mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 04:17:10 +02:00
Merge branch 'main' into merge-release-v1.8.1-in-main
This commit is contained in:
commit
e1fbfde6c4
105 changed files with 5863 additions and 1031 deletions
|
@ -203,7 +203,7 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) {
|
|||
"string" => (field_name, AllowedType::String),
|
||||
"boolean" => (field_name, AllowedType::Boolean),
|
||||
"number" => (field_name, AllowedType::Number),
|
||||
// if the pattern isn't reconized, we keep the whole field.
|
||||
// if the pattern isn't recognized, we keep the whole field.
|
||||
_otherwise => (header, AllowedType::String),
|
||||
},
|
||||
None => (header, AllowedType::String),
|
||||
|
|
|
@ -32,6 +32,8 @@ pub enum InternalError {
|
|||
DatabaseClosing,
|
||||
#[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))]
|
||||
DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> },
|
||||
#[error("Missing {key} in the fieldids weights mapping.")]
|
||||
FieldidsWeightsMapMissingEntry { key: FieldId },
|
||||
#[error(transparent)]
|
||||
FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry),
|
||||
#[error("Missing {key} in the field id mapping.")]
|
||||
|
@ -46,8 +48,6 @@ pub enum InternalError {
|
|||
GrenadInvalidFormatVersion,
|
||||
#[error("Invalid merge while processing {process}")]
|
||||
IndexingMergingKeys { process: &'static str },
|
||||
#[error("{}", HeedError::InvalidDatabaseTyping)]
|
||||
InvalidDatabaseTyping,
|
||||
#[error(transparent)]
|
||||
RayonThreadPool(#[from] ThreadPoolBuildError),
|
||||
#[error(transparent)]
|
||||
|
@ -117,10 +117,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
|||
InvalidGeoField(#[from] GeoError),
|
||||
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
|
||||
InvalidVectorDimensions { expected: usize, found: usize },
|
||||
#[error("The `_vectors.{subfield}` field in the document with id: `{document_id}` is not an array. Was expecting an array of floats or an array of arrays of floats but instead got `{value}`.")]
|
||||
InvalidVectorsType { document_id: Value, value: Value, subfield: String },
|
||||
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
||||
InvalidVectorsMapType { document_id: Value, value: Value },
|
||||
InvalidVectorsMapType { document_id: String, value: Value },
|
||||
#[error("{0}")]
|
||||
InvalidFilter(String),
|
||||
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
|
||||
|
@ -427,7 +425,6 @@ impl From<HeedError> for Error {
|
|||
// TODO use the encoding
|
||||
HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })),
|
||||
HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })),
|
||||
HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping),
|
||||
HeedError::DatabaseClosing => InternalError(DatabaseClosing),
|
||||
HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions),
|
||||
}
|
||||
|
|
48
milli/src/fieldids_weights_map.rs
Normal file
48
milli/src/fieldids_weights_map.rs
Normal file
|
@ -0,0 +1,48 @@
|
|||
//! The fieldids weights map is in charge of storing linking the searchable fields with their weights.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{FieldId, FieldsIdsMap, Weight};
|
||||
|
||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||
pub struct FieldidsWeightsMap {
|
||||
map: HashMap<FieldId, Weight>,
|
||||
}
|
||||
|
||||
impl FieldidsWeightsMap {
|
||||
/// Insert a field id -> weigth into the map.
|
||||
/// If the map did not have this key present, `None` is returned.
|
||||
/// If the map did have this key present, the value is updated, and the old value is returned.
|
||||
pub fn insert(&mut self, fid: FieldId, weight: Weight) -> Option<Weight> {
|
||||
self.map.insert(fid, weight)
|
||||
}
|
||||
|
||||
/// Create the map from the fields ids maps.
|
||||
/// Should only be called in the case there are NO searchable attributes.
|
||||
/// All the fields will be inserted in the order of the fields ids map with a weight of 0.
|
||||
pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self {
|
||||
FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() }
|
||||
}
|
||||
|
||||
/// Removes a field id from the map, returning the associated weight previously in the map.
|
||||
pub fn remove(&mut self, fid: FieldId) -> Option<Weight> {
|
||||
self.map.remove(&fid)
|
||||
}
|
||||
|
||||
/// Returns weight corresponding to the key.
|
||||
pub fn weight(&self, fid: FieldId) -> Option<Weight> {
|
||||
self.map.get(&fid).copied()
|
||||
}
|
||||
|
||||
/// Returns highest weight contained in the map if any.
|
||||
pub fn max_weight(&self) -> Option<Weight> {
|
||||
self.map.values().copied().max()
|
||||
}
|
||||
|
||||
/// Return an iterator visiting all field ids in arbitrary order.
|
||||
pub fn ids(&self) -> impl Iterator<Item = FieldId> + '_ {
|
||||
self.map.keys().copied()
|
||||
}
|
||||
}
|
|
@ -195,7 +195,7 @@ mod tests {
|
|||
fn merge_cbo_roaring_bitmaps() {
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
let small_data = vec![
|
||||
let small_data = [
|
||||
RoaringBitmap::from_sorted_iter(1..4).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(2..5).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(4..6).unwrap(),
|
||||
|
@ -209,7 +209,7 @@ mod tests {
|
|||
let expected = RoaringBitmap::from_sorted_iter(1..6).unwrap();
|
||||
assert_eq!(bitmap, expected);
|
||||
|
||||
let medium_data = vec![
|
||||
let medium_data = [
|
||||
RoaringBitmap::from_sorted_iter(1..4).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(2..5).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(4..8).unwrap(),
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
|
||||
|
@ -22,11 +23,12 @@ use crate::heed_codec::{
|
|||
};
|
||||
use crate::order_by_map::OrderByMap;
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::vector::EmbeddingConfig;
|
||||
use crate::vector::{Embedding, EmbeddingConfig};
|
||||
use crate::{
|
||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
|
||||
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, BEU64,
|
||||
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
|
||||
FieldidsWeightsMap, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
|
||||
Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64,
|
||||
};
|
||||
|
||||
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
||||
|
@ -42,6 +44,7 @@ pub mod main_key {
|
|||
pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
|
||||
pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution";
|
||||
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
||||
pub const FIELDIDS_WEIGHTS_MAP_KEY: &str = "fieldids-weights-map";
|
||||
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
|
||||
pub const GEO_RTREE_KEY: &str = "geo-rtree";
|
||||
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
||||
|
@ -181,7 +184,7 @@ impl Index {
|
|||
|
||||
options.max_dbs(25);
|
||||
|
||||
let env = options.open(path)?;
|
||||
let env = unsafe { options.open(path) }?;
|
||||
let mut wtxn = env.write_txn()?;
|
||||
let main = env.database_options().name(MAIN).create(&mut wtxn)?;
|
||||
let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?;
|
||||
|
@ -291,6 +294,11 @@ impl Index {
|
|||
self.env.read_txn()
|
||||
}
|
||||
|
||||
/// Create a static read transaction to be able to read the index without keeping a reference to it.
|
||||
pub fn static_read_txn(&self) -> heed::Result<RoTxn<'static>> {
|
||||
self.env.clone().static_read_txn()
|
||||
}
|
||||
|
||||
/// Returns the canonicalized path where the heed `Env` of this `Index` lives.
|
||||
pub fn path(&self) -> &Path {
|
||||
self.env.path()
|
||||
|
@ -414,6 +422,65 @@ impl Index {
|
|||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
/* fieldids weights map */
|
||||
// This maps the fields ids to their weights.
|
||||
// Their weights is defined by the ordering of the searchable attributes.
|
||||
|
||||
/// Writes the fieldids weights map which associates the field ids to their weights
|
||||
pub(crate) fn put_fieldids_weights_map(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
map: &FieldidsWeightsMap,
|
||||
) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, SerdeJson<_>>().put(
|
||||
wtxn,
|
||||
main_key::FIELDIDS_WEIGHTS_MAP_KEY,
|
||||
map,
|
||||
)
|
||||
}
|
||||
|
||||
/// Get the fieldids weights map which associates the field ids to their weights
|
||||
pub fn fieldids_weights_map(&self, rtxn: &RoTxn) -> heed::Result<FieldidsWeightsMap> {
|
||||
self.main
|
||||
.remap_types::<Str, SerdeJson<_>>()
|
||||
.get(rtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY)?
|
||||
.map(Ok)
|
||||
.unwrap_or_else(|| {
|
||||
Ok(FieldidsWeightsMap::from_field_id_map_without_searchable(
|
||||
&self.fields_ids_map(rtxn)?,
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
/// Delete the fieldsids weights map
|
||||
pub fn delete_fieldids_weights_map(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(wtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY)
|
||||
}
|
||||
|
||||
pub fn searchable_fields_and_weights<'a>(
|
||||
&self,
|
||||
rtxn: &'a RoTxn,
|
||||
) -> Result<Vec<(Cow<'a, str>, FieldId, Weight)>> {
|
||||
let fid_map = self.fields_ids_map(rtxn)?;
|
||||
let weight_map = self.fieldids_weights_map(rtxn)?;
|
||||
let searchable = self.searchable_fields(rtxn)?;
|
||||
|
||||
searchable
|
||||
.into_iter()
|
||||
.map(|field| -> Result<_> {
|
||||
let fid = fid_map.id(&field).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
|
||||
field_name: field.to_string(),
|
||||
process: "searchable_fields_and_weights",
|
||||
})?;
|
||||
let weight = weight_map
|
||||
.weight(fid)
|
||||
.ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?;
|
||||
|
||||
Ok((field, fid, weight))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/* geo rtree */
|
||||
|
||||
/// Writes the provided `rtree` which associates coordinates to documents ids.
|
||||
|
@ -578,33 +645,42 @@ impl Index {
|
|||
wtxn: &mut RwTxn,
|
||||
user_fields: &[&str],
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
) -> heed::Result<()> {
|
||||
) -> Result<()> {
|
||||
// We can write the user defined searchable fields as-is.
|
||||
self.put_user_defined_searchable_fields(wtxn, user_fields)?;
|
||||
|
||||
let mut weights = FieldidsWeightsMap::default();
|
||||
|
||||
// Now we generate the real searchable fields:
|
||||
// 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion.
|
||||
// 2. Iterate over the user defined searchable fields.
|
||||
// 3. If a user defined field is a subset of a field defined in the fields_ids_map
|
||||
// (ie doggo.name is a subset of doggo) then we push it at the end of the fields.
|
||||
let mut real_fields = user_fields.to_vec();
|
||||
// (ie doggo.name is a subset of doggo) right after doggo and with the same weight.
|
||||
let mut real_fields = Vec::new();
|
||||
|
||||
for field_from_map in fields_ids_map.names() {
|
||||
for user_field in user_fields {
|
||||
for (id, field_from_map) in fields_ids_map.iter() {
|
||||
for (weight, user_field) in user_fields.iter().enumerate() {
|
||||
if crate::is_faceted_by(field_from_map, user_field)
|
||||
&& !user_fields.contains(&field_from_map)
|
||||
&& !real_fields.contains(&field_from_map)
|
||||
{
|
||||
real_fields.push(field_from_map);
|
||||
|
||||
let weight: u16 =
|
||||
weight.try_into().map_err(|_| UserError::AttributeLimitReached)?;
|
||||
weights.insert(id, weight);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.put_searchable_fields(wtxn, &real_fields)
|
||||
self.put_searchable_fields(wtxn, &real_fields)?;
|
||||
self.put_fieldids_weights_map(wtxn, &weights)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||
let did_delete_searchable = self.delete_searchable_fields(wtxn)?;
|
||||
let did_delete_user_defined = self.delete_user_defined_searchable_fields(wtxn)?;
|
||||
self.delete_fieldids_weights_map(wtxn)?;
|
||||
Ok(did_delete_searchable || did_delete_user_defined)
|
||||
}
|
||||
|
||||
|
@ -623,28 +699,31 @@ impl Index {
|
|||
}
|
||||
|
||||
/// Returns the searchable fields, those are the fields that are indexed,
|
||||
/// if the searchable fields aren't there it means that **all** the fields are indexed.
|
||||
pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<Option<Vec<&'t str>>> {
|
||||
pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<Vec<Cow<'t, str>>> {
|
||||
self.main
|
||||
.remap_types::<Str, SerdeBincode<Vec<&'t str>>>()
|
||||
.get(rtxn, main_key::SEARCHABLE_FIELDS_KEY)
|
||||
.get(rtxn, main_key::SEARCHABLE_FIELDS_KEY)?
|
||||
.map(|fields| Ok(fields.into_iter().map(Cow::Borrowed).collect()))
|
||||
.unwrap_or_else(|| {
|
||||
Ok(self
|
||||
.fields_ids_map(rtxn)?
|
||||
.names()
|
||||
.map(|field| Cow::Owned(field.to_string()))
|
||||
.collect())
|
||||
})
|
||||
}
|
||||
|
||||
/// Identical to `searchable_fields`, but returns the ids instead.
|
||||
pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result<Option<Vec<FieldId>>> {
|
||||
match self.searchable_fields(rtxn)? {
|
||||
Some(fields) => {
|
||||
let fields_ids_map = self.fields_ids_map(rtxn)?;
|
||||
let mut fields_ids = Vec::new();
|
||||
for name in fields {
|
||||
if let Some(field_id) = fields_ids_map.id(name) {
|
||||
fields_ids.push(field_id);
|
||||
}
|
||||
}
|
||||
Ok(Some(fields_ids))
|
||||
pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result<Vec<FieldId>> {
|
||||
let fields = self.searchable_fields(rtxn)?;
|
||||
let fields_ids_map = self.fields_ids_map(rtxn)?;
|
||||
let mut fields_ids = Vec::new();
|
||||
for name in fields {
|
||||
if let Some(field_id) = fields_ids_map.id(&name) {
|
||||
fields_ids.push(field_id);
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
Ok(fields_ids)
|
||||
}
|
||||
|
||||
/// Writes the searchable fields, when this list is specified, only these are indexed.
|
||||
|
@ -1527,6 +1606,44 @@ impl Index {
|
|||
pub(crate) fn delete_search_cutoff(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(wtxn, main_key::SEARCH_CUTOFF)
|
||||
}
|
||||
|
||||
pub fn embeddings(
|
||||
&self,
|
||||
rtxn: &RoTxn<'_>,
|
||||
docid: DocumentId,
|
||||
) -> Result<BTreeMap<String, Vec<Embedding>>> {
|
||||
let mut res = BTreeMap::new();
|
||||
for row in self.embedder_category_id.iter(rtxn)? {
|
||||
let (embedder_name, embedder_id) = row?;
|
||||
let embedder_id = (embedder_id as u16) << 8;
|
||||
let mut embeddings = Vec::new();
|
||||
'vectors: for i in 0..=u8::MAX {
|
||||
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
|
||||
.map(Some)
|
||||
.or_else(|e| match e {
|
||||
arroy::Error::MissingMetadata => Ok(None),
|
||||
e => Err(e),
|
||||
})
|
||||
.transpose();
|
||||
|
||||
let Some(reader) = reader else {
|
||||
break 'vectors;
|
||||
};
|
||||
|
||||
let embedding = reader?.item_vector(rtxn, docid)?;
|
||||
if let Some(embedding) = embedding {
|
||||
embeddings.push(embedding)
|
||||
} else {
|
||||
break 'vectors;
|
||||
}
|
||||
}
|
||||
|
||||
if !embeddings.is_empty() {
|
||||
res.insert(embedder_name.to_owned(), embeddings);
|
||||
}
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -1710,10 +1827,14 @@ pub(crate) mod tests {
|
|||
]))
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, field_distribution, 1);
|
||||
db_snap!(index, field_distribution, @r###"
|
||||
age 1 |
|
||||
id 2 |
|
||||
name 2 |
|
||||
"###);
|
||||
|
||||
db_snap!(index, word_docids,
|
||||
@r###"
|
||||
@r###"
|
||||
1 [0, ]
|
||||
2 [1, ]
|
||||
20 [1, ]
|
||||
|
@ -1722,18 +1843,6 @@ pub(crate) mod tests {
|
|||
"###
|
||||
);
|
||||
|
||||
db_snap!(index, field_distribution);
|
||||
|
||||
db_snap!(index, field_distribution,
|
||||
@r###"
|
||||
age 1 |
|
||||
id 2 |
|
||||
name 2 |
|
||||
"###
|
||||
);
|
||||
|
||||
// snapshot_index!(&index, "1", include: "^field_distribution$");
|
||||
|
||||
// we add all the documents a second time. we are supposed to get the same
|
||||
// field_distribution in the end
|
||||
index
|
||||
|
@ -1820,7 +1929,7 @@ pub(crate) mod tests {
|
|||
// ensure we get the right real searchable fields + user defined searchable fields
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let real = index.searchable_fields(&rtxn).unwrap().unwrap();
|
||||
let real = index.searchable_fields(&rtxn).unwrap();
|
||||
assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]);
|
||||
|
||||
let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap();
|
||||
|
@ -1840,7 +1949,7 @@ pub(crate) mod tests {
|
|||
// ensure we get the right real searchable fields + user defined searchable fields
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let real = index.searchable_fields(&rtxn).unwrap().unwrap();
|
||||
let real = index.searchable_fields(&rtxn).unwrap();
|
||||
assert_eq!(real, &["doggo", "name"]);
|
||||
let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap();
|
||||
assert_eq!(user_defined, &["doggo", "name"]);
|
||||
|
@ -1856,7 +1965,7 @@ pub(crate) mod tests {
|
|||
// ensure we get the right real searchable fields + user defined searchable fields
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let real = index.searchable_fields(&rtxn).unwrap().unwrap();
|
||||
let real = index.searchable_fields(&rtxn).unwrap();
|
||||
assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]);
|
||||
|
||||
let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap();
|
||||
|
@ -2395,6 +2504,14 @@ pub(crate) mod tests {
|
|||
11 0
|
||||
4 1
|
||||
"###);
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 primary_key |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["primary_key"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
0 0 |
|
||||
"###);
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
|
@ -2410,6 +2527,16 @@ pub(crate) mod tests {
|
|||
11 0
|
||||
4 1
|
||||
"###);
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 primary_key |
|
||||
1 a |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
0 0 |
|
||||
1 0 |
|
||||
"###);
|
||||
|
||||
index.delete_documents(Default::default());
|
||||
|
||||
|
@ -2420,6 +2547,16 @@ pub(crate) mod tests {
|
|||
11 0
|
||||
4 1
|
||||
"###);
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 primary_key |
|
||||
1 a |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
0 0 |
|
||||
1 0 |
|
||||
"###);
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
|
@ -2435,6 +2572,16 @@ pub(crate) mod tests {
|
|||
11 0
|
||||
4 1
|
||||
"###);
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 primary_key |
|
||||
1 a |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
0 0 |
|
||||
1 0 |
|
||||
"###);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let search = Search::new(&rtxn, &index);
|
||||
|
@ -2520,4 +2667,104 @@ pub(crate) mod tests {
|
|||
|
||||
db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn swapping_searchable_attributes() {
|
||||
// See https://github.com/meilisearch/meilisearch/issues/4484
|
||||
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec![S("name")]);
|
||||
settings.set_filterable_fields(HashSet::from([S("age")]));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!({ "id": 1, "name": "Many", "age": 28, "realName": "Maxime" }))
|
||||
.unwrap();
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 name |
|
||||
1 id |
|
||||
2 age |
|
||||
3 realName |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["name"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
0 0 |
|
||||
"###);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec![S("name"), S("realName")]);
|
||||
settings.set_filterable_fields(HashSet::from([S("age")]));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// The order of the field id map shouldn't change
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 name |
|
||||
1 id |
|
||||
2 age |
|
||||
3 realName |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["name", "realName"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
0 0 |
|
||||
3 1 |
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn attribute_weights_after_swapping_searchable_attributes() {
|
||||
// See https://github.com/meilisearch/meilisearch/issues/4484
|
||||
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec![S("name"), S("beverage")]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "name": "kefir", "beverage": "water" },
|
||||
{ "id": 1, "name": "tamo", "beverage": "kefir" }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let mut search = index.search(&rtxn);
|
||||
let results = search.query("kefir").execute().unwrap();
|
||||
|
||||
// We should find kefir the dog first
|
||||
insta::assert_debug_snapshot!(results.documents_ids, @r###"
|
||||
[
|
||||
0,
|
||||
1,
|
||||
]
|
||||
"###);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec![S("beverage"), S("name")]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let mut search = index.search(&rtxn);
|
||||
let results = search.query("kefir").execute().unwrap();
|
||||
|
||||
// We should find tamo first
|
||||
insta::assert_debug_snapshot!(results.documents_ids, @r###"
|
||||
[
|
||||
1,
|
||||
0,
|
||||
]
|
||||
"###);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,6 +28,7 @@ pub mod vector;
|
|||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
pub mod snapshot_tests;
|
||||
mod fieldids_weights_map;
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
|
@ -52,6 +53,7 @@ pub use self::error::{
|
|||
Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError,
|
||||
};
|
||||
pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||
pub use self::fieldids_weights_map::FieldidsWeightsMap;
|
||||
pub use self::fields_ids_map::FieldsIdsMap;
|
||||
pub use self::heed_codec::{
|
||||
BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
|
||||
|
@ -77,6 +79,7 @@ pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
|||
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
|
||||
pub type FieldDistribution = BTreeMap<String, u64>;
|
||||
pub type FieldId = u16;
|
||||
pub type Weight = u16;
|
||||
pub type Object = serde_json::Map<String, serde_json::Value>;
|
||||
pub type Position = u32;
|
||||
pub type RelativePosition = u16;
|
||||
|
@ -351,43 +354,13 @@ pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator<Item = impl AsR
|
|||
/// assert!(!is_faceted_by("animaux.chien", "animaux.chie"));
|
||||
/// ```
|
||||
pub fn is_faceted_by(field: &str, facet: &str) -> bool {
|
||||
field.starts_with(facet)
|
||||
&& field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true)
|
||||
field.starts_with(facet) && field[facet.len()..].chars().next().map_or(true, |c| c == '.')
|
||||
}
|
||||
|
||||
pub fn normalize_facet(original: &str) -> String {
|
||||
CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase()
|
||||
}
|
||||
|
||||
/// Represents either a vector or an array of multiple vectors.
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||
#[serde(transparent)]
|
||||
pub struct VectorOrArrayOfVectors {
|
||||
#[serde(with = "either::serde_untagged_optional")]
|
||||
inner: Option<either::Either<Vec<f32>, Vec<Vec<f32>>>>,
|
||||
}
|
||||
|
||||
impl VectorOrArrayOfVectors {
|
||||
pub fn into_array_of_vectors(self) -> Option<Vec<Vec<f32>>> {
|
||||
match self.inner? {
|
||||
either::Either::Left(vector) => Some(vec![vector]),
|
||||
either::Either::Right(vectors) => Some(vectors),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalize a vector by dividing the dimensions by the length of it.
|
||||
pub fn normalize_vector(mut vector: Vec<f32>) -> Vec<f32> {
|
||||
let squared: f32 = vector.iter().map(|x| x * x).sum();
|
||||
let length = squared.sqrt();
|
||||
if length <= f32::EPSILON {
|
||||
vector
|
||||
} else {
|
||||
vector.iter_mut().for_each(|x| *x /= length);
|
||||
vector
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json::json;
|
||||
|
|
|
@ -147,7 +147,7 @@ impl<'a> Search<'a> {
|
|||
|
||||
pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> {
|
||||
if has_vector_search {
|
||||
let ctx = SearchContext::new(self.index, self.rtxn);
|
||||
let ctx = SearchContext::new(self.index, self.rtxn)?;
|
||||
filtered_universe(&ctx, &self.filter)
|
||||
} else {
|
||||
Ok(self.execute()?.candidates)
|
||||
|
@ -155,10 +155,10 @@ impl<'a> Search<'a> {
|
|||
}
|
||||
|
||||
pub fn execute(&self) -> Result<SearchResult> {
|
||||
let mut ctx = SearchContext::new(self.index, self.rtxn);
|
||||
let mut ctx = SearchContext::new(self.index, self.rtxn)?;
|
||||
|
||||
if let Some(searchable_attributes) = self.searchable_attributes {
|
||||
ctx.searchable_attributes(searchable_attributes)?;
|
||||
ctx.attributes_to_search_on(searchable_attributes)?;
|
||||
}
|
||||
|
||||
let universe = filtered_universe(&ctx, &self.filter)?;
|
||||
|
|
|
@ -101,7 +101,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
|
||||
let mut ranking_rule_universes: Vec<RoaringBitmap> =
|
||||
vec![RoaringBitmap::default(); ranking_rules_len];
|
||||
ranking_rule_universes[0] = universe.clone();
|
||||
ranking_rule_universes[0].clone_from(universe);
|
||||
let mut cur_ranking_rule_index = 0;
|
||||
|
||||
/// Finish iterating over the current ranking rule, yielding
|
||||
|
@ -232,7 +232,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
}
|
||||
|
||||
cur_ranking_rule_index += 1;
|
||||
ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone();
|
||||
ranking_rule_universes[cur_ranking_rule_index].clone_from(&next_bucket.candidates);
|
||||
logger.start_iteration_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
|
|
|
@ -163,7 +163,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(word).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect();
|
||||
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
|
@ -192,7 +192,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(word).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect();
|
||||
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
|
@ -242,7 +242,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(prefix).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect();
|
||||
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
|
@ -271,7 +271,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(prefix).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect();
|
||||
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
|
@ -315,11 +315,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
.map_err(heed::Error::Decoding)?
|
||||
} else {
|
||||
// Compute the distance at the attribute level and store it in the cache.
|
||||
let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? {
|
||||
fids
|
||||
} else {
|
||||
self.index.fields_ids_map(self.txn)?.ids().collect()
|
||||
};
|
||||
let fids = self.index.searchable_fields_ids(self.txn)?;
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for fid in fids {
|
||||
// for each field, intersect left word bitmap and right word bitmap,
|
||||
|
@ -408,11 +404,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
let prefix_docids = match proximity_precision {
|
||||
ProximityPrecision::ByAttribute => {
|
||||
// Compute the distance at the attribute level and store it in the cache.
|
||||
let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? {
|
||||
fids
|
||||
} else {
|
||||
self.index.fields_ids_map(self.txn)?.ids().collect()
|
||||
};
|
||||
let fids = self.index.searchable_fields_ids(self.txn)?;
|
||||
let mut prefix_docids = RoaringBitmap::new();
|
||||
// for each field, intersect left word bitmap and right word bitmap,
|
||||
// then merge the result in a global bitmap before storing it in the cache.
|
||||
|
|
|
@ -184,13 +184,7 @@ impl State {
|
|||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
|
||||
let searchable_fields_ids = {
|
||||
if let Some(fids) = ctx.index.searchable_fields_ids(ctx.txn)? {
|
||||
fids
|
||||
} else {
|
||||
ctx.index.fields_ids_map(ctx.txn)?.ids().collect()
|
||||
}
|
||||
};
|
||||
let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?;
|
||||
|
||||
let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
|
||||
// then check that there exists at least one attribute that has all of the terms
|
||||
|
|
|
@ -42,7 +42,7 @@ fn facet_number_values<'a>(
|
|||
}
|
||||
|
||||
/// Define the strategy used by the geo sort.
|
||||
/// The paramater represents the cache size, and, in the case of the Dynamic strategy,
|
||||
/// The parameter represents the cache size, and, in the case of the Dynamic strategy,
|
||||
/// the point where we move from using the iterative strategy to the rtree.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum Strategy {
|
||||
|
|
|
@ -258,7 +258,7 @@ pub(crate) mod tests {
|
|||
fn matching_words() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let mut ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
|
||||
let mut builder = TokenizerBuilder::default();
|
||||
let tokenizer = builder.build();
|
||||
let tokens = tokenizer.tokenize("split this world");
|
||||
|
|
|
@ -134,7 +134,7 @@ impl<'t> Matcher<'t, '_> {
|
|||
for (token_position, word_position, word) in words_positions {
|
||||
partial = match partial.match_token(word) {
|
||||
// token matches the partial match, but the match is not full,
|
||||
// we temporarly save the current token then we try to match the next one.
|
||||
// we temporarily save the current token then we try to match the next one.
|
||||
Some(MatchType::Partial(partial)) => {
|
||||
potential_matches.push((token_position, word_position, partial.char_len()));
|
||||
partial
|
||||
|
@ -506,7 +506,7 @@ mod tests {
|
|||
|
||||
impl<'a> MatcherBuilder<'a> {
|
||||
fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
|
||||
let mut ctx = SearchContext::new(index, rtxn);
|
||||
let mut ctx = SearchContext::new(index, rtxn).unwrap();
|
||||
let universe = filtered_universe(&ctx, &None).unwrap();
|
||||
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
|
||||
&mut ctx,
|
||||
|
@ -722,7 +722,7 @@ mod tests {
|
|||
@"…void void void void void split the world void void"
|
||||
);
|
||||
|
||||
// Text containing matches with diferent density.
|
||||
// Text containing matches with different density.
|
||||
let text = "split void the void void world void void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
|
|
|
@ -49,13 +49,12 @@ pub use self::geo_sort::Strategy as GeoSortStrategy;
|
|||
use self::graph_based_ranking_rule::Words;
|
||||
use self::interner::Interned;
|
||||
use self::vector_sort::VectorSort;
|
||||
use crate::error::FieldIdMapMissingEntry;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
use crate::vector::Embedder;
|
||||
use crate::{
|
||||
AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget,
|
||||
UserError,
|
||||
UserError, Weight,
|
||||
};
|
||||
|
||||
/// A structure used throughout the execution of a search query.
|
||||
|
@ -71,8 +70,21 @@ pub struct SearchContext<'ctx> {
|
|||
}
|
||||
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self {
|
||||
Self {
|
||||
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Result<Self> {
|
||||
let searchable_fids = index.searchable_fields_and_weights(txn)?;
|
||||
let exact_attributes_ids = index.exact_attributes_ids(txn)?;
|
||||
|
||||
let mut exact = Vec::new();
|
||||
let mut tolerant = Vec::new();
|
||||
for (_name, fid, weight) in searchable_fids {
|
||||
if exact_attributes_ids.contains(&fid) {
|
||||
exact.push((fid, weight));
|
||||
} else {
|
||||
tolerant.push((fid, weight));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
index,
|
||||
txn,
|
||||
db_cache: <_>::default(),
|
||||
|
@ -81,42 +93,39 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
term_interner: <_>::default(),
|
||||
phrase_docids: <_>::default(),
|
||||
restricted_fids: None,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn searchable_attributes(&mut self, searchable_attributes: &'ctx [String]) -> Result<()> {
|
||||
let fids_map = self.index.fields_ids_map(self.txn)?;
|
||||
let searchable_names = self.index.searchable_fields(self.txn)?;
|
||||
pub fn attributes_to_search_on(
|
||||
&mut self,
|
||||
attributes_to_search_on: &'ctx [String],
|
||||
) -> Result<()> {
|
||||
let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?;
|
||||
let searchable_fields_weights = self.index.searchable_fields_and_weights(self.txn)?;
|
||||
let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?;
|
||||
|
||||
let mut wildcard = false;
|
||||
|
||||
let mut restricted_fids = RestrictedFids::default();
|
||||
let mut contains_wildcard = false;
|
||||
for field_name in searchable_attributes {
|
||||
for field_name in attributes_to_search_on {
|
||||
if field_name == "*" {
|
||||
contains_wildcard = true;
|
||||
wildcard = true;
|
||||
// we cannot early exit as we want to returns error in case of unknown fields
|
||||
continue;
|
||||
}
|
||||
let searchable_contains_name =
|
||||
searchable_names.as_ref().map(|sn| sn.iter().any(|name| name == field_name));
|
||||
let fid = match (fids_map.id(field_name), searchable_contains_name) {
|
||||
let searchable_weight =
|
||||
searchable_fields_weights.iter().find(|(name, _, _)| name == field_name);
|
||||
let (fid, weight) = match searchable_weight {
|
||||
// The Field id exist and the field is searchable
|
||||
(Some(fid), Some(true)) | (Some(fid), None) => fid,
|
||||
// The field is searchable but the Field id doesn't exist => Internal Error
|
||||
(None, Some(true)) => {
|
||||
return Err(FieldIdMapMissingEntry::FieldName {
|
||||
field_name: field_name.to_string(),
|
||||
process: "search",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
// The field is not searchable, but the searchableAttributes are set to * => ignore field
|
||||
(None, None) => continue,
|
||||
Some((_name, fid, weight)) => (*fid, *weight),
|
||||
// The field is not searchable but the user didn't define any searchable attributes
|
||||
None if user_defined_searchable.is_none() => continue,
|
||||
// The field is not searchable => User error
|
||||
(_fid, Some(false)) => {
|
||||
let (valid_fields, hidden_fields) = match searchable_names {
|
||||
Some(sn) => self.index.remove_hidden_fields(self.txn, sn)?,
|
||||
None => self.index.remove_hidden_fields(self.txn, fids_map.names())?,
|
||||
};
|
||||
None => {
|
||||
let (valid_fields, hidden_fields) = self.index.remove_hidden_fields(
|
||||
self.txn,
|
||||
searchable_fields_weights.iter().map(|(name, _, _)| name),
|
||||
)?;
|
||||
|
||||
let field = field_name.to_string();
|
||||
return Err(UserError::InvalidSearchableAttribute {
|
||||
|
@ -129,13 +138,17 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
};
|
||||
|
||||
if exact_attributes_ids.contains(&fid) {
|
||||
restricted_fids.exact.push(fid);
|
||||
restricted_fids.exact.push((fid, weight));
|
||||
} else {
|
||||
restricted_fids.tolerant.push(fid);
|
||||
restricted_fids.tolerant.push((fid, weight));
|
||||
};
|
||||
}
|
||||
|
||||
self.restricted_fids = (!contains_wildcard).then_some(restricted_fids);
|
||||
if wildcard {
|
||||
self.restricted_fids = None;
|
||||
} else {
|
||||
self.restricted_fids = Some(restricted_fids);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -158,13 +171,13 @@ impl Word {
|
|||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct RestrictedFids {
|
||||
pub tolerant: Vec<FieldId>,
|
||||
pub exact: Vec<FieldId>,
|
||||
pub tolerant: Vec<(FieldId, Weight)>,
|
||||
pub exact: Vec<(FieldId, Weight)>,
|
||||
}
|
||||
|
||||
impl RestrictedFids {
|
||||
pub fn contains(&self, fid: &FieldId) -> bool {
|
||||
self.tolerant.contains(fid) || self.exact.contains(fid)
|
||||
self.tolerant.iter().any(|(id, _)| id == fid) || self.exact.iter().any(|(id, _)| id == fid)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -119,7 +119,7 @@ pub fn located_query_terms_from_tokens(
|
|||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
// as we are evaluating a negative operator we put the phrase
|
||||
// in the negative one *but* we don't reset the negative operator
|
||||
// as we are immediatly starting a new negative phrase.
|
||||
// as we are immediately starting a new negative phrase.
|
||||
if negative_phrase {
|
||||
negative_phrases.push(located_query_term);
|
||||
} else {
|
||||
|
@ -366,7 +366,7 @@ mod tests {
|
|||
let tokens = tokenizer.tokenize(".");
|
||||
let index = temp_index_with_documents();
|
||||
let rtxn = index.read_txn()?;
|
||||
let mut ctx = SearchContext::new(&index, &rtxn);
|
||||
let mut ctx = SearchContext::new(&index, &rtxn)?;
|
||||
// panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
|
||||
let ExtractedTokens { query_terms, .. } =
|
||||
located_query_terms_from_tokens(&mut ctx, tokens, None)?;
|
||||
|
|
|
@ -7,12 +7,12 @@ use crate::search::new::interner::{DedupInterner, Interned};
|
|||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id;
|
||||
use crate::search::new::SearchContext;
|
||||
use crate::Result;
|
||||
use crate::{FieldId, InternalError, Result};
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct FidCondition {
|
||||
term: LocatedQueryTermSubset,
|
||||
fid: u16,
|
||||
fid: Option<FieldId>,
|
||||
}
|
||||
|
||||
pub enum FidGraph {}
|
||||
|
@ -26,13 +26,15 @@ impl RankingRuleGraphTrait for FidGraph {
|
|||
universe: &RoaringBitmap,
|
||||
) -> Result<ComputedCondition> {
|
||||
let FidCondition { term, .. } = condition;
|
||||
// maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument
|
||||
let mut docids = compute_query_term_subset_docids_within_field_id(
|
||||
ctx,
|
||||
&term.term_subset,
|
||||
condition.fid,
|
||||
)?;
|
||||
docids &= universe;
|
||||
|
||||
let docids = if let Some(fid) = condition.fid {
|
||||
// maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument
|
||||
let docids =
|
||||
compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?;
|
||||
docids & universe
|
||||
} else {
|
||||
RoaringBitmap::new()
|
||||
};
|
||||
|
||||
Ok(ComputedCondition {
|
||||
docids,
|
||||
|
@ -68,34 +70,29 @@ impl RankingRuleGraphTrait for FidGraph {
|
|||
all_fields.extend(fields);
|
||||
}
|
||||
|
||||
let weights_map = ctx.index.fieldids_weights_map(ctx.txn)?;
|
||||
|
||||
let mut edges = vec![];
|
||||
for fid in all_fields.iter().copied() {
|
||||
let weight = weights_map
|
||||
.weight(fid)
|
||||
.ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?;
|
||||
edges.push((
|
||||
fid as u32 * term.term_ids.len() as u32,
|
||||
conditions_interner.insert(FidCondition { term: term.clone(), fid }),
|
||||
weight as u32 * term.term_ids.len() as u32,
|
||||
conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }),
|
||||
));
|
||||
}
|
||||
|
||||
// always lookup the max_fid if we don't already and add an artificial condition for max scoring
|
||||
let max_fid: Option<u16> = {
|
||||
if let Some(max_fid) = ctx
|
||||
.index
|
||||
.searchable_fields_ids(ctx.txn)?
|
||||
.map(|field_ids| field_ids.into_iter().max())
|
||||
{
|
||||
max_fid
|
||||
} else {
|
||||
ctx.index.fields_ids_map(ctx.txn)?.ids().max()
|
||||
}
|
||||
};
|
||||
let max_weight: Option<u16> = weights_map.max_weight();
|
||||
|
||||
if let Some(max_fid) = max_fid {
|
||||
if !all_fields.contains(&max_fid) {
|
||||
if let Some(max_weight) = max_weight {
|
||||
if !all_fields.contains(&max_weight) {
|
||||
edges.push((
|
||||
max_fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10.
|
||||
max_weight as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10.
|
||||
conditions_interner.insert(FidCondition {
|
||||
term: term.clone(), // TODO remove this ugly clone
|
||||
fid: max_fid,
|
||||
fid: None,
|
||||
}),
|
||||
));
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use crate::index::tests::TempIndex;
|
||||
use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy};
|
||||
use crate::{db_snap, Criterion, Search, SearchResult, TermsMatchingStrategy};
|
||||
|
||||
fn create_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
@ -131,6 +131,19 @@ fn test_attribute_fid_simple() {
|
|||
#[test]
|
||||
fn test_attribute_fid_ngrams() {
|
||||
let index = create_index();
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 id |
|
||||
1 title |
|
||||
2 description |
|
||||
3 plot |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
1 0 |
|
||||
2 1 |
|
||||
3 2 |
|
||||
"###);
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
|
|
|
@ -0,0 +1,244 @@
|
|||
---
|
||||
source: milli/src/search/new/tests/attribute_fid.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
2,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 91,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 14,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 79,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 77,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 12,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 83,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
9,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 11,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 75,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 79,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 73,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
11,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 77,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
10,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
13,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
12,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 78,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
14,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 75,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 91,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
|
@ -308,6 +308,25 @@ pub fn snap_fields_ids_map(index: &Index) -> String {
|
|||
}
|
||||
snap
|
||||
}
|
||||
pub fn snap_fieldids_weights_map(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let weights_map = index.fieldids_weights_map(&rtxn).unwrap();
|
||||
|
||||
let mut snap = String::new();
|
||||
writeln!(&mut snap, "fid weight").unwrap();
|
||||
let mut field_ids: Vec<_> = weights_map.ids().collect();
|
||||
field_ids.sort();
|
||||
for field_id in field_ids {
|
||||
let weight = weights_map.weight(field_id).unwrap();
|
||||
writeln!(&mut snap, "{field_id:<3} {weight:<3} |").unwrap();
|
||||
}
|
||||
snap
|
||||
}
|
||||
pub fn snap_searchable_fields(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let searchable_fields = index.searchable_fields(&rtxn).unwrap();
|
||||
format!("{searchable_fields:?}")
|
||||
}
|
||||
pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap();
|
||||
|
@ -469,6 +488,12 @@ macro_rules! full_snap_of_db {
|
|||
($index:ident, fields_ids_map) => {{
|
||||
$crate::snapshot_tests::snap_fields_ids_map(&$index)
|
||||
}};
|
||||
($index:ident, fieldids_weights_map) => {{
|
||||
$crate::snapshot_tests::snap_fieldids_weights_map(&$index)
|
||||
}};
|
||||
($index:ident, searchable_fields) => {{
|
||||
$crate::snapshot_tests::snap_searchable_fields(&$index)
|
||||
}};
|
||||
($index:ident, geo_faceted_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index)
|
||||
}};
|
||||
|
|
|
@ -21,8 +21,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
|
|||
name = "clear_documents"
|
||||
)]
|
||||
pub fn execute(self) -> Result<u64> {
|
||||
puffin::profile_function!();
|
||||
|
||||
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
||||
let Index {
|
||||
env: _env,
|
||||
|
|
|
@ -499,7 +499,7 @@ impl FacetsUpdateIncrementalInner {
|
|||
ModificationResult::Expand | ModificationResult::Reduce { .. }
|
||||
)
|
||||
{
|
||||
// if any modification occured, insert it in the database.
|
||||
// if any modification occurred, insert it in the database.
|
||||
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
|
||||
Ok(insertion_key_modification)
|
||||
} else {
|
||||
|
|
|
@ -379,7 +379,7 @@ pub(crate) mod test_helpers {
|
|||
let mut options = heed::EnvOpenOptions::new();
|
||||
let options = options.map_size(4096 * 4 * 1000 * 100);
|
||||
let tempdir = tempfile::TempDir::new().unwrap();
|
||||
let env = options.open(tempdir.path()).unwrap();
|
||||
let env = unsafe { options.open(tempdir.path()) }.unwrap();
|
||||
let mut wtxn = env.write_txn().unwrap();
|
||||
let content = env.create_database(&mut wtxn, None).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
|
|
@ -29,8 +29,6 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
|||
autogenerate_docids: bool,
|
||||
reader: DocumentsBatchReader<R>,
|
||||
) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||
|
||||
let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?;
|
||||
|
|
|
@ -29,8 +29,6 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||
settings_diff: &InnerIndexSettingsDiff,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
@ -186,7 +184,7 @@ fn searchable_fields_changed(
|
|||
) -> bool {
|
||||
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
if searchable_fields.contains(&field_id) {
|
||||
let del_add = KvReaderDelAdd::new(field_bytes);
|
||||
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||
// if both fields are None, check the next field.
|
||||
|
@ -298,7 +296,7 @@ fn lang_safe_tokens_from_document<'a>(
|
|||
/// Extract words mapped with their positions of a document.
|
||||
fn tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<Vec<FieldId>>,
|
||||
searchable_fields: &[FieldId],
|
||||
tokenizer: &Tokenizer,
|
||||
max_positions_per_attributes: u32,
|
||||
del_add: DelAdd,
|
||||
|
@ -309,7 +307,7 @@ fn tokens_from_document<'a>(
|
|||
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
// if field is searchable.
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
if searchable_fields.as_ref().contains(&field_id) {
|
||||
// extract deletion or addition only.
|
||||
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
|
||||
// parse json.
|
||||
|
|
|
@ -23,8 +23,6 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
|||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_number_docids_sorter = create_sorter(
|
||||
|
|
|
@ -28,8 +28,6 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
|||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ pub struct ExtractedFacetValues {
|
|||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
|
||||
/// Returns the generated grenad reader containing the docid the fid and the original value as key
|
||||
/// and the normalized value as value extracted from the given chunk of documents.
|
||||
/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
|
@ -46,8 +46,6 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
|
|
|
@ -26,8 +26,6 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
|||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
|
|
|
@ -21,8 +21,6 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
|||
primary_key_id: FieldId,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
|
|
|
@ -10,16 +10,16 @@ use bytemuck::cast_slice;
|
|||
use grenad::Writer;
|
||||
use itertools::EitherOrBoth;
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde_json::{from_slice, Value};
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::UserError;
|
||||
use crate::prompt::Prompt;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::try_split_at;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME};
|
||||
use crate::vector::Embedder;
|
||||
use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, VectorOrArrayOfVectors};
|
||||
use crate::{DocumentId, Result, ThreadPoolNoAbort};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
|
@ -31,6 +31,10 @@ pub struct ExtractedVectorPoints {
|
|||
pub remove_vectors: grenad::Reader<BufReader<File>>,
|
||||
// docid -> prompt
|
||||
pub prompts: grenad::Reader<BufReader<File>>,
|
||||
|
||||
// embedder
|
||||
pub embedder_name: String,
|
||||
pub embedder: Arc<Embedder>,
|
||||
}
|
||||
|
||||
enum VectorStateDelta {
|
||||
|
@ -65,6 +69,19 @@ impl VectorStateDelta {
|
|||
}
|
||||
}
|
||||
|
||||
struct EmbedderVectorExtractor {
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
prompt: Arc<Prompt>,
|
||||
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
manual_vectors_writer: Writer<BufWriter<File>>,
|
||||
// (docid) -> (prompt)
|
||||
prompts_writer: Writer<BufWriter<File>>,
|
||||
// (docid) -> ()
|
||||
remove_vectors_writer: Writer<BufWriter<File>>,
|
||||
}
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
|
||||
|
@ -73,34 +90,52 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
prompt: &Prompt,
|
||||
embedder_name: &str,
|
||||
) -> Result<ExtractedVectorPoints> {
|
||||
puffin::profile_function!();
|
||||
) -> Result<Vec<ExtractedVectorPoints>> {
|
||||
let reindex_vectors = settings_diff.reindex_vectors();
|
||||
|
||||
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||
// the vector field id may have changed
|
||||
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
// filter the old vector fid if the settings has been changed forcing reindexing.
|
||||
let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors);
|
||||
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let mut manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
// (docid) -> (prompt)
|
||||
let mut prompts_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
let mut extractors = Vec::new();
|
||||
for (embedder_name, (embedder, prompt)) in
|
||||
settings_diff.new.embedding_configs.clone().into_iter()
|
||||
{
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> ()
|
||||
let mut remove_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
// (docid) -> (prompt)
|
||||
let prompts_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> ()
|
||||
let remove_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
extractors.push(EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt,
|
||||
manual_vectors_writer,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
});
|
||||
}
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
|
@ -114,152 +149,138 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
// since we only needs the primary key when we throw an error we create this getter to
|
||||
// since we only need the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
// the vector field id may have changed
|
||||
let old_vectors_fid = old_fields_ids_map.id("_vectors");
|
||||
// filter the old vector fid if the settings has been changed forcing reindexing.
|
||||
let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors());
|
||||
let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid)
|
||||
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
|
||||
|
||||
let new_vectors_fid = new_fields_ids_map.id("_vectors");
|
||||
let vectors_field = {
|
||||
let del = old_vectors_fid
|
||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
let add = new_vectors_fid
|
||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
(del, add)
|
||||
};
|
||||
for EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder: _,
|
||||
prompt,
|
||||
manual_vectors_writer,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
} in extractors.iter_mut()
|
||||
{
|
||||
let delta = match parsed_vectors.remove(embedder_name) {
|
||||
(Some(old), Some(new)) => {
|
||||
// no autogeneration
|
||||
let del_vectors = old.into_array_of_vectors();
|
||||
let add_vectors = new.into_array_of_vectors();
|
||||
|
||||
let (del_map, add_map) = vectors_field;
|
||||
|
||||
let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
|
||||
let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
|
||||
|
||||
let delta = match (del_value, add_value) {
|
||||
(Some(old), Some(new)) => {
|
||||
// no autogeneration
|
||||
let del_vectors = extract_vectors(old, document_id, embedder_name)?;
|
||||
let add_vectors = extract_vectors(new, document_id, embedder_name)?;
|
||||
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::ManualDelta(del_vectors, add_vectors)
|
||||
}
|
||||
(Some(_old), None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
// becomes autogenerated
|
||||
VectorStateDelta::NowGenerated(prompt.render(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
new_fields_ids_map,
|
||||
)?)
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
(None, Some(new)) => {
|
||||
// was possibly autogenerated, remove all vectors for that document
|
||||
let add_vectors = extract_vectors(new, document_id, embedder_name)?;
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::WasGeneratedNowManual(add_vectors)
|
||||
}
|
||||
(None, None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt = Some(prompt)
|
||||
// TODO: this filter works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
.filter(|_| !settings_diff.reindex_vectors())
|
||||
.map(|p| {
|
||||
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
|
||||
});
|
||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
if old_prompt.as_ref() != Some(&new_prompt) {
|
||||
let old_prompt = old_prompt.unwrap_or_default();
|
||||
tracing::trace!(
|
||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||
);
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
tracing::trace!("⏭️ Prompt unmodified, skipping");
|
||||
VectorStateDelta::NoChange
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
&mut remove_vectors_writer,
|
||||
&mut prompts_writer,
|
||||
&mut manual_vectors_writer,
|
||||
&mut key_buffer,
|
||||
delta,
|
||||
settings_diff,
|
||||
)?;
|
||||
VectorStateDelta::ManualDelta(del_vectors, add_vectors)
|
||||
}
|
||||
(Some(_old), None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
// becomes autogenerated
|
||||
VectorStateDelta::NowGenerated(prompt.render(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
new_fields_ids_map,
|
||||
)?)
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
(None, Some(new)) => {
|
||||
// was possibly autogenerated, remove all vectors for that document
|
||||
let add_vectors = new.into_array_of_vectors();
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::WasGeneratedNowManual(add_vectors)
|
||||
}
|
||||
(None, None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt = Some(&prompt)
|
||||
// TODO: this filter works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
.filter(|_| !settings_diff.reindex_vectors())
|
||||
.map(|p| {
|
||||
p.render(obkv, DelAdd::Deletion, old_fields_ids_map)
|
||||
.unwrap_or_default()
|
||||
});
|
||||
let new_prompt =
|
||||
prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
if old_prompt.as_ref() != Some(&new_prompt) {
|
||||
let old_prompt = old_prompt.unwrap_or_default();
|
||||
tracing::trace!(
|
||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||
);
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
tracing::trace!("⏭️ Prompt unmodified, skipping");
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
remove_vectors_writer,
|
||||
prompts_writer,
|
||||
manual_vectors_writer,
|
||||
&mut key_buffer,
|
||||
delta,
|
||||
reindex_vectors,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ExtractedVectorPoints {
|
||||
// docid, _index -> KvWriterDelAdd -> Vector
|
||||
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
||||
// docid -> ()
|
||||
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
||||
// docid -> prompt
|
||||
prompts: writer_into_reader(prompts_writer)?,
|
||||
})
|
||||
}
|
||||
let mut results = Vec::new();
|
||||
|
||||
fn to_vector_map(
|
||||
obkv: KvReaderDelAdd,
|
||||
side: DelAdd,
|
||||
document_id: &impl Fn() -> Value,
|
||||
) -> Result<Option<serde_json::Map<String, Value>>> {
|
||||
Ok(if let Some(value) = obkv.get(side) {
|
||||
let Ok(value) = from_slice(value) else {
|
||||
let value = from_slice(value).map_err(InternalError::SerdeJson)?;
|
||||
return Err(crate::Error::UserError(UserError::InvalidVectorsMapType {
|
||||
document_id: document_id(),
|
||||
value,
|
||||
}));
|
||||
};
|
||||
Some(value)
|
||||
} else {
|
||||
None
|
||||
})
|
||||
for EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt: _,
|
||||
manual_vectors_writer,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
} in extractors
|
||||
{
|
||||
results.push(ExtractedVectorPoints {
|
||||
// docid, _index -> KvWriterDelAdd -> Vector
|
||||
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
||||
// docid -> ()
|
||||
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
||||
// docid -> prompt
|
||||
prompts: writer_into_reader(prompts_writer)?,
|
||||
|
||||
embedder,
|
||||
embedder_name,
|
||||
})
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
|
@ -270,14 +291,13 @@ fn push_vectors_diff(
|
|||
manual_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
delta: VectorStateDelta,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
reindex_vectors: bool,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
|
||||
if must_remove
|
||||
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
&& !settings_diff.reindex_vectors()
|
||||
&& !reindex_vectors
|
||||
{
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
remove_vectors_writer.insert(&key_buffer, [])?;
|
||||
|
@ -308,7 +328,7 @@ fn push_vectors_diff(
|
|||
EitherOrBoth::Left(vector) => {
|
||||
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
if !settings_diff.reindex_vectors() {
|
||||
if !reindex_vectors {
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
|
@ -336,26 +356,6 @@ fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
|
|||
a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
|
||||
}
|
||||
|
||||
/// Extracts the vectors from a JSON value.
|
||||
fn extract_vectors(
|
||||
value: Value,
|
||||
document_id: impl Fn() -> Value,
|
||||
name: &str,
|
||||
) -> Result<Vec<Vec<f32>>> {
|
||||
// FIXME: ugly clone of the vectors here
|
||||
match serde_json::from_value(value.clone()) {
|
||||
Ok(vectors) => {
|
||||
Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors).unwrap_or_default())
|
||||
}
|
||||
Err(_) => Err(UserError::InvalidVectorsType {
|
||||
document_id: document_id(),
|
||||
value,
|
||||
subfield: name.to_owned(),
|
||||
}
|
||||
.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
// docid, prompt
|
||||
|
@ -364,7 +364,6 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
|
|||
embedder: Arc<Embedder>,
|
||||
request_threads: &ThreadPoolNoAbort,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
|
||||
let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk
|
||||
|
||||
|
|
|
@ -36,8 +36,6 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
|
@ -167,8 +165,6 @@ fn words_into_sorter(
|
|||
add_words: &BTreeSet<Vec<u8>>,
|
||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
|
|
|
@ -26,7 +26,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
|
||||
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
|
||||
|
||||
|
@ -71,8 +70,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||
|
||||
// if we change document, we fill the sorter
|
||||
if current_document_id.map_or(false, |id| id != document_id) {
|
||||
puffin::profile_scope!("Document into sorter");
|
||||
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "document_into_sorter");
|
||||
let _entered = span.enter();
|
||||
|
@ -163,7 +160,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
puffin::profile_scope!("Final document into sorter");
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "final_document_into_sorter");
|
||||
let _entered = span.enter();
|
||||
|
@ -176,7 +172,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||
)?;
|
||||
}
|
||||
{
|
||||
puffin::profile_scope!("sorter_into_reader");
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "sorter_into_reader");
|
||||
let _entered = span.enter();
|
||||
|
|
|
@ -25,8 +25,6 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
|||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_position_docids_sorter = create_sorter(
|
||||
|
@ -104,8 +102,6 @@ fn words_position_into_sorter(
|
|||
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
|
|
|
@ -46,8 +46,6 @@ pub(crate) fn data_from_obkv_documents(
|
|||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
original_obkv_chunks
|
||||
|
@ -88,7 +86,6 @@ pub(crate) fn data_from_obkv_documents(
|
|||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
run_extraction_task::<
|
||||
_,
|
||||
|
@ -115,7 +112,6 @@ pub(crate) fn data_from_obkv_documents(
|
|||
word_fid_docids_reader,
|
||||
}
|
||||
},
|
||||
"word-docids",
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
|
@ -125,7 +121,6 @@ pub(crate) fn data_from_obkv_documents(
|
|||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
TypedChunk::WordPositionDocids,
|
||||
"word-position-docids",
|
||||
);
|
||||
|
||||
run_extraction_task::<
|
||||
|
@ -139,7 +134,6 @@ pub(crate) fn data_from_obkv_documents(
|
|||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
"field-id-facet-string-docids",
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
|
@ -149,7 +143,6 @@ pub(crate) fn data_from_obkv_documents(
|
|||
lmdb_writer_sx.clone(),
|
||||
extract_facet_number_docids,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docids",
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
|
@ -159,7 +152,6 @@ pub(crate) fn data_from_obkv_documents(
|
|||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -183,7 +175,6 @@ fn run_extraction_task<FE, FS, M>(
|
|||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
extract_fn: FE,
|
||||
serialize_fn: FS,
|
||||
name: &'static str,
|
||||
) where
|
||||
FE: Fn(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
|
@ -201,7 +192,7 @@ fn run_extraction_task<FE, FS, M>(
|
|||
rayon::spawn(move || {
|
||||
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks");
|
||||
let _entered = child_span.enter();
|
||||
puffin::profile_scope!("extract_multiple_chunks", name);
|
||||
|
||||
match extract_fn(chunk, indexer, &settings_diff) {
|
||||
Ok(chunk) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||
|
@ -224,27 +215,31 @@ fn send_original_documents_data(
|
|||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
|
||||
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
|
||||
let request_threads = ThreadPoolNoAbortBuilder::new()
|
||||
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
||||
.thread_name(|index| format!("embedding-request-{index}"))
|
||||
.build()?;
|
||||
|
||||
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
|
||||
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
|
||||
// no point in indexing vectors without embedders
|
||||
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
|
||||
|
||||
if index_vectors {
|
||||
let settings_diff = settings_diff.clone();
|
||||
|
||||
let original_documents_chunk = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {
|
||||
let result = extract_vector_points(
|
||||
documents_chunk_cloned.clone(),
|
||||
indexer,
|
||||
&settings_diff,
|
||||
&prompt,
|
||||
&name,
|
||||
);
|
||||
match result {
|
||||
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
|
||||
match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) {
|
||||
Ok(extracted_vectors) => {
|
||||
for ExtractedVectorPoints {
|
||||
manual_vectors,
|
||||
remove_vectors,
|
||||
prompts,
|
||||
embedder_name,
|
||||
embedder,
|
||||
} in extracted_vectors
|
||||
{
|
||||
let embeddings = match extract_embeddings(
|
||||
prompts,
|
||||
indexer,
|
||||
|
@ -253,28 +248,26 @@ fn send_original_documents_data(
|
|||
) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
if !(remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
|
||||
{
|
||||
let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension: embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name: name,
|
||||
embedder_name,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
}
|
||||
}
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
|
|
@ -61,7 +61,6 @@ pub fn sorter_into_reader(
|
|||
sorter: grenad::Sorter<MergeFn>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
|
@ -182,8 +181,6 @@ where
|
|||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<Bytes, Bytes>();
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ mod typed_chunk;
|
|||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{Read, Seek};
|
||||
use std::iter;
|
||||
use std::num::NonZeroU32;
|
||||
use std::result::Result as StdResult;
|
||||
use std::sync::Arc;
|
||||
|
@ -140,8 +141,6 @@ where
|
|||
mut self,
|
||||
reader: DocumentsBatchReader<R>,
|
||||
) -> Result<(Self, StdResult<u64, UserError>)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
// Early return when there is no document to add
|
||||
if reader.is_empty() {
|
||||
return Ok((self, Ok(0)));
|
||||
|
@ -186,8 +185,6 @@ where
|
|||
mut self,
|
||||
to_delete: Vec<String>,
|
||||
) -> Result<(Self, StdResult<u64, UserError>)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
// Early return when there is no document to add
|
||||
if to_delete.is_empty() {
|
||||
// Maintains Invariant: remove documents actually always returns Ok for the inner result
|
||||
|
@ -222,8 +219,6 @@ where
|
|||
mut self,
|
||||
to_delete: &RoaringBitmap,
|
||||
) -> Result<(Self, u64)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
// Early return when there is no document to add
|
||||
if to_delete.is_empty() {
|
||||
return Ok((self, 0));
|
||||
|
@ -248,8 +243,6 @@ where
|
|||
name = "index_documents"
|
||||
)]
|
||||
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
|
||||
puffin::profile_function!();
|
||||
|
||||
if self.added_documents == 0 && self.deleted_documents == 0 {
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
||||
|
@ -278,8 +271,6 @@ where
|
|||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
let TransformOutput {
|
||||
primary_key,
|
||||
mut settings_diff,
|
||||
|
@ -337,7 +328,10 @@ where
|
|||
let min_chunk_size = 1024 * 512; // 512KiB
|
||||
|
||||
// compute the chunk size from the number of available threads and the inputed data size.
|
||||
let total_size = flattened_documents.metadata().map(|m| m.len());
|
||||
let total_size = match flattened_documents.as_ref() {
|
||||
Some(flattened_documents) => flattened_documents.metadata().map(|m| m.len()),
|
||||
None => Ok(default_chunk_size as u64),
|
||||
};
|
||||
let current_num_threads = pool.current_num_threads();
|
||||
// if we have more than 2 thread, create a number of chunk equal to 3/4 threads count
|
||||
let chunk_count = if current_num_threads > 2 {
|
||||
|
@ -351,8 +345,14 @@ where
|
|||
}
|
||||
};
|
||||
|
||||
let original_documents = grenad::Reader::new(original_documents)?;
|
||||
let flattened_documents = grenad::Reader::new(flattened_documents)?;
|
||||
let original_documents = match original_documents {
|
||||
Some(original_documents) => Some(grenad::Reader::new(original_documents)?),
|
||||
None => None,
|
||||
};
|
||||
let flattened_documents = match flattened_documents {
|
||||
Some(flattened_documents) => Some(grenad::Reader::new(flattened_documents)?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
|
||||
|
||||
|
@ -371,15 +371,23 @@ where
|
|||
pool.install(|| {
|
||||
rayon::spawn(move || {
|
||||
let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks");
|
||||
let _enter = child_span.enter();
|
||||
puffin::profile_scope!("extract_and_send_grenad_chunks");
|
||||
// split obkv file into several chunks
|
||||
let original_chunk_iter =
|
||||
grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
|
||||
let _enter = child_span.enter();
|
||||
|
||||
// split obkv file into several chunks
|
||||
let flattened_chunk_iter =
|
||||
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
|
||||
let original_chunk_iter = match original_documents {
|
||||
Some(original_documents) => {
|
||||
grenad_obkv_into_chunks(original_documents,pool_params,documents_chunk_size).map(either::Left)
|
||||
},
|
||||
None => Ok(either::Right(iter::empty())),
|
||||
};
|
||||
|
||||
// split obkv file into several chunks
|
||||
let flattened_chunk_iter = match flattened_documents {
|
||||
Some(flattened_documents) => {
|
||||
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size).map(either::Left)
|
||||
},
|
||||
None => Ok(either::Right(iter::empty())),
|
||||
};
|
||||
|
||||
let result = original_chunk_iter.and_then(|original_chunk| {
|
||||
let flattened_chunk = flattened_chunk_iter?;
|
||||
|
@ -533,7 +541,7 @@ where
|
|||
let writer_index = (embedder_index as u16) << 8;
|
||||
for k in 0..=u8::MAX {
|
||||
let writer =
|
||||
arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension)?;
|
||||
arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension);
|
||||
if writer.is_empty(wtxn)? {
|
||||
break;
|
||||
}
|
||||
|
@ -571,8 +579,6 @@ where
|
|||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
// Merged databases are already been indexed, we start from this count;
|
||||
let mut databases_seen = MERGED_DATABASE_COUNT;
|
||||
|
||||
|
@ -616,7 +622,6 @@ where
|
|||
{
|
||||
let span = tracing::trace_span!(target: "indexing::details", "compute_prefix_diffs");
|
||||
let _entered = span.enter();
|
||||
puffin::profile_scope!("compute_prefix_diffs");
|
||||
|
||||
current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||
|
||||
|
@ -756,8 +761,6 @@ fn execute_word_prefix_docids(
|
|||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
|
||||
builder.chunk_compression_type = indexer_config.chunk_compression_type;
|
||||
builder.chunk_compression_level = indexer_config.chunk_compression_level;
|
||||
|
@ -3237,6 +3240,7 @@ mod tests {
|
|||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "all-tokenizations")]
|
||||
fn stored_detected_script_and_language_should_not_return_deleted_documents() {
|
||||
use charabia::{Language, Script};
|
||||
let index = TempIndex::new();
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use std::borrow::Cow;
|
||||
use std::collections::btree_map::Entry as BEntry;
|
||||
use std::collections::hash_map::Entry as HEntry;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek};
|
||||
|
||||
|
@ -20,21 +20,21 @@ use super::{IndexDocumentsMethod, IndexerConfig};
|
|||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::index::{db_name, main_key};
|
||||
use crate::update::del_add::{
|
||||
del_add_from_two_obkvs, into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd,
|
||||
};
|
||||
use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd};
|
||||
use crate::update::index_documents::GrenadParameters;
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result};
|
||||
use crate::{
|
||||
is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result,
|
||||
};
|
||||
|
||||
pub struct TransformOutput {
|
||||
pub primary_key: String,
|
||||
pub settings_diff: InnerIndexSettingsDiff,
|
||||
pub field_distribution: FieldDistribution,
|
||||
pub documents_count: usize,
|
||||
pub original_documents: File,
|
||||
pub flattened_documents: File,
|
||||
pub original_documents: Option<File>,
|
||||
pub flattened_documents: Option<File>,
|
||||
}
|
||||
|
||||
/// Extract the external ids, deduplicate and compute the new internal documents ids
|
||||
|
@ -161,8 +161,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
|
||||
let external_documents_ids = self.index.external_documents_ids();
|
||||
let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
|
||||
|
@ -375,8 +373,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
where
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
// there may be duplicates in the documents to remove.
|
||||
to_remove.sort_unstable();
|
||||
to_remove.dedup();
|
||||
|
@ -466,8 +462,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
where
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut documents_deleted = 0;
|
||||
let mut document_sorter_value_buffer = Vec::new();
|
||||
let mut document_sorter_key_buffer = Vec::new();
|
||||
|
@ -686,8 +680,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
let primary_key = self
|
||||
.index
|
||||
.primary_key(wtxn)?
|
||||
|
@ -808,11 +800,15 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
})?;
|
||||
|
||||
let old_inner_settings = InnerIndexSettings::from_index(self.index, wtxn)?;
|
||||
let fields_ids_map = self.fields_ids_map;
|
||||
let primary_key_id = self.index.primary_key(wtxn)?.and_then(|name| fields_ids_map.id(name));
|
||||
let mut new_inner_settings = old_inner_settings.clone();
|
||||
new_inner_settings.fields_ids_map = self.fields_ids_map;
|
||||
new_inner_settings.fields_ids_map = fields_ids_map;
|
||||
|
||||
let settings_diff = InnerIndexSettingsDiff {
|
||||
old: old_inner_settings,
|
||||
new: new_inner_settings,
|
||||
primary_key_id,
|
||||
embedding_configs_updated: false,
|
||||
settings_update_only: false,
|
||||
};
|
||||
|
@ -822,10 +818,12 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
settings_diff,
|
||||
field_distribution,
|
||||
documents_count: self.documents_count,
|
||||
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
|
||||
flattened_documents: flattened_documents
|
||||
.into_inner()
|
||||
.map_err(|err| err.into_error())?,
|
||||
original_documents: Some(
|
||||
original_documents.into_inner().map_err(|err| err.into_error())?,
|
||||
),
|
||||
flattened_documents: Some(
|
||||
flattened_documents.into_inner().map_err(|err| err.into_error())?,
|
||||
),
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -835,34 +833,66 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
fn rebind_existing_document(
|
||||
old_obkv: KvReader<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
original_obkv_buffer: &mut Vec<u8>,
|
||||
flattened_obkv_buffer: &mut Vec<u8>,
|
||||
modified_faceted_fields: &HashSet<String>,
|
||||
original_obkv_buffer: Option<&mut Vec<u8>>,
|
||||
flattened_obkv_buffer: Option<&mut Vec<u8>>,
|
||||
) -> Result<()> {
|
||||
let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone();
|
||||
let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone();
|
||||
// Always keep the primary key.
|
||||
let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) };
|
||||
|
||||
// If only the `searchableAttributes` has been changed, keep only the searchable fields.
|
||||
let must_reindex_searchables = settings_diff.reindex_searchable();
|
||||
let necessary_searchable_field = |id: FieldId| -> bool {
|
||||
must_reindex_searchables
|
||||
&& (settings_diff.old.searchable_fields_ids.contains(&id)
|
||||
|| settings_diff.new.searchable_fields_ids.contains(&id))
|
||||
};
|
||||
|
||||
// If only a faceted field has been added, keep only this field.
|
||||
let must_reindex_facets = settings_diff.reindex_facets();
|
||||
let necessary_faceted_field = |id: FieldId| -> bool {
|
||||
let field_name = settings_diff.new.fields_ids_map.name(id).unwrap();
|
||||
must_reindex_facets
|
||||
&& modified_faceted_fields
|
||||
.iter()
|
||||
.any(|long| is_faceted_by(long, field_name) || is_faceted_by(field_name, long))
|
||||
};
|
||||
|
||||
// Alway provide all fields when vectors are involved because
|
||||
// we need the fields for the prompt/templating.
|
||||
let reindex_vectors = settings_diff.reindex_vectors();
|
||||
|
||||
let mut obkv_writer = KvWriter::<_, FieldId>::memory();
|
||||
// We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
|
||||
for (id, name) in new_fields_ids_map.iter() {
|
||||
if let Some(val) = old_fields_ids_map.id(name).and_then(|id| old_obkv.get(id)) {
|
||||
for (id, val) in old_obkv.iter() {
|
||||
if is_primary_key(id)
|
||||
|| necessary_searchable_field(id)
|
||||
|| necessary_faceted_field(id)
|
||||
|| reindex_vectors
|
||||
{
|
||||
obkv_writer.insert(id, val)?;
|
||||
}
|
||||
}
|
||||
let data = obkv_writer.into_inner()?;
|
||||
let new_obkv = KvReader::<FieldId>::new(&data);
|
||||
let obkv = KvReader::<FieldId>::new(&data);
|
||||
|
||||
// take the non-flattened version if flatten_from_fields_ids_map returns None.
|
||||
let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?;
|
||||
let old_flattened =
|
||||
old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::<FieldId>::new);
|
||||
let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?;
|
||||
let new_flattened =
|
||||
new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::<FieldId>::new);
|
||||
if let Some(original_obkv_buffer) = original_obkv_buffer {
|
||||
original_obkv_buffer.clear();
|
||||
into_del_add_obkv(obkv, DelAddOperation::DeletionAndAddition, original_obkv_buffer)?;
|
||||
}
|
||||
|
||||
original_obkv_buffer.clear();
|
||||
flattened_obkv_buffer.clear();
|
||||
if let Some(flattened_obkv_buffer) = flattened_obkv_buffer {
|
||||
// take the non-flattened version if flatten_from_fields_ids_map returns None.
|
||||
let mut fields_ids_map = settings_diff.new.fields_ids_map.clone();
|
||||
let flattened = Self::flatten_from_fields_ids_map(&obkv, &mut fields_ids_map)?;
|
||||
let flattened = flattened.as_deref().map_or(obkv, KvReader::new);
|
||||
|
||||
del_add_from_two_obkvs(&old_obkv, &new_obkv, original_obkv_buffer)?;
|
||||
del_add_from_two_obkvs(&old_flattened, &new_flattened, flattened_obkv_buffer)?;
|
||||
flattened_obkv_buffer.clear();
|
||||
into_del_add_obkv(
|
||||
flattened,
|
||||
DelAddOperation::DeletionAndAddition,
|
||||
flattened_obkv_buffer,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -891,46 +921,63 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
let documents_count = documents_ids.len() as usize;
|
||||
|
||||
// We initialize the sorter with the user indexing settings.
|
||||
let mut original_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
self.indexer_settings.chunk_compression_type,
|
||||
self.indexer_settings.chunk_compression_level,
|
||||
self.indexer_settings.max_nb_chunks,
|
||||
self.indexer_settings.max_memory.map(|mem| mem / 2),
|
||||
);
|
||||
let mut original_sorter = if settings_diff.reindex_vectors() {
|
||||
Some(create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
self.indexer_settings.chunk_compression_type,
|
||||
self.indexer_settings.chunk_compression_level,
|
||||
self.indexer_settings.max_nb_chunks,
|
||||
self.indexer_settings.max_memory.map(|mem| mem / 2),
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// We initialize the sorter with the user indexing settings.
|
||||
let mut flattened_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
self.indexer_settings.chunk_compression_type,
|
||||
self.indexer_settings.chunk_compression_level,
|
||||
self.indexer_settings.max_nb_chunks,
|
||||
self.indexer_settings.max_memory.map(|mem| mem / 2),
|
||||
);
|
||||
let mut flattened_sorter =
|
||||
if settings_diff.reindex_searchable() || settings_diff.reindex_facets() {
|
||||
Some(create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
self.indexer_settings.chunk_compression_type,
|
||||
self.indexer_settings.chunk_compression_level,
|
||||
self.indexer_settings.max_nb_chunks,
|
||||
self.indexer_settings.max_memory.map(|mem| mem / 2),
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let mut original_obkv_buffer = Vec::new();
|
||||
let mut flattened_obkv_buffer = Vec::new();
|
||||
let mut document_sorter_key_buffer = Vec::new();
|
||||
for result in self.index.external_documents_ids().iter(wtxn)? {
|
||||
let (external_id, docid) = result?;
|
||||
let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
|
||||
)?;
|
||||
if original_sorter.is_some() || flattened_sorter.is_some() {
|
||||
let modified_faceted_fields = settings_diff.modified_faceted_fields();
|
||||
let mut original_obkv_buffer = Vec::new();
|
||||
let mut flattened_obkv_buffer = Vec::new();
|
||||
let mut document_sorter_key_buffer = Vec::new();
|
||||
for result in self.index.external_documents_ids().iter(wtxn)? {
|
||||
let (external_id, docid) = result?;
|
||||
let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
|
||||
)?;
|
||||
|
||||
Self::rebind_existing_document(
|
||||
old_obkv,
|
||||
&settings_diff,
|
||||
&mut original_obkv_buffer,
|
||||
&mut flattened_obkv_buffer,
|
||||
)?;
|
||||
Self::rebind_existing_document(
|
||||
old_obkv,
|
||||
&settings_diff,
|
||||
&modified_faceted_fields,
|
||||
Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()),
|
||||
Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()),
|
||||
)?;
|
||||
|
||||
document_sorter_key_buffer.clear();
|
||||
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
|
||||
original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?;
|
||||
flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?;
|
||||
if let Some(original_sorter) = original_sorter.as_mut() {
|
||||
document_sorter_key_buffer.clear();
|
||||
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
|
||||
original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?;
|
||||
}
|
||||
if let Some(flattened_sorter) = flattened_sorter.as_mut() {
|
||||
flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let grenad_params = GrenadParameters {
|
||||
|
@ -941,17 +988,22 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
};
|
||||
|
||||
// Once we have written all the documents, we merge everything into a Reader.
|
||||
let original_documents = sorter_into_reader(original_sorter, grenad_params)?;
|
||||
|
||||
let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?;
|
||||
let flattened_documents = match flattened_sorter {
|
||||
Some(flattened_sorter) => Some(sorter_into_reader(flattened_sorter, grenad_params)?),
|
||||
None => None,
|
||||
};
|
||||
let original_documents = match original_sorter {
|
||||
Some(original_sorter) => Some(sorter_into_reader(original_sorter, grenad_params)?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
Ok(TransformOutput {
|
||||
primary_key,
|
||||
field_distribution,
|
||||
settings_diff,
|
||||
documents_count,
|
||||
original_documents: original_documents.into_inner().into_inner(),
|
||||
flattened_documents: flattened_documents.into_inner().into_inner(),
|
||||
original_documents: original_documents.map(|od| od.into_inner().into_inner()),
|
||||
flattened_documents: flattened_documents.map(|fd| fd.into_inner().into_inner()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
@ -118,65 +118,6 @@ impl TypedChunk {
|
|||
}
|
||||
}
|
||||
|
||||
impl TypedChunk {
|
||||
pub fn to_debug_string(&self) -> String {
|
||||
match self {
|
||||
TypedChunk::FieldIdDocidFacetStrings(grenad) => {
|
||||
format!("FieldIdDocidFacetStrings {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::FieldIdDocidFacetNumbers(grenad) => {
|
||||
format!("FieldIdDocidFacetNumbers {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::Documents(grenad) => {
|
||||
format!("Documents {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::FieldIdWordCountDocids(grenad) => {
|
||||
format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} => format!(
|
||||
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}",
|
||||
word_docids_reader.len(),
|
||||
exact_word_docids_reader.len(),
|
||||
word_fid_docids_reader.len()
|
||||
),
|
||||
TypedChunk::WordPositionDocids(grenad) => {
|
||||
format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(grenad) => {
|
||||
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::FieldIdFacetStringDocids((grenad, _)) => {
|
||||
format!("FieldIdFacetStringDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::FieldIdFacetNumberDocids(grenad) => {
|
||||
format!("FieldIdFacetNumberDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::FieldIdFacetExistsDocids(grenad) => {
|
||||
format!("FieldIdFacetExistsDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsNullDocids(grenad) => {
|
||||
format!("FieldIdFacetIsNullDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsEmptyDocids(grenad) => {
|
||||
format!("FieldIdFacetIsEmptyDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::GeoPoints(grenad) => {
|
||||
format!("GeoPoints {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::VectorPoints{ remove_vectors, manual_vectors, embeddings, expected_dimension, embedder_name } => {
|
||||
format!("VectorPoints {{ remove_vectors: {}, manual_vectors: {}, embeddings: {}, dimension: {}, embedder_name: {} }}", remove_vectors.len(), manual_vectors.len(), embeddings.as_ref().map(|e| e.len()).unwrap_or_default(), expected_dimension, embedder_name)
|
||||
}
|
||||
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Write typed chunk in the corresponding LMDB database of the provided index.
|
||||
/// Return new documents seen.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
|
@ -185,14 +126,16 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
index: &Index,
|
||||
wtxn: &mut RwTxn,
|
||||
) -> Result<(RoaringBitmap, bool)> {
|
||||
puffin::profile_function!(typed_chunks[0].to_debug_string());
|
||||
|
||||
let mut is_merged_database = false;
|
||||
match typed_chunks[0] {
|
||||
TypedChunk::Documents(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "documents");
|
||||
let _entered = span.enter();
|
||||
|
||||
let fields_ids_map = index.fields_ids_map(wtxn)?;
|
||||
let vectors_fid =
|
||||
fields_ids_map.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::Documents(chunk) = typed_chunk else {
|
||||
|
@ -206,6 +149,10 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
|
||||
let mut docids = index.documents_ids(wtxn)?;
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
|
||||
let embedders: BTreeSet<_> =
|
||||
index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect();
|
||||
let mut vectors_buffer = Vec::new();
|
||||
while let Some((key, reader)) = iter.next()? {
|
||||
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
||||
let reader: KvReader<FieldId> = KvReader::new(reader);
|
||||
|
@ -219,7 +166,35 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
writer.insert(field_id, addition)?;
|
||||
let addition = if vectors_fid == Some(field_id) {
|
||||
'vectors: {
|
||||
vectors_buffer.clear();
|
||||
let Ok(mut vectors) =
|
||||
crate::vector::parsed_vectors::ParsedVectors::from_bytes(
|
||||
addition,
|
||||
)
|
||||
else {
|
||||
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
|
||||
break 'vectors Some(addition);
|
||||
};
|
||||
vectors.retain_user_provided_vectors(&embedders);
|
||||
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
|
||||
if vectors.is_empty() {
|
||||
// skip writing empty `_vectors` map
|
||||
break 'vectors None;
|
||||
}
|
||||
|
||||
serde_json::to_writer(&mut vectors_buffer, &vectors)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
Some(vectors_buffer.as_slice())
|
||||
}
|
||||
} else {
|
||||
Some(addition)
|
||||
};
|
||||
|
||||
if let Some(addition) = addition {
|
||||
writer.insert(field_id, addition)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -661,7 +636,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
)?;
|
||||
let writer_index = (embedder_index as u16) << 8;
|
||||
// FIXME: allow customizing distance
|
||||
let writers: std::result::Result<Vec<_>, _> = (0..=u8::MAX)
|
||||
let writers: Vec<_> = (0..=u8::MAX)
|
||||
.map(|k| {
|
||||
arroy::Writer::new(
|
||||
index.vector_arroy,
|
||||
|
@ -670,7 +645,6 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
)
|
||||
})
|
||||
.collect();
|
||||
let writers = writers?;
|
||||
|
||||
// remove vectors for docids we want them removed
|
||||
let merger = remove_vectors_builder.build();
|
||||
|
@ -842,7 +816,6 @@ where
|
|||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<Bytes, Bytes>();
|
||||
|
||||
|
|
|
@ -398,8 +398,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
// if the settings are set before any document update, we don't need to do anything, and
|
||||
// will set the primary key during the first document addition.
|
||||
if self.index.number_of_documents(self.wtxn)? == 0 {
|
||||
|
@ -461,50 +459,39 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
Ok(true)
|
||||
}
|
||||
|
||||
/// Updates the index's searchable attributes. This causes the field map to be recomputed to
|
||||
/// reflect the order of the searchable attributes.
|
||||
/// Updates the index's searchable attributes.
|
||||
fn update_searchable(&mut self) -> Result<bool> {
|
||||
match self.searchable_fields {
|
||||
Setting::Set(ref fields) => {
|
||||
// Check to see if the searchable fields changed before doing anything else
|
||||
let old_fields = self.index.searchable_fields(self.wtxn)?;
|
||||
let did_change = match old_fields {
|
||||
// If old_fields is Some, let's check to see if the fields actually changed
|
||||
Some(old_fields) => {
|
||||
let new_fields = fields.iter().map(String::as_str).collect::<Vec<_>>();
|
||||
new_fields != old_fields
|
||||
}
|
||||
// If old_fields is None, the fields have changed (because they are being set)
|
||||
None => true,
|
||||
let did_change = {
|
||||
let new_fields = fields.iter().map(String::as_str).collect::<Vec<_>>();
|
||||
new_fields != old_fields
|
||||
};
|
||||
if !did_change {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
// every time the searchable attributes are updated, we need to update the
|
||||
// ids for any settings that uses the facets. (distinct_fields, filterable_fields).
|
||||
let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
|
||||
let mut new_fields_ids_map = FieldsIdsMap::new();
|
||||
// Since we're updating the settings we can only add new fields at the end of the field id map
|
||||
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
// fields are deduplicated, only the first occurrence is taken into account
|
||||
let names = fields.iter().unique().map(String::as_str).collect::<Vec<_>>();
|
||||
|
||||
// Add all the searchable attributes to the field map, and then add the
|
||||
// remaining fields from the old field map to the new one
|
||||
for name in names.iter() {
|
||||
new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?;
|
||||
}
|
||||
|
||||
for (_, name) in old_fields_ids_map.iter() {
|
||||
new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?;
|
||||
// The fields ids map won't change the field id of already present elements thus only the
|
||||
// new fields will be inserted.
|
||||
fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?;
|
||||
}
|
||||
|
||||
self.index.put_all_searchable_fields_from_fields_ids_map(
|
||||
self.wtxn,
|
||||
&names,
|
||||
&new_fields_ids_map,
|
||||
&fields_ids_map,
|
||||
)?;
|
||||
self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?;
|
||||
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
||||
Ok(true)
|
||||
}
|
||||
Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?),
|
||||
|
@ -1078,10 +1065,17 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
|
||||
let embedding_configs_updated = self.update_embedding_configs()?;
|
||||
|
||||
let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
|
||||
let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
|
||||
new_inner_settings.recompute_facets(self.wtxn, self.index)?;
|
||||
|
||||
let primary_key_id = self
|
||||
.index
|
||||
.primary_key(self.wtxn)?
|
||||
.and_then(|name| new_inner_settings.fields_ids_map.id(name));
|
||||
let inner_settings_diff = InnerIndexSettingsDiff {
|
||||
old: old_inner_settings,
|
||||
new: new_inner_settings,
|
||||
primary_key_id,
|
||||
embedding_configs_updated,
|
||||
settings_update_only: true,
|
||||
};
|
||||
|
@ -1097,10 +1091,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
pub struct InnerIndexSettingsDiff {
|
||||
pub(crate) old: InnerIndexSettings,
|
||||
pub(crate) new: InnerIndexSettings,
|
||||
|
||||
pub(crate) primary_key_id: Option<FieldId>,
|
||||
// TODO: compare directly the embedders.
|
||||
pub(crate) embedding_configs_updated: bool,
|
||||
|
||||
pub(crate) settings_update_only: bool,
|
||||
}
|
||||
|
||||
|
@ -1110,13 +1103,8 @@ impl InnerIndexSettingsDiff {
|
|||
}
|
||||
|
||||
pub fn reindex_searchable(&self) -> bool {
|
||||
self.old
|
||||
.fields_ids_map
|
||||
.iter()
|
||||
.zip(self.new.fields_ids_map.iter())
|
||||
.any(|(old, new)| old != new)
|
||||
|| self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|
||||
!= self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|
||||
self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|
||||
!= self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|
||||
|| self.old.allowed_separators != self.new.allowed_separators
|
||||
|| self.old.dictionary != self.new.dictionary
|
||||
|| self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields
|
||||
|
@ -1143,15 +1131,7 @@ impl InnerIndexSettingsDiff {
|
|||
return true;
|
||||
}
|
||||
|
||||
let faceted_updated =
|
||||
(existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields);
|
||||
|
||||
self.old
|
||||
.fields_ids_map
|
||||
.iter()
|
||||
.zip(self.new.fields_ids_map.iter())
|
||||
.any(|(old, new)| old != new)
|
||||
|| faceted_updated
|
||||
(existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields)
|
||||
}
|
||||
|
||||
pub fn reindex_vectors(&self) -> bool {
|
||||
|
@ -1181,7 +1161,7 @@ pub(crate) struct InnerIndexSettings {
|
|||
pub user_defined_faceted_fields: HashSet<String>,
|
||||
pub user_defined_searchable_fields: Option<Vec<String>>,
|
||||
pub faceted_fields_ids: HashSet<FieldId>,
|
||||
pub searchable_fields_ids: Option<Vec<FieldId>>,
|
||||
pub searchable_fields_ids: Vec<FieldId>,
|
||||
pub exact_attributes: HashSet<FieldId>,
|
||||
pub proximity_precision: ProximityPrecision,
|
||||
pub embedding_configs: EmbeddingConfigs,
|
||||
|
@ -1262,18 +1242,21 @@ impl InnerIndexSettings {
|
|||
|
||||
// find and insert the new field ids
|
||||
pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
|
||||
let searchable_fields = self
|
||||
.user_defined_searchable_fields
|
||||
.as_ref()
|
||||
.map(|searchable| searchable.iter().map(|s| s.as_str()).collect::<Vec<_>>());
|
||||
|
||||
// in case new fields were introduced we're going to recreate the searchable fields.
|
||||
if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() {
|
||||
let searchable_fields =
|
||||
searchable_fields.iter().map(String::as_ref).collect::<Vec<_>>();
|
||||
if let Some(searchable_fields) = searchable_fields {
|
||||
index.put_all_searchable_fields_from_fields_ids_map(
|
||||
wtxn,
|
||||
&searchable_fields,
|
||||
&self.fields_ids_map,
|
||||
)?;
|
||||
let searchable_fields_ids = index.searchable_fields_ids(wtxn)?;
|
||||
self.searchable_fields_ids = searchable_fields_ids;
|
||||
}
|
||||
let searchable_fields_ids = index.searchable_fields_ids(wtxn)?;
|
||||
self.searchable_fields_ids = searchable_fields_ids;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1546,12 +1529,13 @@ mod tests {
|
|||
use big_s::S;
|
||||
use heed::types::Bytes;
|
||||
use maplit::{btreemap, btreeset, hashset};
|
||||
use meili_snap::snapshot;
|
||||
|
||||
use super::*;
|
||||
use crate::error::Error;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::ClearDocuments;
|
||||
use crate::{Criterion, Filter, SearchResult};
|
||||
use crate::{db_snap, Criterion, Filter, SearchResult};
|
||||
|
||||
#[test]
|
||||
fn set_and_reset_searchable_fields() {
|
||||
|
@ -1580,6 +1564,17 @@ mod tests {
|
|||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 id |
|
||||
1 name |
|
||||
2 age |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["name"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
1 0 |
|
||||
"###);
|
||||
|
||||
// Check that the searchable field is correctly set to "name" only.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// When we search for something that is not in
|
||||
|
@ -1591,8 +1586,9 @@ mod tests {
|
|||
// we must find the appropriate document.
|
||||
let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap();
|
||||
let documents = index.documents(&rtxn, result.documents_ids).unwrap();
|
||||
let fid_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
assert_eq!(documents.len(), 1);
|
||||
assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..]));
|
||||
assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
|
||||
drop(rtxn);
|
||||
|
||||
// We change the searchable fields to be the "name" field only.
|
||||
|
@ -1602,14 +1598,31 @@ mod tests {
|
|||
})
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 id |
|
||||
1 name |
|
||||
2 age |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["id", "name", "age"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
0 0 |
|
||||
1 0 |
|
||||
2 0 |
|
||||
"###);
|
||||
|
||||
// Check that the searchable field have been reset and documents are found now.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fid_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap();
|
||||
snapshot!(format!("{user_defined_searchable_fields:?}"), @"None");
|
||||
// the searchable fields should contain all the fields
|
||||
let searchable_fields = index.searchable_fields(&rtxn).unwrap();
|
||||
assert_eq!(searchable_fields, None);
|
||||
snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###);
|
||||
let result = index.search(&rtxn).query("23").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
let documents = index.documents(&rtxn, result.documents_ids).unwrap();
|
||||
assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..]));
|
||||
assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -52,8 +52,6 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
|||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
// It is forbidden to keep a mutable reference into the database
|
||||
// and write into it at the same time, therefore we write into another file.
|
||||
let mut prefix_docids_sorter = create_sorter(
|
||||
|
|
|
@ -57,7 +57,6 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
|||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
debug!("Computing and writing the word levels integers docids into LMDB on disk...");
|
||||
|
||||
let mut prefix_integer_docids_sorter = create_sorter(
|
||||
|
|
|
@ -45,8 +45,6 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> {
|
|||
name = "words_prefix_fst"
|
||||
)]
|
||||
pub fn execute(self) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let words_fst = self.index.words_fst(self.wtxn)?;
|
||||
|
||||
let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length];
|
||||
|
|
|
@ -13,6 +13,7 @@ pub mod error;
|
|||
pub mod hf;
|
||||
pub mod manual;
|
||||
pub mod openai;
|
||||
pub mod parsed_vectors;
|
||||
pub mod settings;
|
||||
|
||||
pub mod ollama;
|
||||
|
@ -147,6 +148,10 @@ impl EmbeddingConfigs {
|
|||
self.get(self.get_default_embedder_name())
|
||||
}
|
||||
|
||||
pub fn inner_as_ref(&self) -> &HashMap<String, (Arc<Embedder>, Arc<Prompt>)> {
|
||||
&self.0
|
||||
}
|
||||
|
||||
/// Get the name of the default embedder configuration.
|
||||
///
|
||||
/// The default embedder is determined as follows:
|
||||
|
|
207
milli/src/vector/parsed_vectors.rs
Normal file
207
milli/src/vector/parsed_vectors.rs
Normal file
|
@ -0,0 +1,207 @@
|
|||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use obkv::KvReader;
|
||||
use serde_json::{from_slice, Value};
|
||||
|
||||
use super::Embedding;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::{FieldId, InternalError, UserError};
|
||||
|
||||
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
|
||||
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||
#[serde(untagged)]
|
||||
pub enum Vectors {
|
||||
ImplicitlyUserProvided(VectorOrArrayOfVectors),
|
||||
Explicit(ExplicitVectors),
|
||||
}
|
||||
|
||||
impl Vectors {
|
||||
pub fn into_array_of_vectors(self) -> Vec<Embedding> {
|
||||
match self {
|
||||
Vectors::ImplicitlyUserProvided(embeddings)
|
||||
| Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => {
|
||||
embeddings.into_array_of_vectors().unwrap_or_default()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ExplicitVectors {
|
||||
pub embeddings: VectorOrArrayOfVectors,
|
||||
pub user_provided: bool,
|
||||
}
|
||||
|
||||
pub struct ParsedVectorsDiff {
|
||||
pub old: Option<BTreeMap<String, Vectors>>,
|
||||
pub new: Option<BTreeMap<String, Vectors>>,
|
||||
}
|
||||
|
||||
impl ParsedVectorsDiff {
|
||||
pub fn new(
|
||||
documents_diff: KvReader<'_, FieldId>,
|
||||
old_vectors_fid: Option<FieldId>,
|
||||
new_vectors_fid: Option<FieldId>,
|
||||
) -> Result<Self, Error> {
|
||||
let old = match old_vectors_fid
|
||||
.and_then(|vectors_fid| documents_diff.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion))
|
||||
.transpose()
|
||||
{
|
||||
Ok(del) => del,
|
||||
// ignore wrong shape for old version of documents, use an empty map in this case
|
||||
Err(Error::InvalidMap(value)) => {
|
||||
tracing::warn!(%value, "Previous version of the `_vectors` field had a wrong shape");
|
||||
Default::default()
|
||||
}
|
||||
Err(error) => {
|
||||
return Err(error);
|
||||
}
|
||||
}
|
||||
.flatten();
|
||||
let new = new_vectors_fid
|
||||
.and_then(|vectors_fid| documents_diff.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_map(obkv, DelAdd::Addition))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
Ok(Self { old, new })
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, embedder_name: &str) -> (Option<Vectors>, Option<Vectors>) {
|
||||
let old = self.old.as_mut().and_then(|old| old.remove(embedder_name));
|
||||
let new = self.new.as_mut().and_then(|new| new.remove(embedder_name));
|
||||
(old, new)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ParsedVectors(pub BTreeMap<String, Vectors>);
|
||||
|
||||
impl ParsedVectors {
|
||||
pub fn from_bytes(value: &[u8]) -> Result<Self, Error> {
|
||||
let Ok(value) = from_slice(value) else {
|
||||
let value = from_slice(value).map_err(Error::InternalSerdeJson)?;
|
||||
return Err(Error::InvalidMap(value));
|
||||
};
|
||||
Ok(ParsedVectors(value))
|
||||
}
|
||||
|
||||
pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet<String>) {
|
||||
self.0.retain(|k, v| match v {
|
||||
Vectors::ImplicitlyUserProvided(_) => true,
|
||||
Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => {
|
||||
*user_provided
|
||||
// if the embedder is not in the config, then never touch it
|
||||
|| !embedders.contains(k)
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
pub enum Error {
|
||||
InvalidMap(Value),
|
||||
InternalSerdeJson(serde_json::Error),
|
||||
}
|
||||
|
||||
impl Error {
|
||||
pub fn to_crate_error(self, document_id: String) -> crate::Error {
|
||||
match self {
|
||||
Error::InvalidMap(value) => {
|
||||
crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value })
|
||||
}
|
||||
Error::InternalSerdeJson(error) => {
|
||||
crate::Error::InternalError(InternalError::SerdeJson(error))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn to_vector_map(
|
||||
obkv: KvReaderDelAdd,
|
||||
side: DelAdd,
|
||||
) -> Result<Option<BTreeMap<String, Vectors>>, Error> {
|
||||
Ok(if let Some(value) = obkv.get(side) {
|
||||
let ParsedVectors(parsed_vectors) = ParsedVectors::from_bytes(value)?;
|
||||
Some(parsed_vectors)
|
||||
} else {
|
||||
None
|
||||
})
|
||||
}
|
||||
|
||||
/// Represents either a vector or an array of multiple vectors.
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||
#[serde(transparent)]
|
||||
pub struct VectorOrArrayOfVectors {
|
||||
#[serde(with = "either::serde_untagged_optional")]
|
||||
inner: Option<either::Either<Vec<Embedding>, Embedding>>,
|
||||
}
|
||||
|
||||
impl VectorOrArrayOfVectors {
|
||||
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
|
||||
match self.inner? {
|
||||
either::Either::Left(vectors) => Some(vectors),
|
||||
either::Either::Right(vector) => Some(vec![vector]),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_array_of_vectors(array_of_vec: Vec<Embedding>) -> Self {
|
||||
Self { inner: Some(either::Either::Left(array_of_vec)) }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::VectorOrArrayOfVectors;
|
||||
|
||||
#[test]
|
||||
fn array_of_vectors() {
|
||||
let null: VectorOrArrayOfVectors = serde_json::from_str("null").unwrap();
|
||||
let empty: VectorOrArrayOfVectors = serde_json::from_str("[]").unwrap();
|
||||
let one: VectorOrArrayOfVectors = serde_json::from_str("[0.1]").unwrap();
|
||||
let two: VectorOrArrayOfVectors = serde_json::from_str("[0.1, 0.2]").unwrap();
|
||||
let one_vec: VectorOrArrayOfVectors = serde_json::from_str("[[0.1, 0.2]]").unwrap();
|
||||
let two_vecs: VectorOrArrayOfVectors =
|
||||
serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap();
|
||||
|
||||
insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null");
|
||||
insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]");
|
||||
insta::assert_json_snapshot!(one.into_array_of_vectors(), @r###"
|
||||
[
|
||||
[
|
||||
0.1
|
||||
]
|
||||
]
|
||||
"###);
|
||||
insta::assert_json_snapshot!(two.into_array_of_vectors(), @r###"
|
||||
[
|
||||
[
|
||||
0.1,
|
||||
0.2
|
||||
]
|
||||
]
|
||||
"###);
|
||||
insta::assert_json_snapshot!(one_vec.into_array_of_vectors(), @r###"
|
||||
[
|
||||
[
|
||||
0.1,
|
||||
0.2
|
||||
]
|
||||
]
|
||||
"###);
|
||||
insta::assert_json_snapshot!(two_vecs.into_array_of_vectors(), @r###"
|
||||
[
|
||||
[
|
||||
0.1,
|
||||
0.2
|
||||
],
|
||||
[
|
||||
0.3,
|
||||
0.4
|
||||
]
|
||||
]
|
||||
"###);
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue