From c22460045c8b0d5a830caeae5c3b1856ccfa7b90 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 6 May 2024 14:49:45 +0200 Subject: [PATCH 01/14] Stops returning an option in the internal searchable fields --- milli/src/fieldids_weights_map.rs | 28 +++++ milli/src/index.rs | 108 ++++++++++++------ milli/src/lib.rs | 3 + milli/src/search/new/db_cache.rs | 12 +- milli/src/search/new/exact_attribute.rs | 8 +- milli/src/search/new/mod.rs | 17 +-- .../search/new/ranking_rule_graph/fid/mod.rs | 12 +- .../extract/extract_docid_word_positions.rs | 6 +- milli/src/update/settings.rs | 16 +-- 9 files changed, 120 insertions(+), 90 deletions(-) create mode 100644 milli/src/fieldids_weights_map.rs diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs new file mode 100644 index 000000000..255f6ab80 --- /dev/null +++ b/milli/src/fieldids_weights_map.rs @@ -0,0 +1,28 @@ +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::{FieldId, Weight}; + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct FieldidsWeightsMap { + map: HashMap, +} + +impl FieldidsWeightsMap { + pub fn insert(&mut self, fid: FieldId, weight: Weight) -> Option { + self.map.insert(fid, weight) + } + + pub fn remove(&mut self, fid: FieldId) -> Option { + self.map.remove(&fid) + } + + pub fn weight(&self, fid: FieldId) -> Option { + self.map.get(&fid).copied() + } + + pub fn max_weight(&self) -> Option { + self.map.values().copied().max() + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 27b273393..b6b07404b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::convert::TryInto; use std::fs::File; use std::path::Path; @@ -25,8 +26,9 @@ use crate::proximity::ProximityPrecision; use crate::vector::EmbeddingConfig; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, - FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, - Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, BEU64, + FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, FieldidsWeightsMap, + GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, + BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -42,6 +44,7 @@ pub mod main_key { pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; + pub const FIELDIDS_WEIGHTS_MAP_KEY: &str = "fieldids-weights-map"; pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; pub const GEO_RTREE_KEY: &str = "geo-rtree"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -414,6 +417,32 @@ impl Index { .unwrap_or_default()) } + /* fieldids weights map */ + // This maps the fields ids to their weights. + // Their weights is defined by the ordering of the searchable attributes. + + /// Writes the fieldids weights map which associates the field ids to their weights + pub(crate) fn put_fieldids_weights_map( + &self, + wtxn: &mut RwTxn, + map: &FieldidsWeightsMap, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + wtxn, + main_key::FIELDIDS_WEIGHTS_MAP_KEY, + map, + ) + } + + /// Get the fieldids weights map which associates the field ids to their weights + pub fn fieldids_weights_map(&self, rtxn: &RoTxn) -> heed::Result { + Ok(self + .main + .remap_types::>() + .get(rtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY)? + .unwrap_or_default()) + } + /* geo rtree */ /// Writes the provided `rtree` which associates coordinates to documents ids. @@ -578,10 +607,12 @@ impl Index { wtxn: &mut RwTxn, user_fields: &[&str], fields_ids_map: &FieldsIdsMap, - ) -> heed::Result<()> { + ) -> Result<()> { // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; + let mut weights = self.fieldids_weights_map(&wtxn)?; + // Now we generate the real searchable fields: // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. // 2. Iterate over the user defined searchable fields. @@ -589,17 +620,23 @@ impl Index { // (ie doggo.name is a subset of doggo) then we push it at the end of the fields. let mut real_fields = user_fields.to_vec(); - for field_from_map in fields_ids_map.names() { - for user_field in user_fields { + for (id, field_from_map) in fields_ids_map.iter() { + for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) && !user_fields.contains(&field_from_map) { real_fields.push(field_from_map); + + let weight: u16 = + weight.try_into().map_err(|_| UserError::AttributeLimitReached)?; + weights.insert(id, weight as u16); } } } - self.put_searchable_fields(wtxn, &real_fields) + self.put_searchable_fields(wtxn, &real_fields)?; + self.put_fieldids_weights_map(wtxn, &weights)?; + Ok(()) } pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { @@ -623,28 +660,31 @@ impl Index { } /// Returns the searchable fields, those are the fields that are indexed, - /// if the searchable fields aren't there it means that **all** the fields are indexed. - pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { + pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { self.main .remap_types::>>() - .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY) + .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY)? + .map(|fields| Ok(fields.into_iter().map(|field| Cow::Borrowed(field)).collect())) + .unwrap_or_else(|| { + Ok(self + .fields_ids_map(rtxn)? + .names() + .map(|field| Cow::Owned(field.to_string())) + .collect()) + }) } /// Identical to `searchable_fields`, but returns the ids instead. - pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result>> { - match self.searchable_fields(rtxn)? { - Some(fields) => { - let fields_ids_map = self.fields_ids_map(rtxn)?; - let mut fields_ids = Vec::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(name) { - fields_ids.push(field_id); - } - } - Ok(Some(fields_ids)) + pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.searchable_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields { + if let Some(field_id) = fields_ids_map.id(&name) { + fields_ids.push(field_id); } - None => Ok(None), } + Ok(fields_ids) } /// Writes the searchable fields, when this list is specified, only these are indexed. @@ -1710,10 +1750,14 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, field_distribution, 1); + db_snap!(index, field_distribution, @r###" + age 1 | + id 2 | + name 2 | + "###); db_snap!(index, word_docids, - @r###" + @r###" 1 [0, ] 2 [1, ] 20 [1, ] @@ -1722,18 +1766,6 @@ pub(crate) mod tests { "### ); - db_snap!(index, field_distribution); - - db_snap!(index, field_distribution, - @r###" - age 1 | - id 2 | - name 2 | - "### - ); - - // snapshot_index!(&index, "1", include: "^field_distribution$"); - // we add all the documents a second time. we are supposed to get the same // field_distribution in the end index @@ -1820,7 +1852,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); @@ -1840,7 +1872,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); assert_eq!(user_defined, &["doggo", "name"]); @@ -1856,7 +1888,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a1e240464..881633b5c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -28,6 +28,7 @@ pub mod vector; #[cfg(test)] #[macro_use] pub mod snapshot_tests; +mod fieldids_weights_map; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; @@ -52,6 +53,7 @@ pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; pub use self::external_documents_ids::ExternalDocumentsIds; +pub use self::fieldids_weights_map::FieldidsWeightsMap; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, @@ -77,6 +79,7 @@ pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; pub type FieldDistribution = BTreeMap; pub type FieldId = u16; +pub type Weight = u16; pub type Object = serde_json::Map; pub type Position = u32; pub type RelativePosition = u16; diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 62c921a1d..a99000f60 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -315,11 +315,7 @@ impl<'ctx> SearchContext<'ctx> { .map_err(heed::Error::Decoding)? } else { // Compute the distance at the attribute level and store it in the cache. - let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { - fids - } else { - self.index.fields_ids_map(self.txn)?.ids().collect() - }; + let fids = self.index.searchable_fields_ids(self.txn)?; let mut docids = RoaringBitmap::new(); for fid in fids { // for each field, intersect left word bitmap and right word bitmap, @@ -408,11 +404,7 @@ impl<'ctx> SearchContext<'ctx> { let prefix_docids = match proximity_precision { ProximityPrecision::ByAttribute => { // Compute the distance at the attribute level and store it in the cache. - let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { - fids - } else { - self.index.fields_ids_map(self.txn)?.ids().collect() - }; + let fids = self.index.searchable_fields_ids(self.txn)?; let mut prefix_docids = RoaringBitmap::new(); // for each field, intersect left word bitmap and right word bitmap, // then merge the result in a global bitmap before storing it in the cache. diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 7932f0c2a..41b70ae39 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -184,13 +184,7 @@ impl State { return Ok(State::Empty(query_graph.clone())); } - let searchable_fields_ids = { - if let Some(fids) = ctx.index.searchable_fields_ids(ctx.txn)? { - fids - } else { - ctx.index.fields_ids_map(ctx.txn)?.ids().collect() - } - }; + let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?; let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); // then check that there exists at least one attribute that has all of the terms diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 617068ef8..acbb3638b 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -96,27 +96,22 @@ impl<'ctx> SearchContext<'ctx> { contains_wildcard = true; continue; } - let searchable_contains_name = - searchable_names.as_ref().map(|sn| sn.iter().any(|name| name == field_name)); + let searchable_contains_name = searchable_names.iter().any(|name| name == field_name); let fid = match (fids_map.id(field_name), searchable_contains_name) { // The Field id exist and the field is searchable - (Some(fid), Some(true)) | (Some(fid), None) => fid, + (Some(fid), true) => fid, // The field is searchable but the Field id doesn't exist => Internal Error - (None, Some(true)) => { + (None, true) => { return Err(FieldIdMapMissingEntry::FieldName { field_name: field_name.to_string(), process: "search", } .into()) } - // The field is not searchable, but the searchableAttributes are set to * => ignore field - (None, None) => continue, // The field is not searchable => User error - (_fid, Some(false)) => { - let (valid_fields, hidden_fields) = match searchable_names { - Some(sn) => self.index.remove_hidden_fields(self.txn, sn)?, - None => self.index.remove_hidden_fields(self.txn, fids_map.names())?, - }; + (_fid, false) => { + let (valid_fields, hidden_fields) = + self.index.remove_hidden_fields(self.txn, searchable_names)?; let field = field_name.to_string(); return Err(UserError::InvalidSearchableAttribute { diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index 8f3e0cc82..cf65249de 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -77,17 +77,7 @@ impl RankingRuleGraphTrait for FidGraph { } // always lookup the max_fid if we don't already and add an artificial condition for max scoring - let max_fid: Option = { - if let Some(max_fid) = ctx - .index - .searchable_fields_ids(ctx.txn)? - .map(|field_ids| field_ids.into_iter().max()) - { - max_fid - } else { - ctx.index.fields_ids_map(ctx.txn)?.ids().max() - } - }; + let max_fid: Option = ctx.index.searchable_fields_ids(ctx.txn)?.into_iter().max(); if let Some(max_fid) = max_fid { if !all_fields.contains(&max_fid) { diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 6af5bba6d..d97b6639e 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -186,7 +186,7 @@ fn searchable_fields_changed( ) -> bool { let searchable_fields = &settings_diff.new.searchable_fields_ids; for (field_id, field_bytes) in obkv.iter() { - if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + if searchable_fields.contains(&field_id) { let del_add = KvReaderDelAdd::new(field_bytes); match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { // if both fields are None, check the next field. @@ -298,7 +298,7 @@ fn lang_safe_tokens_from_document<'a>( /// Extract words mapped with their positions of a document. fn tokens_from_document<'a>( obkv: &KvReader, - searchable_fields: &Option>, + searchable_fields: &[FieldId], tokenizer: &Tokenizer, max_positions_per_attributes: u32, del_add: DelAdd, @@ -309,7 +309,7 @@ fn tokens_from_document<'a>( let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { // if field is searchable. - if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + if searchable_fields.as_ref().contains(&field_id) { // extract deletion or addition only. if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { // parse json. diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1997e966e..c0742a74a 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -468,14 +468,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Setting::Set(ref fields) => { // Check to see if the searchable fields changed before doing anything else let old_fields = self.index.searchable_fields(self.wtxn)?; - let did_change = match old_fields { - // If old_fields is Some, let's check to see if the fields actually changed - Some(old_fields) => { - let new_fields = fields.iter().map(String::as_str).collect::>(); - new_fields != old_fields - } - // If old_fields is None, the fields have changed (because they are being set) - None => true, + let did_change = { + let new_fields = fields.iter().map(String::as_str).collect::>(); + new_fields != old_fields }; if !did_change { return Ok(false); @@ -1172,7 +1167,7 @@ pub(crate) struct InnerIndexSettings { pub user_defined_faceted_fields: HashSet, pub user_defined_searchable_fields: Option>, pub faceted_fields_ids: HashSet, - pub searchable_fields_ids: Option>, + pub searchable_fields_ids: Vec, pub exact_attributes: HashSet, pub proximity_precision: ProximityPrecision, pub embedding_configs: EmbeddingConfigs, @@ -1517,6 +1512,7 @@ mod tests { use big_s::S; use heed::types::Bytes; use maplit::{btreemap, btreeset, hashset}; + use meili_snap::snapshot; use super::*; use crate::error::Error; @@ -1576,7 +1572,7 @@ mod tests { // Check that the searchable field have been reset and documents are found now. let rtxn = index.read_txn().unwrap(); let searchable_fields = index.searchable_fields(&rtxn).unwrap(); - assert_eq!(searchable_fields, None); + snapshot!(format!("{searchable_fields:?}"), @r###"["name", "id", "age"]"###); let result = index.search(&rtxn).query("23").execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); From 4e4a1ddff7807c1268579bda54c53cd6fca29547 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 7 May 2024 16:37:34 +0200 Subject: [PATCH 02/14] gate a test behind the required feature --- milli/src/update/index_documents/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index bb180a7ee..936ce1efc 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -3260,6 +3260,7 @@ mod tests { } #[test] + #[cfg(feature = "all-tokenizations")] fn stored_detected_script_and_language_should_not_return_deleted_documents() { use charabia::{Language, Script}; let index = TempIndex::new(); From 685f452fb2524c9c3f67218fb2dd273d59ba5110 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 7 May 2024 17:56:40 +0200 Subject: [PATCH 03/14] Fix the indexing of the searchable --- milli/examples/search.rs | 2 +- milli/src/fieldids_weights_map.rs | 4 + milli/src/index.rs | 85 ++++++++++- milli/src/search/mod.rs | 4 +- milli/src/search/new/db_cache.rs | 140 ++++++------------ .../src/search/new/matches/matching_words.rs | 2 +- milli/src/search/new/matches/mod.rs | 2 +- milli/src/search/new/mod.rs | 83 ++++++----- .../src/search/new/query_term/parse_query.rs | 2 +- milli/src/search/new/tests/attribute_fid.rs | 15 +- milli/src/snapshot_tests.rs | 25 ++++ milli/src/update/settings.rs | 25 ++-- 12 files changed, 235 insertions(+), 154 deletions(-) diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 8640acf42..3d10ec599 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -48,7 +48,7 @@ fn main() -> Result<(), Box> { let start = Instant::now(); - let mut ctx = SearchContext::new(&index, &txn); + let mut ctx = SearchContext::new(&index, &txn)?; let universe = filtered_universe(&ctx, &None)?; let docs = execute_search( diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index 255f6ab80..bead160e9 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -25,4 +25,8 @@ impl FieldidsWeightsMap { pub fn max_weight(&self) -> Option { self.map.values().copied().max() } + + pub fn ids<'a>(&'a self) -> impl Iterator + 'a { + self.map.keys().copied() + } } diff --git a/milli/src/index.rs b/milli/src/index.rs index b6b07404b..e9f0f75de 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -28,7 +28,7 @@ use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, FieldidsWeightsMap, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, - BEU16, BEU32, BEU64, + Weight, BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -443,6 +443,27 @@ impl Index { .unwrap_or_default()) } + pub fn searchable_fields_and_weights<'a>( + &self, + rtxn: &'a RoTxn, + ) -> heed::Result, FieldId, Weight)>> { + let fid_map = self.fields_ids_map(rtxn)?; + let weight_map = self.fieldids_weights_map(rtxn)?; + let searchable = self.searchable_fields(rtxn)?; + + Ok(searchable + .into_iter() + .map(|field| { + // the searchable attributes are a subset of the field id map + let fid = fid_map.id(&field).unwrap(); + // all the searchable fields have a weight + let weight = weight_map.weight(fid).unwrap(); + + (field, fid, weight) + }) + .collect()) + } + /* geo rtree */ /// Writes the provided `rtree` which associates coordinates to documents ids. @@ -605,9 +626,25 @@ impl Index { pub(crate) fn put_all_searchable_fields_from_fields_ids_map( &self, wtxn: &mut RwTxn, - user_fields: &[&str], + user_fields: Option<&[&str]>, fields_ids_map: &FieldsIdsMap, ) -> Result<()> { + // Special case if there is no user defined fields. + // Then the whole field id map is marked as searchable. + if user_fields.is_none() { + let mut weights = self.fieldids_weights_map(&wtxn)?; + let mut searchable = Vec::new(); + for (weight, (fid, name)) in fields_ids_map.iter().enumerate() { + searchable.push(name); + weights.insert(fid, weight as u16); + } + self.put_searchable_fields(wtxn, &searchable)?; + self.put_fieldids_weights_map(wtxn, &weights)?; + return Ok(()); + } + + let user_fields = user_fields.unwrap(); + // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; @@ -617,13 +654,13 @@ impl Index { // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. // 2. Iterate over the user defined searchable fields. // 3. If a user defined field is a subset of a field defined in the fields_ids_map - // (ie doggo.name is a subset of doggo) then we push it at the end of the fields. - let mut real_fields = user_fields.to_vec(); + // (ie doggo.name is a subset of doggo) right after doggo and with the same weight. + let mut real_fields = Vec::new(); for (id, field_from_map) in fields_ids_map.iter() { for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) - && !user_fields.contains(&field_from_map) + && !real_fields.contains(&field_from_map) { real_fields.push(field_from_map); @@ -2427,6 +2464,14 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); index .add_documents(documents!([ @@ -2442,6 +2487,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + "###); index.delete_documents(Default::default()); @@ -2452,6 +2507,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + "###); index .add_documents(documents!([ @@ -2467,6 +2532,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + "###); let rtxn = index.read_txn().unwrap(); let search = Search::new(&rtxn, &index); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index bab67e6bd..7427db3a1 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -147,7 +147,7 @@ impl<'a> Search<'a> { pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result { if has_vector_search { - let ctx = SearchContext::new(self.index, self.rtxn); + let ctx = SearchContext::new(self.index, self.rtxn)?; filtered_universe(&ctx, &self.filter) } else { Ok(self.execute()?.candidates) @@ -155,7 +155,7 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - let mut ctx = SearchContext::new(self.index, self.rtxn); + let mut ctx = SearchContext::new(self.index, self.rtxn)?; if let Some(searchable_attributes) = self.searchable_attributes { ctx.searchable_attributes(searchable_attributes)?; diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index a99000f60..4985f55e9 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -159,58 +159,36 @@ impl<'ctx> SearchContext<'ctx> { /// Retrieve or insert the given value in the `word_docids` database. fn get_db_word_docids(&mut self, word: Interned) -> Result> { - match &self.restricted_fids { - Some(restricted_fids) => { - let interned = self.word_interner.get(word).as_str(); - let keys: Vec<_> = - restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect(); + let interned = self.word_interner.get(word).as_str(); + let keys: Vec<_> = + self.searchable_fids.tolerant.iter().map(|(fid, _weight)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - &keys[..], - &mut self.db_cache.word_docids, - self.index.word_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) - } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - self.word_interner.get(word).as_str(), - &mut self.db_cache.word_docids, - self.index.word_docids.remap_data_type::(), - ), - } + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + &keys[..], + &mut self.db_cache.word_docids, + self.index.word_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) } fn get_db_exact_word_docids( &mut self, word: Interned, ) -> Result> { - match &self.restricted_fids { - Some(restricted_fids) => { - let interned = self.word_interner.get(word).as_str(); - let keys: Vec<_> = - restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect(); + let interned = self.word_interner.get(word).as_str(); + let keys: Vec<_> = + self.searchable_fids.exact.iter().map(|(fid, _weight)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - &keys[..], - &mut self.db_cache.exact_word_docids, - self.index.word_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) - } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - self.word_interner.get(word).as_str(), - &mut self.db_cache.exact_word_docids, - self.index.exact_word_docids.remap_data_type::(), - ), - } + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + &keys[..], + &mut self.db_cache.exact_word_docids, + self.index.word_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) } pub fn word_prefix_docids(&mut self, prefix: Word) -> Result> { @@ -238,58 +216,36 @@ impl<'ctx> SearchContext<'ctx> { &mut self, prefix: Interned, ) -> Result> { - match &self.restricted_fids { - Some(restricted_fids) => { - let interned = self.word_interner.get(prefix).as_str(); - let keys: Vec<_> = - restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect(); + let interned = self.word_interner.get(prefix).as_str(); + let keys: Vec<_> = + self.searchable_fids.tolerant.iter().map(|(fid, _weight)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - &keys[..], - &mut self.db_cache.word_prefix_docids, - self.index.word_prefix_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) - } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - self.word_interner.get(prefix).as_str(), - &mut self.db_cache.word_prefix_docids, - self.index.word_prefix_docids.remap_data_type::(), - ), - } + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + &keys[..], + &mut self.db_cache.word_prefix_docids, + self.index.word_prefix_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) } fn get_db_exact_word_prefix_docids( &mut self, prefix: Interned, ) -> Result> { - match &self.restricted_fids { - Some(restricted_fids) => { - let interned = self.word_interner.get(prefix).as_str(); - let keys: Vec<_> = - restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect(); + let interned = self.word_interner.get(prefix).as_str(); + let keys: Vec<_> = + self.searchable_fids.exact.iter().map(|(fid, _weight)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - &keys[..], - &mut self.db_cache.exact_word_prefix_docids, - self.index.word_prefix_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) - } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - self.word_interner.get(prefix).as_str(), - &mut self.db_cache.exact_word_prefix_docids, - self.index.exact_word_prefix_docids.remap_data_type::(), - ), - } + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + &keys[..], + &mut self.db_cache.exact_word_prefix_docids, + self.index.word_prefix_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) } pub fn get_db_word_pair_proximity_docids( @@ -465,8 +421,8 @@ impl<'ctx> SearchContext<'ctx> { word: Interned, fid: u16, ) -> Result> { - // if the requested fid isn't in the restricted list, return None. - if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { + // if the requested fid isn't in the list of searchable, return None. + if !self.searchable_fids.contains(&fid) { return Ok(None); } @@ -484,8 +440,8 @@ impl<'ctx> SearchContext<'ctx> { word_prefix: Interned, fid: u16, ) -> Result> { - // if the requested fid isn't in the restricted list, return None. - if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { + // if the requested fid isn't in the searchable list, return None. + if !self.searchable_fids.contains(&fid) { return Ok(None); } diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 56bf6c169..4db1c99c6 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -258,7 +258,7 @@ pub(crate) mod tests { fn matching_words() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); + let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap(); let mut builder = TokenizerBuilder::default(); let tokenizer = builder.build(); let tokens = tokenizer.tokenize("split this world"); diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 8f0069589..40e6f8dc8 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -506,7 +506,7 @@ mod tests { impl<'a> MatcherBuilder<'a> { fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self { - let mut ctx = SearchContext::new(index, rtxn); + let mut ctx = SearchContext::new(index, rtxn).unwrap(); let universe = filtered_universe(&ctx, &None).unwrap(); let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search( &mut ctx, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index acbb3638b..90d971fa3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -49,13 +49,12 @@ pub use self::geo_sort::Strategy as GeoSortStrategy; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort; -use crate::error::FieldIdMapMissingEntry; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; use crate::vector::Embedder; use crate::{ AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget, - UserError, + UserError, Weight, }; /// A structure used throughout the execution of a search query. @@ -67,12 +66,25 @@ pub struct SearchContext<'ctx> { pub phrase_interner: DedupInterner, pub term_interner: Interner, pub phrase_docids: PhraseDocIdsCache, - pub restricted_fids: Option, + pub searchable_fids: SearchableFids, } impl<'ctx> SearchContext<'ctx> { - pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { - Self { + pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Result { + let searchable_fids = index.searchable_fields_and_weights(txn)?; + let exact_attributes_ids = index.exact_attributes_ids(txn)?; + + let mut exact = Vec::new(); + let mut tolerant = Vec::new(); + for (name, fid, weight) in searchable_fids { + if exact_attributes_ids.contains(&fid) { + exact.push((fid, weight)); + } else { + tolerant.push((fid, weight)); + } + } + + Ok(Self { index, txn, db_cache: <_>::default(), @@ -80,38 +92,32 @@ impl<'ctx> SearchContext<'ctx> { phrase_interner: <_>::default(), term_interner: <_>::default(), phrase_docids: <_>::default(), - restricted_fids: None, - } + searchable_fids: SearchableFids { tolerant, exact }, + }) } - pub fn searchable_attributes(&mut self, searchable_attributes: &'ctx [String]) -> Result<()> { + // TODO: TAMO continue here + pub fn searchable_attributes(&mut self, attributes_to_search_on: &'ctx [String]) -> Result<()> { + if attributes_to_search_on.contains(&String::from("*")) { + return Ok(()); + } + let fids_map = self.index.fields_ids_map(self.txn)?; - let searchable_names = self.index.searchable_fields(self.txn)?; + let searchable_names = self.index.searchable_fields_and_weights(self.txn)?; let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?; - let mut restricted_fids = RestrictedFids::default(); - let mut contains_wildcard = false; - for field_name in searchable_attributes { - if field_name == "*" { - contains_wildcard = true; - continue; - } - let searchable_contains_name = searchable_names.iter().any(|name| name == field_name); - let fid = match (fids_map.id(field_name), searchable_contains_name) { + let mut restricted_fids = SearchableFids::default(); + for field_name in attributes_to_search_on { + let searchable_weight = searchable_names.iter().find(|(name, _, _)| name == field_name); + let (fid, weight) = match searchable_weight { // The Field id exist and the field is searchable - (Some(fid), true) => fid, - // The field is searchable but the Field id doesn't exist => Internal Error - (None, true) => { - return Err(FieldIdMapMissingEntry::FieldName { - field_name: field_name.to_string(), - process: "search", - } - .into()) - } + Some((_name, fid, weight)) => (*fid, *weight), // The field is not searchable => User error - (_fid, false) => { - let (valid_fields, hidden_fields) = - self.index.remove_hidden_fields(self.txn, searchable_names)?; + None => { + let (valid_fields, hidden_fields) = self.index.remove_hidden_fields( + self.txn, + searchable_names.iter().map(|(name, _, _)| name), + )?; let field = field_name.to_string(); return Err(UserError::InvalidSearchableAttribute { @@ -124,13 +130,13 @@ impl<'ctx> SearchContext<'ctx> { }; if exact_attributes_ids.contains(&fid) { - restricted_fids.exact.push(fid); + restricted_fids.exact.push((fid, weight)); } else { - restricted_fids.tolerant.push(fid); + restricted_fids.tolerant.push((fid, weight)); }; } - self.restricted_fids = (!contains_wildcard).then_some(restricted_fids); + self.searchable_fids = restricted_fids; Ok(()) } @@ -152,14 +158,15 @@ impl Word { } #[derive(Debug, Clone, Default)] -pub struct RestrictedFids { - pub tolerant: Vec, - pub exact: Vec, +pub struct SearchableFids { + pub tolerant: Vec<(FieldId, Weight)>, + pub exact: Vec<(FieldId, Weight)>, } -impl RestrictedFids { +impl SearchableFids { pub fn contains(&self, fid: &FieldId) -> bool { - self.tolerant.contains(fid) || self.exact.contains(fid) + self.tolerant.iter().find(|(id, _)| id == fid).is_some() + || self.exact.iter().find(|(id, _)| id == fid).is_some() } } diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 93f5f081c..74b2ed564 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -366,7 +366,7 @@ mod tests { let tokens = tokenizer.tokenize("."); let index = temp_index_with_documents(); let rtxn = index.read_txn()?; - let mut ctx = SearchContext::new(&index, &rtxn); + let mut ctx = SearchContext::new(&index, &rtxn)?; // panics with `attempt to add with overflow` before let ExtractedTokens { query_terms, .. } = located_query_terms_from_tokens(&mut ctx, tokens, None)?; diff --git a/milli/src/search/new/tests/attribute_fid.rs b/milli/src/search/new/tests/attribute_fid.rs index 38225404c..61b0a743b 100644 --- a/milli/src/search/new/tests/attribute_fid.rs +++ b/milli/src/search/new/tests/attribute_fid.rs @@ -1,5 +1,5 @@ use crate::index::tests::TempIndex; -use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; +use crate::{db_snap, Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -131,6 +131,19 @@ fn test_attribute_fid_simple() { #[test] fn test_attribute_fid_ngrams() { let index = create_index(); + db_snap!(index, fields_ids_map, @r###" + 0 title | + 1 description | + 2 plot | + 3 id | + "###); + db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + 2 2 | + "###); let txn = index.read_txn().unwrap(); diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 28c4cb45c..d79003747 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -308,6 +308,25 @@ pub fn snap_fields_ids_map(index: &Index) -> String { } snap } +pub fn snap_fieldids_weights_map(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let weights_map = index.fieldids_weights_map(&rtxn).unwrap(); + + let mut snap = String::new(); + writeln!(&mut snap, "fid weight").unwrap(); + let mut field_ids: Vec<_> = weights_map.ids().collect(); + field_ids.sort(); + for field_id in field_ids { + let weight = weights_map.weight(field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {weight:<3} |").unwrap(); + } + snap +} +pub fn snap_searchable_fields(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let searchable_fields = index.searchable_fields(&rtxn).unwrap(); + format!("{searchable_fields:?}") +} pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); @@ -469,6 +488,12 @@ macro_rules! full_snap_of_db { ($index:ident, fields_ids_map) => {{ $crate::snapshot_tests::snap_fields_ids_map(&$index) }}; + ($index:ident, fieldids_weights_map) => {{ + $crate::snapshot_tests::snap_fieldids_weights_map(&$index) + }}; + ($index:ident, searchable_fields) => {{ + $crate::snapshot_tests::snap_searchable_fields(&$index) + }}; ($index:ident, geo_faceted_documents_ids) => {{ $crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index) }}; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c0742a74a..19b2c5778 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -496,7 +496,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, - &names, + Some(&names), &new_fields_ids_map, )?; self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; @@ -1228,18 +1228,19 @@ impl InnerIndexSettings { // find and insert the new field ids pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { + let searchable_fields = self + .user_defined_searchable_fields + .as_ref() + .map(|searchable| searchable.iter().map(|s| s.as_str()).collect::>()); + // in case new fields were introduced we're going to recreate the searchable fields. - if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() { - let searchable_fields = - searchable_fields.iter().map(String::as_ref).collect::>(); - index.put_all_searchable_fields_from_fields_ids_map( - wtxn, - &searchable_fields, - &self.fields_ids_map, - )?; - let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; - self.searchable_fields_ids = searchable_fields_ids; - } + index.put_all_searchable_fields_from_fields_ids_map( + wtxn, + searchable_fields.as_deref(), + &self.fields_ids_map, + )?; + let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; + self.searchable_fields_ids = searchable_fields_ids; Ok(()) } From 9ecde418531e199a9c558b4273e63e58649ff35d Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 May 2024 16:18:05 +0200 Subject: [PATCH 04/14] add a test on the current behaviour --- milli/src/index.rs | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index e9f0f75de..c66222ab1 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2627,4 +2627,52 @@ pub(crate) mod tests { db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted } + + #[test] + fn swapping_searchable_attributes() { + // See https://github.com/meilisearch/meilisearch/issues/4484 + + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name")]); + settings.set_filterable_fields(HashSet::from([S("age")])); + }) + .unwrap(); + + index + .add_documents(documents!({ "id": 1, "name": "Many", "age": 28, "realName": "Maxime" })) + .unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 name | + 1 id | + 2 age | + 3 realName | + "###); + db_snap!(index, searchable_fields, @r###"["name"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name"), S("realName")]); + settings.set_filterable_fields(HashSet::from([S("age")])); + }) + .unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 name | + 1 realName | + 2 id | + 3 age | + "###); + db_snap!(index, searchable_fields, @r###"["name", "realName"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + "###); + } } From b0afe0972e109bdaaa532ef5f125e02f83930ab0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 May 2024 16:49:08 +0200 Subject: [PATCH 05/14] stop updating the fields ids map when fields are only swapped --- milli/src/index.rs | 9 +++++---- milli/src/update/settings.rs | 21 ++++++++++++--------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index c66222ab1..d0d148d86 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2662,17 +2662,18 @@ pub(crate) mod tests { settings.set_filterable_fields(HashSet::from([S("age")])); }) .unwrap(); + // The order of the field id map shouldn't change db_snap!(index, fields_ids_map, @r###" 0 name | - 1 realName | - 2 id | - 3 age | + 1 id | + 2 age | + 3 realName | "###); db_snap!(index, searchable_fields, @r###"["name", "realName"]"###); db_snap!(index, fieldids_weights_map, @r###" fid weight 0 0 | - 1 1 | + 3 1 | "###); } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 19b2c5778..6875e6f47 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -12,6 +12,7 @@ use time::OffsetDateTime; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; +use crate::documents::FieldIdMapper; use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::order_by_map::OrderByMap; @@ -461,8 +462,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(true) } - /// Updates the index's searchable attributes. This causes the field map to be recomputed to - /// reflect the order of the searchable attributes. + /// Updates the index's searchable attributes. fn update_searchable(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { @@ -480,17 +480,20 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // ids for any settings that uses the facets. (distinct_fields, filterable_fields). let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let mut new_fields_ids_map = FieldsIdsMap::new(); - // fields are deduplicated, only the first occurrence is taken into account - let names = fields.iter().unique().map(String::as_str).collect::>(); + // Since we're updating the settings we can only add new fields at the end of the field id map + let mut new_fields_ids_map = old_fields_ids_map.clone(); + let names = fields + .iter() + // fields are deduplicated, only the first occurrence is taken into account + .unique() + .map(String::as_str) + .collect::>(); // Add all the searchable attributes to the field map, and then add the // remaining fields from the old field map to the new one for name in names.iter() { - new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; - } - - for (_, name) in old_fields_ids_map.iter() { + // The fields ids map won't change the field id of already present elements thus only the + // new fields will be inserted. new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } From a0082c4df9f3cc5497678d4d6989dbba8674f31c Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 14 May 2024 10:45:06 +0200 Subject: [PATCH 06/14] add a failing test on the attribute ranking rule --- milli/src/index.rs | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index d0d148d86..accfff719 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2662,6 +2662,7 @@ pub(crate) mod tests { settings.set_filterable_fields(HashSet::from([S("age")])); }) .unwrap(); + // The order of the field id map shouldn't change db_snap!(index, fields_ids_map, @r###" 0 name | @@ -2676,4 +2677,54 @@ pub(crate) mod tests { 3 1 | "###); } + + #[test] + fn attribute_weights_after_swapping_searchable_attributes() { + // See https://github.com/meilisearch/meilisearch/issues/4484 + + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name"), S("beverage")]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "id": 0, "name": "kefir", "beverage": "water" }, + { "id": 1, "name": "tamo", "beverage": "kefir" } + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("kefir").execute().unwrap(); + + // We should find kefir the dog first + insta::assert_debug_snapshot!(results.documents_ids, @r###" + [ + 0, + 1, + ] + "###); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("beverage"), S("name")]); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("kefir").execute().unwrap(); + + // We should find tamo first + insta::assert_debug_snapshot!(results.documents_ids, @r###" + [ + 0, + 1, + ] + "###); + } } From caa6a7149ac6967580c6e17a4a62f06f64f8312a Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 14 May 2024 16:56:08 +0200 Subject: [PATCH 07/14] make the attribute ranking rule use the weights and fix the tests --- meilisearch/src/search_queue.rs | 3 + milli/src/index.rs | 2 +- milli/src/search/new/mod.rs | 29 ++- .../search/new/ranking_rule_graph/fid/mod.rs | 38 +-- milli/src/search/new/tests/attribute_fid.rs | 14 +- ...attribute_fid__attribute_fid_ngrams-4.snap | 244 ++++++++++++++++++ milli/src/update/settings.rs | 12 +- 7 files changed, 306 insertions(+), 36 deletions(-) create mode 100644 milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap diff --git a/meilisearch/src/search_queue.rs b/meilisearch/src/search_queue.rs index 6d5044d20..0fe9a5a53 100644 --- a/meilisearch/src/search_queue.rs +++ b/meilisearch/src/search_queue.rs @@ -85,6 +85,9 @@ impl SearchQueue { }, search_request = receive_new_searches.recv() => { + if search_request.is_none() { + continue; + } // this unwrap is safe because we're sure the `SearchQueue` still lives somewhere in actix-web let search_request = search_request.unwrap(); if searches_running < usize::from(parallelism) && queue.is_empty() { diff --git a/milli/src/index.rs b/milli/src/index.rs index accfff719..49f78f3cd 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2722,8 +2722,8 @@ pub(crate) mod tests { // We should find tamo first insta::assert_debug_snapshot!(results.documents_ids, @r###" [ - 0, 1, + 0, ] "###); } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 90d971fa3..9a2ff5b02 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -76,7 +76,7 @@ impl<'ctx> SearchContext<'ctx> { let mut exact = Vec::new(); let mut tolerant = Vec::new(); - for (name, fid, weight) in searchable_fids { + for (_name, fid, weight) in searchable_fids { if exact_attributes_ids.contains(&fid) { exact.push((fid, weight)); } else { @@ -96,22 +96,26 @@ impl<'ctx> SearchContext<'ctx> { }) } - // TODO: TAMO continue here pub fn searchable_attributes(&mut self, attributes_to_search_on: &'ctx [String]) -> Result<()> { - if attributes_to_search_on.contains(&String::from("*")) { - return Ok(()); - } - - let fids_map = self.index.fields_ids_map(self.txn)?; + let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?; let searchable_names = self.index.searchable_fields_and_weights(self.txn)?; let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?; + let mut wildcard = false; + let mut restricted_fids = SearchableFids::default(); for field_name in attributes_to_search_on { + if field_name == "*" { + wildcard = true; + // we cannot early exit as we want to returns error in case of unknown fields + continue; + } let searchable_weight = searchable_names.iter().find(|(name, _, _)| name == field_name); let (fid, weight) = match searchable_weight { // The Field id exist and the field is searchable Some((_name, fid, weight)) => (*fid, *weight), + // The field is not searchable but the user didn't define any searchable attributes + None if user_defined_searchable.is_none() => continue, // The field is not searchable => User error None => { let (valid_fields, hidden_fields) = self.index.remove_hidden_fields( @@ -136,7 +140,16 @@ impl<'ctx> SearchContext<'ctx> { }; } - self.searchable_fids = restricted_fids; + if wildcard { + let (exact, tolerant) = searchable_names + .iter() + .map(|(_name, fid, weight)| (*fid, *weight)) + .partition(|(fid, _weight)| exact_attributes_ids.contains(fid)); + + self.searchable_fids = SearchableFids { tolerant, exact }; + } else { + self.searchable_fids = restricted_fids; + } Ok(()) } diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index cf65249de..e10f2fbab 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -7,12 +7,12 @@ use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id; use crate::search::new::SearchContext; -use crate::Result; +use crate::{FieldId, Result}; #[derive(Clone, PartialEq, Eq, Hash)] pub struct FidCondition { term: LocatedQueryTermSubset, - fid: u16, + fid: Option, } pub enum FidGraph {} @@ -26,13 +26,16 @@ impl RankingRuleGraphTrait for FidGraph { universe: &RoaringBitmap, ) -> Result { let FidCondition { term, .. } = condition; - // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument - let mut docids = compute_query_term_subset_docids_within_field_id( - ctx, - &term.term_subset, - condition.fid, - )?; - docids &= universe; + + let docids = if let Some(fid) = condition.fid { + // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument + let mut docids = + compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?; + docids &= universe; + docids + } else { + RoaringBitmap::new() + }; Ok(ComputedCondition { docids, @@ -68,24 +71,27 @@ impl RankingRuleGraphTrait for FidGraph { all_fields.extend(fields); } + let weights_map = ctx.index.fieldids_weights_map(ctx.txn)?; + let mut edges = vec![]; for fid in all_fields.iter().copied() { + let weight = weights_map.weight(fid).unwrap(); edges.push(( - fid as u32 * term.term_ids.len() as u32, - conditions_interner.insert(FidCondition { term: term.clone(), fid }), + weight as u32 * term.term_ids.len() as u32, + conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }), )); } // always lookup the max_fid if we don't already and add an artificial condition for max scoring - let max_fid: Option = ctx.index.searchable_fields_ids(ctx.txn)?.into_iter().max(); + let max_weight: Option = weights_map.max_weight(); - if let Some(max_fid) = max_fid { - if !all_fields.contains(&max_fid) { + if let Some(max_weight) = max_weight { + if !all_fields.contains(&max_weight) { edges.push(( - max_fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. + max_weight as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. conditions_interner.insert(FidCondition { term: term.clone(), // TODO remove this ugly clone - fid: max_fid, + fid: None, }), )); } diff --git a/milli/src/search/new/tests/attribute_fid.rs b/milli/src/search/new/tests/attribute_fid.rs index 61b0a743b..c595887ba 100644 --- a/milli/src/search/new/tests/attribute_fid.rs +++ b/milli/src/search/new/tests/attribute_fid.rs @@ -132,17 +132,17 @@ fn test_attribute_fid_simple() { fn test_attribute_fid_ngrams() { let index = create_index(); db_snap!(index, fields_ids_map, @r###" - 0 title | - 1 description | - 2 plot | - 3 id | + 0 id | + 1 title | + 2 description | + 3 plot | "###); db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###); db_snap!(index, fieldids_weights_map, @r###" fid weight - 0 0 | - 1 1 | - 2 2 | + 1 0 | + 2 1 | + 3 2 | "###); let txn = index.read_txn().unwrap(); diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap new file mode 100644 index 000000000..930a21626 --- /dev/null +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap @@ -0,0 +1,244 @@ +--- +source: milli/src/search/new/tests/attribute_fid.rs +expression: "format!(\"{document_ids_scores:#?}\")" +--- +[ + ( + 2, + [ + Fid( + Rank { + rank: 19, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 91, + max_rank: 91, + }, + ), + ], + ), + ( + 6, + [ + Fid( + Rank { + rank: 15, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 5, + [ + Fid( + Rank { + rank: 14, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 79, + max_rank: 91, + }, + ), + ], + ), + ( + 4, + [ + Fid( + Rank { + rank: 13, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 77, + max_rank: 91, + }, + ), + ], + ), + ( + 3, + [ + Fid( + Rank { + rank: 12, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 83, + max_rank: 91, + }, + ), + ], + ), + ( + 9, + [ + Fid( + Rank { + rank: 11, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 75, + max_rank: 91, + }, + ), + ], + ), + ( + 8, + [ + Fid( + Rank { + rank: 10, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 79, + max_rank: 91, + }, + ), + ], + ), + ( + 7, + [ + Fid( + Rank { + rank: 10, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 73, + max_rank: 91, + }, + ), + ], + ), + ( + 11, + [ + Fid( + Rank { + rank: 7, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 77, + max_rank: 91, + }, + ), + ], + ), + ( + 10, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 13, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 12, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 78, + max_rank: 91, + }, + ), + ], + ), + ( + 14, + [ + Fid( + Rank { + rank: 5, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 75, + max_rank: 91, + }, + ), + ], + ), + ( + 0, + [ + Fid( + Rank { + rank: 1, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 91, + max_rank: 91, + }, + ), + ], + ), +] diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 6875e6f47..2e8ac157c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -12,7 +12,6 @@ use time::OffsetDateTime; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; -use crate::documents::FieldIdMapper; use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::order_by_map::OrderByMap; @@ -1562,8 +1561,9 @@ mod tests { // we must find the appropriate document. let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap(); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); + let fid_map = index.fields_ids_map(&rtxn).unwrap(); assert_eq!(documents.len(), 1); - assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); drop(rtxn); // We change the searchable fields to be the "name" field only. @@ -1575,12 +1575,16 @@ mod tests { // Check that the searchable field have been reset and documents are found now. let rtxn = index.read_txn().unwrap(); + let fid_map = index.fields_ids_map(&rtxn).unwrap(); + let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap(); + snapshot!(format!("{user_defined_searchable_fields:?}"), @"None"); + // the searchable fields should contain all the fields let searchable_fields = index.searchable_fields(&rtxn).unwrap(); - snapshot!(format!("{searchable_fields:?}"), @r###"["name", "id", "age"]"###); + snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###); let result = index.search(&rtxn).query("23").execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); - assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); } #[test] From 9fffb8e83dd13cbe2d88655a258e5391b648d01e Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 14 May 2024 17:20:57 +0200 Subject: [PATCH 08/14] make clippy happy --- index-scheduler/src/utils.rs | 4 ++-- meilisearch/src/search.rs | 2 +- milli/src/fieldids_weights_map.rs | 2 +- milli/src/index.rs | 8 ++++---- milli/src/search/new/bucket_sort.rs | 4 ++-- milli/src/search/new/mod.rs | 3 +-- 6 files changed, 11 insertions(+), 12 deletions(-) diff --git a/index-scheduler/src/utils.rs b/index-scheduler/src/utils.rs index 9f6f90db2..260ff6ee4 100644 --- a/index-scheduler/src/utils.rs +++ b/index-scheduler/src/utils.rs @@ -272,9 +272,9 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) { } for index_uid in index_uids { if index_uid == swap.0 { - *index_uid = swap.1.to_owned(); + swap.1.clone_into(index_uid); } else if index_uid == swap.1 { - *index_uid = swap.0.to_owned(); + swap.0.clone_into(index_uid); } } } diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index a383434a2..34ebe463d 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -730,7 +730,7 @@ pub fn perform_search( let mut ids = BTreeSet::new(); for attr in attrs { if attr == "*" { - ids = displayed_ids.clone(); + ids.clone_from(&displayed_ids); break; } diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index bead160e9..fdfe8fba2 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -26,7 +26,7 @@ impl FieldidsWeightsMap { self.map.values().copied().max() } - pub fn ids<'a>(&'a self) -> impl Iterator + 'a { + pub fn ids(&self) -> impl Iterator + '_ { self.map.keys().copied() } } diff --git a/milli/src/index.rs b/milli/src/index.rs index 49f78f3cd..7fe9da0ff 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -632,7 +632,7 @@ impl Index { // Special case if there is no user defined fields. // Then the whole field id map is marked as searchable. if user_fields.is_none() { - let mut weights = self.fieldids_weights_map(&wtxn)?; + let mut weights = self.fieldids_weights_map(wtxn)?; let mut searchable = Vec::new(); for (weight, (fid, name)) in fields_ids_map.iter().enumerate() { searchable.push(name); @@ -648,7 +648,7 @@ impl Index { // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; - let mut weights = self.fieldids_weights_map(&wtxn)?; + let mut weights = self.fieldids_weights_map(wtxn)?; // Now we generate the real searchable fields: // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. @@ -666,7 +666,7 @@ impl Index { let weight: u16 = weight.try_into().map_err(|_| UserError::AttributeLimitReached)?; - weights.insert(id, weight as u16); + weights.insert(id, weight); } } } @@ -701,7 +701,7 @@ impl Index { self.main .remap_types::>>() .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY)? - .map(|fields| Ok(fields.into_iter().map(|field| Cow::Borrowed(field)).collect())) + .map(|fields| Ok(fields.into_iter().map(Cow::Borrowed).collect())) .unwrap_or_else(|| { Ok(self .fields_ids_map(rtxn)? diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index 521fcb983..e9bc5449d 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -101,7 +101,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( let mut ranking_rule_universes: Vec = vec![RoaringBitmap::default(); ranking_rules_len]; - ranking_rule_universes[0] = universe.clone(); + ranking_rule_universes[0].clone_from(universe); let mut cur_ranking_rule_index = 0; /// Finish iterating over the current ranking rule, yielding @@ -232,7 +232,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( } cur_ranking_rule_index += 1; - ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); + ranking_rule_universes[cur_ranking_rule_index].clone_from(&next_bucket.candidates); logger.start_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index].as_ref(), diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 9a2ff5b02..b7514cbb5 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -178,8 +178,7 @@ pub struct SearchableFids { impl SearchableFids { pub fn contains(&self, fid: &FieldId) -> bool { - self.tolerant.iter().find(|(id, _)| id == fid).is_some() - || self.exact.iter().find(|(id, _)| id == fid).is_some() + self.tolerant.iter().any(|(id, _)| id == fid) || self.exact.iter().any(|(id, _)| id == fid) } } From 7ec4e2a3fbb89821f3a153a9adee05e405183720 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 15 May 2024 15:02:26 +0200 Subject: [PATCH 09/14] apply all style review comments --- meilisearch/src/search_queue.rs | 12 +++++---- milli/src/error.rs | 2 ++ milli/src/fieldids_weights_map.rs | 9 +++++++ milli/src/index.rs | 27 ++++++++++--------- .../search/new/ranking_rule_graph/fid/mod.rs | 11 ++++---- milli/src/update/settings.rs | 20 +++++--------- 6 files changed, 45 insertions(+), 36 deletions(-) diff --git a/meilisearch/src/search_queue.rs b/meilisearch/src/search_queue.rs index 0fe9a5a53..415da0c15 100644 --- a/meilisearch/src/search_queue.rs +++ b/meilisearch/src/search_queue.rs @@ -85,11 +85,13 @@ impl SearchQueue { }, search_request = receive_new_searches.recv() => { - if search_request.is_none() { - continue; - } - // this unwrap is safe because we're sure the `SearchQueue` still lives somewhere in actix-web - let search_request = search_request.unwrap(); + let search_request = match search_request { + Some(search_request) => search_request, + // This should never happen while actix-web is running, but it's not a reason to crash + // and it can generate a lot of noise in the tests. + None => continue, + }; + if searches_running < usize::from(parallelism) && queue.is_empty() { searches_running += 1; // if the search requests die it's not a hard error on our side diff --git a/milli/src/error.rs b/milli/src/error.rs index e4550de1f..009781fcf 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -32,6 +32,8 @@ pub enum InternalError { DatabaseClosing, #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))] DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, + #[error("Missing {key} in the fieldids weights mapping.")] + FieldidsWeightsMapMissingEntry { key: FieldId }, #[error(transparent)] FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry), #[error("Missing {key} in the field id mapping.")] diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index fdfe8fba2..72720a02a 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -1,3 +1,5 @@ +//! The fieldids weights map is in charge of storing linking the searchable fields with their weights. + use std::collections::HashMap; use serde::{Deserialize, Serialize}; @@ -10,22 +12,29 @@ pub struct FieldidsWeightsMap { } impl FieldidsWeightsMap { + /// Insert a field id -> weigth into the map. + /// If the map did not have this key present, `None` is returned. + /// If the map did have this key present, the value is updated, and the old value is returned. pub fn insert(&mut self, fid: FieldId, weight: Weight) -> Option { self.map.insert(fid, weight) } + /// Removes a field id from the map, returning the associated weight previously in the map. pub fn remove(&mut self, fid: FieldId) -> Option { self.map.remove(&fid) } + /// Returns weight corresponding to the key. pub fn weight(&self, fid: FieldId) -> Option { self.map.get(&fid).copied() } + /// Returns highest weight contained in the map if any. pub fn max_weight(&self) -> Option { self.map.values().copied().max() } + /// Return an iterator visiting all field ids in arbitrary order. pub fn ids(&self) -> impl Iterator + '_ { self.map.keys().copied() } diff --git a/milli/src/index.rs b/milli/src/index.rs index 7fe9da0ff..c565cdd5b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -26,9 +26,9 @@ use crate::proximity::ProximityPrecision; use crate::vector::EmbeddingConfig; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, - FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, FieldidsWeightsMap, - GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, - Weight, BEU16, BEU32, BEU64, + FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, + FieldidsWeightsMap, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, + Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -446,22 +446,25 @@ impl Index { pub fn searchable_fields_and_weights<'a>( &self, rtxn: &'a RoTxn, - ) -> heed::Result, FieldId, Weight)>> { + ) -> Result, FieldId, Weight)>> { let fid_map = self.fields_ids_map(rtxn)?; let weight_map = self.fieldids_weights_map(rtxn)?; let searchable = self.searchable_fields(rtxn)?; - Ok(searchable + searchable .into_iter() - .map(|field| { - // the searchable attributes are a subset of the field id map - let fid = fid_map.id(&field).unwrap(); - // all the searchable fields have a weight - let weight = weight_map.weight(fid).unwrap(); + .map(|field| -> Result<_> { + let fid = fid_map.id(&field).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: field.to_string(), + process: "searchable_fields_and_weights", + })?; + let weight = weight_map + .weight(fid) + .ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?; - (field, fid, weight) + Ok((field, fid, weight)) }) - .collect()) + .collect() } /* geo rtree */ diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index e10f2fbab..a4a08ea46 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -7,7 +7,7 @@ use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id; use crate::search::new::SearchContext; -use crate::{FieldId, Result}; +use crate::{FieldId, InternalError, Result}; #[derive(Clone, PartialEq, Eq, Hash)] pub struct FidCondition { @@ -29,10 +29,9 @@ impl RankingRuleGraphTrait for FidGraph { let docids = if let Some(fid) = condition.fid { // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument - let mut docids = + let docids = compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?; - docids &= universe; - docids + docids & universe } else { RoaringBitmap::new() }; @@ -75,7 +74,9 @@ impl RankingRuleGraphTrait for FidGraph { let mut edges = vec![]; for fid in all_fields.iter().copied() { - let weight = weights_map.weight(fid).unwrap(); + let weight = weights_map + .weight(fid) + .ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?; edges.push(( weight as u32 * term.term_ids.len() as u32, conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }), diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 2e8ac157c..c66148813 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -475,33 +475,25 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { return Ok(false); } - // every time the searchable attributes are updated, we need to update the - // ids for any settings that uses the facets. (distinct_fields, filterable_fields). - let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - // Since we're updating the settings we can only add new fields at the end of the field id map - let mut new_fields_ids_map = old_fields_ids_map.clone(); - let names = fields - .iter() - // fields are deduplicated, only the first occurrence is taken into account - .unique() - .map(String::as_str) - .collect::>(); + let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + // fields are deduplicated, only the first occurrence is taken into account + let names = fields.iter().unique().map(String::as_str).collect::>(); // Add all the searchable attributes to the field map, and then add the // remaining fields from the old field map to the new one for name in names.iter() { // The fields ids map won't change the field id of already present elements thus only the // new fields will be inserted. - new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; + fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, Some(&names), - &new_fields_ids_map, + &fields_ids_map, )?; - self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; Ok(true) } Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), From ad4d8502b3583f734f7508dea2e14656a8dea946 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 15 May 2024 17:16:10 +0200 Subject: [PATCH 10/14] stops storing the whole fieldids weights map when no searchable are defined --- milli/src/fieldids_weights_map.rs | 9 ++++++- milli/src/index.rs | 36 ++++++++++++---------------- milli/src/update/settings.rs | 40 +++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 29 deletions(-) diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index 72720a02a..5ca2a6146 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use crate::{FieldId, Weight}; +use crate::{FieldId, FieldsIdsMap, Weight}; #[derive(Debug, Default, Serialize, Deserialize)] pub struct FieldidsWeightsMap { @@ -19,6 +19,13 @@ impl FieldidsWeightsMap { self.map.insert(fid, weight) } + /// Create the map from the fields ids maps. + /// Should only be called in the case there are NO searchable attributes. + /// The weights and the fields ids will have the same values. + pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { + FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, fid)).collect() } + } + /// Removes a field id from the map, returning the associated weight previously in the map. pub fn remove(&mut self, fid: FieldId) -> Option { self.map.remove(&fid) diff --git a/milli/src/index.rs b/milli/src/index.rs index c565cdd5b..36f0b339e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -436,11 +436,20 @@ impl Index { /// Get the fieldids weights map which associates the field ids to their weights pub fn fieldids_weights_map(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self - .main + self.main .remap_types::>() .get(rtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY)? - .unwrap_or_default()) + .map(Ok) + .unwrap_or_else(|| { + Ok(FieldidsWeightsMap::from_field_id_map_without_searchable( + &self.fields_ids_map(rtxn)?, + )) + }) + } + + /// Delete the fieldsids weights map + pub fn delete_fieldids_weights_map(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY) } pub fn searchable_fields_and_weights<'a>( @@ -629,29 +638,13 @@ impl Index { pub(crate) fn put_all_searchable_fields_from_fields_ids_map( &self, wtxn: &mut RwTxn, - user_fields: Option<&[&str]>, + user_fields: &[&str], fields_ids_map: &FieldsIdsMap, ) -> Result<()> { - // Special case if there is no user defined fields. - // Then the whole field id map is marked as searchable. - if user_fields.is_none() { - let mut weights = self.fieldids_weights_map(wtxn)?; - let mut searchable = Vec::new(); - for (weight, (fid, name)) in fields_ids_map.iter().enumerate() { - searchable.push(name); - weights.insert(fid, weight as u16); - } - self.put_searchable_fields(wtxn, &searchable)?; - self.put_fieldids_weights_map(wtxn, &weights)?; - return Ok(()); - } - - let user_fields = user_fields.unwrap(); - // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; - let mut weights = self.fieldids_weights_map(wtxn)?; + let mut weights = FieldidsWeightsMap::default(); // Now we generate the real searchable fields: // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. @@ -682,6 +675,7 @@ impl Index { pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { let did_delete_searchable = self.delete_searchable_fields(wtxn)?; let did_delete_user_defined = self.delete_user_defined_searchable_fields(wtxn)?; + self.delete_fieldids_weights_map(wtxn)?; Ok(did_delete_searchable || did_delete_user_defined) } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c66148813..046644dc4 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -490,7 +490,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, - Some(&names), + &names, &fields_ids_map, )?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -1228,11 +1228,13 @@ impl InnerIndexSettings { .map(|searchable| searchable.iter().map(|s| s.as_str()).collect::>()); // in case new fields were introduced we're going to recreate the searchable fields. - index.put_all_searchable_fields_from_fields_ids_map( - wtxn, - searchable_fields.as_deref(), - &self.fields_ids_map, - )?; + if let Some(searchable_fields) = searchable_fields { + index.put_all_searchable_fields_from_fields_ids_map( + wtxn, + &searchable_fields, + &self.fields_ids_map, + )?; + } let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; self.searchable_fields_ids = searchable_fields_ids; @@ -1513,7 +1515,7 @@ mod tests { use crate::error::Error; use crate::index::tests::TempIndex; use crate::update::ClearDocuments; - use crate::{Criterion, Filter, SearchResult}; + use crate::{db_snap, Criterion, Filter, SearchResult}; #[test] fn set_and_reset_searchable_fields() { @@ -1542,6 +1544,17 @@ mod tests { wtxn.commit().unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 name | + 2 age | + "###); + db_snap!(index, searchable_fields, @r###"["name"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 1 0 | + "###); + // Check that the searchable field is correctly set to "name" only. let rtxn = index.read_txn().unwrap(); // When we search for something that is not in @@ -1565,6 +1578,19 @@ mod tests { }) .unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 name | + 2 age | + "###); + db_snap!(index, searchable_fields, @r###"["id", "name", "age"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 1 | + 2 2 | + "###); + // Check that the searchable field have been reset and documents are found now. let rtxn = index.read_txn().unwrap(); let fid_map = index.fields_ids_map(&rtxn).unwrap(); From 5542f1d9f11c2c6d7ad691af11ed1b5177a13168 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 15 May 2024 18:00:39 +0200 Subject: [PATCH 11/14] get back to what we were doingb efore in the DB cache and with the restricted field id --- milli/src/search/new/db_cache.rs | 140 ++++++++++++++++++++----------- milli/src/search/new/mod.rs | 19 ++--- 2 files changed, 99 insertions(+), 60 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 4985f55e9..4fa0765e0 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -159,36 +159,58 @@ impl<'ctx> SearchContext<'ctx> { /// Retrieve or insert the given value in the `word_docids` database. fn get_db_word_docids(&mut self, word: Interned) -> Result> { - let interned = self.word_interner.get(word).as_str(); - let keys: Vec<_> = - self.searchable_fids.tolerant.iter().map(|(fid, _weight)| (interned, *fid)).collect(); + match &self.restricted_fids { + Some(restricted_fids) => { + let interned = self.word_interner.get(word).as_str(); + let keys: Vec<_> = + restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - &keys[..], - &mut self.db_cache.word_docids, - self.index.word_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + &keys[..], + &mut self.db_cache.word_docids, + self.index.word_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) + } + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + self.word_interner.get(word).as_str(), + &mut self.db_cache.word_docids, + self.index.word_docids.remap_data_type::(), + ), + } } fn get_db_exact_word_docids( &mut self, word: Interned, ) -> Result> { - let interned = self.word_interner.get(word).as_str(); - let keys: Vec<_> = - self.searchable_fids.exact.iter().map(|(fid, _weight)| (interned, *fid)).collect(); + match &self.restricted_fids { + Some(restricted_fids) => { + let interned = self.word_interner.get(word).as_str(); + let keys: Vec<_> = + restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - word, - &keys[..], - &mut self.db_cache.exact_word_docids, - self.index.word_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + &keys[..], + &mut self.db_cache.exact_word_docids, + self.index.word_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) + } + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + self.word_interner.get(word).as_str(), + &mut self.db_cache.exact_word_docids, + self.index.exact_word_docids.remap_data_type::(), + ), + } } pub fn word_prefix_docids(&mut self, prefix: Word) -> Result> { @@ -216,36 +238,58 @@ impl<'ctx> SearchContext<'ctx> { &mut self, prefix: Interned, ) -> Result> { - let interned = self.word_interner.get(prefix).as_str(); - let keys: Vec<_> = - self.searchable_fids.tolerant.iter().map(|(fid, _weight)| (interned, *fid)).collect(); + match &self.restricted_fids { + Some(restricted_fids) => { + let interned = self.word_interner.get(prefix).as_str(); + let keys: Vec<_> = + restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - &keys[..], - &mut self.db_cache.word_prefix_docids, - self.index.word_prefix_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + &keys[..], + &mut self.db_cache.word_prefix_docids, + self.index.word_prefix_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) + } + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + self.word_interner.get(prefix).as_str(), + &mut self.db_cache.word_prefix_docids, + self.index.word_prefix_docids.remap_data_type::(), + ), + } } fn get_db_exact_word_prefix_docids( &mut self, prefix: Interned, ) -> Result> { - let interned = self.word_interner.get(prefix).as_str(); - let keys: Vec<_> = - self.searchable_fids.exact.iter().map(|(fid, _weight)| (interned, *fid)).collect(); + match &self.restricted_fids { + Some(restricted_fids) => { + let interned = self.word_interner.get(prefix).as_str(); + let keys: Vec<_> = + restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( - self.txn, - prefix, - &keys[..], - &mut self.db_cache.exact_word_prefix_docids, - self.index.word_prefix_fid_docids.remap_data_type::(), - merge_cbo_roaring_bitmaps, - ) + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + &keys[..], + &mut self.db_cache.exact_word_prefix_docids, + self.index.word_prefix_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) + } + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + self.word_interner.get(prefix).as_str(), + &mut self.db_cache.exact_word_prefix_docids, + self.index.exact_word_prefix_docids.remap_data_type::(), + ), + } } pub fn get_db_word_pair_proximity_docids( @@ -421,8 +465,8 @@ impl<'ctx> SearchContext<'ctx> { word: Interned, fid: u16, ) -> Result> { - // if the requested fid isn't in the list of searchable, return None. - if !self.searchable_fids.contains(&fid) { + // if the requested fid isn't in the restricted list, return None. + if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { return Ok(None); } @@ -440,8 +484,8 @@ impl<'ctx> SearchContext<'ctx> { word_prefix: Interned, fid: u16, ) -> Result> { - // if the requested fid isn't in the searchable list, return None. - if !self.searchable_fids.contains(&fid) { + // if the requested fid isn't in the restricted list, return None. + if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { return Ok(None); } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index b7514cbb5..2cea96fce 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -66,7 +66,7 @@ pub struct SearchContext<'ctx> { pub phrase_interner: DedupInterner, pub term_interner: Interner, pub phrase_docids: PhraseDocIdsCache, - pub searchable_fids: SearchableFids, + pub restricted_fids: Option, } impl<'ctx> SearchContext<'ctx> { @@ -92,7 +92,7 @@ impl<'ctx> SearchContext<'ctx> { phrase_interner: <_>::default(), term_interner: <_>::default(), phrase_docids: <_>::default(), - searchable_fids: SearchableFids { tolerant, exact }, + restricted_fids: None, }) } @@ -103,7 +103,7 @@ impl<'ctx> SearchContext<'ctx> { let mut wildcard = false; - let mut restricted_fids = SearchableFids::default(); + let mut restricted_fids = RestrictedFids::default(); for field_name in attributes_to_search_on { if field_name == "*" { wildcard = true; @@ -141,14 +141,9 @@ impl<'ctx> SearchContext<'ctx> { } if wildcard { - let (exact, tolerant) = searchable_names - .iter() - .map(|(_name, fid, weight)| (*fid, *weight)) - .partition(|(fid, _weight)| exact_attributes_ids.contains(fid)); - - self.searchable_fids = SearchableFids { tolerant, exact }; + self.restricted_fids = None; } else { - self.searchable_fids = restricted_fids; + self.restricted_fids = Some(restricted_fids); } Ok(()) @@ -171,12 +166,12 @@ impl Word { } #[derive(Debug, Clone, Default)] -pub struct SearchableFids { +pub struct RestrictedFids { pub tolerant: Vec<(FieldId, Weight)>, pub exact: Vec<(FieldId, Weight)>, } -impl SearchableFids { +impl RestrictedFids { pub fn contains(&self, fid: &FieldId) -> bool { self.tolerant.iter().any(|(id, _)| id == fid) || self.exact.iter().any(|(id, _)| id == fid) } From c78a2fa4f5dfdf9dc487d32ce7df6a52a2b02c64 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 15 May 2024 18:04:42 +0200 Subject: [PATCH 12/14] rename method and variable around the attributes to search on feature --- milli/src/search/mod.rs | 2 +- milli/src/search/new/mod.rs | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7427db3a1..ca0eda49e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -158,7 +158,7 @@ impl<'a> Search<'a> { let mut ctx = SearchContext::new(self.index, self.rtxn)?; if let Some(searchable_attributes) = self.searchable_attributes { - ctx.searchable_attributes(searchable_attributes)?; + ctx.attributes_to_search_on(searchable_attributes)?; } let universe = filtered_universe(&ctx, &self.filter)?; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 2cea96fce..5e4c2f829 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -96,9 +96,12 @@ impl<'ctx> SearchContext<'ctx> { }) } - pub fn searchable_attributes(&mut self, attributes_to_search_on: &'ctx [String]) -> Result<()> { + pub fn attributes_to_search_on( + &mut self, + attributes_to_search_on: &'ctx [String], + ) -> Result<()> { let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?; - let searchable_names = self.index.searchable_fields_and_weights(self.txn)?; + let searchable_fields_weights = self.index.searchable_fields_and_weights(self.txn)?; let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?; let mut wildcard = false; @@ -110,7 +113,8 @@ impl<'ctx> SearchContext<'ctx> { // we cannot early exit as we want to returns error in case of unknown fields continue; } - let searchable_weight = searchable_names.iter().find(|(name, _, _)| name == field_name); + let searchable_weight = + searchable_fields_weights.iter().find(|(name, _, _)| name == field_name); let (fid, weight) = match searchable_weight { // The Field id exist and the field is searchable Some((_name, fid, weight)) => (*fid, *weight), @@ -120,7 +124,7 @@ impl<'ctx> SearchContext<'ctx> { None => { let (valid_fields, hidden_fields) = self.index.remove_hidden_fields( self.txn, - searchable_names.iter().map(|(name, _, _)| name), + searchable_fields_weights.iter().map(|(name, _, _)| name), )?; let field = field_name.to_string(); From f2d0a59f1da3a83875e57a38fb5c45e0af993b3f Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 16 May 2024 01:06:33 +0200 Subject: [PATCH 13/14] when no searchable attributes are defined, makes all the weight equals to zero --- meilisearch/tests/search/hybrid.rs | 8 ++++---- meilisearch/tests/search/mod.rs | 2 +- meilisearch/tests/search/restrict_searchable.rs | 4 ++-- milli/src/fieldids_weights_map.rs | 4 ++-- milli/src/index.rs | 6 +++--- milli/src/update/settings.rs | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 68ae4c0aa..67f7909b9 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -85,8 +85,8 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); - snapshot!(response["semanticHitCount"], @"1"); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index .search_post( @@ -331,7 +331,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.8848484848484849}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => @@ -374,6 +374,6 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index b4350f686..f601e2b03 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -921,7 +921,7 @@ async fn test_score_details() { "order": 3, "attributeRankingOrderScore": 1.0, "queryWordDistanceScore": 0.8095238095238095, - "score": 0.9727891156462584 + "score": 0.8095238095238095 }, "exactness": { "order": 4, diff --git a/meilisearch/tests/search/restrict_searchable.rs b/meilisearch/tests/search/restrict_searchable.rs index 7bbdca38f..f52efa1f4 100644 --- a/meilisearch/tests/search/restrict_searchable.rs +++ b/meilisearch/tests/search/restrict_searchable.rs @@ -285,10 +285,10 @@ async fn attributes_ranking_rule_order() { @r###" [ { - "id": "2" + "id": "1" }, { - "id": "1" + "id": "2" } ] "### diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index 5ca2a6146..a737632a4 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -21,9 +21,9 @@ impl FieldidsWeightsMap { /// Create the map from the fields ids maps. /// Should only be called in the case there are NO searchable attributes. - /// The weights and the fields ids will have the same values. + /// All the fields will be inserted in the order of the fields ids map with a weight of 0. pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { - FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, fid)).collect() } + FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() } } /// Removes a field id from the map, returning the associated weight previously in the map. diff --git a/milli/src/index.rs b/milli/src/index.rs index 36f0b339e..42b9cb111 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2492,7 +2492,7 @@ pub(crate) mod tests { db_snap!(index, fieldids_weights_map, @r###" fid weight 0 0 | - 1 1 | + 1 0 | "###); index.delete_documents(Default::default()); @@ -2512,7 +2512,7 @@ pub(crate) mod tests { db_snap!(index, fieldids_weights_map, @r###" fid weight 0 0 | - 1 1 | + 1 0 | "###); index @@ -2537,7 +2537,7 @@ pub(crate) mod tests { db_snap!(index, fieldids_weights_map, @r###" fid weight 0 0 | - 1 1 | + 1 0 | "###); let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 046644dc4..0599bb9d8 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1587,8 +1587,8 @@ mod tests { db_snap!(index, fieldids_weights_map, @r###" fid weight 0 0 | - 1 1 | - 2 2 | + 1 0 | + 2 0 | "###); // Check that the searchable field have been reset and documents are found now. From 673b6e1dc0f9ad6d688c5f8da7295d1f4e041c5f Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 16 May 2024 11:28:14 +0200 Subject: [PATCH 14/14] fix a flaky test --- meilisearch/tests/snapshot/mod.rs | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/meilisearch/tests/snapshot/mod.rs b/meilisearch/tests/snapshot/mod.rs index 1312aa9ca..67e80f45b 100644 --- a/meilisearch/tests/snapshot/mod.rs +++ b/meilisearch/tests/snapshot/mod.rs @@ -1,6 +1,5 @@ use std::time::Duration; -use actix_rt::time::sleep; use meili_snap::{json_string, snapshot}; use meilisearch::option::ScheduleSnapshot; use meilisearch::Opt; @@ -53,11 +52,29 @@ async fn perform_snapshot() { index.load_test_set().await; - server.index("test1").create(Some("prim")).await; + let (task, code) = server.index("test1").create(Some("prim")).await; + meili_snap::snapshot!(code, @"202 Accepted"); - index.wait_task(2).await; + index.wait_task(task.uid()).await; - sleep(Duration::from_secs(2)).await; + // wait for the _next task_ to process, aka the snapshot that should be enqueued at some point + + println!("waited for the next task to finish"); + let now = std::time::Instant::now(); + let next_task = task.uid() + 1; + loop { + let (value, code) = index.get_task(next_task).await; + dbg!(&value); + if code != 404 && value["status"].as_str() == Some("succeeded") { + break; + } + + if now.elapsed() > Duration::from_secs(30) { + panic!("The snapshot didn't schedule in 30s even though it was supposed to be scheduled every 2s: {}", + serde_json::to_string_pretty(&value).unwrap() + ); + } + } let temp = tempfile::tempdir().unwrap();