diff --git a/index-scheduler/src/utils.rs b/index-scheduler/src/utils.rs index 9f6f90db2..260ff6ee4 100644 --- a/index-scheduler/src/utils.rs +++ b/index-scheduler/src/utils.rs @@ -272,9 +272,9 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) { } for index_uid in index_uids { if index_uid == swap.0 { - *index_uid = swap.1.to_owned(); + swap.1.clone_into(index_uid); } else if index_uid == swap.1 { - *index_uid = swap.0.to_owned(); + swap.0.clone_into(index_uid); } } } diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index a383434a2..34ebe463d 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -730,7 +730,7 @@ pub fn perform_search( let mut ids = BTreeSet::new(); for attr in attrs { if attr == "*" { - ids = displayed_ids.clone(); + ids.clone_from(&displayed_ids); break; } diff --git a/meilisearch/src/search_queue.rs b/meilisearch/src/search_queue.rs index 6d5044d20..415da0c15 100644 --- a/meilisearch/src/search_queue.rs +++ b/meilisearch/src/search_queue.rs @@ -85,8 +85,13 @@ impl SearchQueue { }, search_request = receive_new_searches.recv() => { - // this unwrap is safe because we're sure the `SearchQueue` still lives somewhere in actix-web - let search_request = search_request.unwrap(); + let search_request = match search_request { + Some(search_request) => search_request, + // This should never happen while actix-web is running, but it's not a reason to crash + // and it can generate a lot of noise in the tests. + None => continue, + }; + if searches_running < usize::from(parallelism) && queue.is_empty() { searches_running += 1; // if the search requests die it's not a hard error on our side diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 68ae4c0aa..67f7909b9 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -85,8 +85,8 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); - snapshot!(response["semanticHitCount"], @"1"); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index .search_post( @@ -331,7 +331,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.8848484848484849}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => @@ -374,6 +374,6 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index b4350f686..f601e2b03 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -921,7 +921,7 @@ async fn test_score_details() { "order": 3, "attributeRankingOrderScore": 1.0, "queryWordDistanceScore": 0.8095238095238095, - "score": 0.9727891156462584 + "score": 0.8095238095238095 }, "exactness": { "order": 4, diff --git a/meilisearch/tests/search/restrict_searchable.rs b/meilisearch/tests/search/restrict_searchable.rs index 7bbdca38f..f52efa1f4 100644 --- a/meilisearch/tests/search/restrict_searchable.rs +++ b/meilisearch/tests/search/restrict_searchable.rs @@ -285,10 +285,10 @@ async fn attributes_ranking_rule_order() { @r###" [ { - "id": "2" + "id": "1" }, { - "id": "1" + "id": "2" } ] "### diff --git a/meilisearch/tests/snapshot/mod.rs b/meilisearch/tests/snapshot/mod.rs index 1312aa9ca..67e80f45b 100644 --- a/meilisearch/tests/snapshot/mod.rs +++ b/meilisearch/tests/snapshot/mod.rs @@ -1,6 +1,5 @@ use std::time::Duration; -use actix_rt::time::sleep; use meili_snap::{json_string, snapshot}; use meilisearch::option::ScheduleSnapshot; use meilisearch::Opt; @@ -53,11 +52,29 @@ async fn perform_snapshot() { index.load_test_set().await; - server.index("test1").create(Some("prim")).await; + let (task, code) = server.index("test1").create(Some("prim")).await; + meili_snap::snapshot!(code, @"202 Accepted"); - index.wait_task(2).await; + index.wait_task(task.uid()).await; - sleep(Duration::from_secs(2)).await; + // wait for the _next task_ to process, aka the snapshot that should be enqueued at some point + + println!("waited for the next task to finish"); + let now = std::time::Instant::now(); + let next_task = task.uid() + 1; + loop { + let (value, code) = index.get_task(next_task).await; + dbg!(&value); + if code != 404 && value["status"].as_str() == Some("succeeded") { + break; + } + + if now.elapsed() > Duration::from_secs(30) { + panic!("The snapshot didn't schedule in 30s even though it was supposed to be scheduled every 2s: {}", + serde_json::to_string_pretty(&value).unwrap() + ); + } + } let temp = tempfile::tempdir().unwrap(); diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 8640acf42..3d10ec599 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -48,7 +48,7 @@ fn main() -> Result<(), Box> { let start = Instant::now(); - let mut ctx = SearchContext::new(&index, &txn); + let mut ctx = SearchContext::new(&index, &txn)?; let universe = filtered_universe(&ctx, &None)?; let docs = execute_search( diff --git a/milli/src/error.rs b/milli/src/error.rs index e4550de1f..009781fcf 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -32,6 +32,8 @@ pub enum InternalError { DatabaseClosing, #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))] DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, + #[error("Missing {key} in the fieldids weights mapping.")] + FieldidsWeightsMapMissingEntry { key: FieldId }, #[error(transparent)] FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry), #[error("Missing {key} in the field id mapping.")] diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs new file mode 100644 index 000000000..a737632a4 --- /dev/null +++ b/milli/src/fieldids_weights_map.rs @@ -0,0 +1,48 @@ +//! The fieldids weights map is in charge of storing linking the searchable fields with their weights. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::{FieldId, FieldsIdsMap, Weight}; + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct FieldidsWeightsMap { + map: HashMap, +} + +impl FieldidsWeightsMap { + /// Insert a field id -> weigth into the map. + /// If the map did not have this key present, `None` is returned. + /// If the map did have this key present, the value is updated, and the old value is returned. + pub fn insert(&mut self, fid: FieldId, weight: Weight) -> Option { + self.map.insert(fid, weight) + } + + /// Create the map from the fields ids maps. + /// Should only be called in the case there are NO searchable attributes. + /// All the fields will be inserted in the order of the fields ids map with a weight of 0. + pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { + FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() } + } + + /// Removes a field id from the map, returning the associated weight previously in the map. + pub fn remove(&mut self, fid: FieldId) -> Option { + self.map.remove(&fid) + } + + /// Returns weight corresponding to the key. + pub fn weight(&self, fid: FieldId) -> Option { + self.map.get(&fid).copied() + } + + /// Returns highest weight contained in the map if any. + pub fn max_weight(&self) -> Option { + self.map.values().copied().max() + } + + /// Return an iterator visiting all field ids in arbitrary order. + pub fn ids(&self) -> impl Iterator + '_ { + self.map.keys().copied() + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 27b273393..42b9cb111 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::convert::TryInto; use std::fs::File; use std::path::Path; @@ -25,8 +26,9 @@ use crate::proximity::ProximityPrecision; use crate::vector::EmbeddingConfig; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, - FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, - Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, BEU64, + FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, + FieldidsWeightsMap, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, + Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -42,6 +44,7 @@ pub mod main_key { pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; + pub const FIELDIDS_WEIGHTS_MAP_KEY: &str = "fieldids-weights-map"; pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; pub const GEO_RTREE_KEY: &str = "geo-rtree"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -414,6 +417,65 @@ impl Index { .unwrap_or_default()) } + /* fieldids weights map */ + // This maps the fields ids to their weights. + // Their weights is defined by the ordering of the searchable attributes. + + /// Writes the fieldids weights map which associates the field ids to their weights + pub(crate) fn put_fieldids_weights_map( + &self, + wtxn: &mut RwTxn, + map: &FieldidsWeightsMap, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + wtxn, + main_key::FIELDIDS_WEIGHTS_MAP_KEY, + map, + ) + } + + /// Get the fieldids weights map which associates the field ids to their weights + pub fn fieldids_weights_map(&self, rtxn: &RoTxn) -> heed::Result { + self.main + .remap_types::>() + .get(rtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY)? + .map(Ok) + .unwrap_or_else(|| { + Ok(FieldidsWeightsMap::from_field_id_map_without_searchable( + &self.fields_ids_map(rtxn)?, + )) + }) + } + + /// Delete the fieldsids weights map + pub fn delete_fieldids_weights_map(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY) + } + + pub fn searchable_fields_and_weights<'a>( + &self, + rtxn: &'a RoTxn, + ) -> Result, FieldId, Weight)>> { + let fid_map = self.fields_ids_map(rtxn)?; + let weight_map = self.fieldids_weights_map(rtxn)?; + let searchable = self.searchable_fields(rtxn)?; + + searchable + .into_iter() + .map(|field| -> Result<_> { + let fid = fid_map.id(&field).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: field.to_string(), + process: "searchable_fields_and_weights", + })?; + let weight = weight_map + .weight(fid) + .ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?; + + Ok((field, fid, weight)) + }) + .collect() + } + /* geo rtree */ /// Writes the provided `rtree` which associates coordinates to documents ids. @@ -578,33 +640,42 @@ impl Index { wtxn: &mut RwTxn, user_fields: &[&str], fields_ids_map: &FieldsIdsMap, - ) -> heed::Result<()> { + ) -> Result<()> { // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; + let mut weights = FieldidsWeightsMap::default(); + // Now we generate the real searchable fields: // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. // 2. Iterate over the user defined searchable fields. // 3. If a user defined field is a subset of a field defined in the fields_ids_map - // (ie doggo.name is a subset of doggo) then we push it at the end of the fields. - let mut real_fields = user_fields.to_vec(); + // (ie doggo.name is a subset of doggo) right after doggo and with the same weight. + let mut real_fields = Vec::new(); - for field_from_map in fields_ids_map.names() { - for user_field in user_fields { + for (id, field_from_map) in fields_ids_map.iter() { + for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) - && !user_fields.contains(&field_from_map) + && !real_fields.contains(&field_from_map) { real_fields.push(field_from_map); + + let weight: u16 = + weight.try_into().map_err(|_| UserError::AttributeLimitReached)?; + weights.insert(id, weight); } } } - self.put_searchable_fields(wtxn, &real_fields) + self.put_searchable_fields(wtxn, &real_fields)?; + self.put_fieldids_weights_map(wtxn, &weights)?; + Ok(()) } pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { let did_delete_searchable = self.delete_searchable_fields(wtxn)?; let did_delete_user_defined = self.delete_user_defined_searchable_fields(wtxn)?; + self.delete_fieldids_weights_map(wtxn)?; Ok(did_delete_searchable || did_delete_user_defined) } @@ -623,28 +694,31 @@ impl Index { } /// Returns the searchable fields, those are the fields that are indexed, - /// if the searchable fields aren't there it means that **all** the fields are indexed. - pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { + pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { self.main .remap_types::>>() - .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY) + .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY)? + .map(|fields| Ok(fields.into_iter().map(Cow::Borrowed).collect())) + .unwrap_or_else(|| { + Ok(self + .fields_ids_map(rtxn)? + .names() + .map(|field| Cow::Owned(field.to_string())) + .collect()) + }) } /// Identical to `searchable_fields`, but returns the ids instead. - pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result>> { - match self.searchable_fields(rtxn)? { - Some(fields) => { - let fields_ids_map = self.fields_ids_map(rtxn)?; - let mut fields_ids = Vec::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(name) { - fields_ids.push(field_id); - } - } - Ok(Some(fields_ids)) + pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.searchable_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields { + if let Some(field_id) = fields_ids_map.id(&name) { + fields_ids.push(field_id); } - None => Ok(None), } + Ok(fields_ids) } /// Writes the searchable fields, when this list is specified, only these are indexed. @@ -1710,10 +1784,14 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, field_distribution, 1); + db_snap!(index, field_distribution, @r###" + age 1 | + id 2 | + name 2 | + "###); db_snap!(index, word_docids, - @r###" + @r###" 1 [0, ] 2 [1, ] 20 [1, ] @@ -1722,18 +1800,6 @@ pub(crate) mod tests { "### ); - db_snap!(index, field_distribution); - - db_snap!(index, field_distribution, - @r###" - age 1 | - id 2 | - name 2 | - "### - ); - - // snapshot_index!(&index, "1", include: "^field_distribution$"); - // we add all the documents a second time. we are supposed to get the same // field_distribution in the end index @@ -1820,7 +1886,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); @@ -1840,7 +1906,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); assert_eq!(user_defined, &["doggo", "name"]); @@ -1856,7 +1922,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); @@ -2395,6 +2461,14 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); index .add_documents(documents!([ @@ -2410,6 +2484,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 0 | + "###); index.delete_documents(Default::default()); @@ -2420,6 +2504,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 0 | + "###); index .add_documents(documents!([ @@ -2435,6 +2529,16 @@ pub(crate) mod tests { 11 0 4 1 "###); + db_snap!(index, fields_ids_map, @r###" + 0 primary_key | + 1 a | + "###); + db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 0 | + "###); let rtxn = index.read_txn().unwrap(); let search = Search::new(&rtxn, &index); @@ -2520,4 +2624,104 @@ pub(crate) mod tests { db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted } + + #[test] + fn swapping_searchable_attributes() { + // See https://github.com/meilisearch/meilisearch/issues/4484 + + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name")]); + settings.set_filterable_fields(HashSet::from([S("age")])); + }) + .unwrap(); + + index + .add_documents(documents!({ "id": 1, "name": "Many", "age": 28, "realName": "Maxime" })) + .unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 name | + 1 id | + 2 age | + 3 realName | + "###); + db_snap!(index, searchable_fields, @r###"["name"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name"), S("realName")]); + settings.set_filterable_fields(HashSet::from([S("age")])); + }) + .unwrap(); + + // The order of the field id map shouldn't change + db_snap!(index, fields_ids_map, @r###" + 0 name | + 1 id | + 2 age | + 3 realName | + "###); + db_snap!(index, searchable_fields, @r###"["name", "realName"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 3 1 | + "###); + } + + #[test] + fn attribute_weights_after_swapping_searchable_attributes() { + // See https://github.com/meilisearch/meilisearch/issues/4484 + + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("name"), S("beverage")]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "id": 0, "name": "kefir", "beverage": "water" }, + { "id": 1, "name": "tamo", "beverage": "kefir" } + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("kefir").execute().unwrap(); + + // We should find kefir the dog first + insta::assert_debug_snapshot!(results.documents_ids, @r###" + [ + 0, + 1, + ] + "###); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("beverage"), S("name")]); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("kefir").execute().unwrap(); + + // We should find tamo first + insta::assert_debug_snapshot!(results.documents_ids, @r###" + [ + 1, + 0, + ] + "###); + } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a1e240464..881633b5c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -28,6 +28,7 @@ pub mod vector; #[cfg(test)] #[macro_use] pub mod snapshot_tests; +mod fieldids_weights_map; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; @@ -52,6 +53,7 @@ pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; pub use self::external_documents_ids::ExternalDocumentsIds; +pub use self::fieldids_weights_map::FieldidsWeightsMap; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, @@ -77,6 +79,7 @@ pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; pub type FieldDistribution = BTreeMap; pub type FieldId = u16; +pub type Weight = u16; pub type Object = serde_json::Map; pub type Position = u32; pub type RelativePosition = u16; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index bab67e6bd..ca0eda49e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -147,7 +147,7 @@ impl<'a> Search<'a> { pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result { if has_vector_search { - let ctx = SearchContext::new(self.index, self.rtxn); + let ctx = SearchContext::new(self.index, self.rtxn)?; filtered_universe(&ctx, &self.filter) } else { Ok(self.execute()?.candidates) @@ -155,10 +155,10 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - let mut ctx = SearchContext::new(self.index, self.rtxn); + let mut ctx = SearchContext::new(self.index, self.rtxn)?; if let Some(searchable_attributes) = self.searchable_attributes { - ctx.searchable_attributes(searchable_attributes)?; + ctx.attributes_to_search_on(searchable_attributes)?; } let universe = filtered_universe(&ctx, &self.filter)?; diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index 521fcb983..e9bc5449d 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -101,7 +101,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( let mut ranking_rule_universes: Vec = vec![RoaringBitmap::default(); ranking_rules_len]; - ranking_rule_universes[0] = universe.clone(); + ranking_rule_universes[0].clone_from(universe); let mut cur_ranking_rule_index = 0; /// Finish iterating over the current ranking rule, yielding @@ -232,7 +232,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( } cur_ranking_rule_index += 1; - ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); + ranking_rule_universes[cur_ranking_rule_index].clone_from(&next_bucket.candidates); logger.start_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index].as_ref(), diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 62c921a1d..4fa0765e0 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -163,7 +163,7 @@ impl<'ctx> SearchContext<'ctx> { Some(restricted_fids) => { let interned = self.word_interner.get(word).as_str(); let keys: Vec<_> = - restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect(); + restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( self.txn, @@ -192,7 +192,7 @@ impl<'ctx> SearchContext<'ctx> { Some(restricted_fids) => { let interned = self.word_interner.get(word).as_str(); let keys: Vec<_> = - restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect(); + restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( self.txn, @@ -242,7 +242,7 @@ impl<'ctx> SearchContext<'ctx> { Some(restricted_fids) => { let interned = self.word_interner.get(prefix).as_str(); let keys: Vec<_> = - restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect(); + restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( self.txn, @@ -271,7 +271,7 @@ impl<'ctx> SearchContext<'ctx> { Some(restricted_fids) => { let interned = self.word_interner.get(prefix).as_str(); let keys: Vec<_> = - restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect(); + restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( self.txn, @@ -315,11 +315,7 @@ impl<'ctx> SearchContext<'ctx> { .map_err(heed::Error::Decoding)? } else { // Compute the distance at the attribute level and store it in the cache. - let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { - fids - } else { - self.index.fields_ids_map(self.txn)?.ids().collect() - }; + let fids = self.index.searchable_fields_ids(self.txn)?; let mut docids = RoaringBitmap::new(); for fid in fids { // for each field, intersect left word bitmap and right word bitmap, @@ -408,11 +404,7 @@ impl<'ctx> SearchContext<'ctx> { let prefix_docids = match proximity_precision { ProximityPrecision::ByAttribute => { // Compute the distance at the attribute level and store it in the cache. - let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { - fids - } else { - self.index.fields_ids_map(self.txn)?.ids().collect() - }; + let fids = self.index.searchable_fields_ids(self.txn)?; let mut prefix_docids = RoaringBitmap::new(); // for each field, intersect left word bitmap and right word bitmap, // then merge the result in a global bitmap before storing it in the cache. diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 7932f0c2a..41b70ae39 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -184,13 +184,7 @@ impl State { return Ok(State::Empty(query_graph.clone())); } - let searchable_fields_ids = { - if let Some(fids) = ctx.index.searchable_fields_ids(ctx.txn)? { - fids - } else { - ctx.index.fields_ids_map(ctx.txn)?.ids().collect() - } - }; + let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?; let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); // then check that there exists at least one attribute that has all of the terms diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 56bf6c169..4db1c99c6 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -258,7 +258,7 @@ pub(crate) mod tests { fn matching_words() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); + let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap(); let mut builder = TokenizerBuilder::default(); let tokenizer = builder.build(); let tokens = tokenizer.tokenize("split this world"); diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 8f0069589..40e6f8dc8 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -506,7 +506,7 @@ mod tests { impl<'a> MatcherBuilder<'a> { fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self { - let mut ctx = SearchContext::new(index, rtxn); + let mut ctx = SearchContext::new(index, rtxn).unwrap(); let universe = filtered_universe(&ctx, &None).unwrap(); let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search( &mut ctx, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 617068ef8..5e4c2f829 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -49,13 +49,12 @@ pub use self::geo_sort::Strategy as GeoSortStrategy; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort; -use crate::error::FieldIdMapMissingEntry; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; use crate::vector::Embedder; use crate::{ AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget, - UserError, + UserError, Weight, }; /// A structure used throughout the execution of a search query. @@ -71,8 +70,21 @@ pub struct SearchContext<'ctx> { } impl<'ctx> SearchContext<'ctx> { - pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { - Self { + pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Result { + let searchable_fids = index.searchable_fields_and_weights(txn)?; + let exact_attributes_ids = index.exact_attributes_ids(txn)?; + + let mut exact = Vec::new(); + let mut tolerant = Vec::new(); + for (_name, fid, weight) in searchable_fids { + if exact_attributes_ids.contains(&fid) { + exact.push((fid, weight)); + } else { + tolerant.push((fid, weight)); + } + } + + Ok(Self { index, txn, db_cache: <_>::default(), @@ -81,42 +93,39 @@ impl<'ctx> SearchContext<'ctx> { term_interner: <_>::default(), phrase_docids: <_>::default(), restricted_fids: None, - } + }) } - pub fn searchable_attributes(&mut self, searchable_attributes: &'ctx [String]) -> Result<()> { - let fids_map = self.index.fields_ids_map(self.txn)?; - let searchable_names = self.index.searchable_fields(self.txn)?; + pub fn attributes_to_search_on( + &mut self, + attributes_to_search_on: &'ctx [String], + ) -> Result<()> { + let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?; + let searchable_fields_weights = self.index.searchable_fields_and_weights(self.txn)?; let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?; + let mut wildcard = false; + let mut restricted_fids = RestrictedFids::default(); - let mut contains_wildcard = false; - for field_name in searchable_attributes { + for field_name in attributes_to_search_on { if field_name == "*" { - contains_wildcard = true; + wildcard = true; + // we cannot early exit as we want to returns error in case of unknown fields continue; } - let searchable_contains_name = - searchable_names.as_ref().map(|sn| sn.iter().any(|name| name == field_name)); - let fid = match (fids_map.id(field_name), searchable_contains_name) { + let searchable_weight = + searchable_fields_weights.iter().find(|(name, _, _)| name == field_name); + let (fid, weight) = match searchable_weight { // The Field id exist and the field is searchable - (Some(fid), Some(true)) | (Some(fid), None) => fid, - // The field is searchable but the Field id doesn't exist => Internal Error - (None, Some(true)) => { - return Err(FieldIdMapMissingEntry::FieldName { - field_name: field_name.to_string(), - process: "search", - } - .into()) - } - // The field is not searchable, but the searchableAttributes are set to * => ignore field - (None, None) => continue, + Some((_name, fid, weight)) => (*fid, *weight), + // The field is not searchable but the user didn't define any searchable attributes + None if user_defined_searchable.is_none() => continue, // The field is not searchable => User error - (_fid, Some(false)) => { - let (valid_fields, hidden_fields) = match searchable_names { - Some(sn) => self.index.remove_hidden_fields(self.txn, sn)?, - None => self.index.remove_hidden_fields(self.txn, fids_map.names())?, - }; + None => { + let (valid_fields, hidden_fields) = self.index.remove_hidden_fields( + self.txn, + searchable_fields_weights.iter().map(|(name, _, _)| name), + )?; let field = field_name.to_string(); return Err(UserError::InvalidSearchableAttribute { @@ -129,13 +138,17 @@ impl<'ctx> SearchContext<'ctx> { }; if exact_attributes_ids.contains(&fid) { - restricted_fids.exact.push(fid); + restricted_fids.exact.push((fid, weight)); } else { - restricted_fids.tolerant.push(fid); + restricted_fids.tolerant.push((fid, weight)); }; } - self.restricted_fids = (!contains_wildcard).then_some(restricted_fids); + if wildcard { + self.restricted_fids = None; + } else { + self.restricted_fids = Some(restricted_fids); + } Ok(()) } @@ -158,13 +171,13 @@ impl Word { #[derive(Debug, Clone, Default)] pub struct RestrictedFids { - pub tolerant: Vec, - pub exact: Vec, + pub tolerant: Vec<(FieldId, Weight)>, + pub exact: Vec<(FieldId, Weight)>, } impl RestrictedFids { pub fn contains(&self, fid: &FieldId) -> bool { - self.tolerant.contains(fid) || self.exact.contains(fid) + self.tolerant.iter().any(|(id, _)| id == fid) || self.exact.iter().any(|(id, _)| id == fid) } } diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 93f5f081c..74b2ed564 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -366,7 +366,7 @@ mod tests { let tokens = tokenizer.tokenize("."); let index = temp_index_with_documents(); let rtxn = index.read_txn()?; - let mut ctx = SearchContext::new(&index, &rtxn); + let mut ctx = SearchContext::new(&index, &rtxn)?; // panics with `attempt to add with overflow` before let ExtractedTokens { query_terms, .. } = located_query_terms_from_tokens(&mut ctx, tokens, None)?; diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index 8f3e0cc82..a4a08ea46 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -7,12 +7,12 @@ use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id; use crate::search::new::SearchContext; -use crate::Result; +use crate::{FieldId, InternalError, Result}; #[derive(Clone, PartialEq, Eq, Hash)] pub struct FidCondition { term: LocatedQueryTermSubset, - fid: u16, + fid: Option, } pub enum FidGraph {} @@ -26,13 +26,15 @@ impl RankingRuleGraphTrait for FidGraph { universe: &RoaringBitmap, ) -> Result { let FidCondition { term, .. } = condition; - // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument - let mut docids = compute_query_term_subset_docids_within_field_id( - ctx, - &term.term_subset, - condition.fid, - )?; - docids &= universe; + + let docids = if let Some(fid) = condition.fid { + // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument + let docids = + compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?; + docids & universe + } else { + RoaringBitmap::new() + }; Ok(ComputedCondition { docids, @@ -68,34 +70,29 @@ impl RankingRuleGraphTrait for FidGraph { all_fields.extend(fields); } + let weights_map = ctx.index.fieldids_weights_map(ctx.txn)?; + let mut edges = vec![]; for fid in all_fields.iter().copied() { + let weight = weights_map + .weight(fid) + .ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?; edges.push(( - fid as u32 * term.term_ids.len() as u32, - conditions_interner.insert(FidCondition { term: term.clone(), fid }), + weight as u32 * term.term_ids.len() as u32, + conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }), )); } // always lookup the max_fid if we don't already and add an artificial condition for max scoring - let max_fid: Option = { - if let Some(max_fid) = ctx - .index - .searchable_fields_ids(ctx.txn)? - .map(|field_ids| field_ids.into_iter().max()) - { - max_fid - } else { - ctx.index.fields_ids_map(ctx.txn)?.ids().max() - } - }; + let max_weight: Option = weights_map.max_weight(); - if let Some(max_fid) = max_fid { - if !all_fields.contains(&max_fid) { + if let Some(max_weight) = max_weight { + if !all_fields.contains(&max_weight) { edges.push(( - max_fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. + max_weight as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. conditions_interner.insert(FidCondition { term: term.clone(), // TODO remove this ugly clone - fid: max_fid, + fid: None, }), )); } diff --git a/milli/src/search/new/tests/attribute_fid.rs b/milli/src/search/new/tests/attribute_fid.rs index 38225404c..c595887ba 100644 --- a/milli/src/search/new/tests/attribute_fid.rs +++ b/milli/src/search/new/tests/attribute_fid.rs @@ -1,5 +1,5 @@ use crate::index::tests::TempIndex; -use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; +use crate::{db_snap, Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -131,6 +131,19 @@ fn test_attribute_fid_simple() { #[test] fn test_attribute_fid_ngrams() { let index = create_index(); + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 title | + 2 description | + 3 plot | + "###); + db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 1 0 | + 2 1 | + 3 2 | + "###); let txn = index.read_txn().unwrap(); diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap new file mode 100644 index 000000000..930a21626 --- /dev/null +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams-4.snap @@ -0,0 +1,244 @@ +--- +source: milli/src/search/new/tests/attribute_fid.rs +expression: "format!(\"{document_ids_scores:#?}\")" +--- +[ + ( + 2, + [ + Fid( + Rank { + rank: 19, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 91, + max_rank: 91, + }, + ), + ], + ), + ( + 6, + [ + Fid( + Rank { + rank: 15, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 5, + [ + Fid( + Rank { + rank: 14, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 79, + max_rank: 91, + }, + ), + ], + ), + ( + 4, + [ + Fid( + Rank { + rank: 13, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 77, + max_rank: 91, + }, + ), + ], + ), + ( + 3, + [ + Fid( + Rank { + rank: 12, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 83, + max_rank: 91, + }, + ), + ], + ), + ( + 9, + [ + Fid( + Rank { + rank: 11, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 75, + max_rank: 91, + }, + ), + ], + ), + ( + 8, + [ + Fid( + Rank { + rank: 10, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 79, + max_rank: 91, + }, + ), + ], + ), + ( + 7, + [ + Fid( + Rank { + rank: 10, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 73, + max_rank: 91, + }, + ), + ], + ), + ( + 11, + [ + Fid( + Rank { + rank: 7, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 77, + max_rank: 91, + }, + ), + ], + ), + ( + 10, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 13, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 81, + max_rank: 91, + }, + ), + ], + ), + ( + 12, + [ + Fid( + Rank { + rank: 6, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 78, + max_rank: 91, + }, + ), + ], + ), + ( + 14, + [ + Fid( + Rank { + rank: 5, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 75, + max_rank: 91, + }, + ), + ], + ), + ( + 0, + [ + Fid( + Rank { + rank: 1, + max_rank: 19, + }, + ), + Position( + Rank { + rank: 91, + max_rank: 91, + }, + ), + ], + ), +] diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 28c4cb45c..d79003747 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -308,6 +308,25 @@ pub fn snap_fields_ids_map(index: &Index) -> String { } snap } +pub fn snap_fieldids_weights_map(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let weights_map = index.fieldids_weights_map(&rtxn).unwrap(); + + let mut snap = String::new(); + writeln!(&mut snap, "fid weight").unwrap(); + let mut field_ids: Vec<_> = weights_map.ids().collect(); + field_ids.sort(); + for field_id in field_ids { + let weight = weights_map.weight(field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {weight:<3} |").unwrap(); + } + snap +} +pub fn snap_searchable_fields(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let searchable_fields = index.searchable_fields(&rtxn).unwrap(); + format!("{searchable_fields:?}") +} pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); @@ -469,6 +488,12 @@ macro_rules! full_snap_of_db { ($index:ident, fields_ids_map) => {{ $crate::snapshot_tests::snap_fields_ids_map(&$index) }}; + ($index:ident, fieldids_weights_map) => {{ + $crate::snapshot_tests::snap_fieldids_weights_map(&$index) + }}; + ($index:ident, searchable_fields) => {{ + $crate::snapshot_tests::snap_searchable_fields(&$index) + }}; ($index:ident, geo_faceted_documents_ids) => {{ $crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index) }}; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 6af5bba6d..d97b6639e 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -186,7 +186,7 @@ fn searchable_fields_changed( ) -> bool { let searchable_fields = &settings_diff.new.searchable_fields_ids; for (field_id, field_bytes) in obkv.iter() { - if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + if searchable_fields.contains(&field_id) { let del_add = KvReaderDelAdd::new(field_bytes); match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { // if both fields are None, check the next field. @@ -298,7 +298,7 @@ fn lang_safe_tokens_from_document<'a>( /// Extract words mapped with their positions of a document. fn tokens_from_document<'a>( obkv: &KvReader, - searchable_fields: &Option>, + searchable_fields: &[FieldId], tokenizer: &Tokenizer, max_positions_per_attributes: u32, del_add: DelAdd, @@ -309,7 +309,7 @@ fn tokens_from_document<'a>( let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { // if field is searchable. - if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + if searchable_fields.as_ref().contains(&field_id) { // extract deletion or addition only. if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { // parse json. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index bb180a7ee..936ce1efc 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -3260,6 +3260,7 @@ mod tests { } #[test] + #[cfg(feature = "all-tokenizations")] fn stored_detected_script_and_language_should_not_return_deleted_documents() { use charabia::{Language, Script}; let index = TempIndex::new(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1997e966e..0599bb9d8 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -461,50 +461,39 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(true) } - /// Updates the index's searchable attributes. This causes the field map to be recomputed to - /// reflect the order of the searchable attributes. + /// Updates the index's searchable attributes. fn update_searchable(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { // Check to see if the searchable fields changed before doing anything else let old_fields = self.index.searchable_fields(self.wtxn)?; - let did_change = match old_fields { - // If old_fields is Some, let's check to see if the fields actually changed - Some(old_fields) => { - let new_fields = fields.iter().map(String::as_str).collect::>(); - new_fields != old_fields - } - // If old_fields is None, the fields have changed (because they are being set) - None => true, + let did_change = { + let new_fields = fields.iter().map(String::as_str).collect::>(); + new_fields != old_fields }; if !did_change { return Ok(false); } - // every time the searchable attributes are updated, we need to update the - // ids for any settings that uses the facets. (distinct_fields, filterable_fields). - let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - - let mut new_fields_ids_map = FieldsIdsMap::new(); + // Since we're updating the settings we can only add new fields at the end of the field id map + let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // fields are deduplicated, only the first occurrence is taken into account let names = fields.iter().unique().map(String::as_str).collect::>(); // Add all the searchable attributes to the field map, and then add the // remaining fields from the old field map to the new one for name in names.iter() { - new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; - } - - for (_, name) in old_fields_ids_map.iter() { - new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; + // The fields ids map won't change the field id of already present elements thus only the + // new fields will be inserted. + fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, &names, - &new_fields_ids_map, + &fields_ids_map, )?; - self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; Ok(true) } Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), @@ -1172,7 +1161,7 @@ pub(crate) struct InnerIndexSettings { pub user_defined_faceted_fields: HashSet, pub user_defined_searchable_fields: Option>, pub faceted_fields_ids: HashSet, - pub searchable_fields_ids: Option>, + pub searchable_fields_ids: Vec, pub exact_attributes: HashSet, pub proximity_precision: ProximityPrecision, pub embedding_configs: EmbeddingConfigs, @@ -1233,18 +1222,21 @@ impl InnerIndexSettings { // find and insert the new field ids pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { + let searchable_fields = self + .user_defined_searchable_fields + .as_ref() + .map(|searchable| searchable.iter().map(|s| s.as_str()).collect::>()); + // in case new fields were introduced we're going to recreate the searchable fields. - if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() { - let searchable_fields = - searchable_fields.iter().map(String::as_ref).collect::>(); + if let Some(searchable_fields) = searchable_fields { index.put_all_searchable_fields_from_fields_ids_map( wtxn, &searchable_fields, &self.fields_ids_map, )?; - let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; - self.searchable_fields_ids = searchable_fields_ids; } + let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; + self.searchable_fields_ids = searchable_fields_ids; Ok(()) } @@ -1517,12 +1509,13 @@ mod tests { use big_s::S; use heed::types::Bytes; use maplit::{btreemap, btreeset, hashset}; + use meili_snap::snapshot; use super::*; use crate::error::Error; use crate::index::tests::TempIndex; use crate::update::ClearDocuments; - use crate::{Criterion, Filter, SearchResult}; + use crate::{db_snap, Criterion, Filter, SearchResult}; #[test] fn set_and_reset_searchable_fields() { @@ -1551,6 +1544,17 @@ mod tests { wtxn.commit().unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 name | + 2 age | + "###); + db_snap!(index, searchable_fields, @r###"["name"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 1 0 | + "###); + // Check that the searchable field is correctly set to "name" only. let rtxn = index.read_txn().unwrap(); // When we search for something that is not in @@ -1562,8 +1566,9 @@ mod tests { // we must find the appropriate document. let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap(); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); + let fid_map = index.fields_ids_map(&rtxn).unwrap(); assert_eq!(documents.len(), 1); - assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); drop(rtxn); // We change the searchable fields to be the "name" field only. @@ -1573,14 +1578,31 @@ mod tests { }) .unwrap(); + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 name | + 2 age | + "###); + db_snap!(index, searchable_fields, @r###"["id", "name", "age"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 0 | + 2 0 | + "###); + // Check that the searchable field have been reset and documents are found now. let rtxn = index.read_txn().unwrap(); + let fid_map = index.fields_ids_map(&rtxn).unwrap(); + let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap(); + snapshot!(format!("{user_defined_searchable_fields:?}"), @"None"); + // the searchable fields should contain all the fields let searchable_fields = index.searchable_fields(&rtxn).unwrap(); - assert_eq!(searchable_fields, None); + snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###); let result = index.search(&rtxn).query("23").execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); - assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); } #[test]