From 7a84697570c4f03f903328d0da7145941a1ef445 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 21 May 2024 17:08:45 +0200 Subject: [PATCH] never store the _vectors as searchable or faceted fields --- milli/src/fieldids_weights_map.rs | 10 ++- milli/src/index.rs | 101 +++++++++++++++++++++++++++++- milli/src/update/settings.rs | 22 +++++-- 3 files changed, 124 insertions(+), 9 deletions(-) diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index a737632a4..2bf828711 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use crate::{FieldId, FieldsIdsMap, Weight}; +use crate::{vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME, FieldId, FieldsIdsMap, Weight}; #[derive(Debug, Default, Serialize, Deserialize)] pub struct FieldidsWeightsMap { @@ -23,7 +23,13 @@ impl FieldidsWeightsMap { /// Should only be called in the case there are NO searchable attributes. /// All the fields will be inserted in the order of the fields ids map with a weight of 0. pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { - FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() } + FieldidsWeightsMap { + map: fid_map + .iter() + .filter(|(_fid, name)| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME)) + .map(|(fid, _name)| (fid, 0)) + .collect(), + } } /// Removes a field id from the map, returning the associated weight previously in the map. diff --git a/milli/src/index.rs b/milli/src/index.rs index 3c502d541..ef4936ed1 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,6 +23,7 @@ use crate::heed_codec::{ }; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::vector::{Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, @@ -644,6 +645,7 @@ impl Index { &self, wtxn: &mut RwTxn, user_fields: &[&str], + non_searchable_fields_ids: &[FieldId], fields_ids_map: &FieldsIdsMap, ) -> Result<()> { // We can write the user defined searchable fields as-is. @@ -662,6 +664,7 @@ impl Index { for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) && !real_fields.contains(&field_from_map) + && !non_searchable_fields_ids.contains(&id) { real_fields.push(field_from_map); @@ -708,6 +711,7 @@ impl Index { Ok(self .fields_ids_map(rtxn)? .names() + .filter(|name| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME)) .map(|field| Cow::Owned(field.to_string())) .collect()) }) @@ -1669,15 +1673,17 @@ pub(crate) mod tests { use big_s::S; use heed::{EnvOpenOptions, RwTxn}; - use maplit::hashset; + use maplit::{btreemap, hashset}; use tempfile::TempDir; use crate::documents::DocumentsBatchReader; use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, + self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, + Settings, }; + use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult}; pub(crate) struct TempIndex { @@ -2783,4 +2789,95 @@ pub(crate) mod tests { ] "###); } + + #[test] + fn vectors_are_never_indexed_as_searchable_or_filterable() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { "id": 0, "_vectors": { "doggo": [2345] } }, + { "id": 1, "_vectors": { "doggo": [6789] } }, + ])) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @r###"["id"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + drop(rtxn); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]); + settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]); + }) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @"[]"); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + + let mut search = index.search(&rtxn); + let results = search + .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) + .execute() + .unwrap(); + assert!(results.candidates.is_empty()); + + index + .update_settings(|settings| { + settings.set_embedder_settings(btreemap! { + S("doggo") => Setting::Set(EmbeddingSettings { + dimensions: Setting::Set(1), + source: Setting::Set(EmbedderSource::UserProvided), + ..EmbeddingSettings::default()}), + }); + }) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @"[]"); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + + let mut search = index.search(&rtxn); + let results = search + .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) + .execute() + .unwrap(); + assert!(results.candidates.is_empty()); + } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index be9b6b74e..68c31fabb 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -19,6 +19,7 @@ use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::{FieldId, FieldsIdsMap, Index, Result}; @@ -490,6 +491,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, &names, + &fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME), &fields_ids_map, )?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -1252,6 +1254,8 @@ pub(crate) struct InnerIndexSettings { pub embedding_configs: EmbeddingConfigs, pub existing_fields: HashSet, pub geo_fields_ids: Option<(FieldId, FieldId)>, + pub non_searchable_fields_ids: Vec, + pub non_faceted_fields_ids: Vec, } impl InnerIndexSettings { @@ -1265,8 +1269,8 @@ impl InnerIndexSettings { let user_defined_searchable_fields = user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; - let searchable_fields_ids = index.searchable_fields_ids(rtxn)?; - let faceted_fields_ids = index.faceted_fields_ids(rtxn)?; + let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?; + let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let embedding_configs = embedders(index.embedding_configs(rtxn)?)?; @@ -1294,6 +1298,10 @@ impl InnerIndexSettings { None => None, }; + let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); + searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); + faceted_fields_ids.retain(|id| !vectors_fids.contains(id)); + Ok(Self { stop_words, allowed_separators, @@ -1308,6 +1316,8 @@ impl InnerIndexSettings { embedding_configs, existing_fields, geo_fields_ids, + non_searchable_fields_ids: vectors_fids.clone(), + non_faceted_fields_ids: vectors_fids.clone(), }) } @@ -1315,9 +1325,10 @@ impl InnerIndexSettings { pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { let new_facets = self .fields_ids_map - .names() - .filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields)) - .map(|field| field.to_string()) + .iter() + .filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid)) + .filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields)) + .map(|(_fid, field)| field.to_string()) .collect(); index.put_faceted_fields(wtxn, &new_facets)?; @@ -1337,6 +1348,7 @@ impl InnerIndexSettings { index.put_all_searchable_fields_from_fields_ids_map( wtxn, &searchable_fields, + &self.non_searchable_fields_ids, &self.fields_ids_map, )?; }