mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-29 16:24:26 +01:00
never store the _vectors as searchable or faceted fields
This commit is contained in:
parent
4148fbbe85
commit
7a84697570
@ -4,7 +4,7 @@ use std::collections::HashMap;
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::{FieldId, FieldsIdsMap, Weight};
|
use crate::{vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME, FieldId, FieldsIdsMap, Weight};
|
||||||
|
|
||||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||||
pub struct FieldidsWeightsMap {
|
pub struct FieldidsWeightsMap {
|
||||||
@ -23,7 +23,13 @@ impl FieldidsWeightsMap {
|
|||||||
/// Should only be called in the case there are NO searchable attributes.
|
/// Should only be called in the case there are NO searchable attributes.
|
||||||
/// All the fields will be inserted in the order of the fields ids map with a weight of 0.
|
/// All the fields will be inserted in the order of the fields ids map with a weight of 0.
|
||||||
pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self {
|
pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self {
|
||||||
FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() }
|
FieldidsWeightsMap {
|
||||||
|
map: fid_map
|
||||||
|
.iter()
|
||||||
|
.filter(|(_fid, name)| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
|
||||||
|
.map(|(fid, _name)| (fid, 0))
|
||||||
|
.collect(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Removes a field id from the map, returning the associated weight previously in the map.
|
/// Removes a field id from the map, returning the associated weight previously in the map.
|
||||||
|
@ -23,6 +23,7 @@ use crate::heed_codec::{
|
|||||||
};
|
};
|
||||||
use crate::order_by_map::OrderByMap;
|
use crate::order_by_map::OrderByMap;
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
|
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
||||||
use crate::vector::{Embedding, EmbeddingConfig};
|
use crate::vector::{Embedding, EmbeddingConfig};
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||||
@ -644,6 +645,7 @@ impl Index {
|
|||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
user_fields: &[&str],
|
user_fields: &[&str],
|
||||||
|
non_searchable_fields_ids: &[FieldId],
|
||||||
fields_ids_map: &FieldsIdsMap,
|
fields_ids_map: &FieldsIdsMap,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// We can write the user defined searchable fields as-is.
|
// We can write the user defined searchable fields as-is.
|
||||||
@ -662,6 +664,7 @@ impl Index {
|
|||||||
for (weight, user_field) in user_fields.iter().enumerate() {
|
for (weight, user_field) in user_fields.iter().enumerate() {
|
||||||
if crate::is_faceted_by(field_from_map, user_field)
|
if crate::is_faceted_by(field_from_map, user_field)
|
||||||
&& !real_fields.contains(&field_from_map)
|
&& !real_fields.contains(&field_from_map)
|
||||||
|
&& !non_searchable_fields_ids.contains(&id)
|
||||||
{
|
{
|
||||||
real_fields.push(field_from_map);
|
real_fields.push(field_from_map);
|
||||||
|
|
||||||
@ -708,6 +711,7 @@ impl Index {
|
|||||||
Ok(self
|
Ok(self
|
||||||
.fields_ids_map(rtxn)?
|
.fields_ids_map(rtxn)?
|
||||||
.names()
|
.names()
|
||||||
|
.filter(|name| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
|
||||||
.map(|field| Cow::Owned(field.to_string()))
|
.map(|field| Cow::Owned(field.to_string()))
|
||||||
.collect())
|
.collect())
|
||||||
})
|
})
|
||||||
@ -1669,15 +1673,17 @@ pub(crate) mod tests {
|
|||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
use heed::{EnvOpenOptions, RwTxn};
|
use heed::{EnvOpenOptions, RwTxn};
|
||||||
use maplit::hashset;
|
use maplit::{btreemap, hashset};
|
||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
|
|
||||||
use crate::documents::DocumentsBatchReader;
|
use crate::documents::DocumentsBatchReader;
|
||||||
use crate::error::{Error, InternalError};
|
use crate::error::{Error, InternalError};
|
||||||
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||||
use crate::update::{
|
use crate::update::{
|
||||||
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
|
||||||
|
Settings,
|
||||||
};
|
};
|
||||||
|
use crate::vector::settings::{EmbedderSource, EmbeddingSettings};
|
||||||
use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
|
use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
|
||||||
|
|
||||||
pub(crate) struct TempIndex {
|
pub(crate) struct TempIndex {
|
||||||
@ -2783,4 +2789,95 @@ pub(crate) mod tests {
|
|||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn vectors_are_never_indexed_as_searchable_or_filterable() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{ "id": 0, "_vectors": { "doggo": [2345] } },
|
||||||
|
{ "id": 1, "_vectors": { "doggo": [6789] } },
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, fields_ids_map, @r###"
|
||||||
|
0 id |
|
||||||
|
1 _vectors |
|
||||||
|
2 _vectors.doggo |
|
||||||
|
"###);
|
||||||
|
db_snap!(index, searchable_fields, @r###"["id"]"###);
|
||||||
|
db_snap!(index, fieldids_weights_map, @r###"
|
||||||
|
fid weight
|
||||||
|
0 0 |
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let mut search = index.search(&rtxn);
|
||||||
|
let results = search.query("2345").execute().unwrap();
|
||||||
|
assert!(results.candidates.is_empty());
|
||||||
|
drop(rtxn);
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|settings| {
|
||||||
|
settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]);
|
||||||
|
settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, fields_ids_map, @r###"
|
||||||
|
0 id |
|
||||||
|
1 _vectors |
|
||||||
|
2 _vectors.doggo |
|
||||||
|
"###);
|
||||||
|
db_snap!(index, searchable_fields, @"[]");
|
||||||
|
db_snap!(index, fieldids_weights_map, @r###"
|
||||||
|
fid weight
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let mut search = index.search(&rtxn);
|
||||||
|
let results = search.query("2345").execute().unwrap();
|
||||||
|
assert!(results.candidates.is_empty());
|
||||||
|
|
||||||
|
let mut search = index.search(&rtxn);
|
||||||
|
let results = search
|
||||||
|
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
|
||||||
|
.execute()
|
||||||
|
.unwrap();
|
||||||
|
assert!(results.candidates.is_empty());
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|settings| {
|
||||||
|
settings.set_embedder_settings(btreemap! {
|
||||||
|
S("doggo") => Setting::Set(EmbeddingSettings {
|
||||||
|
dimensions: Setting::Set(1),
|
||||||
|
source: Setting::Set(EmbedderSource::UserProvided),
|
||||||
|
..EmbeddingSettings::default()}),
|
||||||
|
});
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, fields_ids_map, @r###"
|
||||||
|
0 id |
|
||||||
|
1 _vectors |
|
||||||
|
2 _vectors.doggo |
|
||||||
|
"###);
|
||||||
|
db_snap!(index, searchable_fields, @"[]");
|
||||||
|
db_snap!(index, fieldids_weights_map, @r###"
|
||||||
|
fid weight
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let mut search = index.search(&rtxn);
|
||||||
|
let results = search.query("2345").execute().unwrap();
|
||||||
|
assert!(results.candidates.is_empty());
|
||||||
|
|
||||||
|
let mut search = index.search(&rtxn);
|
||||||
|
let results = search
|
||||||
|
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
|
||||||
|
.execute()
|
||||||
|
.unwrap();
|
||||||
|
assert!(results.candidates.is_empty());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,6 +19,7 @@ use crate::order_by_map::OrderByMap;
|
|||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::update::index_documents::IndexDocumentsMethod;
|
use crate::update::index_documents::IndexDocumentsMethod;
|
||||||
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
||||||
|
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
||||||
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
|
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
|
||||||
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
|
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
|
||||||
use crate::{FieldId, FieldsIdsMap, Index, Result};
|
use crate::{FieldId, FieldsIdsMap, Index, Result};
|
||||||
@ -490,6 +491,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
self.index.put_all_searchable_fields_from_fields_ids_map(
|
self.index.put_all_searchable_fields_from_fields_ids_map(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
&names,
|
&names,
|
||||||
|
&fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME),
|
||||||
&fields_ids_map,
|
&fields_ids_map,
|
||||||
)?;
|
)?;
|
||||||
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
||||||
@ -1252,6 +1254,8 @@ pub(crate) struct InnerIndexSettings {
|
|||||||
pub embedding_configs: EmbeddingConfigs,
|
pub embedding_configs: EmbeddingConfigs,
|
||||||
pub existing_fields: HashSet<String>,
|
pub existing_fields: HashSet<String>,
|
||||||
pub geo_fields_ids: Option<(FieldId, FieldId)>,
|
pub geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||||
|
pub non_searchable_fields_ids: Vec<FieldId>,
|
||||||
|
pub non_faceted_fields_ids: Vec<FieldId>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl InnerIndexSettings {
|
impl InnerIndexSettings {
|
||||||
@ -1265,8 +1269,8 @@ impl InnerIndexSettings {
|
|||||||
let user_defined_searchable_fields =
|
let user_defined_searchable_fields =
|
||||||
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
|
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
|
||||||
let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
|
let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
|
||||||
let searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
|
let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
|
||||||
let faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
|
let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
|
||||||
let exact_attributes = index.exact_attributes_ids(rtxn)?;
|
let exact_attributes = index.exact_attributes_ids(rtxn)?;
|
||||||
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
|
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
|
||||||
let embedding_configs = embedders(index.embedding_configs(rtxn)?)?;
|
let embedding_configs = embedders(index.embedding_configs(rtxn)?)?;
|
||||||
@ -1294,6 +1298,10 @@ impl InnerIndexSettings {
|
|||||||
None => None,
|
None => None,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME);
|
||||||
|
searchable_fields_ids.retain(|id| !vectors_fids.contains(id));
|
||||||
|
faceted_fields_ids.retain(|id| !vectors_fids.contains(id));
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
stop_words,
|
stop_words,
|
||||||
allowed_separators,
|
allowed_separators,
|
||||||
@ -1308,6 +1316,8 @@ impl InnerIndexSettings {
|
|||||||
embedding_configs,
|
embedding_configs,
|
||||||
existing_fields,
|
existing_fields,
|
||||||
geo_fields_ids,
|
geo_fields_ids,
|
||||||
|
non_searchable_fields_ids: vectors_fids.clone(),
|
||||||
|
non_faceted_fields_ids: vectors_fids.clone(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1315,9 +1325,10 @@ impl InnerIndexSettings {
|
|||||||
pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
|
pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
|
||||||
let new_facets = self
|
let new_facets = self
|
||||||
.fields_ids_map
|
.fields_ids_map
|
||||||
.names()
|
.iter()
|
||||||
.filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields))
|
.filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid))
|
||||||
.map(|field| field.to_string())
|
.filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields))
|
||||||
|
.map(|(_fid, field)| field.to_string())
|
||||||
.collect();
|
.collect();
|
||||||
index.put_faceted_fields(wtxn, &new_facets)?;
|
index.put_faceted_fields(wtxn, &new_facets)?;
|
||||||
|
|
||||||
@ -1337,6 +1348,7 @@ impl InnerIndexSettings {
|
|||||||
index.put_all_searchable_fields_from_fields_ids_map(
|
index.put_all_searchable_fields_from_fields_ids_map(
|
||||||
wtxn,
|
wtxn,
|
||||||
&searchable_fields,
|
&searchable_fields,
|
||||||
|
&self.non_searchable_fields_ids,
|
||||||
&self.fields_ids_map,
|
&self.fields_ids_map,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user