4631: Split the field id map from the weight of each fields r=Kerollmops a=irevoire

# Pull Request

## Related issue
Fixes https://github.com/meilisearch/meilisearch/issues/4484

## What does this PR do?
- Make the (internal) searchable fields database always contain the searchable fields (instead of None when the user-defined searchable fields were not defined)
- Introduce a new « fieldids_weights_map » that does the mapping between a fieldId and its Weight
- Ensure that when two searchable fields are swapped, the field ID map doesn't change anymore (and thus, doesn't re-index)
- Uses the weight instead of the order of the searchable fields in the attribute ranking rule at search time
- When no searchable attributes are defined, make all their weights equal to zero
- When a field is declared as searchable and contains nested fields, all its subfields share the same weight

## Impact on relevancy

### When no searchable attributes are declared

When no searchable attributes are declared, all the fields have the same importance instead of randomly giving more importance to the field we've encountered « the most early » in the life of the index.

This means that before this PR, send the following json:
```json
[
  { "id": 0, "name": "kefir", "color": "white" },
  { "id": 1, "name": "white", "last name": "spirit" }
]
```

Would make the field `name` more important than the field `color` or `last name`.
This means that searching for `white` would make the document `1` automatically higher ranked than the document `0`.

After this PR, all the fields have the same weight, and none are considered more important than others.

### When a nested field is made searchable

The second behavior change that happened with this PR is in the case you're sending this document, for example:

```json
{
  "id": 0,
  "name": "tamo",
  "doggo": {
    "name": "kefir",
    "surname": "le kef"
  },
  "catto": "gromez"
}
```

Previously, defining the searchable attributes as: `["tamo", "doggo", "catto"]` was actually defining the « real » searchable attributes in the engine as: `["tamo", "doggo", "catto", "doggo.name", "doggo.surname"]`, which means that `doggo.name` and `doggo.surname` were _NOT_ where the user expected them and had completely different weights than `doggo`.
In this PR all the weights have been unified, and the « real » searchable fields look like this:
```json
[ "tamo", "doggo", "doggo.name", "doggo.surname", "catto"]
   ^^^^    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^    ^^^^^
Weight 0                 Weight 1                  Weight 2

Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-05-16 09:59:24 +00:00 committed by GitHub
commit 7c19c072fa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
27 changed files with 765 additions and 185 deletions

View File

@ -272,9 +272,9 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) {
} }
for index_uid in index_uids { for index_uid in index_uids {
if index_uid == swap.0 { if index_uid == swap.0 {
*index_uid = swap.1.to_owned(); swap.1.clone_into(index_uid);
} else if index_uid == swap.1 { } else if index_uid == swap.1 {
*index_uid = swap.0.to_owned(); swap.0.clone_into(index_uid);
} }
} }
} }

View File

@ -730,7 +730,7 @@ pub fn perform_search(
let mut ids = BTreeSet::new(); let mut ids = BTreeSet::new();
for attr in attrs { for attr in attrs {
if attr == "*" { if attr == "*" {
ids = displayed_ids.clone(); ids.clone_from(&displayed_ids);
break; break;
} }

View File

@ -85,8 +85,13 @@ impl SearchQueue {
}, },
search_request = receive_new_searches.recv() => { search_request = receive_new_searches.recv() => {
// this unwrap is safe because we're sure the `SearchQueue` still lives somewhere in actix-web let search_request = match search_request {
let search_request = search_request.unwrap(); Some(search_request) => search_request,
// This should never happen while actix-web is running, but it's not a reason to crash
// and it can generate a lot of noise in the tests.
None => continue,
};
if searches_running < usize::from(parallelism) && queue.is_empty() { if searches_running < usize::from(parallelism) && queue.is_empty() {
searches_running += 1; searches_running += 1;
// if the search requests die it's not a hard error on our side // if the search requests die it's not a hard error on our side

View File

@ -85,8 +85,8 @@ async fn simple_search() {
) )
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"1"); snapshot!(response["semanticHitCount"], @"2");
let (response, code) = index let (response, code) = index
.search_post( .search_post(
@ -331,7 +331,7 @@ async fn query_combination() {
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.8848484848484849}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###);
snapshot!(response["semanticHitCount"], @"null"); snapshot!(response["semanticHitCount"], @"null");
// query + vector, no hybrid keyword => // query + vector, no hybrid keyword =>
@ -374,6 +374,6 @@ async fn query_combination() {
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###);
snapshot!(response["semanticHitCount"], @"0"); snapshot!(response["semanticHitCount"], @"0");
} }

View File

@ -921,7 +921,7 @@ async fn test_score_details() {
"order": 3, "order": 3,
"attributeRankingOrderScore": 1.0, "attributeRankingOrderScore": 1.0,
"queryWordDistanceScore": 0.8095238095238095, "queryWordDistanceScore": 0.8095238095238095,
"score": 0.9727891156462584 "score": 0.8095238095238095
}, },
"exactness": { "exactness": {
"order": 4, "order": 4,

View File

@ -285,10 +285,10 @@ async fn attributes_ranking_rule_order() {
@r###" @r###"
[ [
{ {
"id": "2" "id": "1"
}, },
{ {
"id": "1" "id": "2"
} }
] ]
"### "###

View File

@ -1,6 +1,5 @@
use std::time::Duration; use std::time::Duration;
use actix_rt::time::sleep;
use meili_snap::{json_string, snapshot}; use meili_snap::{json_string, snapshot};
use meilisearch::option::ScheduleSnapshot; use meilisearch::option::ScheduleSnapshot;
use meilisearch::Opt; use meilisearch::Opt;
@ -53,11 +52,29 @@ async fn perform_snapshot() {
index.load_test_set().await; index.load_test_set().await;
server.index("test1").create(Some("prim")).await; let (task, code) = server.index("test1").create(Some("prim")).await;
meili_snap::snapshot!(code, @"202 Accepted");
index.wait_task(2).await; index.wait_task(task.uid()).await;
sleep(Duration::from_secs(2)).await; // wait for the _next task_ to process, aka the snapshot that should be enqueued at some point
println!("waited for the next task to finish");
let now = std::time::Instant::now();
let next_task = task.uid() + 1;
loop {
let (value, code) = index.get_task(next_task).await;
dbg!(&value);
if code != 404 && value["status"].as_str() == Some("succeeded") {
break;
}
if now.elapsed() > Duration::from_secs(30) {
panic!("The snapshot didn't schedule in 30s even though it was supposed to be scheduled every 2s: {}",
serde_json::to_string_pretty(&value).unwrap()
);
}
}
let temp = tempfile::tempdir().unwrap(); let temp = tempfile::tempdir().unwrap();

View File

@ -48,7 +48,7 @@ fn main() -> Result<(), Box<dyn Error>> {
let start = Instant::now(); let start = Instant::now();
let mut ctx = SearchContext::new(&index, &txn); let mut ctx = SearchContext::new(&index, &txn)?;
let universe = filtered_universe(&ctx, &None)?; let universe = filtered_universe(&ctx, &None)?;
let docs = execute_search( let docs = execute_search(

View File

@ -32,6 +32,8 @@ pub enum InternalError {
DatabaseClosing, DatabaseClosing,
#[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))] #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))]
DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> },
#[error("Missing {key} in the fieldids weights mapping.")]
FieldidsWeightsMapMissingEntry { key: FieldId },
#[error(transparent)] #[error(transparent)]
FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry), FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry),
#[error("Missing {key} in the field id mapping.")] #[error("Missing {key} in the field id mapping.")]

View File

@ -0,0 +1,48 @@
//! The fieldids weights map is in charge of storing linking the searchable fields with their weights.
use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use crate::{FieldId, FieldsIdsMap, Weight};
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct FieldidsWeightsMap {
map: HashMap<FieldId, Weight>,
}
impl FieldidsWeightsMap {
/// Insert a field id -> weigth into the map.
/// If the map did not have this key present, `None` is returned.
/// If the map did have this key present, the value is updated, and the old value is returned.
pub fn insert(&mut self, fid: FieldId, weight: Weight) -> Option<Weight> {
self.map.insert(fid, weight)
}
/// Create the map from the fields ids maps.
/// Should only be called in the case there are NO searchable attributes.
/// All the fields will be inserted in the order of the fields ids map with a weight of 0.
pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self {
FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() }
}
/// Removes a field id from the map, returning the associated weight previously in the map.
pub fn remove(&mut self, fid: FieldId) -> Option<Weight> {
self.map.remove(&fid)
}
/// Returns weight corresponding to the key.
pub fn weight(&self, fid: FieldId) -> Option<Weight> {
self.map.get(&fid).copied()
}
/// Returns highest weight contained in the map if any.
pub fn max_weight(&self) -> Option<Weight> {
self.map.values().copied().max()
}
/// Return an iterator visiting all field ids in arbitrary order.
pub fn ids(&self) -> impl Iterator<Item = FieldId> + '_ {
self.map.keys().copied()
}
}

View File

@ -1,5 +1,6 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::convert::TryInto;
use std::fs::File; use std::fs::File;
use std::path::Path; use std::path::Path;
@ -25,8 +26,9 @@ use crate::proximity::ProximityPrecision;
use crate::vector::EmbeddingConfig; use crate::vector::EmbeddingConfig;
use crate::{ use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, BEU64, FieldidsWeightsMap, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64,
}; };
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
@ -42,6 +44,7 @@ pub mod main_key {
pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution";
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const FIELDIDS_WEIGHTS_MAP_KEY: &str = "fieldids-weights-map";
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
pub const GEO_RTREE_KEY: &str = "geo-rtree"; pub const GEO_RTREE_KEY: &str = "geo-rtree";
pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const PRIMARY_KEY_KEY: &str = "primary-key";
@ -414,6 +417,65 @@ impl Index {
.unwrap_or_default()) .unwrap_or_default())
} }
/* fieldids weights map */
// This maps the fields ids to their weights.
// Their weights is defined by the ordering of the searchable attributes.
/// Writes the fieldids weights map which associates the field ids to their weights
pub(crate) fn put_fieldids_weights_map(
&self,
wtxn: &mut RwTxn,
map: &FieldidsWeightsMap,
) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<_>>().put(
wtxn,
main_key::FIELDIDS_WEIGHTS_MAP_KEY,
map,
)
}
/// Get the fieldids weights map which associates the field ids to their weights
pub fn fieldids_weights_map(&self, rtxn: &RoTxn) -> heed::Result<FieldidsWeightsMap> {
self.main
.remap_types::<Str, SerdeJson<_>>()
.get(rtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY)?
.map(Ok)
.unwrap_or_else(|| {
Ok(FieldidsWeightsMap::from_field_id_map_without_searchable(
&self.fields_ids_map(rtxn)?,
))
})
}
/// Delete the fieldsids weights map
pub fn delete_fieldids_weights_map(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.remap_key_type::<Str>().delete(wtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY)
}
pub fn searchable_fields_and_weights<'a>(
&self,
rtxn: &'a RoTxn,
) -> Result<Vec<(Cow<'a, str>, FieldId, Weight)>> {
let fid_map = self.fields_ids_map(rtxn)?;
let weight_map = self.fieldids_weights_map(rtxn)?;
let searchable = self.searchable_fields(rtxn)?;
searchable
.into_iter()
.map(|field| -> Result<_> {
let fid = fid_map.id(&field).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
field_name: field.to_string(),
process: "searchable_fields_and_weights",
})?;
let weight = weight_map
.weight(fid)
.ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?;
Ok((field, fid, weight))
})
.collect()
}
/* geo rtree */ /* geo rtree */
/// Writes the provided `rtree` which associates coordinates to documents ids. /// Writes the provided `rtree` which associates coordinates to documents ids.
@ -578,33 +640,42 @@ impl Index {
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
user_fields: &[&str], user_fields: &[&str],
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
) -> heed::Result<()> { ) -> Result<()> {
// We can write the user defined searchable fields as-is. // We can write the user defined searchable fields as-is.
self.put_user_defined_searchable_fields(wtxn, user_fields)?; self.put_user_defined_searchable_fields(wtxn, user_fields)?;
let mut weights = FieldidsWeightsMap::default();
// Now we generate the real searchable fields: // Now we generate the real searchable fields:
// 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion.
// 2. Iterate over the user defined searchable fields. // 2. Iterate over the user defined searchable fields.
// 3. If a user defined field is a subset of a field defined in the fields_ids_map // 3. If a user defined field is a subset of a field defined in the fields_ids_map
// (ie doggo.name is a subset of doggo) then we push it at the end of the fields. // (ie doggo.name is a subset of doggo) right after doggo and with the same weight.
let mut real_fields = user_fields.to_vec(); let mut real_fields = Vec::new();
for field_from_map in fields_ids_map.names() { for (id, field_from_map) in fields_ids_map.iter() {
for user_field in user_fields { for (weight, user_field) in user_fields.iter().enumerate() {
if crate::is_faceted_by(field_from_map, user_field) if crate::is_faceted_by(field_from_map, user_field)
&& !user_fields.contains(&field_from_map) && !real_fields.contains(&field_from_map)
{ {
real_fields.push(field_from_map); real_fields.push(field_from_map);
let weight: u16 =
weight.try_into().map_err(|_| UserError::AttributeLimitReached)?;
weights.insert(id, weight);
} }
} }
} }
self.put_searchable_fields(wtxn, &real_fields) self.put_searchable_fields(wtxn, &real_fields)?;
self.put_fieldids_weights_map(wtxn, &weights)?;
Ok(())
} }
pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result<bool> { pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
let did_delete_searchable = self.delete_searchable_fields(wtxn)?; let did_delete_searchable = self.delete_searchable_fields(wtxn)?;
let did_delete_user_defined = self.delete_user_defined_searchable_fields(wtxn)?; let did_delete_user_defined = self.delete_user_defined_searchable_fields(wtxn)?;
self.delete_fieldids_weights_map(wtxn)?;
Ok(did_delete_searchable || did_delete_user_defined) Ok(did_delete_searchable || did_delete_user_defined)
} }
@ -623,28 +694,31 @@ impl Index {
} }
/// Returns the searchable fields, those are the fields that are indexed, /// Returns the searchable fields, those are the fields that are indexed,
/// if the searchable fields aren't there it means that **all** the fields are indexed. pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<Vec<Cow<'t, str>>> {
pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<Option<Vec<&'t str>>> {
self.main self.main
.remap_types::<Str, SerdeBincode<Vec<&'t str>>>() .remap_types::<Str, SerdeBincode<Vec<&'t str>>>()
.get(rtxn, main_key::SEARCHABLE_FIELDS_KEY) .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY)?
.map(|fields| Ok(fields.into_iter().map(Cow::Borrowed).collect()))
.unwrap_or_else(|| {
Ok(self
.fields_ids_map(rtxn)?
.names()
.map(|field| Cow::Owned(field.to_string()))
.collect())
})
} }
/// Identical to `searchable_fields`, but returns the ids instead. /// Identical to `searchable_fields`, but returns the ids instead.
pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result<Option<Vec<FieldId>>> { pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result<Vec<FieldId>> {
match self.searchable_fields(rtxn)? { let fields = self.searchable_fields(rtxn)?;
Some(fields) => { let fields_ids_map = self.fields_ids_map(rtxn)?;
let fields_ids_map = self.fields_ids_map(rtxn)?; let mut fields_ids = Vec::new();
let mut fields_ids = Vec::new(); for name in fields {
for name in fields { if let Some(field_id) = fields_ids_map.id(&name) {
if let Some(field_id) = fields_ids_map.id(name) { fields_ids.push(field_id);
fields_ids.push(field_id);
}
}
Ok(Some(fields_ids))
} }
None => Ok(None),
} }
Ok(fields_ids)
} }
/// Writes the searchable fields, when this list is specified, only these are indexed. /// Writes the searchable fields, when this list is specified, only these are indexed.
@ -1710,10 +1784,14 @@ pub(crate) mod tests {
])) ]))
.unwrap(); .unwrap();
db_snap!(index, field_distribution, 1); db_snap!(index, field_distribution, @r###"
age 1 |
id 2 |
name 2 |
"###);
db_snap!(index, word_docids, db_snap!(index, word_docids,
@r###" @r###"
1 [0, ] 1 [0, ]
2 [1, ] 2 [1, ]
20 [1, ] 20 [1, ]
@ -1722,18 +1800,6 @@ pub(crate) mod tests {
"### "###
); );
db_snap!(index, field_distribution);
db_snap!(index, field_distribution,
@r###"
age 1 |
id 2 |
name 2 |
"###
);
// snapshot_index!(&index, "1", include: "^field_distribution$");
// we add all the documents a second time. we are supposed to get the same // we add all the documents a second time. we are supposed to get the same
// field_distribution in the end // field_distribution in the end
index index
@ -1820,7 +1886,7 @@ pub(crate) mod tests {
// ensure we get the right real searchable fields + user defined searchable fields // ensure we get the right real searchable fields + user defined searchable fields
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let real = index.searchable_fields(&rtxn).unwrap().unwrap(); let real = index.searchable_fields(&rtxn).unwrap();
assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]);
let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap();
@ -1840,7 +1906,7 @@ pub(crate) mod tests {
// ensure we get the right real searchable fields + user defined searchable fields // ensure we get the right real searchable fields + user defined searchable fields
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let real = index.searchable_fields(&rtxn).unwrap().unwrap(); let real = index.searchable_fields(&rtxn).unwrap();
assert_eq!(real, &["doggo", "name"]); assert_eq!(real, &["doggo", "name"]);
let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap();
assert_eq!(user_defined, &["doggo", "name"]); assert_eq!(user_defined, &["doggo", "name"]);
@ -1856,7 +1922,7 @@ pub(crate) mod tests {
// ensure we get the right real searchable fields + user defined searchable fields // ensure we get the right real searchable fields + user defined searchable fields
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let real = index.searchable_fields(&rtxn).unwrap().unwrap(); let real = index.searchable_fields(&rtxn).unwrap();
assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]);
let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap();
@ -2395,6 +2461,14 @@ pub(crate) mod tests {
11 0 11 0
4 1 4 1
"###); "###);
db_snap!(index, fields_ids_map, @r###"
0 primary_key |
"###);
db_snap!(index, searchable_fields, @r###"["primary_key"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
"###);
index index
.add_documents(documents!([ .add_documents(documents!([
@ -2410,6 +2484,16 @@ pub(crate) mod tests {
11 0 11 0
4 1 4 1
"###); "###);
db_snap!(index, fields_ids_map, @r###"
0 primary_key |
1 a |
"###);
db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
1 0 |
"###);
index.delete_documents(Default::default()); index.delete_documents(Default::default());
@ -2420,6 +2504,16 @@ pub(crate) mod tests {
11 0 11 0
4 1 4 1
"###); "###);
db_snap!(index, fields_ids_map, @r###"
0 primary_key |
1 a |
"###);
db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
1 0 |
"###);
index index
.add_documents(documents!([ .add_documents(documents!([
@ -2435,6 +2529,16 @@ pub(crate) mod tests {
11 0 11 0
4 1 4 1
"###); "###);
db_snap!(index, fields_ids_map, @r###"
0 primary_key |
1 a |
"###);
db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
1 0 |
"###);
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let search = Search::new(&rtxn, &index); let search = Search::new(&rtxn, &index);
@ -2520,4 +2624,104 @@ pub(crate) mod tests {
db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted
} }
#[test]
fn swapping_searchable_attributes() {
// See https://github.com/meilisearch/meilisearch/issues/4484
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_searchable_fields(vec![S("name")]);
settings.set_filterable_fields(HashSet::from([S("age")]));
})
.unwrap();
index
.add_documents(documents!({ "id": 1, "name": "Many", "age": 28, "realName": "Maxime" }))
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 name |
1 id |
2 age |
3 realName |
"###);
db_snap!(index, searchable_fields, @r###"["name"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
"###);
index
.update_settings(|settings| {
settings.set_searchable_fields(vec![S("name"), S("realName")]);
settings.set_filterable_fields(HashSet::from([S("age")]));
})
.unwrap();
// The order of the field id map shouldn't change
db_snap!(index, fields_ids_map, @r###"
0 name |
1 id |
2 age |
3 realName |
"###);
db_snap!(index, searchable_fields, @r###"["name", "realName"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
3 1 |
"###);
}
#[test]
fn attribute_weights_after_swapping_searchable_attributes() {
// See https://github.com/meilisearch/meilisearch/issues/4484
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_searchable_fields(vec![S("name"), S("beverage")]);
})
.unwrap();
index
.add_documents(documents!([
{ "id": 0, "name": "kefir", "beverage": "water" },
{ "id": 1, "name": "tamo", "beverage": "kefir" }
]))
.unwrap();
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("kefir").execute().unwrap();
// We should find kefir the dog first
insta::assert_debug_snapshot!(results.documents_ids, @r###"
[
0,
1,
]
"###);
index
.update_settings(|settings| {
settings.set_searchable_fields(vec![S("beverage"), S("name")]);
})
.unwrap();
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("kefir").execute().unwrap();
// We should find tamo first
insta::assert_debug_snapshot!(results.documents_ids, @r###"
[
1,
0,
]
"###);
}
} }

View File

@ -28,6 +28,7 @@ pub mod vector;
#[cfg(test)] #[cfg(test)]
#[macro_use] #[macro_use]
pub mod snapshot_tests; pub mod snapshot_tests;
mod fieldids_weights_map;
use std::collections::{BTreeMap, HashMap}; use std::collections::{BTreeMap, HashMap};
use std::convert::{TryFrom, TryInto}; use std::convert::{TryFrom, TryInto};
@ -52,6 +53,7 @@ pub use self::error::{
Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError,
}; };
pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fieldids_weights_map::FieldidsWeightsMap;
pub use self::fields_ids_map::FieldsIdsMap; pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{ pub use self::heed_codec::{
BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
@ -77,6 +79,7 @@ pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>; pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
pub type FieldDistribution = BTreeMap<String, u64>; pub type FieldDistribution = BTreeMap<String, u64>;
pub type FieldId = u16; pub type FieldId = u16;
pub type Weight = u16;
pub type Object = serde_json::Map<String, serde_json::Value>; pub type Object = serde_json::Map<String, serde_json::Value>;
pub type Position = u32; pub type Position = u32;
pub type RelativePosition = u16; pub type RelativePosition = u16;

View File

@ -147,7 +147,7 @@ impl<'a> Search<'a> {
pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> { pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> {
if has_vector_search { if has_vector_search {
let ctx = SearchContext::new(self.index, self.rtxn); let ctx = SearchContext::new(self.index, self.rtxn)?;
filtered_universe(&ctx, &self.filter) filtered_universe(&ctx, &self.filter)
} else { } else {
Ok(self.execute()?.candidates) Ok(self.execute()?.candidates)
@ -155,10 +155,10 @@ impl<'a> Search<'a> {
} }
pub fn execute(&self) -> Result<SearchResult> { pub fn execute(&self) -> Result<SearchResult> {
let mut ctx = SearchContext::new(self.index, self.rtxn); let mut ctx = SearchContext::new(self.index, self.rtxn)?;
if let Some(searchable_attributes) = self.searchable_attributes { if let Some(searchable_attributes) = self.searchable_attributes {
ctx.searchable_attributes(searchable_attributes)?; ctx.attributes_to_search_on(searchable_attributes)?;
} }
let universe = filtered_universe(&ctx, &self.filter)?; let universe = filtered_universe(&ctx, &self.filter)?;

View File

@ -101,7 +101,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
let mut ranking_rule_universes: Vec<RoaringBitmap> = let mut ranking_rule_universes: Vec<RoaringBitmap> =
vec![RoaringBitmap::default(); ranking_rules_len]; vec![RoaringBitmap::default(); ranking_rules_len];
ranking_rule_universes[0] = universe.clone(); ranking_rule_universes[0].clone_from(universe);
let mut cur_ranking_rule_index = 0; let mut cur_ranking_rule_index = 0;
/// Finish iterating over the current ranking rule, yielding /// Finish iterating over the current ranking rule, yielding
@ -232,7 +232,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
} }
cur_ranking_rule_index += 1; cur_ranking_rule_index += 1;
ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); ranking_rule_universes[cur_ranking_rule_index].clone_from(&next_bucket.candidates);
logger.start_iteration_ranking_rule( logger.start_iteration_ranking_rule(
cur_ranking_rule_index, cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index].as_ref(), ranking_rules[cur_ranking_rule_index].as_ref(),

View File

@ -163,7 +163,7 @@ impl<'ctx> SearchContext<'ctx> {
Some(restricted_fids) => { Some(restricted_fids) => {
let interned = self.word_interner.get(word).as_str(); let interned = self.word_interner.get(word).as_str();
let keys: Vec<_> = let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect(); restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn, self.txn,
@ -192,7 +192,7 @@ impl<'ctx> SearchContext<'ctx> {
Some(restricted_fids) => { Some(restricted_fids) => {
let interned = self.word_interner.get(word).as_str(); let interned = self.word_interner.get(word).as_str();
let keys: Vec<_> = let keys: Vec<_> =
restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect(); restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn, self.txn,
@ -242,7 +242,7 @@ impl<'ctx> SearchContext<'ctx> {
Some(restricted_fids) => { Some(restricted_fids) => {
let interned = self.word_interner.get(prefix).as_str(); let interned = self.word_interner.get(prefix).as_str();
let keys: Vec<_> = let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect(); restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn, self.txn,
@ -271,7 +271,7 @@ impl<'ctx> SearchContext<'ctx> {
Some(restricted_fids) => { Some(restricted_fids) => {
let interned = self.word_interner.get(prefix).as_str(); let interned = self.word_interner.get(prefix).as_str();
let keys: Vec<_> = let keys: Vec<_> =
restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect(); restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn, self.txn,
@ -315,11 +315,7 @@ impl<'ctx> SearchContext<'ctx> {
.map_err(heed::Error::Decoding)? .map_err(heed::Error::Decoding)?
} else { } else {
// Compute the distance at the attribute level and store it in the cache. // Compute the distance at the attribute level and store it in the cache.
let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { let fids = self.index.searchable_fields_ids(self.txn)?;
fids
} else {
self.index.fields_ids_map(self.txn)?.ids().collect()
};
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for fid in fids { for fid in fids {
// for each field, intersect left word bitmap and right word bitmap, // for each field, intersect left word bitmap and right word bitmap,
@ -408,11 +404,7 @@ impl<'ctx> SearchContext<'ctx> {
let prefix_docids = match proximity_precision { let prefix_docids = match proximity_precision {
ProximityPrecision::ByAttribute => { ProximityPrecision::ByAttribute => {
// Compute the distance at the attribute level and store it in the cache. // Compute the distance at the attribute level and store it in the cache.
let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { let fids = self.index.searchable_fields_ids(self.txn)?;
fids
} else {
self.index.fields_ids_map(self.txn)?.ids().collect()
};
let mut prefix_docids = RoaringBitmap::new(); let mut prefix_docids = RoaringBitmap::new();
// for each field, intersect left word bitmap and right word bitmap, // for each field, intersect left word bitmap and right word bitmap,
// then merge the result in a global bitmap before storing it in the cache. // then merge the result in a global bitmap before storing it in the cache.

View File

@ -184,13 +184,7 @@ impl State {
return Ok(State::Empty(query_graph.clone())); return Ok(State::Empty(query_graph.clone()));
} }
let searchable_fields_ids = { let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?;
if let Some(fids) = ctx.index.searchable_fields_ids(ctx.txn)? {
fids
} else {
ctx.index.fields_ids_map(ctx.txn)?.ids().collect()
}
};
let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
// then check that there exists at least one attribute that has all of the terms // then check that there exists at least one attribute that has all of the terms

View File

@ -258,7 +258,7 @@ pub(crate) mod tests {
fn matching_words() { fn matching_words() {
let temp_index = temp_index_with_documents(); let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap(); let rtxn = temp_index.read_txn().unwrap();
let mut ctx = SearchContext::new(&temp_index, &rtxn); let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
let mut builder = TokenizerBuilder::default(); let mut builder = TokenizerBuilder::default();
let tokenizer = builder.build(); let tokenizer = builder.build();
let tokens = tokenizer.tokenize("split this world"); let tokens = tokenizer.tokenize("split this world");

View File

@ -506,7 +506,7 @@ mod tests {
impl<'a> MatcherBuilder<'a> { impl<'a> MatcherBuilder<'a> {
fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self { fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
let mut ctx = SearchContext::new(index, rtxn); let mut ctx = SearchContext::new(index, rtxn).unwrap();
let universe = filtered_universe(&ctx, &None).unwrap(); let universe = filtered_universe(&ctx, &None).unwrap();
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search( let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
&mut ctx, &mut ctx,

View File

@ -49,13 +49,12 @@ pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words; use self::graph_based_ranking_rule::Words;
use self::interner::Interned; use self::interner::Interned;
use self::vector_sort::VectorSort; use self::vector_sort::VectorSort;
use crate::error::FieldIdMapMissingEntry;
use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::apply_distinct_rule; use crate::search::new::distinct::apply_distinct_rule;
use crate::vector::Embedder; use crate::vector::Embedder;
use crate::{ use crate::{
AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget, AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget,
UserError, UserError, Weight,
}; };
/// A structure used throughout the execution of a search query. /// A structure used throughout the execution of a search query.
@ -71,8 +70,21 @@ pub struct SearchContext<'ctx> {
} }
impl<'ctx> SearchContext<'ctx> { impl<'ctx> SearchContext<'ctx> {
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Result<Self> {
Self { let searchable_fids = index.searchable_fields_and_weights(txn)?;
let exact_attributes_ids = index.exact_attributes_ids(txn)?;
let mut exact = Vec::new();
let mut tolerant = Vec::new();
for (_name, fid, weight) in searchable_fids {
if exact_attributes_ids.contains(&fid) {
exact.push((fid, weight));
} else {
tolerant.push((fid, weight));
}
}
Ok(Self {
index, index,
txn, txn,
db_cache: <_>::default(), db_cache: <_>::default(),
@ -81,42 +93,39 @@ impl<'ctx> SearchContext<'ctx> {
term_interner: <_>::default(), term_interner: <_>::default(),
phrase_docids: <_>::default(), phrase_docids: <_>::default(),
restricted_fids: None, restricted_fids: None,
} })
} }
pub fn searchable_attributes(&mut self, searchable_attributes: &'ctx [String]) -> Result<()> { pub fn attributes_to_search_on(
let fids_map = self.index.fields_ids_map(self.txn)?; &mut self,
let searchable_names = self.index.searchable_fields(self.txn)?; attributes_to_search_on: &'ctx [String],
) -> Result<()> {
let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?;
let searchable_fields_weights = self.index.searchable_fields_and_weights(self.txn)?;
let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?; let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?;
let mut wildcard = false;
let mut restricted_fids = RestrictedFids::default(); let mut restricted_fids = RestrictedFids::default();
let mut contains_wildcard = false; for field_name in attributes_to_search_on {
for field_name in searchable_attributes {
if field_name == "*" { if field_name == "*" {
contains_wildcard = true; wildcard = true;
// we cannot early exit as we want to returns error in case of unknown fields
continue; continue;
} }
let searchable_contains_name = let searchable_weight =
searchable_names.as_ref().map(|sn| sn.iter().any(|name| name == field_name)); searchable_fields_weights.iter().find(|(name, _, _)| name == field_name);
let fid = match (fids_map.id(field_name), searchable_contains_name) { let (fid, weight) = match searchable_weight {
// The Field id exist and the field is searchable // The Field id exist and the field is searchable
(Some(fid), Some(true)) | (Some(fid), None) => fid, Some((_name, fid, weight)) => (*fid, *weight),
// The field is searchable but the Field id doesn't exist => Internal Error // The field is not searchable but the user didn't define any searchable attributes
(None, Some(true)) => { None if user_defined_searchable.is_none() => continue,
return Err(FieldIdMapMissingEntry::FieldName {
field_name: field_name.to_string(),
process: "search",
}
.into())
}
// The field is not searchable, but the searchableAttributes are set to * => ignore field
(None, None) => continue,
// The field is not searchable => User error // The field is not searchable => User error
(_fid, Some(false)) => { None => {
let (valid_fields, hidden_fields) = match searchable_names { let (valid_fields, hidden_fields) = self.index.remove_hidden_fields(
Some(sn) => self.index.remove_hidden_fields(self.txn, sn)?, self.txn,
None => self.index.remove_hidden_fields(self.txn, fids_map.names())?, searchable_fields_weights.iter().map(|(name, _, _)| name),
}; )?;
let field = field_name.to_string(); let field = field_name.to_string();
return Err(UserError::InvalidSearchableAttribute { return Err(UserError::InvalidSearchableAttribute {
@ -129,13 +138,17 @@ impl<'ctx> SearchContext<'ctx> {
}; };
if exact_attributes_ids.contains(&fid) { if exact_attributes_ids.contains(&fid) {
restricted_fids.exact.push(fid); restricted_fids.exact.push((fid, weight));
} else { } else {
restricted_fids.tolerant.push(fid); restricted_fids.tolerant.push((fid, weight));
}; };
} }
self.restricted_fids = (!contains_wildcard).then_some(restricted_fids); if wildcard {
self.restricted_fids = None;
} else {
self.restricted_fids = Some(restricted_fids);
}
Ok(()) Ok(())
} }
@ -158,13 +171,13 @@ impl Word {
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone, Default)]
pub struct RestrictedFids { pub struct RestrictedFids {
pub tolerant: Vec<FieldId>, pub tolerant: Vec<(FieldId, Weight)>,
pub exact: Vec<FieldId>, pub exact: Vec<(FieldId, Weight)>,
} }
impl RestrictedFids { impl RestrictedFids {
pub fn contains(&self, fid: &FieldId) -> bool { pub fn contains(&self, fid: &FieldId) -> bool {
self.tolerant.contains(fid) || self.exact.contains(fid) self.tolerant.iter().any(|(id, _)| id == fid) || self.exact.iter().any(|(id, _)| id == fid)
} }
} }

View File

@ -366,7 +366,7 @@ mod tests {
let tokens = tokenizer.tokenize("."); let tokens = tokenizer.tokenize(".");
let index = temp_index_with_documents(); let index = temp_index_with_documents();
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
let mut ctx = SearchContext::new(&index, &rtxn); let mut ctx = SearchContext::new(&index, &rtxn)?;
// panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785> // panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
let ExtractedTokens { query_terms, .. } = let ExtractedTokens { query_terms, .. } =
located_query_terms_from_tokens(&mut ctx, tokens, None)?; located_query_terms_from_tokens(&mut ctx, tokens, None)?;

View File

@ -7,12 +7,12 @@ use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id;
use crate::search::new::SearchContext; use crate::search::new::SearchContext;
use crate::Result; use crate::{FieldId, InternalError, Result};
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
pub struct FidCondition { pub struct FidCondition {
term: LocatedQueryTermSubset, term: LocatedQueryTermSubset,
fid: u16, fid: Option<FieldId>,
} }
pub enum FidGraph {} pub enum FidGraph {}
@ -26,13 +26,15 @@ impl RankingRuleGraphTrait for FidGraph {
universe: &RoaringBitmap, universe: &RoaringBitmap,
) -> Result<ComputedCondition> { ) -> Result<ComputedCondition> {
let FidCondition { term, .. } = condition; let FidCondition { term, .. } = condition;
// maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument
let mut docids = compute_query_term_subset_docids_within_field_id( let docids = if let Some(fid) = condition.fid {
ctx, // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument
&term.term_subset, let docids =
condition.fid, compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?;
)?; docids & universe
docids &= universe; } else {
RoaringBitmap::new()
};
Ok(ComputedCondition { Ok(ComputedCondition {
docids, docids,
@ -68,34 +70,29 @@ impl RankingRuleGraphTrait for FidGraph {
all_fields.extend(fields); all_fields.extend(fields);
} }
let weights_map = ctx.index.fieldids_weights_map(ctx.txn)?;
let mut edges = vec![]; let mut edges = vec![];
for fid in all_fields.iter().copied() { for fid in all_fields.iter().copied() {
let weight = weights_map
.weight(fid)
.ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?;
edges.push(( edges.push((
fid as u32 * term.term_ids.len() as u32, weight as u32 * term.term_ids.len() as u32,
conditions_interner.insert(FidCondition { term: term.clone(), fid }), conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }),
)); ));
} }
// always lookup the max_fid if we don't already and add an artificial condition for max scoring // always lookup the max_fid if we don't already and add an artificial condition for max scoring
let max_fid: Option<u16> = { let max_weight: Option<u16> = weights_map.max_weight();
if let Some(max_fid) = ctx
.index
.searchable_fields_ids(ctx.txn)?
.map(|field_ids| field_ids.into_iter().max())
{
max_fid
} else {
ctx.index.fields_ids_map(ctx.txn)?.ids().max()
}
};
if let Some(max_fid) = max_fid { if let Some(max_weight) = max_weight {
if !all_fields.contains(&max_fid) { if !all_fields.contains(&max_weight) {
edges.push(( edges.push((
max_fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. max_weight as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10.
conditions_interner.insert(FidCondition { conditions_interner.insert(FidCondition {
term: term.clone(), // TODO remove this ugly clone term: term.clone(), // TODO remove this ugly clone
fid: max_fid, fid: None,
}), }),
)); ));
} }

View File

@ -1,5 +1,5 @@
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; use crate::{db_snap, Criterion, Search, SearchResult, TermsMatchingStrategy};
fn create_index() -> TempIndex { fn create_index() -> TempIndex {
let index = TempIndex::new(); let index = TempIndex::new();
@ -131,6 +131,19 @@ fn test_attribute_fid_simple() {
#[test] #[test]
fn test_attribute_fid_ngrams() { fn test_attribute_fid_ngrams() {
let index = create_index(); let index = create_index();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 title |
2 description |
3 plot |
"###);
db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
1 0 |
2 1 |
3 2 |
"###);
let txn = index.read_txn().unwrap(); let txn = index.read_txn().unwrap();

View File

@ -0,0 +1,244 @@
---
source: milli/src/search/new/tests/attribute_fid.rs
expression: "format!(\"{document_ids_scores:#?}\")"
---
[
(
2,
[
Fid(
Rank {
rank: 19,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
(
6,
[
Fid(
Rank {
rank: 15,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
5,
[
Fid(
Rank {
rank: 14,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
4,
[
Fid(
Rank {
rank: 13,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
3,
[
Fid(
Rank {
rank: 12,
max_rank: 19,
},
),
Position(
Rank {
rank: 83,
max_rank: 91,
},
),
],
),
(
9,
[
Fid(
Rank {
rank: 11,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
8,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
7,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 73,
max_rank: 91,
},
),
],
),
(
11,
[
Fid(
Rank {
rank: 7,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
10,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
13,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
12,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 78,
max_rank: 91,
},
),
],
),
(
14,
[
Fid(
Rank {
rank: 5,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
0,
[
Fid(
Rank {
rank: 1,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
]

View File

@ -308,6 +308,25 @@ pub fn snap_fields_ids_map(index: &Index) -> String {
} }
snap snap
} }
pub fn snap_fieldids_weights_map(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let weights_map = index.fieldids_weights_map(&rtxn).unwrap();
let mut snap = String::new();
writeln!(&mut snap, "fid weight").unwrap();
let mut field_ids: Vec<_> = weights_map.ids().collect();
field_ids.sort();
for field_id in field_ids {
let weight = weights_map.weight(field_id).unwrap();
writeln!(&mut snap, "{field_id:<3} {weight:<3} |").unwrap();
}
snap
}
pub fn snap_searchable_fields(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let searchable_fields = index.searchable_fields(&rtxn).unwrap();
format!("{searchable_fields:?}")
}
pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap();
@ -469,6 +488,12 @@ macro_rules! full_snap_of_db {
($index:ident, fields_ids_map) => {{ ($index:ident, fields_ids_map) => {{
$crate::snapshot_tests::snap_fields_ids_map(&$index) $crate::snapshot_tests::snap_fields_ids_map(&$index)
}}; }};
($index:ident, fieldids_weights_map) => {{
$crate::snapshot_tests::snap_fieldids_weights_map(&$index)
}};
($index:ident, searchable_fields) => {{
$crate::snapshot_tests::snap_searchable_fields(&$index)
}};
($index:ident, geo_faceted_documents_ids) => {{ ($index:ident, geo_faceted_documents_ids) => {{
$crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index) $crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index)
}}; }};

View File

@ -186,7 +186,7 @@ fn searchable_fields_changed(
) -> bool { ) -> bool {
let searchable_fields = &settings_diff.new.searchable_fields_ids; let searchable_fields = &settings_diff.new.searchable_fields_ids;
for (field_id, field_bytes) in obkv.iter() { for (field_id, field_bytes) in obkv.iter() {
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { if searchable_fields.contains(&field_id) {
let del_add = KvReaderDelAdd::new(field_bytes); let del_add = KvReaderDelAdd::new(field_bytes);
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
// if both fields are None, check the next field. // if both fields are None, check the next field.
@ -298,7 +298,7 @@ fn lang_safe_tokens_from_document<'a>(
/// Extract words mapped with their positions of a document. /// Extract words mapped with their positions of a document.
fn tokens_from_document<'a>( fn tokens_from_document<'a>(
obkv: &KvReader<FieldId>, obkv: &KvReader<FieldId>,
searchable_fields: &Option<Vec<FieldId>>, searchable_fields: &[FieldId],
tokenizer: &Tokenizer, tokenizer: &Tokenizer,
max_positions_per_attributes: u32, max_positions_per_attributes: u32,
del_add: DelAdd, del_add: DelAdd,
@ -309,7 +309,7 @@ fn tokens_from_document<'a>(
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
for (field_id, field_bytes) in obkv.iter() { for (field_id, field_bytes) in obkv.iter() {
// if field is searchable. // if field is searchable.
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { if searchable_fields.as_ref().contains(&field_id) {
// extract deletion or addition only. // extract deletion or addition only.
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
// parse json. // parse json.

View File

@ -3260,6 +3260,7 @@ mod tests {
} }
#[test] #[test]
#[cfg(feature = "all-tokenizations")]
fn stored_detected_script_and_language_should_not_return_deleted_documents() { fn stored_detected_script_and_language_should_not_return_deleted_documents() {
use charabia::{Language, Script}; use charabia::{Language, Script};
let index = TempIndex::new(); let index = TempIndex::new();

View File

@ -461,50 +461,39 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Ok(true) Ok(true)
} }
/// Updates the index's searchable attributes. This causes the field map to be recomputed to /// Updates the index's searchable attributes.
/// reflect the order of the searchable attributes.
fn update_searchable(&mut self) -> Result<bool> { fn update_searchable(&mut self) -> Result<bool> {
match self.searchable_fields { match self.searchable_fields {
Setting::Set(ref fields) => { Setting::Set(ref fields) => {
// Check to see if the searchable fields changed before doing anything else // Check to see if the searchable fields changed before doing anything else
let old_fields = self.index.searchable_fields(self.wtxn)?; let old_fields = self.index.searchable_fields(self.wtxn)?;
let did_change = match old_fields { let did_change = {
// If old_fields is Some, let's check to see if the fields actually changed let new_fields = fields.iter().map(String::as_str).collect::<Vec<_>>();
Some(old_fields) => { new_fields != old_fields
let new_fields = fields.iter().map(String::as_str).collect::<Vec<_>>();
new_fields != old_fields
}
// If old_fields is None, the fields have changed (because they are being set)
None => true,
}; };
if !did_change { if !did_change {
return Ok(false); return Ok(false);
} }
// every time the searchable attributes are updated, we need to update the // Since we're updating the settings we can only add new fields at the end of the field id map
// ids for any settings that uses the facets. (distinct_fields, filterable_fields). let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let mut new_fields_ids_map = FieldsIdsMap::new();
// fields are deduplicated, only the first occurrence is taken into account // fields are deduplicated, only the first occurrence is taken into account
let names = fields.iter().unique().map(String::as_str).collect::<Vec<_>>(); let names = fields.iter().unique().map(String::as_str).collect::<Vec<_>>();
// Add all the searchable attributes to the field map, and then add the // Add all the searchable attributes to the field map, and then add the
// remaining fields from the old field map to the new one // remaining fields from the old field map to the new one
for name in names.iter() { for name in names.iter() {
new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; // The fields ids map won't change the field id of already present elements thus only the
} // new fields will be inserted.
fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?;
for (_, name) in old_fields_ids_map.iter() {
new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?;
} }
self.index.put_all_searchable_fields_from_fields_ids_map( self.index.put_all_searchable_fields_from_fields_ids_map(
self.wtxn, self.wtxn,
&names, &names,
&new_fields_ids_map, &fields_ids_map,
)?; )?;
self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
Ok(true) Ok(true)
} }
Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?),
@ -1172,7 +1161,7 @@ pub(crate) struct InnerIndexSettings {
pub user_defined_faceted_fields: HashSet<String>, pub user_defined_faceted_fields: HashSet<String>,
pub user_defined_searchable_fields: Option<Vec<String>>, pub user_defined_searchable_fields: Option<Vec<String>>,
pub faceted_fields_ids: HashSet<FieldId>, pub faceted_fields_ids: HashSet<FieldId>,
pub searchable_fields_ids: Option<Vec<FieldId>>, pub searchable_fields_ids: Vec<FieldId>,
pub exact_attributes: HashSet<FieldId>, pub exact_attributes: HashSet<FieldId>,
pub proximity_precision: ProximityPrecision, pub proximity_precision: ProximityPrecision,
pub embedding_configs: EmbeddingConfigs, pub embedding_configs: EmbeddingConfigs,
@ -1233,18 +1222,21 @@ impl InnerIndexSettings {
// find and insert the new field ids // find and insert the new field ids
pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
let searchable_fields = self
.user_defined_searchable_fields
.as_ref()
.map(|searchable| searchable.iter().map(|s| s.as_str()).collect::<Vec<_>>());
// in case new fields were introduced we're going to recreate the searchable fields. // in case new fields were introduced we're going to recreate the searchable fields.
if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() { if let Some(searchable_fields) = searchable_fields {
let searchable_fields =
searchable_fields.iter().map(String::as_ref).collect::<Vec<_>>();
index.put_all_searchable_fields_from_fields_ids_map( index.put_all_searchable_fields_from_fields_ids_map(
wtxn, wtxn,
&searchable_fields, &searchable_fields,
&self.fields_ids_map, &self.fields_ids_map,
)?; )?;
let searchable_fields_ids = index.searchable_fields_ids(wtxn)?;
self.searchable_fields_ids = searchable_fields_ids;
} }
let searchable_fields_ids = index.searchable_fields_ids(wtxn)?;
self.searchable_fields_ids = searchable_fields_ids;
Ok(()) Ok(())
} }
@ -1517,12 +1509,13 @@ mod tests {
use big_s::S; use big_s::S;
use heed::types::Bytes; use heed::types::Bytes;
use maplit::{btreemap, btreeset, hashset}; use maplit::{btreemap, btreeset, hashset};
use meili_snap::snapshot;
use super::*; use super::*;
use crate::error::Error; use crate::error::Error;
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
use crate::update::ClearDocuments; use crate::update::ClearDocuments;
use crate::{Criterion, Filter, SearchResult}; use crate::{db_snap, Criterion, Filter, SearchResult};
#[test] #[test]
fn set_and_reset_searchable_fields() { fn set_and_reset_searchable_fields() {
@ -1551,6 +1544,17 @@ mod tests {
wtxn.commit().unwrap(); wtxn.commit().unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 name |
2 age |
"###);
db_snap!(index, searchable_fields, @r###"["name"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
1 0 |
"###);
// Check that the searchable field is correctly set to "name" only. // Check that the searchable field is correctly set to "name" only.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
// When we search for something that is not in // When we search for something that is not in
@ -1562,8 +1566,9 @@ mod tests {
// we must find the appropriate document. // we must find the appropriate document.
let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap(); let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap();
let documents = index.documents(&rtxn, result.documents_ids).unwrap(); let documents = index.documents(&rtxn, result.documents_ids).unwrap();
let fid_map = index.fields_ids_map(&rtxn).unwrap();
assert_eq!(documents.len(), 1); assert_eq!(documents.len(), 1);
assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
drop(rtxn); drop(rtxn);
// We change the searchable fields to be the "name" field only. // We change the searchable fields to be the "name" field only.
@ -1573,14 +1578,31 @@ mod tests {
}) })
.unwrap(); .unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 name |
2 age |
"###);
db_snap!(index, searchable_fields, @r###"["id", "name", "age"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
1 0 |
2 0 |
"###);
// Check that the searchable field have been reset and documents are found now. // Check that the searchable field have been reset and documents are found now.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let fid_map = index.fields_ids_map(&rtxn).unwrap();
let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap();
snapshot!(format!("{user_defined_searchable_fields:?}"), @"None");
// the searchable fields should contain all the fields
let searchable_fields = index.searchable_fields(&rtxn).unwrap(); let searchable_fields = index.searchable_fields(&rtxn).unwrap();
assert_eq!(searchable_fields, None); snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###);
let result = index.search(&rtxn).query("23").execute().unwrap(); let result = index.search(&rtxn).query("23").execute().unwrap();
assert_eq!(result.documents_ids.len(), 1); assert_eq!(result.documents_ids.len(), 1);
let documents = index.documents(&rtxn, result.documents_ids).unwrap(); let documents = index.documents(&rtxn, result.documents_ids).unwrap();
assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
} }
#[test] #[test]