Introduce a new way to determine the operations to perform on the fields

This commit is contained in:
Clément Renault 2024-05-29 17:46:28 +02:00
parent bc210bdc00
commit d9e5074189
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
2 changed files with 34 additions and 41 deletions

View File

@ -843,29 +843,6 @@ impl<'a, 'i> Transform<'a, 'i> {
// Always keep the primary key.
let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) };
// If only the `searchableAttributes` has been changed, keep only the searchable fields.
// However, if only new searchable attributes are added, this function will
// return false has fields do not need to be reindexed.
let must_reindex_searchables = settings_diff.reindex_searchable();
let must_index_only_additional_searchables = &settings_diff.only_additional_fields();
let necessary_searchable_field_to_reindex = move |id: FieldId| -> bool {
must_index_only_additional_searchables.is_none()
&& must_reindex_searchables
&& (settings_diff.old.searchable_fields_ids.contains(&id)
|| settings_diff.new.searchable_fields_ids.contains(&id))
};
// If only new `searchableAttributes` are present, keep only those ones.
let additional_searchable_field_only = move |id: FieldId| -> bool {
match must_index_only_additional_searchables {
Some(additional_fields) => {
let additional_field = settings_diff.new.fields_ids_map.name(id).unwrap();
additional_fields.contains(additional_field)
}
None => false,
}
};
// If only a faceted field has been added, keep only this field.
let must_reindex_facets = settings_diff.reindex_facets();
let necessary_faceted_field = |id: FieldId| -> bool {
@ -880,20 +857,16 @@ impl<'a, 'i> Transform<'a, 'i> {
// we need the fields for the prompt/templating.
let reindex_vectors = settings_diff.reindex_vectors();
// The set of additional searchable fields only,
// the only purpose of these fields is to be indexed from scratch.
let mut additional_searchables_only = HashSet::new();
// The operations that we must perform on the different fields.
let mut operations = HashMap::new();
let mut obkv_writer = KvWriter::<_, FieldId>::memory();
for (id, val) in old_obkv.iter() {
if is_primary_key(id)
|| necessary_searchable_field_to_reindex(id)
|| necessary_faceted_field(id)
|| reindex_vectors
{
if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors {
operations.insert(id, DelAddOperation::DeletionAndAddition);
obkv_writer.insert(id, val)?;
} else if additional_searchable_field_only(id) {
additional_searchables_only.insert(id);
} else if let Some(operation) = settings_diff.reindex_searchable_id(id) {
operations.insert(id, operation);
obkv_writer.insert(id, val)?;
}
}
@ -913,13 +886,7 @@ impl<'a, 'i> Transform<'a, 'i> {
flattened_obkv_buffer.clear();
into_del_add_obkv_conditional_operation(flattened, flattened_obkv_buffer, |id| {
// If the field is only required because it is an additional
// searchable field only define it as an DelAdd::Addition only.
if additional_searchables_only.contains(&id) {
DelAddOperation::Addition
} else {
DelAddOperation::DeletionAndAddition
}
operations.get(&id).copied().unwrap_or(DelAddOperation::DeletionAndAddition)
})?;
}

View File

@ -9,6 +9,7 @@ use itertools::{EitherOrBoth, Itertools};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use time::OffsetDateTime;
use super::del_add::DelAddOperation;
use super::index_documents::{IndexDocumentsConfig, Transform};
use super::IndexerConfig;
use crate::criterion::Criterion;
@ -1112,6 +1113,31 @@ impl InnerIndexSettingsDiff {
|| self.old.proximity_precision != self.new.proximity_precision
}
pub fn reindex_searchable_id(&self, id: FieldId) -> Option<DelAddOperation> {
if self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
!= self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|| self.old.allowed_separators != self.new.allowed_separators
|| self.old.dictionary != self.new.dictionary
|| self.old.exact_attributes != self.new.exact_attributes
// Here we can be much more optimal by just deleting the proximity database
|| self.old.proximity_precision != self.new.proximity_precision
{
Some(DelAddOperation::DeletionAndAddition)
} else if let Some(only_additional_fields) = self.only_additional_fields() {
let additional_field = self.new.fields_ids_map.name(id).unwrap();
if only_additional_fields.contains(additional_field) {
Some(DelAddOperation::Addition)
} else {
None
}
} else if self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields
{
Some(DelAddOperation::DeletionAndAddition)
} else {
None
}
}
/// Returns only the additional searchable fields.
/// If any other searchable field has been modified, returns None.
pub fn only_additional_fields(&self) -> Option<HashSet<String>> {
@ -1599,7 +1625,7 @@ mod tests {
// When we search for something that is not in
// the searchable fields it must not return any document.
let result = index.search(&rtxn).query("23").execute().unwrap();
assert!(result.documents_ids.is_empty());
assert_eq!(result.documents_ids, Vec::<u32>::new());
// When we search for something that is in the searchable fields
// we must find the appropriate document.