Construct the minimal OBKVs according to the settings diff

This commit is contained in:
Clément Renault 2024-05-22 16:05:55 +02:00
parent bc5663e673
commit fe17c0f52e
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
4 changed files with 122 additions and 64 deletions

View File

@ -680,6 +680,26 @@ async fn search_facet_distribution() {
}, },
) )
.await; .await;
index.update_settings(json!({"filterableAttributes": ["doggos.name"]})).await;
index.wait_task(5).await;
index
.search(
json!({
"facets": ["doggos.name"]
}),
|response, code| {
assert_eq!(code, 200, "{}", response);
let dist = response["facetDistribution"].as_object().unwrap();
assert_eq!(dist.len(), 1);
assert_eq!(
dist["doggos.name"],
json!({ "bobby": 1, "buddy": 1, "gros bill": 1, "turbo": 1, "fast": 1})
);
},
)
.await;
} }
#[actix_rt::test] #[actix_rt::test]

View File

@ -354,8 +354,7 @@ pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator<Item = impl AsR
/// assert!(!is_faceted_by("animaux.chien", "animaux.chie")); /// assert!(!is_faceted_by("animaux.chien", "animaux.chie"));
/// ``` /// ```
pub fn is_faceted_by(field: &str, facet: &str) -> bool { pub fn is_faceted_by(field: &str, facet: &str) -> bool {
field.starts_with(facet) field.starts_with(facet) && field[facet.len()..].chars().next().map_or(true, |c| c == '.')
&& field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true)
} }
pub fn normalize_facet(original: &str) -> String { pub fn normalize_facet(original: &str) -> String {

View File

@ -1,7 +1,7 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::btree_map::Entry as BEntry; use std::collections::btree_map::Entry as BEntry;
use std::collections::hash_map::Entry as HEntry; use std::collections::hash_map::Entry as HEntry;
use std::collections::HashMap; use std::collections::{HashMap, HashSet};
use std::fs::File; use std::fs::File;
use std::io::{Read, Seek}; use std::io::{Read, Seek};
@ -20,13 +20,13 @@ use super::{IndexDocumentsMethod, IndexerConfig};
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
use crate::error::{Error, InternalError, UserError}; use crate::error::{Error, InternalError, UserError};
use crate::index::{db_name, main_key}; use crate::index::{db_name, main_key};
use crate::update::del_add::{ use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd};
del_add_from_two_obkvs, into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd,
};
use crate::update::index_documents::GrenadParameters; use crate::update::index_documents::GrenadParameters;
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result}; use crate::{
is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result,
};
pub struct TransformOutput { pub struct TransformOutput {
pub primary_key: String, pub primary_key: String,
@ -808,11 +808,15 @@ impl<'a, 'i> Transform<'a, 'i> {
})?; })?;
let old_inner_settings = InnerIndexSettings::from_index(self.index, wtxn)?; let old_inner_settings = InnerIndexSettings::from_index(self.index, wtxn)?;
let fields_ids_map = self.fields_ids_map;
let primary_key_id = self.index.primary_key(wtxn)?.and_then(|name| fields_ids_map.id(name));
let mut new_inner_settings = old_inner_settings.clone(); let mut new_inner_settings = old_inner_settings.clone();
new_inner_settings.fields_ids_map = self.fields_ids_map; new_inner_settings.fields_ids_map = fields_ids_map;
let settings_diff = InnerIndexSettingsDiff { let settings_diff = InnerIndexSettingsDiff {
old: old_inner_settings, old: old_inner_settings,
new: new_inner_settings, new: new_inner_settings,
primary_key_id,
embedding_configs_updated: false, embedding_configs_updated: false,
settings_update_only: false, settings_update_only: false,
}; };
@ -837,37 +841,66 @@ impl<'a, 'i> Transform<'a, 'i> {
fn rebind_existing_document( fn rebind_existing_document(
old_obkv: KvReader<FieldId>, old_obkv: KvReader<FieldId>,
settings_diff: &InnerIndexSettingsDiff, settings_diff: &InnerIndexSettingsDiff,
original_obkv_buffer: &mut Vec<u8>, modified_faceted_fields: &HashSet<String>,
flattened_obkv_buffer: &mut Vec<u8>, original_obkv_buffer: Option<&mut Vec<u8>>,
flattened_obkv_buffer: Option<&mut Vec<u8>>,
) -> Result<()> { ) -> Result<()> {
// TODO do a XOR of the faceted fields // Always keep the primary key.
// TODO if reindex_searchable returns true store all searchables else none let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) };
// TODO no longer useful after Tamo's PR
let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone(); // If only the `searchableAttributes` has been changed, keep only the searchable fields.
let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone(); let must_reindex_searchables = settings_diff.reindex_searchable();
let necessary_searchable_field = |id: FieldId| -> bool {
must_reindex_searchables
&& (settings_diff.old.searchable_fields_ids.contains(&id)
|| settings_diff.new.searchable_fields_ids.contains(&id))
};
// If only a faceted field has been added, keep only this field.
let must_reindex_facets = settings_diff.reindex_facets();
let necessary_faceted_field = |id: FieldId| -> bool {
let field_name = settings_diff.new.fields_ids_map.name(id).unwrap();
must_reindex_facets
&& modified_faceted_fields
.iter()
.any(|long| is_faceted_by(long, field_name) || is_faceted_by(field_name, long))
};
// Alway provide all fields when vectors are involved because
// we need the fields for the prompt/templating.
let reindex_vectors = settings_diff.reindex_vectors();
let mut obkv_writer = KvWriter::<_, FieldId>::memory(); let mut obkv_writer = KvWriter::<_, FieldId>::memory();
// We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. for (id, val) in old_obkv.iter() {
for (id, name) in new_fields_ids_map.iter() { if is_primary_key(id)
if let Some(val) = old_fields_ids_map.id(name).and_then(|id| old_obkv.get(id)) { || necessary_searchable_field(id)
|| necessary_faceted_field(id)
|| reindex_vectors
{
obkv_writer.insert(id, val)?; obkv_writer.insert(id, val)?;
} }
} }
let data = obkv_writer.into_inner()?; let data = obkv_writer.into_inner()?;
let new_obkv = KvReader::<FieldId>::new(&data); let obkv = KvReader::<FieldId>::new(&data);
// take the non-flattened version if flatten_from_fields_ids_map returns None.
let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?;
let old_flattened =
old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::<FieldId>::new);
let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?;
let new_flattened =
new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::<FieldId>::new);
if let Some(original_obkv_buffer) = original_obkv_buffer {
original_obkv_buffer.clear(); original_obkv_buffer.clear();
flattened_obkv_buffer.clear(); into_del_add_obkv(obkv, DelAddOperation::DeletionAndAddition, original_obkv_buffer)?;
}
del_add_from_two_obkvs(&old_obkv, &new_obkv, original_obkv_buffer)?; if let Some(flattened_obkv_buffer) = flattened_obkv_buffer {
del_add_from_two_obkvs(&old_flattened, &new_flattened, flattened_obkv_buffer)?; // take the non-flattened version if flatten_from_fields_ids_map returns None.
let mut fields_ids_map = settings_diff.new.fields_ids_map.clone();
let flattened = Self::flatten_from_fields_ids_map(&obkv, &mut fields_ids_map)?;
let flattened = flattened.as_deref().map_or(obkv, KvReader::new);
flattened_obkv_buffer.clear();
into_del_add_obkv(
flattened,
DelAddOperation::DeletionAndAddition,
flattened_obkv_buffer,
)?;
}
Ok(()) Ok(())
} }
@ -924,6 +957,8 @@ impl<'a, 'i> Transform<'a, 'i> {
None None
}; };
if original_sorter.is_some() || flattened_sorter.is_some() {
let modified_faceted_fields = settings_diff.modified_faceted_fields();
let mut original_obkv_buffer = Vec::new(); let mut original_obkv_buffer = Vec::new();
let mut flattened_obkv_buffer = Vec::new(); let mut flattened_obkv_buffer = Vec::new();
let mut document_sorter_key_buffer = Vec::new(); let mut document_sorter_key_buffer = Vec::new();
@ -936,20 +971,22 @@ impl<'a, 'i> Transform<'a, 'i> {
Self::rebind_existing_document( Self::rebind_existing_document(
old_obkv, old_obkv,
&settings_diff, &settings_diff,
&mut original_obkv_buffer, &modified_faceted_fields,
&mut flattened_obkv_buffer, Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()),
Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()),
)?; )?;
if let Some(original_sorter) = original_sorter.as_mut() {
document_sorter_key_buffer.clear(); document_sorter_key_buffer.clear();
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
if let Some(original_sorter) = original_sorter.as_mut() {
original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?;
} }
if let Some(flattened_sorter) = flattened_sorter.as_mut() { if let Some(flattened_sorter) = flattened_sorter.as_mut() {
flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?;
} }
} }
}
let grenad_params = GrenadParameters { let grenad_params = GrenadParameters {
chunk_compression_type: self.indexer_settings.chunk_compression_type, chunk_compression_type: self.indexer_settings.chunk_compression_type,

View File

@ -1067,10 +1067,17 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
let embedding_configs_updated = self.update_embedding_configs()?; let embedding_configs_updated = self.update_embedding_configs()?;
let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
new_inner_settings.recompute_facets(self.wtxn, self.index)?;
let primary_key_id = self
.index
.primary_key(self.wtxn)?
.and_then(|name| new_inner_settings.fields_ids_map.id(name));
let inner_settings_diff = InnerIndexSettingsDiff { let inner_settings_diff = InnerIndexSettingsDiff {
old: old_inner_settings, old: old_inner_settings,
new: new_inner_settings, new: new_inner_settings,
primary_key_id,
embedding_configs_updated, embedding_configs_updated,
settings_update_only: true, settings_update_only: true,
}; };
@ -1086,10 +1093,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
pub struct InnerIndexSettingsDiff { pub struct InnerIndexSettingsDiff {
pub(crate) old: InnerIndexSettings, pub(crate) old: InnerIndexSettings,
pub(crate) new: InnerIndexSettings, pub(crate) new: InnerIndexSettings,
pub(crate) primary_key_id: Option<FieldId>,
// TODO: compare directly the embedders. // TODO: compare directly the embedders.
pub(crate) embedding_configs_updated: bool, pub(crate) embedding_configs_updated: bool,
pub(crate) settings_update_only: bool, pub(crate) settings_update_only: bool,
} }
@ -1127,15 +1133,7 @@ impl InnerIndexSettingsDiff {
return true; return true;
} }
let faceted_updated = (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields)
(existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields);
self.old
.fields_ids_map
.iter()
.zip(self.new.fields_ids_map.iter())
.any(|(old, new)| old != new)
|| faceted_updated
} }
pub fn reindex_vectors(&self) -> bool { pub fn reindex_vectors(&self) -> bool {
@ -1145,6 +1143,10 @@ impl InnerIndexSettingsDiff {
pub fn settings_update_only(&self) -> bool { pub fn settings_update_only(&self) -> bool {
self.settings_update_only self.settings_update_only
} }
pub fn modified_faceted_fields(&self) -> HashSet<String> {
&self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields
}
} }
#[derive(Clone)] #[derive(Clone)]