5091: Settings opt out r=Kerollmops a=ManyTheFish

# Pull Request

Related PRD: https://www.notion.so/meilisearch/API-usage-Settings-to-opt-out-indexing-features-fff4b06b651f8108ade3f858aeb16b14?pvs=4

## Related issue
Fixes #4979 

- [x] Add setting opt-out
- [x] Add analytics
- [x] Add tests


Co-authored-by: ManyTheFish <many@meilisearch.com>
Co-authored-by: Many the fish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-11-26 15:50:28 +00:00 committed by GitHub
commit d0b2c0a523
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 1032 additions and 114 deletions

View file

@ -34,10 +34,12 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
} else {
let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
let facet_search = settings_diff.new.facet_search;
extract_facet_string_docids_document_update(
docid_fid_facet_string,
indexer,
localized_field_ids,
facet_search,
)
}
}
@ -51,6 +53,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
localized_field_ids: &LocalizedFieldIds,
facet_search: bool,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
let max_memory = indexer.max_memory_by_thread();
@ -96,7 +99,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
let normalized_value = str::from_utf8(normalized_value_bytes)?;
// Facet search normalization
{
if facet_search {
let locales = localized_field_ids.locales(field_id);
let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
@ -179,8 +182,10 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
let are_same_locales = old_locales == new_locales;
let reindex_facet_search =
settings_diff.new.facet_search && !settings_diff.old.facet_search;
if is_same_value && are_same_locales {
if is_same_value && are_same_locales && !reindex_facet_search {
continue;
}
@ -191,18 +196,26 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
let normalized_value = str::from_utf8(normalized_value_bytes)?;
// Facet search normalization
{
let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
let new_hyper_normalized_value = if are_same_locales {
&old_hyper_normalized_value
if settings_diff.new.facet_search {
let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales);
let old_hyper_normalized_value;
let old_hyper_normalized_value = if !settings_diff.old.facet_search
|| deladd_reader.get(DelAdd::Deletion).is_none()
{
// if the facet search is disabled in the old settings or if no facet string is deleted,
// we don't need to normalize the facet string.
None
} else if are_same_locales {
Some(&new_hyper_normalized_value)
} else {
&normalize_facet_string(normalized_value, new_locales)
old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
Some(&old_hyper_normalized_value)
};
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
// if the facet string is the same, we can put the deletion and addition in the same obkv.
if old_hyper_normalized_value == new_hyper_normalized_value.as_str() {
if old_hyper_normalized_value == Some(&new_hyper_normalized_value) {
// nothing to do if we delete and re-add the value.
if is_same_value {
continue;
@ -222,7 +235,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
} else {
// if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different.
// deletion
if deladd_reader.get(DelAdd::Deletion).is_some() {
if let Some(old_hyper_normalized_value) = old_hyper_normalized_value {
// insert old value
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
buffer.clear();

View file

@ -80,7 +80,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
let new_faceted_fids: BTreeSet<_> =
settings_diff.new.faceted_fields_ids.iter().copied().collect();
if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids {
if !settings_diff.settings_update_only || settings_diff.reindex_facets() {
let mut cursor = obkv_documents.into_cursor()?;
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
let obkv = obkv::KvReader::from_slice(value);
@ -112,8 +112,10 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
(field_id, None, add_value)
}
EitherOrBoth::Both(&field_id, _) => {
// during settings update, recompute the changing settings only.
if settings_diff.settings_update_only {
// during settings update, recompute the changing settings only unless a global change is detected.
if settings_diff.settings_update_only
&& !settings_diff.global_facet_settings_changed()
{
continue;
}

View file

@ -29,6 +29,7 @@ pub use self::transform::{Transform, TransformOutput};
use super::new::StdResult;
use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError};
use crate::index::{PrefixSearch, PrefixSettings};
use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{
@ -82,8 +83,6 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
#[derive(Default, Debug, Clone)]
pub struct IndexDocumentsConfig {
pub words_prefix_threshold: Option<u32>,
pub max_prefix_length: Option<usize>,
pub words_positions_level_group_size: Option<NonZeroU32>,
pub words_positions_min_level_size: Option<NonZeroU32>,
pub update_method: IndexDocumentsMethod,
@ -565,14 +564,32 @@ where
self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?;
// Run the words prefixes update operation.
let mut builder = WordsPrefixesFst::new(self.wtxn, self.index);
if let Some(value) = self.config.words_prefix_threshold {
builder.threshold(value);
let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } =
self.index.prefix_settings(self.wtxn)?;
// If the prefix search is enabled at indexing time, we compute the prefixes.
if compute_prefixes == PrefixSearch::IndexingTime {
let mut builder = WordsPrefixesFst::new(self.wtxn, self.index);
builder.threshold(prefix_count_threshold);
builder.max_prefix_length(max_prefix_length);
builder.execute()?;
} else {
// If the prefix search is disabled at indexing time, we delete the previous words prefixes fst.
// And all the associated docids databases.
self.index.delete_words_prefixes_fst(self.wtxn)?;
self.index.word_prefix_docids.clear(self.wtxn)?;
self.index.exact_word_prefix_docids.clear(self.wtxn)?;
self.index.word_prefix_position_docids.clear(self.wtxn)?;
self.index.word_prefix_fid_docids.clear(self.wtxn)?;
databases_seen += 3;
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
return Ok(());
}
if let Some(value) = self.config.max_prefix_length {
builder.max_prefix_length(value);
}
builder.execute()?;
if (self.should_abort)() {
return Err(Error::InternalError(InternalError::AbortedIndexation));

View file

@ -667,14 +667,23 @@ impl<'a, 'i> Transform<'a, 'i> {
let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) };
// If only a faceted field has been added, keep only this field.
let must_reindex_facets = settings_diff.reindex_facets();
let necessary_faceted_field = |id: FieldId| -> bool {
let field_name = settings_diff.new.fields_ids_map.name(id).unwrap();
must_reindex_facets
&& modified_faceted_fields
.iter()
.any(|long| is_faceted_by(long, field_name) || is_faceted_by(field_name, long))
};
let global_facet_settings_changed = settings_diff.global_facet_settings_changed();
let facet_fids_changed = settings_diff.facet_fids_changed();
let necessary_faceted_field =
|id: FieldId| -> bool {
let field_name = settings_diff.new.fields_ids_map.name(id).unwrap();
if global_facet_settings_changed {
settings_diff.new.user_defined_faceted_fields.iter().any(|long| {
is_faceted_by(long, field_name) || is_faceted_by(field_name, long)
})
} else if facet_fids_changed {
modified_faceted_fields.iter().any(|long| {
is_faceted_by(long, field_name) || is_faceted_by(field_name, long)
})
} else {
false
}
};
// Alway provide all fields when vectors are involved because
// we need the fields for the prompt/templating.