mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 12:54:26 +01:00
Merge #4845
4845: Fix perf regression facet strings r=ManyTheFish a=dureuill Benchmarks between v1.9 and v1.10 show a performance regression of about x2 (+3dB regression) for most indexing workloads (+44s for hackernews). [Benchmark interpretation in the engine weekly meeting](https://www.notion.so/meilisearch/Engine-weekly-4d49560d374c4a87b4e3d126a261d4a0?pvs=4#98a709683276450295fcfe1f8ea5cef3). - Initial investigation pointed to #4819 as the origin of the regression. - Further investigation points towards the hypernormalization of each facet value in `extract_facet_string_docids` - Most of the slowdown is in `normalize_facet_strings`, and precisely in `detection.language()`. This PR improves the situation (-10s compared with `main` for hackernews, so only +34s regression compared with `v1.9`) by skipping normalization when it can be skipped. I'm not sure how to fix the root cause though. Should we skip facet locale normalization for now? Cc `@ManyTheFish` --- Tentative resolution options: 1. remove locale normalization from facet. I'm not sure why this is required, I believe we weren't doing this before, so maybe we can stop doing that again. 2. don't do language detection when it can be helped: won't help with the regressions in benchmark, but maybe we can skip language detection when the locales contain only one language? 3. use a faster language detection library: `@Kerollmops` told me about https://github.com/quickwit-oss/whichlang which bolsters x10 to x100 throughput compared with whatlang. Should we consider replacing whatlang with whichlang? Now I understand whichlang supports fewer languages than whatlang, so I also suggest: 4. use whichlang when the list of locales is empty (autodetection), or when it only contains locales that whichlang can detect. If the list of locales contains locales that whichlang *cannot* detect, **then** use whatlang instead. --- > [!CAUTION] > this PR contains a commit that adds detailed spans, that were used to detect which part of `extract_facet_string_docids` was taking too much time. As this commit adds spans that are called too often and adds 7s overhead, it should be removed before landing. Co-authored-by: Louis Dureuil <louis@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
commit
ee62d9ce30
@ -1369,12 +1369,18 @@ pub fn perform_facet_search(
|
||||
None => TimeBudget::default(),
|
||||
};
|
||||
|
||||
// In the faceted search context, we want to use the intersection between the locales provided by the user
|
||||
// and the locales of the facet string.
|
||||
// If the facet string is not localized, we **ignore** the locales provided by the user because the facet data has no locale.
|
||||
// If the user does not provide locales, we use the locales of the facet string.
|
||||
let localized_attributes = index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
||||
let locales = locales.or_else(|| {
|
||||
localized_attributes
|
||||
let localized_attributes_locales =
|
||||
localized_attributes.into_iter().find(|attr| attr.match_str(&facet_name));
|
||||
let locales = localized_attributes_locales.map(|attr| {
|
||||
attr.locales
|
||||
.into_iter()
|
||||
.find(|attr| attr.match_str(&facet_name))
|
||||
.map(|attr| attr.locales)
|
||||
.filter(|locale| locales.as_ref().map_or(true, |locales| locales.contains(locale)))
|
||||
.collect()
|
||||
});
|
||||
|
||||
let (search, _, _, _) =
|
||||
|
@ -339,10 +339,18 @@ impl ValuesCollection {
|
||||
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
|
||||
let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||
let mut detection = StrDetection::new(facet_string, locales);
|
||||
|
||||
// Detect the language of the facet string only if several locales are explicitly provided.
|
||||
let language = match locales {
|
||||
Some(&[language]) => Some(language),
|
||||
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let token = Token {
|
||||
lemma: std::borrow::Cow::Borrowed(facet_string),
|
||||
script: detection.script(),
|
||||
language: detection.language(),
|
||||
language,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
|
@ -12,6 +12,7 @@ use heed::BytesEncode;
|
||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
|
||||
use crate::localized_attributes_rules::LocalizedFieldIds;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::{
|
||||
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||
@ -28,6 +29,116 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
if settings_diff.settings_update_only() {
|
||||
extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
|
||||
} else {
|
||||
let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
|
||||
extract_facet_string_docids_document_update(
|
||||
docid_fid_facet_string,
|
||||
indexer,
|
||||
localized_field_ids,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
localized_field_ids: &LocalizedFieldIds,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut normalized_facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_deladd_btreeset_string,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
|
||||
|
||||
let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some();
|
||||
|
||||
if is_same_value {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
{
|
||||
let locales = localized_field_ids.locales(field_id);
|
||||
let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
// as the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
obkv.insert(deladd_key, val)?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref());
|
||||
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
|
||||
}
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
@ -60,6 +171,15 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
|
||||
let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
|
||||
|
||||
let are_same_locales = old_locales == new_locales;
|
||||
|
||||
if is_same_value && are_same_locales {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
@ -68,15 +188,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
|
||||
// Facet search normalization
|
||||
{
|
||||
let locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
|
||||
let old_hyper_normalized_value = normalize_facet_string(normalized_value, locales);
|
||||
let locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
|
||||
let new_hyper_normalized_value = normalize_facet_string(normalized_value, locales);
|
||||
let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
|
||||
let new_hyper_normalized_value = if are_same_locales {
|
||||
&old_hyper_normalized_value
|
||||
} else {
|
||||
&normalize_facet_string(normalized_value, new_locales)
|
||||
};
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
// if the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
if old_hyper_normalized_value == new_hyper_normalized_value {
|
||||
if old_hyper_normalized_value == new_hyper_normalized_value.as_str() {
|
||||
// nothing to do if we delete and re-add the value.
|
||||
if is_same_value {
|
||||
continue;
|
||||
@ -148,12 +270,21 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
|
||||
/// Normalizes the facet string and truncates it to the max length.
|
||||
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
|
||||
let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||
let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() };
|
||||
let mut detection = StrDetection::new(facet_string, locales);
|
||||
|
||||
let script = detection.script();
|
||||
// Detect the language of the facet string only if several locales are explicitly provided.
|
||||
let language = match locales {
|
||||
Some(&[language]) => Some(language),
|
||||
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let token = Token {
|
||||
lemma: std::borrow::Cow::Borrowed(facet_string),
|
||||
script: detection.script(),
|
||||
language: detection.language(),
|
||||
script,
|
||||
language,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user