mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-25 22:34:28 +01:00
Merge #4864
4864: Don't remove facet value when multiple original values map to the same normalized value r=ManyTheFish a=dureuill # Pull Request ## Related issue Fixes #4860 > [!WARNING] > This PR contains a fix to the immediate issue, but it looks like the underlying data model is faulty: there is only one possible "original" value for each normalized value in a facet of a document, while because of array values (or manually written nested fields, if you're evil), it is technically possible to have multiple, distinct original values mapping to the same normalized value. Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
commit
07c8ed0459
@ -9,7 +9,7 @@ use std::result::Result as StdResult;
|
|||||||
use bytemuck::bytes_of;
|
use bytemuck::bytes_of;
|
||||||
use grenad::Sorter;
|
use grenad::Sorter;
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
use itertools::{merge_join_by, EitherOrBoth};
|
use itertools::{merge_join_by, EitherOrBoth, Itertools};
|
||||||
use ordered_float::OrderedFloat;
|
use ordered_float::OrderedFloat;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::{from_slice, Value};
|
use serde_json::{from_slice, Value};
|
||||||
@ -317,11 +317,15 @@ fn deladd_obkv_cbo_roaring_bitmaps(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Truncates a string to the biggest valid LMDB key size.
|
/// Truncates a string to the biggest valid LMDB key size.
|
||||||
fn truncate_string(s: String) -> String {
|
fn truncate_str(s: &str) -> &str {
|
||||||
s.char_indices()
|
let index = s
|
||||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
.char_indices()
|
||||||
.map(|(_, c)| c)
|
.map(|(idx, _)| idx)
|
||||||
.collect()
|
.chain(std::iter::once(s.len()))
|
||||||
|
.take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
|
||||||
|
.last();
|
||||||
|
|
||||||
|
&s[..index.unwrap_or(0)]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Computes the diff between both Del and Add numbers and
|
/// Computes the diff between both Del and Add numbers and
|
||||||
@ -401,36 +405,102 @@ where
|
|||||||
del_strings.dedup();
|
del_strings.dedup();
|
||||||
add_strings.dedup();
|
add_strings.dedup();
|
||||||
|
|
||||||
|
let del_strings = del_strings.iter().chunk_by(|(normalized, _)| normalized);
|
||||||
|
let add_strings = add_strings.iter().chunk_by(|(normalized, _)| normalized);
|
||||||
|
|
||||||
let merged_strings_iter = itertools::merge_join_by(
|
let merged_strings_iter = itertools::merge_join_by(
|
||||||
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||||
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||||
|del, add| del.cmp(add),
|
|(normalized_del, _), (normalized_add, _)| normalized_del.cmp(normalized_add),
|
||||||
);
|
);
|
||||||
|
|
||||||
// insert normalized and original facet string in sorter
|
// insert normalized and original facet string in sorter
|
||||||
for eob in merged_strings_iter {
|
for eob in merged_strings_iter {
|
||||||
key_buffer.truncate(TRUNCATE_SIZE);
|
key_buffer.truncate(TRUNCATE_SIZE);
|
||||||
match eob {
|
let (side, normalized, original) = match eob {
|
||||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
EitherOrBoth::Both((normalized, del), (_, add)) => {
|
||||||
EitherOrBoth::Left((normalized, original)) => {
|
let merged_strings_iter =
|
||||||
let truncated = truncate_string(normalized);
|
itertools::merge_join_by(del, add, |(_, original_del), (_, original_add)| {
|
||||||
|
original_del.cmp(original_add)
|
||||||
|
});
|
||||||
|
|
||||||
|
// FIXME: we're in a bit of a pickle here, because we're only saving **one** original value per side,
|
||||||
|
// but we possibly have multiple original values that changed in the case where the field is an
|
||||||
|
// array of multiple values that normalize to the same value.
|
||||||
|
// (e.g. "foo" = ["bar", "Bar", "bAr", "baR"]. I'm not judging why you would do that ¯\_(ツ)_/¯)
|
||||||
|
//
|
||||||
|
// We'll work best effort by ignoring when the same value appears in both sides, deleting the first
|
||||||
|
// value that is only in the old version, and adding the first value that is only in the new version
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
let mut del = None;
|
||||||
|
let mut add = None;
|
||||||
|
let mut both = None;
|
||||||
|
|
||||||
|
for eob in merged_strings_iter {
|
||||||
|
match eob {
|
||||||
|
EitherOrBoth::Both((_normalized, original), _) => {
|
||||||
|
both = match both {
|
||||||
|
Some(both) => Some(both),
|
||||||
|
None => Some(original),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EitherOrBoth::Left((_normalized, original)) => {
|
||||||
|
del = match del {
|
||||||
|
Some(del) => Some(del),
|
||||||
|
None => Some(original),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right((_normalized, original)) => {
|
||||||
|
add = match add {
|
||||||
|
Some(add) => Some(add),
|
||||||
|
None => Some(original),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(del) = del {
|
||||||
|
obkv.insert(DelAdd::Deletion, del)?;
|
||||||
|
}
|
||||||
|
if let Some(add) = add
|
||||||
|
// prefer the newly added, but if there is none, keep a value in the list of values
|
||||||
|
// since the normalized value appears both in old and new, we should never remove it.
|
||||||
|
.or(both)
|
||||||
|
{
|
||||||
|
obkv.insert(DelAdd::Addition, add)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let truncated = truncate_str(normalized);
|
||||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||||
|
|
||||||
let mut obkv = KvWriterDelAdd::memory();
|
|
||||||
obkv.insert(DelAdd::Deletion, original)?;
|
|
||||||
let bytes = obkv.into_inner()?;
|
let bytes = obkv.into_inner()?;
|
||||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
EitherOrBoth::Right((normalized, original)) => {
|
EitherOrBoth::Left((_normalized, mut original)) => {
|
||||||
let truncated = truncate_string(normalized);
|
// FIXME: we only consider the first value for the purpose of facet search
|
||||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
// another structure is needed, able to retain all originals associated with a normalized value.
|
||||||
|
let Some((normalized, original)) = original.next() else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
(DelAdd::Deletion, normalized, original)
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right((_normalized, mut original)) => {
|
||||||
|
// FIXME: we only consider the first value for the purpose of facet search
|
||||||
|
// another structure is needed, able to retain all originals associated with a normalized value.
|
||||||
|
let Some((normalized, original)) = original.next() else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
(DelAdd::Addition, normalized, original)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let truncated = truncate_str(normalized);
|
||||||
|
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||||
|
|
||||||
let mut obkv = KvWriterDelAdd::memory();
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
obkv.insert(DelAdd::Addition, original)?;
|
obkv.insert(side, original)?;
|
||||||
let bytes = obkv.into_inner()?;
|
let bytes = obkv.into_inner()?;
|
||||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
Loading…
Reference in New Issue
Block a user