mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 05:54:30 +01:00
Group by normalized values, make sure you don't remove a value where there remains at still one value that normalizes towards it
This commit is contained in:
parent
b44e17c4c3
commit
2f10273d14
@ -9,7 +9,7 @@ use std::result::Result as StdResult;
|
|||||||
use bytemuck::bytes_of;
|
use bytemuck::bytes_of;
|
||||||
use grenad::Sorter;
|
use grenad::Sorter;
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
use itertools::{merge_join_by, EitherOrBoth};
|
use itertools::{merge_join_by, EitherOrBoth, Itertools};
|
||||||
use ordered_float::OrderedFloat;
|
use ordered_float::OrderedFloat;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::{from_slice, Value};
|
use serde_json::{from_slice, Value};
|
||||||
@ -401,36 +401,102 @@ where
|
|||||||
del_strings.dedup();
|
del_strings.dedup();
|
||||||
add_strings.dedup();
|
add_strings.dedup();
|
||||||
|
|
||||||
|
let del_strings = del_strings.iter().chunk_by(|(normalized, _)| normalized);
|
||||||
|
let add_strings = add_strings.iter().chunk_by(|(normalized, _)| normalized);
|
||||||
|
|
||||||
let merged_strings_iter = itertools::merge_join_by(
|
let merged_strings_iter = itertools::merge_join_by(
|
||||||
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||||
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||||
|del, add| del.cmp(add),
|
|(normalized_del, _), (normalized_add, _)| normalized_del.cmp(normalized_add),
|
||||||
);
|
);
|
||||||
|
|
||||||
// insert normalized and original facet string in sorter
|
// insert normalized and original facet string in sorter
|
||||||
for eob in merged_strings_iter {
|
for eob in merged_strings_iter {
|
||||||
key_buffer.truncate(TRUNCATE_SIZE);
|
key_buffer.truncate(TRUNCATE_SIZE);
|
||||||
match eob {
|
let (side, normalized, original) = match eob {
|
||||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
EitherOrBoth::Both((normalized, del), (_, add)) => {
|
||||||
EitherOrBoth::Left((normalized, original)) => {
|
let merged_strings_iter =
|
||||||
let truncated = truncate_string(normalized);
|
itertools::merge_join_by(del, add, |(_, original_del), (_, original_add)| {
|
||||||
|
original_del.cmp(original_add)
|
||||||
|
});
|
||||||
|
|
||||||
|
// FIXME: we're in a bit of a pickle here, because we're only saving **one** original value per side,
|
||||||
|
// but we possibly have multiple original values that changed in the case where the field is an
|
||||||
|
// array of multiple values that normalize to the same value.
|
||||||
|
// (e.g. "foo" = ["bar", "Bar", "bAr", "baR"]. I'm not judging why you would do that ¯\_(ツ)_/¯)
|
||||||
|
//
|
||||||
|
// We'll work best effort by ignoring when the same value appears in both sides, deleting the first
|
||||||
|
// value that is only in the old version, and adding the first value that is only in the new version
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
let mut del = None;
|
||||||
|
let mut add = None;
|
||||||
|
let mut both = None;
|
||||||
|
|
||||||
|
for eob in merged_strings_iter {
|
||||||
|
match eob {
|
||||||
|
EitherOrBoth::Both((_normalized, original), _) => {
|
||||||
|
both = match both {
|
||||||
|
Some(both) => Some(both),
|
||||||
|
None => Some(original),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EitherOrBoth::Left((_normalized, original)) => {
|
||||||
|
del = match del {
|
||||||
|
Some(del) => Some(del),
|
||||||
|
None => Some(original),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right((_normalized, original)) => {
|
||||||
|
add = match add {
|
||||||
|
Some(add) => Some(add),
|
||||||
|
None => Some(original),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(del) = del {
|
||||||
|
obkv.insert(DelAdd::Deletion, del)?;
|
||||||
|
}
|
||||||
|
if let Some(add) = add
|
||||||
|
// prefer the newly added, but if there is none, keep a value in the list of values
|
||||||
|
// since the normalized value appears both in old and new, we should never remove it.
|
||||||
|
.or(both)
|
||||||
|
{
|
||||||
|
obkv.insert(DelAdd::Addition, add)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let truncated = truncate_string(normalized.clone());
|
||||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||||
|
|
||||||
let mut obkv = KvWriterDelAdd::memory();
|
|
||||||
obkv.insert(DelAdd::Deletion, original)?;
|
|
||||||
let bytes = obkv.into_inner()?;
|
let bytes = obkv.into_inner()?;
|
||||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
EitherOrBoth::Right((normalized, original)) => {
|
EitherOrBoth::Left((_normalized, mut original)) => {
|
||||||
let truncated = truncate_string(normalized);
|
// FIXME: we only consider the first value for the purpose of facet search
|
||||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
// another structure is needed, able to retain all originals associated with a normalized value.
|
||||||
|
let Some((normalized, original)) = original.next() else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
(DelAdd::Deletion, normalized, original)
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right((_normalized, mut original)) => {
|
||||||
|
// FIXME: we only consider the first value for the purpose of facet search
|
||||||
|
// another structure is needed, able to retain all originals associated with a normalized value.
|
||||||
|
let Some((normalized, original)) = original.next() else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
(DelAdd::Addition, normalized, original)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let truncated = truncate_string(normalized.clone());
|
||||||
|
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||||
|
|
||||||
let mut obkv = KvWriterDelAdd::memory();
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
obkv.insert(DelAdd::Addition, original)?;
|
obkv.insert(side, original)?;
|
||||||
let bytes = obkv.into_inner()?;
|
let bytes = obkv.into_inner()?;
|
||||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user