mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 12:54:26 +01:00
Merge #4680
4680: Speedup additional searchables r=Kerollmops a=ManyTheFish Fixes #4492. ## To Do - [x] Do not call the `InnerSettingsDiff::only_additional_fields` function too many times Co-authored-by: Clément Renault <clement@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
commit
40b2345394
@ -40,11 +40,26 @@ pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
|
||||
operation: DelAddOperation,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
into_del_add_obkv_conditional_operation(reader, buffer, |_| operation)
|
||||
}
|
||||
|
||||
/// Akin to the [into_del_add_obkv] function but lets you
|
||||
/// conditionally define the `DelAdd` variant based on the obkv key.
|
||||
pub fn into_del_add_obkv_conditional_operation<K, F>(
|
||||
reader: obkv::KvReader<K>,
|
||||
buffer: &mut Vec<u8>,
|
||||
operation: F,
|
||||
) -> std::io::Result<()>
|
||||
where
|
||||
K: obkv::Key + PartialOrd,
|
||||
F: Fn(K) -> DelAddOperation,
|
||||
{
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
for (key, value) in reader.iter() {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let operation = operation(key);
|
||||
if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) {
|
||||
value_writer.insert(DelAdd::Deletion, value)?;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
@ -9,7 +9,7 @@ use std::result::Result as StdResult;
|
||||
use bytemuck::bytes_of;
|
||||
use grenad::Sorter;
|
||||
use heed::BytesEncode;
|
||||
use itertools::EitherOrBoth;
|
||||
use itertools::{merge_join_by, EitherOrBoth};
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{from_slice, Value};
|
||||
@ -18,7 +18,7 @@ use FilterableValues::{Empty, Null, Values};
|
||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
@ -75,149 +75,181 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
let mut numbers_key_buffer = Vec::new();
|
||||
let mut strings_key_buffer = Vec::new();
|
||||
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
let old_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.old.faceted_fields_ids.iter().copied().collect();
|
||||
let new_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.new.faceted_fields_ids.iter().copied().collect();
|
||||
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id);
|
||||
let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id);
|
||||
if delete_faceted || add_faceted {
|
||||
numbers_key_buffer.clear();
|
||||
strings_key_buffer.clear();
|
||||
if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids {
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
let get_document_json_value = move |field_id, side| {
|
||||
obkv.get(field_id)
|
||||
.map(KvReaderDelAdd::new)
|
||||
.and_then(|kv| kv.get(side))
|
||||
.map(from_slice)
|
||||
.transpose()
|
||||
.map_err(InternalError::SerdeJson)
|
||||
};
|
||||
// iterate over the faceted fields instead of over the whole document.
|
||||
for eob in
|
||||
merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| {
|
||||
old.cmp(new)
|
||||
})
|
||||
{
|
||||
let (field_id, del_value, add_value) = match eob {
|
||||
EitherOrBoth::Left(&field_id) => {
|
||||
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
|
||||
|
||||
// Set key to the field_id
|
||||
// Note: this encoding is consistent with FieldIdCodec
|
||||
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
// deletion only
|
||||
(field_id, del_value, None)
|
||||
}
|
||||
EitherOrBoth::Right(&field_id) => {
|
||||
let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
|
||||
|
||||
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
||||
let document = DocumentId::from_be_bytes(document);
|
||||
// addition only
|
||||
(field_id, None, add_value)
|
||||
}
|
||||
EitherOrBoth::Both(&field_id, _) => {
|
||||
// during settings update, recompute the changing settings only.
|
||||
if settings_diff.settings_update_only {
|
||||
continue;
|
||||
}
|
||||
|
||||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||
numbers_key_buffer.extend_from_slice(docid_bytes);
|
||||
strings_key_buffer.extend_from_slice(docid_bytes);
|
||||
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
|
||||
let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
|
||||
|
||||
let del_add_obkv = obkv::KvReader::new(field_bytes);
|
||||
let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted)
|
||||
{
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
None => None,
|
||||
};
|
||||
let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) {
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
None => None,
|
||||
(field_id, del_value, add_value)
|
||||
}
|
||||
};
|
||||
|
||||
// We insert the document id on the Del and the Add side if the field exists.
|
||||
let (ref mut del_exists, ref mut add_exists) =
|
||||
facet_exists_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_null, ref mut add_is_null) =
|
||||
facet_is_null_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_empty, ref mut add_is_empty) =
|
||||
facet_is_empty_docids.entry(field_id).or_default();
|
||||
if del_value.is_some() || add_value.is_some() {
|
||||
numbers_key_buffer.clear();
|
||||
strings_key_buffer.clear();
|
||||
|
||||
if del_value.is_some() {
|
||||
del_exists.insert(document);
|
||||
}
|
||||
if add_value.is_some() {
|
||||
add_exists.insert(document);
|
||||
}
|
||||
// Set key to the field_id
|
||||
// Note: this encoding is consistent with FieldIdCodec
|
||||
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
|
||||
let del_geo_support = settings_diff
|
||||
.old
|
||||
.geo_fields_ids
|
||||
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let add_geo_support = settings_diff
|
||||
.new
|
||||
.geo_fields_ids
|
||||
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let del_filterable_values =
|
||||
del_value.map(|value| extract_facet_values(&value, del_geo_support));
|
||||
let add_filterable_values =
|
||||
add_value.map(|value| extract_facet_values(&value, add_geo_support));
|
||||
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
||||
let document = DocumentId::from_be_bytes(document);
|
||||
|
||||
// Those closures are just here to simplify things a bit.
|
||||
let mut insert_numbers_diff = |del_numbers, add_numbers| {
|
||||
insert_numbers_diff(
|
||||
&mut fid_docid_facet_numbers_sorter,
|
||||
&mut numbers_key_buffer,
|
||||
del_numbers,
|
||||
add_numbers,
|
||||
)
|
||||
};
|
||||
let mut insert_strings_diff = |del_strings, add_strings| {
|
||||
insert_strings_diff(
|
||||
&mut fid_docid_facet_strings_sorter,
|
||||
&mut strings_key_buffer,
|
||||
del_strings,
|
||||
add_strings,
|
||||
)
|
||||
};
|
||||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||
numbers_key_buffer.extend_from_slice(docid_bytes);
|
||||
strings_key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(None, None) => (),
|
||||
(Some(del_filterable_values), None) => match del_filterable_values {
|
||||
Null => {
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
Empty => {
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
},
|
||||
(None, Some(add_filterable_values)) => match add_filterable_values {
|
||||
Null => {
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
Empty => {
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
}
|
||||
},
|
||||
(Some(del_filterable_values), Some(add_filterable_values)) => {
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(Null, Null) | (Empty, Empty) => (),
|
||||
(Null, Empty) => {
|
||||
del_is_null.insert(document);
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
(Empty, Null) => {
|
||||
del_is_empty.insert(document);
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
(Null, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
// We insert the document id on the Del and the Add side if the field exists.
|
||||
let (ref mut del_exists, ref mut add_exists) =
|
||||
facet_exists_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_null, ref mut add_is_null) =
|
||||
facet_is_null_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_empty, ref mut add_is_empty) =
|
||||
facet_is_empty_docids.entry(field_id).or_default();
|
||||
|
||||
if del_value.is_some() {
|
||||
del_exists.insert(document);
|
||||
}
|
||||
if add_value.is_some() {
|
||||
add_exists.insert(document);
|
||||
}
|
||||
|
||||
let del_geo_support = settings_diff
|
||||
.old
|
||||
.geo_fields_ids
|
||||
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let add_geo_support = settings_diff
|
||||
.new
|
||||
.geo_fields_ids
|
||||
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let del_filterable_values =
|
||||
del_value.map(|value| extract_facet_values(&value, del_geo_support));
|
||||
let add_filterable_values =
|
||||
add_value.map(|value| extract_facet_values(&value, add_geo_support));
|
||||
|
||||
// Those closures are just here to simplify things a bit.
|
||||
let mut insert_numbers_diff = |del_numbers, add_numbers| {
|
||||
insert_numbers_diff(
|
||||
&mut fid_docid_facet_numbers_sorter,
|
||||
&mut numbers_key_buffer,
|
||||
del_numbers,
|
||||
add_numbers,
|
||||
)
|
||||
};
|
||||
let mut insert_strings_diff = |del_strings, add_strings| {
|
||||
insert_strings_diff(
|
||||
&mut fid_docid_facet_strings_sorter,
|
||||
&mut strings_key_buffer,
|
||||
del_strings,
|
||||
add_strings,
|
||||
)
|
||||
};
|
||||
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(None, None) => (),
|
||||
(Some(del_filterable_values), None) => match del_filterable_values {
|
||||
Null => {
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
(Empty, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
Empty => {
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
(Values { numbers, strings }, Null) => {
|
||||
add_is_null.insert(document);
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(Values { numbers, strings }, Empty) => {
|
||||
add_is_empty.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
},
|
||||
(None, Some(add_filterable_values)) => match add_filterable_values {
|
||||
Null => {
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
(
|
||||
Values { numbers: del_numbers, strings: del_strings },
|
||||
Values { numbers: add_numbers, strings: add_strings },
|
||||
) => {
|
||||
insert_numbers_diff(del_numbers, add_numbers)?;
|
||||
insert_strings_diff(del_strings, add_strings)?;
|
||||
Empty => {
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
}
|
||||
},
|
||||
(Some(del_filterable_values), Some(add_filterable_values)) => {
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(Null, Null) | (Empty, Empty) => (),
|
||||
(Null, Empty) => {
|
||||
del_is_null.insert(document);
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
(Empty, Null) => {
|
||||
del_is_empty.insert(document);
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
(Null, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
(Empty, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
(Values { numbers, strings }, Null) => {
|
||||
add_is_null.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(Values { numbers, strings }, Empty) => {
|
||||
add_is_empty.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(
|
||||
Values { numbers: del_numbers, strings: del_strings },
|
||||
Values { numbers: add_numbers, strings: add_strings },
|
||||
) => {
|
||||
insert_numbers_diff(del_numbers, add_numbers)?;
|
||||
insert_strings_diff(del_strings, add_strings)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -26,11 +26,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
|
||||
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
|
||||
|
||||
// early return if the data shouldn't be deleted nor created.
|
||||
if !any_deletion && !any_addition {
|
||||
if settings_diff.settings_update_only && !settings_diff.reindex_proximities() {
|
||||
let writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
@ -39,8 +36,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
return writer_into_reader(writer);
|
||||
}
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
|
||||
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
|
||||
.map(|_| {
|
||||
create_sorter(
|
||||
|
@ -369,6 +369,7 @@ where
|
||||
|
||||
// Run extraction pipeline in parallel.
|
||||
pool.install(|| {
|
||||
let settings_diff_cloned = settings_diff.clone();
|
||||
rayon::spawn(move || {
|
||||
let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks");
|
||||
let _enter = child_span.enter();
|
||||
@ -398,7 +399,7 @@ where
|
||||
pool_params,
|
||||
lmdb_writer_sx.clone(),
|
||||
primary_key_id,
|
||||
settings_diff.clone(),
|
||||
settings_diff_cloned,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
});
|
||||
@ -425,7 +426,7 @@ where
|
||||
Err(status) => {
|
||||
if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
|
||||
let (docids, is_merged_database) =
|
||||
write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?;
|
||||
write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?;
|
||||
if !docids.is_empty() {
|
||||
final_documents_ids |= docids;
|
||||
let documents_seen_count = final_documents_ids.len();
|
||||
|
@ -20,7 +20,10 @@ use super::{IndexDocumentsMethod, IndexerConfig};
|
||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::index::{db_name, main_key};
|
||||
use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd};
|
||||
use crate::update::del_add::{
|
||||
into_del_add_obkv, into_del_add_obkv_conditional_operation, DelAdd, DelAddOperation,
|
||||
KvReaderDelAdd,
|
||||
};
|
||||
use crate::update::index_documents::GrenadParameters;
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
@ -805,13 +808,15 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
let mut new_inner_settings = old_inner_settings.clone();
|
||||
new_inner_settings.fields_ids_map = fields_ids_map;
|
||||
|
||||
let settings_diff = InnerIndexSettingsDiff {
|
||||
old: old_inner_settings,
|
||||
new: new_inner_settings,
|
||||
let embedding_configs_updated = false;
|
||||
let settings_update_only = false;
|
||||
let settings_diff = InnerIndexSettingsDiff::new(
|
||||
old_inner_settings,
|
||||
new_inner_settings,
|
||||
primary_key_id,
|
||||
embedding_configs_updated: false,
|
||||
settings_update_only: false,
|
||||
};
|
||||
embedding_configs_updated,
|
||||
settings_update_only,
|
||||
);
|
||||
|
||||
Ok(TransformOutput {
|
||||
primary_key,
|
||||
@ -840,14 +845,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
// Always keep the primary key.
|
||||
let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) };
|
||||
|
||||
// If only the `searchableAttributes` has been changed, keep only the searchable fields.
|
||||
let must_reindex_searchables = settings_diff.reindex_searchable();
|
||||
let necessary_searchable_field = |id: FieldId| -> bool {
|
||||
must_reindex_searchables
|
||||
&& (settings_diff.old.searchable_fields_ids.contains(&id)
|
||||
|| settings_diff.new.searchable_fields_ids.contains(&id))
|
||||
};
|
||||
|
||||
// If only a faceted field has been added, keep only this field.
|
||||
let must_reindex_facets = settings_diff.reindex_facets();
|
||||
let necessary_faceted_field = |id: FieldId| -> bool {
|
||||
@ -862,13 +859,16 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
// we need the fields for the prompt/templating.
|
||||
let reindex_vectors = settings_diff.reindex_vectors();
|
||||
|
||||
// The operations that we must perform on the different fields.
|
||||
let mut operations = HashMap::new();
|
||||
|
||||
let mut obkv_writer = KvWriter::<_, FieldId>::memory();
|
||||
for (id, val) in old_obkv.iter() {
|
||||
if is_primary_key(id)
|
||||
|| necessary_searchable_field(id)
|
||||
|| necessary_faceted_field(id)
|
||||
|| reindex_vectors
|
||||
{
|
||||
if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors {
|
||||
operations.insert(id, DelAddOperation::DeletionAndAddition);
|
||||
obkv_writer.insert(id, val)?;
|
||||
} else if let Some(operation) = settings_diff.reindex_searchable_id(id) {
|
||||
operations.insert(id, operation);
|
||||
obkv_writer.insert(id, val)?;
|
||||
}
|
||||
}
|
||||
@ -887,11 +887,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
let flattened = flattened.as_deref().map_or(obkv, KvReader::new);
|
||||
|
||||
flattened_obkv_buffer.clear();
|
||||
into_del_add_obkv(
|
||||
flattened,
|
||||
DelAddOperation::DeletionAndAddition,
|
||||
flattened_obkv_buffer,
|
||||
)?;
|
||||
into_del_add_obkv_conditional_operation(flattened, flattened_obkv_buffer, |id| {
|
||||
operations.get(&id).copied().unwrap_or(DelAddOperation::DeletionAndAddition)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -901,6 +899,11 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
/// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument.
|
||||
///
|
||||
// TODO this can be done in parallel by using the rayon `ThreadPool`.
|
||||
#[tracing::instrument(
|
||||
level = "trace"
|
||||
skip(self, wtxn, settings_diff),
|
||||
target = "indexing::documents"
|
||||
)]
|
||||
pub fn prepare_for_documents_reindexing(
|
||||
self,
|
||||
wtxn: &mut heed::RwTxn<'i>,
|
||||
|
@ -7,7 +7,7 @@ use bytemuck::allocation::pod_collect_to_vec;
|
||||
use charabia::{Language, Script};
|
||||
use grenad::{Merger, MergerBuilder};
|
||||
use heed::types::Bytes;
|
||||
use heed::RwTxn;
|
||||
use heed::{BytesDecode, RwTxn};
|
||||
use obkv::{KvReader, KvWriter};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
@ -20,13 +20,16 @@ use super::MergeFn;
|
||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||
use crate::facet::FacetType;
|
||||
use crate::index::db_name::DOCUMENTS;
|
||||
use crate::proximity::MAX_DISTANCE;
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::{
|
||||
as_cloneable_grenad, keep_latest_obkv, try_split_array_at,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{
|
||||
lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError,
|
||||
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
|
||||
Result, SerializationError, U8StrStrCodec,
|
||||
};
|
||||
|
||||
/// This struct accumulates and group the TypedChunks
|
||||
@ -122,9 +125,10 @@ impl TypedChunk {
|
||||
/// Return new documents seen.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
pub(crate) fn write_typed_chunk_into_index(
|
||||
typed_chunks: Vec<TypedChunk>,
|
||||
index: &Index,
|
||||
wtxn: &mut RwTxn,
|
||||
index: &Index,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
typed_chunks: Vec<TypedChunk>,
|
||||
) -> Result<(RoaringBitmap, bool)> {
|
||||
let mut is_merged_database = false;
|
||||
match typed_chunks[0] {
|
||||
@ -485,13 +489,22 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.word_pair_proximity_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
if settings_diff.only_additional_fields.is_some() {
|
||||
write_proximity_entries_into_database_additional_searchables(
|
||||
merger,
|
||||
&index.word_pair_proximity_docids,
|
||||
wtxn,
|
||||
)?;
|
||||
} else {
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.word_pair_proximity_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
}
|
||||
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdDocidFacetNumbers(_) => {
|
||||
@ -830,3 +843,51 @@ where
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Akin to the `write_entries_into_database` function but specialized
|
||||
/// for the case when we only index additional searchable fields only.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
fn write_proximity_entries_into_database_additional_searchables<R>(
|
||||
merger: Merger<R, MergeFn>,
|
||||
database: &heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
wtxn: &mut RwTxn,
|
||||
) -> Result<()>
|
||||
where
|
||||
R: io::Read + io::Seek,
|
||||
{
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
let (proximity_to_insert, word1, word2) =
|
||||
U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
||||
let data_to_insert = match KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||
Some(value) => {
|
||||
CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)?
|
||||
}
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let mut data_to_remove = RoaringBitmap::new();
|
||||
for prox in 1..(MAX_DISTANCE as u8) {
|
||||
let key = (prox, word1, word2);
|
||||
let database_value = database.get(wtxn, &key)?.unwrap_or_default();
|
||||
let value = if prox == proximity_to_insert {
|
||||
// Proximity that should be changed.
|
||||
// Union values and remove lower proximity data
|
||||
(&database_value | &data_to_insert) - &data_to_remove
|
||||
} else {
|
||||
// Remove lower proximity data
|
||||
&database_value - &data_to_remove
|
||||
};
|
||||
|
||||
// add the current data in data_to_remove for the next proximities
|
||||
data_to_remove |= &value;
|
||||
|
||||
if database_value != value {
|
||||
database.put(wtxn, &key, &value)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
@ -9,6 +9,7 @@ use itertools::{EitherOrBoth, Itertools};
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use super::del_add::DelAddOperation;
|
||||
use super::index_documents::{IndexDocumentsConfig, Transform};
|
||||
use super::IndexerConfig;
|
||||
use crate::criterion::Criterion;
|
||||
@ -1072,13 +1073,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
.index
|
||||
.primary_key(self.wtxn)?
|
||||
.and_then(|name| new_inner_settings.fields_ids_map.id(name));
|
||||
let inner_settings_diff = InnerIndexSettingsDiff {
|
||||
old: old_inner_settings,
|
||||
new: new_inner_settings,
|
||||
let settings_update_only = true;
|
||||
let inner_settings_diff = InnerIndexSettingsDiff::new(
|
||||
old_inner_settings,
|
||||
new_inner_settings,
|
||||
primary_key_id,
|
||||
embedding_configs_updated,
|
||||
settings_update_only: true,
|
||||
};
|
||||
settings_update_only,
|
||||
);
|
||||
|
||||
if inner_settings_diff.any_reindexing_needed() {
|
||||
self.reindex(&progress_callback, &should_abort, inner_settings_diff)?;
|
||||
@ -1095,21 +1097,104 @@ pub struct InnerIndexSettingsDiff {
|
||||
// TODO: compare directly the embedders.
|
||||
pub(crate) embedding_configs_updated: bool,
|
||||
pub(crate) settings_update_only: bool,
|
||||
/// The set of only the additional searchable fields.
|
||||
/// If any other searchable field has been modified, is set to None.
|
||||
pub(crate) only_additional_fields: Option<HashSet<String>>,
|
||||
|
||||
// Cache the check to see if all the stop_words, allowed_separators, dictionary,
|
||||
// exact_attributes, proximity_precision are different.
|
||||
pub(crate) cache_reindex_searchable_without_user_defined: bool,
|
||||
// Cache the check to see if the user_defined_searchables are different.
|
||||
pub(crate) cache_user_defined_searchables: bool,
|
||||
// Cache the check to see if the exact_attributes are different.
|
||||
pub(crate) cache_exact_attributes: bool,
|
||||
}
|
||||
|
||||
impl InnerIndexSettingsDiff {
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::settings")]
|
||||
pub(crate) fn new(
|
||||
old_settings: InnerIndexSettings,
|
||||
new_settings: InnerIndexSettings,
|
||||
primary_key_id: Option<FieldId>,
|
||||
embedding_configs_updated: bool,
|
||||
settings_update_only: bool,
|
||||
) -> Self {
|
||||
let only_additional_fields = match (
|
||||
&old_settings.user_defined_searchable_fields,
|
||||
&new_settings.user_defined_searchable_fields,
|
||||
) {
|
||||
(None, None) | (Some(_), None) | (None, Some(_)) => None, // None means *
|
||||
(Some(old), Some(new)) => {
|
||||
let old: HashSet<_> = old.iter().cloned().collect();
|
||||
let new: HashSet<_> = new.iter().cloned().collect();
|
||||
if old.difference(&new).next().is_none() {
|
||||
// if no field has been removed return only the additional ones
|
||||
Some(&new - &old).filter(|x| !x.is_empty())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let cache_reindex_searchable_without_user_defined = {
|
||||
old_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|
||||
!= new_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|
||||
|| old_settings.allowed_separators != new_settings.allowed_separators
|
||||
|| old_settings.dictionary != new_settings.dictionary
|
||||
|| old_settings.proximity_precision != new_settings.proximity_precision
|
||||
};
|
||||
|
||||
let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes;
|
||||
|
||||
let cache_user_defined_searchables = old_settings.user_defined_searchable_fields
|
||||
!= new_settings.user_defined_searchable_fields;
|
||||
|
||||
InnerIndexSettingsDiff {
|
||||
old: old_settings,
|
||||
new: new_settings,
|
||||
primary_key_id,
|
||||
embedding_configs_updated,
|
||||
settings_update_only,
|
||||
only_additional_fields,
|
||||
cache_reindex_searchable_without_user_defined,
|
||||
cache_user_defined_searchables,
|
||||
cache_exact_attributes,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn any_reindexing_needed(&self) -> bool {
|
||||
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
|
||||
}
|
||||
|
||||
pub fn reindex_searchable(&self) -> bool {
|
||||
self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|
||||
!= self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|
||||
|| self.old.allowed_separators != self.new.allowed_separators
|
||||
|| self.old.dictionary != self.new.dictionary
|
||||
|| self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields
|
||||
|| self.old.exact_attributes != self.new.exact_attributes
|
||||
|| self.old.proximity_precision != self.new.proximity_precision
|
||||
self.cache_reindex_searchable_without_user_defined
|
||||
|| self.cache_exact_attributes
|
||||
|| self.cache_user_defined_searchables
|
||||
}
|
||||
|
||||
pub fn reindex_proximities(&self) -> bool {
|
||||
// if any searchable settings force the reindexing
|
||||
(self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables)
|
||||
// and if any settings needs the proximity database created
|
||||
&& (self.old.proximity_precision == ProximityPrecision::ByAttribute
|
||||
|| self.new.proximity_precision == ProximityPrecision::ByAttribute)
|
||||
}
|
||||
|
||||
pub fn reindex_searchable_id(&self, id: FieldId) -> Option<DelAddOperation> {
|
||||
if self.cache_reindex_searchable_without_user_defined || self.cache_exact_attributes {
|
||||
Some(DelAddOperation::DeletionAndAddition)
|
||||
} else if let Some(only_additional_fields) = &self.only_additional_fields {
|
||||
let additional_field = self.new.fields_ids_map.name(id).unwrap();
|
||||
if only_additional_fields.contains(additional_field) {
|
||||
Some(DelAddOperation::Addition)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else if self.cache_user_defined_searchables {
|
||||
Some(DelAddOperation::DeletionAndAddition)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reindex_facets(&self) -> bool {
|
||||
@ -1580,7 +1665,7 @@ mod tests {
|
||||
// When we search for something that is not in
|
||||
// the searchable fields it must not return any document.
|
||||
let result = index.search(&rtxn).query("23").execute().unwrap();
|
||||
assert!(result.documents_ids.is_empty());
|
||||
assert_eq!(result.documents_ids, Vec::<u32>::new());
|
||||
|
||||
// When we search for something that is in the searchable fields
|
||||
// we must find the appropriate document.
|
||||
|
Loading…
Reference in New Issue
Block a user