mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-10 21:44:34 +01:00
finish work
This commit is contained in:
parent
b5e4a55af6
commit
02c3d6b265
@ -34,6 +34,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let force_reindexing = settings_diff.reindex_searchable();
|
||||
|
||||
// initialize destination values.
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
@ -54,12 +55,15 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
// initialize tokenizer.
|
||||
// TODO: Fix ugly allocation
|
||||
/// TODO: Fix ugly allocation
|
||||
let old_stop_words = settings_diff.old.stop_words.as_ref();
|
||||
let old_separators: Option<Vec<_>> =
|
||||
settings_diff.old.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
|
||||
let old_separators: Option<Vec<_>> = settings_diff
|
||||
.old
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let old_dictionary: Option<Vec<_>> =
|
||||
settings_diff.old.dictionary.map(|s| s.iter().map(String::as_str).collect());
|
||||
settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut del_builder = tokenizer_builder(
|
||||
old_stop_words,
|
||||
old_separators.as_deref(),
|
||||
@ -68,12 +72,15 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
);
|
||||
let del_tokenizer = del_builder.build();
|
||||
|
||||
// TODO: Fix ugly allocation
|
||||
/// TODO: Fix ugly allocation
|
||||
let new_stop_words = settings_diff.new.stop_words.as_ref();
|
||||
let new_separators: Option<Vec<_>> =
|
||||
settings_diff.new.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
|
||||
let new_separators: Option<Vec<_>> = settings_diff
|
||||
.new
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let new_dictionary: Option<Vec<_>> =
|
||||
settings_diff.new.dictionary.map(|s| s.iter().map(String::as_str).collect());
|
||||
settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut add_builder = tokenizer_builder(
|
||||
new_stop_words,
|
||||
new_separators.as_deref(),
|
||||
@ -92,10 +99,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
|
||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||
if !searchable_fields_changed(
|
||||
&KvReader::<FieldId>::new(value),
|
||||
&settings_diff.new.searchable_fields_ids,
|
||||
) {
|
||||
if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -180,8 +184,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
/// Check if any searchable fields of a document changed.
|
||||
fn searchable_fields_changed(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<Vec<FieldId>>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> bool {
|
||||
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
let del_add = KvReaderDelAdd::new(field_bytes);
|
||||
@ -262,12 +267,14 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
// then we don't rerun the extraction.
|
||||
if !script_language.is_empty() {
|
||||
// build a new temporary tokenizer including the allow list.
|
||||
// TODO: Fix ugly allocation
|
||||
/// TODO: Fix ugly allocation
|
||||
let stop_words = settings.stop_words.as_ref();
|
||||
let separators: Option<Vec<_>> =
|
||||
settings.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
|
||||
let separators: Option<Vec<_>> = settings
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let dictionary: Option<Vec<_>> =
|
||||
settings.dictionary.map(|s| s.iter().map(String::as_str).collect());
|
||||
settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut builder =
|
||||
tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
|
||||
let tokenizer = builder.build();
|
||||
|
@ -17,8 +17,9 @@ use crate::error::UserError;
|
||||
use crate::prompt::Prompt;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::try_split_at;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::Embedder;
|
||||
use crate::{DocumentId, FieldsIdsMap, InternalError, Result, VectorOrArrayOfVectors};
|
||||
use crate::{DocumentId, InternalError, Result, VectorOrArrayOfVectors};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
@ -71,12 +72,15 @@ impl VectorStateDelta {
|
||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
field_id_map: &FieldsIdsMap,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
prompt: &Prompt,
|
||||
embedder_name: &str,
|
||||
) -> Result<ExtractedVectorPoints> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let mut manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
@ -98,8 +102,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let vectors_fid = field_id_map.id("_vectors");
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
@ -116,15 +118,29 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
let vectors_field = vectors_fid
|
||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_maps(obkv, document_id))
|
||||
.transpose()?;
|
||||
// the vector field id may have changed
|
||||
let old_vectors_fid = old_fields_ids_map.id("_vectors");
|
||||
// filter the old vector fid if the settings has been changed forcing reindexing.
|
||||
let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors());
|
||||
|
||||
let (del_map, add_map) = vectors_field.unzip();
|
||||
let del_map = del_map.flatten();
|
||||
let add_map = add_map.flatten();
|
||||
let new_vectors_fid = new_fields_ids_map.id("_vectors");
|
||||
let vectors_field = {
|
||||
let del = old_vectors_fid
|
||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
let add = new_vectors_fid
|
||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
(del, add)
|
||||
};
|
||||
|
||||
let (del_map, add_map) = vectors_field;
|
||||
|
||||
let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
|
||||
let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
|
||||
@ -155,7 +171,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
VectorStateDelta::NowGenerated(prompt.render(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
field_id_map,
|
||||
&new_fields_ids_map,
|
||||
)?)
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
@ -182,9 +198,10 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
|
||||
if document_is_kept {
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt =
|
||||
prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default();
|
||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?;
|
||||
let old_prompt = prompt
|
||||
.render(obkv, DelAdd::Deletion, &old_fields_ids_map)
|
||||
.unwrap_or_default();
|
||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, &new_fields_ids_map)?;
|
||||
if old_prompt != new_prompt {
|
||||
tracing::trace!(
|
||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||
@ -220,15 +237,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
})
|
||||
}
|
||||
|
||||
fn to_vector_maps(
|
||||
obkv: KvReaderDelAdd,
|
||||
document_id: impl Fn() -> Value,
|
||||
) -> Result<(Option<serde_json::Map<String, Value>>, Option<serde_json::Map<String, Value>>)> {
|
||||
let del = to_vector_map(obkv, DelAdd::Deletion, &document_id)?;
|
||||
let add = to_vector_map(obkv, DelAdd::Addition, &document_id)?;
|
||||
Ok((del, add))
|
||||
}
|
||||
|
||||
fn to_vector_map(
|
||||
obkv: KvReaderDelAdd,
|
||||
side: DelAdd,
|
||||
|
@ -121,16 +121,16 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
let (w, fid) = StrBEU16Codec::bytes_decode(key)
|
||||
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
|
||||
if let Some(word) = word {
|
||||
if word.as_str() != w {
|
||||
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
|
||||
if let Some(current) = word.as_ref() {
|
||||
if current != w {
|
||||
docids_into_writers(¤t, &deletions, &additions, &mut word_docids_writer)?;
|
||||
docids_into_writers(
|
||||
&word,
|
||||
¤t,
|
||||
&exact_deletions,
|
||||
&exact_additions,
|
||||
&mut exact_word_docids_writer,
|
||||
);
|
||||
let word = Some(w.to_string());
|
||||
)?;
|
||||
word = Some(w.to_string());
|
||||
// clear buffers
|
||||
deletions.clear();
|
||||
additions.clear();
|
||||
@ -138,7 +138,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
exact_additions.clear();
|
||||
}
|
||||
} else {
|
||||
let word = Some(w.to_string());
|
||||
word = Some(w.to_string());
|
||||
}
|
||||
|
||||
// merge all deletions
|
||||
@ -169,13 +169,13 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
}
|
||||
|
||||
if let Some(word) = word {
|
||||
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
|
||||
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer)?;
|
||||
docids_into_writers(
|
||||
&word,
|
||||
&exact_deletions,
|
||||
&exact_additions,
|
||||
&mut exact_word_docids_writer,
|
||||
);
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok((
|
||||
@ -253,7 +253,7 @@ where
|
||||
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
);
|
||||
)?;
|
||||
}
|
||||
// additions:
|
||||
if !additions.is_empty() {
|
||||
@ -262,7 +262,7 @@ where
|
||||
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
);
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert everything in the same writer.
|
||||
|
@ -11,7 +11,7 @@ use super::helpers::{
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||
use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{DocumentId, Result};
|
||||
@ -24,9 +24,20 @@ use crate::{DocumentId, Result};
|
||||
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
|
||||
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
|
||||
|
||||
// early return if the data shouldn't be deleted nor created.
|
||||
if !any_deletion && !any_addition {
|
||||
return tempfile::tempfile()
|
||||
.map_err(Into::into)
|
||||
.map(BufReader::new)
|
||||
.and_then(grenad::Reader::new)
|
||||
.map_err(Into::into);
|
||||
}
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
@ -79,6 +90,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
if !any_deletion {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// deletions
|
||||
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
|
||||
for (position, word) in KvReaderU16::new(deletion).iter() {
|
||||
@ -108,6 +123,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
Ok(())
|
||||
},
|
||||
|| {
|
||||
if !any_addition {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// additions
|
||||
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||
for (position, word) in KvReaderU16::new(addition).iter() {
|
||||
|
@ -9,7 +9,6 @@ mod extract_word_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
mod extract_word_position_docids;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
@ -30,7 +29,6 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{FieldId, Result};
|
||||
|
||||
@ -200,12 +198,14 @@ fn run_extraction_task<FE, FS, M>(
|
||||
M: Send,
|
||||
{
|
||||
let current_span = tracing::Span::current();
|
||||
/// TODO: remove clone
|
||||
let settings_diff = settings_diff.clone();
|
||||
|
||||
rayon::spawn(move || {
|
||||
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks");
|
||||
let _entered = child_span.enter();
|
||||
puffin::profile_scope!("extract_multiple_chunks", name);
|
||||
match extract_fn(chunk, indexer, settings_diff) {
|
||||
match extract_fn(chunk, indexer, &settings_diff) {
|
||||
Ok(chunk) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||
}
|
||||
@ -235,50 +235,54 @@ fn send_original_documents_data(
|
||||
.thread_name(|index| format!("embedding-request-{index}"))
|
||||
.build()?;
|
||||
|
||||
rayon::spawn(move || {
|
||||
for (name, (embedder, prompt)) in embedders {
|
||||
let result = extract_vector_points(
|
||||
documents_chunk_cloned.clone(),
|
||||
indexer,
|
||||
&field_id_map,
|
||||
&prompt,
|
||||
&name,
|
||||
);
|
||||
match result {
|
||||
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
|
||||
let embeddings = match extract_embeddings(
|
||||
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
|
||||
/// TODO: remove clone
|
||||
let settings_diff = settings_diff.clone();
|
||||
rayon::spawn(move || {
|
||||
for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {
|
||||
let result = extract_vector_points(
|
||||
documents_chunk_cloned.clone(),
|
||||
indexer,
|
||||
&settings_diff,
|
||||
&prompt,
|
||||
&name,
|
||||
);
|
||||
match result {
|
||||
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
|
||||
let embeddings = match extract_embeddings(
|
||||
prompts,
|
||||
indexer,
|
||||
embedder.clone(),
|
||||
&request_threads,
|
||||
) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
None
|
||||
}
|
||||
};
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
if !(remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
|
||||
{
|
||||
let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension: embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name: name,
|
||||
}));
|
||||
if !(remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
|
||||
{
|
||||
let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension: embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name: name,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
}
|
||||
}
|
||||
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: create a custom internal error
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
|
||||
|
@ -6,7 +6,6 @@ mod typed_chunk;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{Read, Seek};
|
||||
use std::iter::FromIterator;
|
||||
use std::num::NonZeroU32;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
@ -281,7 +280,7 @@ where
|
||||
|
||||
let TransformOutput {
|
||||
primary_key,
|
||||
settings_diff,
|
||||
mut settings_diff,
|
||||
field_distribution,
|
||||
documents_count,
|
||||
original_documents,
|
||||
@ -319,13 +318,8 @@ where
|
||||
) = crossbeam_channel::unbounded();
|
||||
|
||||
// get the primary key field id
|
||||
let primary_key_id = output.settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
|
||||
let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
|
||||
|
||||
// get searchable fields for word databases
|
||||
let searchable_fields =
|
||||
self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
|
||||
// get filterable fields for facet databases
|
||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
||||
// get the fid of the `_geo.lat` and `_geo.lng` fields.
|
||||
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
|
||||
@ -348,12 +342,6 @@ where
|
||||
None => None,
|
||||
};
|
||||
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
let separators = self.index.allowed_separators(self.wtxn)?;
|
||||
let dictionary = self.index.dictionary(self.wtxn)?;
|
||||
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
||||
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
|
||||
|
||||
let pool_params = GrenadParameters {
|
||||
chunk_compression_type: self.indexer_config.chunk_compression_type,
|
||||
chunk_compression_level: self.indexer_config.chunk_compression_level,
|
||||
|
@ -1,12 +1,11 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::btree_map::Entry as BEntry;
|
||||
use std::collections::hash_map::Entry as HEntry;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek};
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
use heed::RoTxn;
|
||||
use itertools::Itertools;
|
||||
use obkv::{KvReader, KvReaderU16, KvWriter};
|
||||
use roaring::RoaringBitmap;
|
||||
@ -814,7 +813,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
let settings_diff = InnerIndexSettingsDiff {
|
||||
old: old_inner_settings,
|
||||
new: new_inner_settings,
|
||||
embedding_configs_updated: true,
|
||||
embedding_configs_updated: false,
|
||||
settings_update_only: false,
|
||||
};
|
||||
|
||||
Ok(TransformOutput {
|
||||
@ -844,13 +844,16 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
obkv_writer.insert(id, val)?;
|
||||
}
|
||||
}
|
||||
let new_obkv = KvReader::<FieldId>::new(&obkv_writer.into_inner()?);
|
||||
let data = obkv_writer.into_inner()?;
|
||||
let new_obkv = KvReader::<FieldId>::new(&data);
|
||||
|
||||
// take the non-flattened version if flatten_from_fields_ids_map returns None.
|
||||
let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?
|
||||
.map_or_else(|| old_obkv, |bytes| KvReader::<FieldId>::new(&bytes));
|
||||
let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?
|
||||
.map_or_else(|| new_obkv, |bytes| KvReader::<FieldId>::new(&bytes));
|
||||
let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?;
|
||||
let old_flattened =
|
||||
old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::<FieldId>::new);
|
||||
let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?;
|
||||
let new_flattened =
|
||||
new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::<FieldId>::new);
|
||||
|
||||
original_obkv_buffer.clear();
|
||||
flattened_obkv_buffer.clear();
|
||||
|
@ -1010,6 +1010,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
}
|
||||
Setting::NotSet => false,
|
||||
};
|
||||
|
||||
// if any changes force a reindexing
|
||||
// clear the vector database.
|
||||
if update {
|
||||
self.index.vector_arroy.clear(self.wtxn)?;
|
||||
}
|
||||
|
||||
Ok(update)
|
||||
}
|
||||
|
||||
@ -1077,6 +1084,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
old: old_inner_settings,
|
||||
new: new_inner_settings,
|
||||
embedding_configs_updated,
|
||||
settings_update_only: true,
|
||||
};
|
||||
|
||||
if inner_settings_diff.any_reindexing_needed() {
|
||||
@ -1087,20 +1095,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct InnerIndexSettingsDiff {
|
||||
pub old: InnerIndexSettings,
|
||||
pub new: InnerIndexSettings,
|
||||
#[derive(Clone)]
|
||||
pub struct InnerIndexSettingsDiff {
|
||||
pub(crate) old: InnerIndexSettings,
|
||||
pub(crate) new: InnerIndexSettings,
|
||||
|
||||
// TODO: compare directly the embedders.
|
||||
pub embedding_configs_updated: bool,
|
||||
pub(crate) embedding_configs_updated: bool,
|
||||
|
||||
pub(crate) settings_update_only: bool,
|
||||
}
|
||||
|
||||
impl InnerIndexSettingsDiff {
|
||||
fn any_reindexing_needed(&self) -> bool {
|
||||
pub fn any_reindexing_needed(&self) -> bool {
|
||||
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
|
||||
}
|
||||
|
||||
fn reindex_searchable(&self) -> bool {
|
||||
pub fn reindex_searchable(&self) -> bool {
|
||||
self.old
|
||||
.fields_ids_map
|
||||
.iter()
|
||||
@ -1115,13 +1126,13 @@ impl InnerIndexSettingsDiff {
|
||||
|| self.old.proximity_precision != self.new.proximity_precision
|
||||
}
|
||||
|
||||
fn reindex_facets(&self) -> bool {
|
||||
let existing_fields = self.new.existing_fields;
|
||||
pub fn reindex_facets(&self) -> bool {
|
||||
let existing_fields = &self.new.existing_fields;
|
||||
if existing_fields.iter().any(|field| field.contains('.')) {
|
||||
return true;
|
||||
}
|
||||
|
||||
let old_faceted_fields = self.old.user_defined_faceted_fields;
|
||||
let old_faceted_fields = &self.old.user_defined_faceted_fields;
|
||||
if old_faceted_fields.iter().any(|field| field.contains('.')) {
|
||||
return true;
|
||||
}
|
||||
@ -1129,13 +1140,13 @@ impl InnerIndexSettingsDiff {
|
||||
// If there is new faceted fields we indicate that we must reindex as we must
|
||||
// index new fields as facets. It means that the distinct attribute,
|
||||
// an Asc/Desc criterion or a filtered attribute as be added or removed.
|
||||
let new_faceted_fields = self.new.user_defined_faceted_fields;
|
||||
let new_faceted_fields = &self.new.user_defined_faceted_fields;
|
||||
if new_faceted_fields.iter().any(|field| field.contains('.')) {
|
||||
return true;
|
||||
}
|
||||
|
||||
let faceted_updated =
|
||||
(&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields);
|
||||
(existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields);
|
||||
|
||||
self.old
|
||||
.fields_ids_map
|
||||
@ -1145,9 +1156,13 @@ impl InnerIndexSettingsDiff {
|
||||
|| faceted_updated
|
||||
}
|
||||
|
||||
fn reindex_vectors(&self) -> bool {
|
||||
pub fn reindex_vectors(&self) -> bool {
|
||||
self.embedding_configs_updated
|
||||
}
|
||||
|
||||
pub fn settings_update_only(&self) -> bool {
|
||||
self.settings_update_only
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
|
Loading…
x
Reference in New Issue
Block a user