finish work

This commit is contained in:
ManyTheFish 2024-04-03 11:19:45 +02:00
parent b5e4a55af6
commit 02c3d6b265
8 changed files with 171 additions and 127 deletions

View File

@ -34,6 +34,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let max_positions_per_attributes = max_positions_per_attributes
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
let max_memory = indexer.max_memory_by_thread();
let force_reindexing = settings_diff.reindex_searchable();
// initialize destination values.
let mut documents_ids = RoaringBitmap::new();
@ -54,12 +55,15 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let mut value_buffer = Vec::new();
// initialize tokenizer.
// TODO: Fix ugly allocation
/// TODO: Fix ugly allocation
let old_stop_words = settings_diff.old.stop_words.as_ref();
let old_separators: Option<Vec<_>> =
settings_diff.old.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
let old_separators: Option<Vec<_>> = settings_diff
.old
.allowed_separators
.as_ref()
.map(|s| s.iter().map(String::as_str).collect());
let old_dictionary: Option<Vec<_>> =
settings_diff.old.dictionary.map(|s| s.iter().map(String::as_str).collect());
settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut del_builder = tokenizer_builder(
old_stop_words,
old_separators.as_deref(),
@ -68,12 +72,15 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
);
let del_tokenizer = del_builder.build();
// TODO: Fix ugly allocation
/// TODO: Fix ugly allocation
let new_stop_words = settings_diff.new.stop_words.as_ref();
let new_separators: Option<Vec<_>> =
settings_diff.new.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
let new_separators: Option<Vec<_>> = settings_diff
.new
.allowed_separators
.as_ref()
.map(|s| s.iter().map(String::as_str).collect());
let new_dictionary: Option<Vec<_>> =
settings_diff.new.dictionary.map(|s| s.iter().map(String::as_str).collect());
settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut add_builder = tokenizer_builder(
new_stop_words,
new_separators.as_deref(),
@ -92,10 +99,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let obkv = KvReader::<FieldId>::new(value);
// if the searchable fields didn't change, skip the searchable indexing for this document.
if !searchable_fields_changed(
&KvReader::<FieldId>::new(value),
&settings_diff.new.searchable_fields_ids,
) {
if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) {
continue;
}
@ -180,8 +184,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
/// Check if any searchable fields of a document changed.
fn searchable_fields_changed(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<Vec<FieldId>>,
settings_diff: &InnerIndexSettingsDiff,
) -> bool {
let searchable_fields = &settings_diff.new.searchable_fields_ids;
for (field_id, field_bytes) in obkv.iter() {
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
let del_add = KvReaderDelAdd::new(field_bytes);
@ -262,12 +267,14 @@ fn lang_safe_tokens_from_document<'a>(
// then we don't rerun the extraction.
if !script_language.is_empty() {
// build a new temporary tokenizer including the allow list.
// TODO: Fix ugly allocation
/// TODO: Fix ugly allocation
let stop_words = settings.stop_words.as_ref();
let separators: Option<Vec<_>> =
settings.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
let separators: Option<Vec<_>> = settings
.allowed_separators
.as_ref()
.map(|s| s.iter().map(String::as_str).collect());
let dictionary: Option<Vec<_>> =
settings.dictionary.map(|s| s.iter().map(String::as_str).collect());
settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut builder =
tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
let tokenizer = builder.build();

View File

@ -17,8 +17,9 @@ use crate::error::UserError;
use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::Embedder;
use crate::{DocumentId, FieldsIdsMap, InternalError, Result, VectorOrArrayOfVectors};
use crate::{DocumentId, InternalError, Result, VectorOrArrayOfVectors};
/// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -71,12 +72,15 @@ impl VectorStateDelta {
pub fn extract_vector_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
field_id_map: &FieldsIdsMap,
settings_diff: &InnerIndexSettingsDiff,
prompt: &Prompt,
embedder_name: &str,
) -> Result<ExtractedVectorPoints> {
puffin::profile_function!();
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
// (docid, _index) -> KvWriterDelAdd -> Vector
let mut manual_vectors_writer = create_writer(
indexer.chunk_compression_type,
@ -98,8 +102,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
tempfile::tempfile()?,
);
let vectors_fid = field_id_map.id("_vectors");
let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
@ -116,15 +118,29 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
// lazily get it when needed
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
let vectors_field = vectors_fid
// the vector field id may have changed
let old_vectors_fid = old_fields_ids_map.id("_vectors");
// filter the old vector fid if the settings has been changed forcing reindexing.
let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors());
let new_vectors_fid = new_fields_ids_map.id("_vectors");
let vectors_field = {
let del = old_vectors_fid
.and_then(|vectors_fid| obkv.get(vectors_fid))
.map(KvReaderDelAdd::new)
.map(|obkv| to_vector_maps(obkv, document_id))
.transpose()?;
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id))
.transpose()?
.flatten();
let add = new_vectors_fid
.and_then(|vectors_fid| obkv.get(vectors_fid))
.map(KvReaderDelAdd::new)
.map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id))
.transpose()?
.flatten();
(del, add)
};
let (del_map, add_map) = vectors_field.unzip();
let del_map = del_map.flatten();
let add_map = add_map.flatten();
let (del_map, add_map) = vectors_field;
let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
@ -155,7 +171,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
VectorStateDelta::NowGenerated(prompt.render(
obkv,
DelAdd::Addition,
field_id_map,
&new_fields_ids_map,
)?)
} else {
VectorStateDelta::NowRemoved
@ -182,9 +198,10 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
if document_is_kept {
// Don't give up if the old prompt was failing
let old_prompt =
prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default();
let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?;
let old_prompt = prompt
.render(obkv, DelAdd::Deletion, &old_fields_ids_map)
.unwrap_or_default();
let new_prompt = prompt.render(obkv, DelAdd::Addition, &new_fields_ids_map)?;
if old_prompt != new_prompt {
tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
@ -220,15 +237,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
})
}
fn to_vector_maps(
obkv: KvReaderDelAdd,
document_id: impl Fn() -> Value,
) -> Result<(Option<serde_json::Map<String, Value>>, Option<serde_json::Map<String, Value>>)> {
let del = to_vector_map(obkv, DelAdd::Deletion, &document_id)?;
let add = to_vector_map(obkv, DelAdd::Addition, &document_id)?;
Ok((del, add))
}
fn to_vector_map(
obkv: KvReaderDelAdd,
side: DelAdd,

View File

@ -121,16 +121,16 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
let (w, fid) = StrBEU16Codec::bytes_decode(key)
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
if let Some(word) = word {
if word.as_str() != w {
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
if let Some(current) = word.as_ref() {
if current != w {
docids_into_writers(&current, &deletions, &additions, &mut word_docids_writer)?;
docids_into_writers(
&word,
&current,
&exact_deletions,
&exact_additions,
&mut exact_word_docids_writer,
);
let word = Some(w.to_string());
)?;
word = Some(w.to_string());
// clear buffers
deletions.clear();
additions.clear();
@ -138,7 +138,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
exact_additions.clear();
}
} else {
let word = Some(w.to_string());
word = Some(w.to_string());
}
// merge all deletions
@ -169,13 +169,13 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
}
if let Some(word) = word {
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer)?;
docids_into_writers(
&word,
&exact_deletions,
&exact_additions,
&mut exact_word_docids_writer,
);
)?;
}
Ok((
@ -253,7 +253,7 @@ where
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?,
);
)?;
}
// additions:
if !additions.is_empty() {
@ -262,7 +262,7 @@ where
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?,
);
)?;
}
// insert everything in the same writer.

View File

@ -11,7 +11,7 @@ use super::helpers::{
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::proximity::{index_proximity, MAX_DISTANCE};
use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE};
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{DocumentId, Result};
@ -24,9 +24,20 @@ use crate::{DocumentId, Result};
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
// early return if the data shouldn't be deleted nor created.
if !any_deletion && !any_addition {
return tempfile::tempfile()
.map_err(Into::into)
.map(BufReader::new)
.and_then(grenad::Reader::new)
.map_err(Into::into);
}
let max_memory = indexer.max_memory_by_thread();
@ -79,6 +90,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
let (del, add): (Result<_>, Result<_>) = rayon::join(
|| {
if !any_deletion {
return Ok(());
}
// deletions
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
for (position, word) in KvReaderU16::new(deletion).iter() {
@ -108,6 +123,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
Ok(())
},
|| {
if !any_addition {
return Ok(());
}
// additions
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
for (position, word) in KvReaderU16::new(addition).iter() {

View File

@ -9,7 +9,6 @@ mod extract_word_docids;
mod extract_word_pair_proximity_docids;
mod extract_word_position_docids;
use std::collections::HashSet;
use std::fs::File;
use std::io::BufReader;
@ -30,7 +29,6 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk};
use crate::proximity::ProximityPrecision;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result};
@ -200,12 +198,14 @@ fn run_extraction_task<FE, FS, M>(
M: Send,
{
let current_span = tracing::Span::current();
/// TODO: remove clone
let settings_diff = settings_diff.clone();
rayon::spawn(move || {
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks");
let _entered = child_span.enter();
puffin::profile_scope!("extract_multiple_chunks", name);
match extract_fn(chunk, indexer, settings_diff) {
match extract_fn(chunk, indexer, &settings_diff) {
Ok(chunk) => {
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
}
@ -235,12 +235,15 @@ fn send_original_documents_data(
.thread_name(|index| format!("embedding-request-{index}"))
.build()?;
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
/// TODO: remove clone
let settings_diff = settings_diff.clone();
rayon::spawn(move || {
for (name, (embedder, prompt)) in embedders {
for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {
let result = extract_vector_points(
documents_chunk_cloned.clone(),
indexer,
&field_id_map,
&settings_diff,
&prompt,
&name,
);
@ -279,6 +282,7 @@ fn send_original_documents_data(
}
}
});
}
// TODO: create a custom internal error
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));

View File

@ -6,7 +6,6 @@ mod typed_chunk;
use std::collections::{HashMap, HashSet};
use std::io::{Read, Seek};
use std::iter::FromIterator;
use std::num::NonZeroU32;
use std::result::Result as StdResult;
@ -281,7 +280,7 @@ where
let TransformOutput {
primary_key,
settings_diff,
mut settings_diff,
field_distribution,
documents_count,
original_documents,
@ -319,13 +318,8 @@ where
) = crossbeam_channel::unbounded();
// get the primary key field id
let primary_key_id = output.settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
// get searchable fields for word databases
let searchable_fields =
self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
// get filterable fields for facet databases
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
// get the fid of the `_geo.lat` and `_geo.lng` fields.
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
@ -348,12 +342,6 @@ where
None => None,
};
let stop_words = self.index.stop_words(self.wtxn)?;
let separators = self.index.allowed_separators(self.wtxn)?;
let dictionary = self.index.dictionary(self.wtxn)?;
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
let pool_params = GrenadParameters {
chunk_compression_type: self.indexer_config.chunk_compression_type,
chunk_compression_level: self.indexer_config.chunk_compression_level,

View File

@ -1,12 +1,11 @@
use std::borrow::Cow;
use std::collections::btree_map::Entry as BEntry;
use std::collections::hash_map::Entry as HEntry;
use std::collections::{HashMap, HashSet};
use std::collections::HashMap;
use std::fs::File;
use std::io::{Read, Seek};
use fxhash::FxHashMap;
use heed::RoTxn;
use itertools::Itertools;
use obkv::{KvReader, KvReaderU16, KvWriter};
use roaring::RoaringBitmap;
@ -814,7 +813,8 @@ impl<'a, 'i> Transform<'a, 'i> {
let settings_diff = InnerIndexSettingsDiff {
old: old_inner_settings,
new: new_inner_settings,
embedding_configs_updated: true,
embedding_configs_updated: false,
settings_update_only: false,
};
Ok(TransformOutput {
@ -844,13 +844,16 @@ impl<'a, 'i> Transform<'a, 'i> {
obkv_writer.insert(id, val)?;
}
}
let new_obkv = KvReader::<FieldId>::new(&obkv_writer.into_inner()?);
let data = obkv_writer.into_inner()?;
let new_obkv = KvReader::<FieldId>::new(&data);
// take the non-flattened version if flatten_from_fields_ids_map returns None.
let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?
.map_or_else(|| old_obkv, |bytes| KvReader::<FieldId>::new(&bytes));
let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?
.map_or_else(|| new_obkv, |bytes| KvReader::<FieldId>::new(&bytes));
let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?;
let old_flattened =
old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::<FieldId>::new);
let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?;
let new_flattened =
new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::<FieldId>::new);
original_obkv_buffer.clear();
flattened_obkv_buffer.clear();

View File

@ -1010,6 +1010,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
}
Setting::NotSet => false,
};
// if any changes force a reindexing
// clear the vector database.
if update {
self.index.vector_arroy.clear(self.wtxn)?;
}
Ok(update)
}
@ -1077,6 +1084,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
old: old_inner_settings,
new: new_inner_settings,
embedding_configs_updated,
settings_update_only: true,
};
if inner_settings_diff.any_reindexing_needed() {
@ -1087,20 +1095,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
}
}
pub(crate) struct InnerIndexSettingsDiff {
pub old: InnerIndexSettings,
pub new: InnerIndexSettings,
#[derive(Clone)]
pub struct InnerIndexSettingsDiff {
pub(crate) old: InnerIndexSettings,
pub(crate) new: InnerIndexSettings,
// TODO: compare directly the embedders.
pub embedding_configs_updated: bool,
pub(crate) embedding_configs_updated: bool,
pub(crate) settings_update_only: bool,
}
impl InnerIndexSettingsDiff {
fn any_reindexing_needed(&self) -> bool {
pub fn any_reindexing_needed(&self) -> bool {
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
}
fn reindex_searchable(&self) -> bool {
pub fn reindex_searchable(&self) -> bool {
self.old
.fields_ids_map
.iter()
@ -1115,13 +1126,13 @@ impl InnerIndexSettingsDiff {
|| self.old.proximity_precision != self.new.proximity_precision
}
fn reindex_facets(&self) -> bool {
let existing_fields = self.new.existing_fields;
pub fn reindex_facets(&self) -> bool {
let existing_fields = &self.new.existing_fields;
if existing_fields.iter().any(|field| field.contains('.')) {
return true;
}
let old_faceted_fields = self.old.user_defined_faceted_fields;
let old_faceted_fields = &self.old.user_defined_faceted_fields;
if old_faceted_fields.iter().any(|field| field.contains('.')) {
return true;
}
@ -1129,13 +1140,13 @@ impl InnerIndexSettingsDiff {
// If there is new faceted fields we indicate that we must reindex as we must
// index new fields as facets. It means that the distinct attribute,
// an Asc/Desc criterion or a filtered attribute as be added or removed.
let new_faceted_fields = self.new.user_defined_faceted_fields;
let new_faceted_fields = &self.new.user_defined_faceted_fields;
if new_faceted_fields.iter().any(|field| field.contains('.')) {
return true;
}
let faceted_updated =
(&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields);
(existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields);
self.old
.fields_ids_map
@ -1145,9 +1156,13 @@ impl InnerIndexSettingsDiff {
|| faceted_updated
}
fn reindex_vectors(&self) -> bool {
pub fn reindex_vectors(&self) -> bool {
self.embedding_configs_updated
}
pub fn settings_update_only(&self) -> bool {
self.settings_update_only
}
}
#[derive(Clone)]