mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
Merge #5187
5187: Bring back v1.12.0 of pre-release changes into `main` r=irevoire a=curquiza Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Louis Dureuil <louis@meilisearch.com> Co-authored-by: Clément Renault <clement@meilisearch.com> Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com> Co-authored-by: Many the fish <many@meilisearch.com>
This commit is contained in:
commit
d3491851bc
168 changed files with 5778 additions and 2556 deletions
|
@ -280,7 +280,7 @@ fn starts_with(selector: &str, key: &str) -> bool {
|
|||
|
||||
pub fn validate_document_id_str(document_id: &str) -> Option<&str> {
|
||||
if document_id.is_empty()
|
||||
|| document_id.len() > 512
|
||||
|| document_id.len() >= 512
|
||||
|| !document_id.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
|
||||
{
|
||||
None
|
||||
|
|
|
@ -3,6 +3,7 @@ use std::convert::Infallible;
|
|||
use std::fmt::Write;
|
||||
use std::{io, str};
|
||||
|
||||
use bstr::BString;
|
||||
use heed::{Error as HeedError, MdbError};
|
||||
use rayon::ThreadPoolBuildError;
|
||||
use rhai::EvalAltResult;
|
||||
|
@ -61,6 +62,10 @@ pub enum InternalError {
|
|||
Serialization(#[from] SerializationError),
|
||||
#[error(transparent)]
|
||||
Store(#[from] MdbError),
|
||||
#[error("Cannot delete {key:?} from database {database_name}: {error}")]
|
||||
StoreDeletion { database_name: &'static str, key: BString, error: heed::Error },
|
||||
#[error("Cannot insert {key:?} and value with length {value_length} into database {database_name}: {error}")]
|
||||
StorePut { database_name: &'static str, key: BString, value_length: usize, error: heed::Error },
|
||||
#[error(transparent)]
|
||||
Utf8(#[from] str::Utf8Error),
|
||||
#[error("An indexation process was explicitly aborted")]
|
||||
|
@ -109,7 +114,7 @@ pub enum UserError {
|
|||
"Document identifier `{}` is invalid. \
|
||||
A document identifier can be of type integer or string, \
|
||||
only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), \
|
||||
and can not be more than 512 bytes.", .document_id.to_string()
|
||||
and can not be more than 511 bytes.", .document_id.to_string()
|
||||
)]
|
||||
InvalidDocumentId { document_id: Value },
|
||||
#[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))]
|
||||
|
|
|
@ -97,7 +97,7 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
|
|||
|
||||
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut v = vec![value.size];
|
||||
CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v);
|
||||
CboRoaringBitmapCodec::serialize_into_vec(&value.bitmap, &mut v);
|
||||
Ok(Cow::Owned(v))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,18 +27,27 @@ impl CboRoaringBitmapCodec {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec<u8>) {
|
||||
pub fn serialize_into_vec(roaring: &RoaringBitmap, vec: &mut Vec<u8>) {
|
||||
Self::serialize_into_writer(roaring, vec).unwrap()
|
||||
}
|
||||
|
||||
pub fn serialize_into_writer<W: io::Write>(
|
||||
roaring: &RoaringBitmap,
|
||||
mut writer: W,
|
||||
) -> io::Result<()> {
|
||||
if roaring.len() <= THRESHOLD as u64 {
|
||||
// If the number of items (u32s) to encode is less than or equal to the threshold
|
||||
// it means that it would weigh the same or less than the RoaringBitmap
|
||||
// header, so we directly encode them using ByteOrder instead.
|
||||
for integer in roaring {
|
||||
vec.write_u32::<NativeEndian>(integer).unwrap();
|
||||
writer.write_u32::<NativeEndian>(integer)?;
|
||||
}
|
||||
} else {
|
||||
// Otherwise, we use the classic RoaringBitmapCodec that writes a header.
|
||||
roaring.serialize_into(vec).unwrap();
|
||||
roaring.serialize_into(writer)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn deserialize_from(mut bytes: &[u8]) -> io::Result<RoaringBitmap> {
|
||||
|
@ -143,7 +152,7 @@ impl CboRoaringBitmapCodec {
|
|||
return Ok(None);
|
||||
}
|
||||
|
||||
Self::serialize_into(&previous, buffer);
|
||||
Self::serialize_into_vec(&previous, buffer);
|
||||
Ok(Some(&buffer[..]))
|
||||
}
|
||||
}
|
||||
|
@ -169,7 +178,7 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
|
|||
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||
let mut vec = Vec::with_capacity(Self::serialized_size(item));
|
||||
Self::serialize_into(item, &mut vec);
|
||||
Self::serialize_into_vec(item, &mut vec);
|
||||
Ok(Cow::Owned(vec))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -70,6 +70,8 @@ pub mod main_key {
|
|||
pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
|
||||
pub const SEARCH_CUTOFF: &str = "search_cutoff";
|
||||
pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules";
|
||||
pub const FACET_SEARCH: &str = "facet_search";
|
||||
pub const PREFIX_SEARCH: &str = "prefix_search";
|
||||
}
|
||||
|
||||
pub mod db_name {
|
||||
|
@ -1233,6 +1235,10 @@ impl Index {
|
|||
)
|
||||
}
|
||||
|
||||
pub(crate) fn delete_words_prefixes_fst(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(wtxn, main_key::WORDS_PREFIXES_FST_KEY)
|
||||
}
|
||||
|
||||
/// Returns the FST which is the words prefixes dictionary of the engine.
|
||||
pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn<'t>) -> Result<fst::Set<Cow<'t, [u8]>>> {
|
||||
match self.main.remap_types::<Str, Bytes>().get(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? {
|
||||
|
@ -1562,6 +1568,41 @@ impl Index {
|
|||
self.main.remap_key_type::<Str>().delete(txn, main_key::PROXIMITY_PRECISION)
|
||||
}
|
||||
|
||||
pub fn prefix_search(&self, txn: &RoTxn<'_>) -> heed::Result<Option<PrefixSearch>> {
|
||||
self.main.remap_types::<Str, SerdeBincode<PrefixSearch>>().get(txn, main_key::PREFIX_SEARCH)
|
||||
}
|
||||
|
||||
pub(crate) fn put_prefix_search(
|
||||
&self,
|
||||
txn: &mut RwTxn<'_>,
|
||||
val: PrefixSearch,
|
||||
) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, SerdeBincode<PrefixSearch>>().put(
|
||||
txn,
|
||||
main_key::PREFIX_SEARCH,
|
||||
&val,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn delete_prefix_search(&self, txn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(txn, main_key::PREFIX_SEARCH)
|
||||
}
|
||||
|
||||
pub fn facet_search(&self, txn: &RoTxn<'_>) -> heed::Result<bool> {
|
||||
self.main
|
||||
.remap_types::<Str, SerdeBincode<bool>>()
|
||||
.get(txn, main_key::FACET_SEARCH)
|
||||
.map(|v| v.unwrap_or(true))
|
||||
}
|
||||
|
||||
pub(crate) fn put_facet_search(&self, txn: &mut RwTxn<'_>, val: bool) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, SerdeBincode<bool>>().put(txn, main_key::FACET_SEARCH, &val)
|
||||
}
|
||||
|
||||
pub(crate) fn delete_facet_search(&self, txn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(txn, main_key::FACET_SEARCH)
|
||||
}
|
||||
|
||||
pub fn localized_attributes_rules(
|
||||
&self,
|
||||
rtxn: &RoTxn<'_>,
|
||||
|
@ -1647,12 +1688,9 @@ impl Index {
|
|||
Ok(res)
|
||||
}
|
||||
|
||||
pub fn prefix_settings(&self, _rtxn: &RoTxn<'_>) -> Result<PrefixSettings> {
|
||||
Ok(PrefixSettings {
|
||||
compute_prefixes: true,
|
||||
max_prefix_length: 4,
|
||||
prefix_count_threshold: 100,
|
||||
})
|
||||
pub fn prefix_settings(&self, rtxn: &RoTxn<'_>) -> Result<PrefixSettings> {
|
||||
let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default();
|
||||
Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 })
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1665,9 +1703,17 @@ pub struct IndexEmbeddingConfig {
|
|||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct PrefixSettings {
|
||||
pub prefix_count_threshold: u64,
|
||||
pub prefix_count_threshold: usize,
|
||||
pub max_prefix_length: usize,
|
||||
pub compute_prefixes: bool,
|
||||
pub compute_prefixes: PrefixSearch,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum PrefixSearch {
|
||||
#[default]
|
||||
IndexingTime,
|
||||
Disabled,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
|
@ -1688,6 +1734,7 @@ pub(crate) mod tests {
|
|||
|
||||
use crate::error::{Error, InternalError};
|
||||
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||
use crate::progress::Progress;
|
||||
use crate::update::new::indexer;
|
||||
use crate::update::settings::InnerIndexSettings;
|
||||
use crate::update::{
|
||||
|
@ -1764,7 +1811,7 @@ pub(crate) mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)?;
|
||||
|
||||
if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) {
|
||||
|
@ -1775,6 +1822,7 @@ pub(crate) mod tests {
|
|||
indexer::index(
|
||||
wtxn,
|
||||
&self.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -1782,7 +1830,7 @@ pub(crate) mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
})
|
||||
.unwrap()?;
|
||||
|
@ -1854,7 +1902,7 @@ pub(crate) mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)?;
|
||||
|
||||
if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) {
|
||||
|
@ -1865,6 +1913,7 @@ pub(crate) mod tests {
|
|||
indexer::index(
|
||||
wtxn,
|
||||
&self.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -1872,7 +1921,7 @@ pub(crate) mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
})
|
||||
.unwrap()?;
|
||||
|
@ -1934,7 +1983,7 @@ pub(crate) mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -1945,6 +1994,7 @@ pub(crate) mod tests {
|
|||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -1952,7 +2002,7 @@ pub(crate) mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| should_abort.load(Relaxed),
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
})
|
||||
.unwrap()
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#![cfg_attr(all(test, fuzzing), feature(no_coverage))]
|
||||
#![allow(clippy::type_complexity)]
|
||||
|
||||
#[cfg(not(windows))]
|
||||
#[cfg(test)]
|
||||
#[global_allocator]
|
||||
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
@ -30,6 +31,7 @@ pub mod vector;
|
|||
#[macro_use]
|
||||
pub mod snapshot_tests;
|
||||
mod fieldids_weights_map;
|
||||
pub mod progress;
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
|
|
152
crates/milli/src/progress.rs
Normal file
152
crates/milli/src/progress.rs
Normal file
|
@ -0,0 +1,152 @@
|
|||
use std::any::TypeId;
|
||||
use std::borrow::Cow;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
pub trait Step: 'static + Send + Sync {
|
||||
fn name(&self) -> Cow<'static, str>;
|
||||
fn current(&self) -> u32;
|
||||
fn total(&self) -> u32;
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Progress {
|
||||
steps: Arc<RwLock<Vec<(TypeId, Box<dyn Step>)>>>,
|
||||
}
|
||||
|
||||
impl Progress {
|
||||
pub fn update_progress<P: Step>(&self, sub_progress: P) {
|
||||
let mut steps = self.steps.write().unwrap();
|
||||
let step_type = TypeId::of::<P>();
|
||||
if let Some(idx) = steps.iter().position(|(id, _)| *id == step_type) {
|
||||
steps.truncate(idx);
|
||||
}
|
||||
steps.push((step_type, Box::new(sub_progress)));
|
||||
}
|
||||
|
||||
// TODO: This code should be in meilisearch_types but cannot because milli can't depend on meilisearch_types
|
||||
pub fn as_progress_view(&self) -> ProgressView {
|
||||
let steps = self.steps.read().unwrap();
|
||||
|
||||
let mut percentage = 0.0;
|
||||
let mut prev_factors = 1.0;
|
||||
|
||||
let mut step_view = Vec::with_capacity(steps.len());
|
||||
for (_, step) in steps.iter() {
|
||||
prev_factors *= step.total() as f32;
|
||||
percentage += step.current() as f32 / prev_factors;
|
||||
|
||||
step_view.push(ProgressStepView {
|
||||
current_step: step.name(),
|
||||
finished: step.current(),
|
||||
total: step.total(),
|
||||
});
|
||||
}
|
||||
|
||||
ProgressView { steps: step_view, percentage: percentage * 100.0 }
|
||||
}
|
||||
}
|
||||
|
||||
/// This trait lets you use the AtomicSubStep defined right below.
|
||||
/// The name must be a const that never changed but that can't be enforced by the type system because it make the trait non object-safe.
|
||||
/// By forcing the Default trait + the &'static str we make it harder to miss-use the trait.
|
||||
pub trait NamedStep: 'static + Send + Sync + Default {
|
||||
fn name(&self) -> &'static str;
|
||||
}
|
||||
|
||||
/// Structure to quickly define steps that need very quick, lockless updating of their current step.
|
||||
/// You can use this struct if:
|
||||
/// - The name of the step doesn't change
|
||||
/// - The total number of steps doesn't change
|
||||
pub struct AtomicSubStep<Name: NamedStep> {
|
||||
unit_name: Name,
|
||||
current: Arc<AtomicU32>,
|
||||
total: u32,
|
||||
}
|
||||
|
||||
impl<Name: NamedStep> AtomicSubStep<Name> {
|
||||
pub fn new(total: u32) -> (Arc<AtomicU32>, Self) {
|
||||
let current = Arc::new(AtomicU32::new(0));
|
||||
(current.clone(), Self { current, total, unit_name: Name::default() })
|
||||
}
|
||||
}
|
||||
|
||||
impl<Name: NamedStep> Step for AtomicSubStep<Name> {
|
||||
fn name(&self) -> Cow<'static, str> {
|
||||
self.unit_name.name().into()
|
||||
}
|
||||
|
||||
fn current(&self) -> u32 {
|
||||
self.current.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
fn total(&self) -> u32 {
|
||||
self.total
|
||||
}
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! make_enum_progress {
|
||||
($visibility:vis enum $name:ident { $($variant:ident,)+ }) => {
|
||||
#[repr(u8)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)]
|
||||
#[allow(clippy::enum_variant_names)]
|
||||
$visibility enum $name {
|
||||
$($variant),+
|
||||
}
|
||||
|
||||
impl Step for $name {
|
||||
fn name(&self) -> Cow<'static, str> {
|
||||
use convert_case::Casing;
|
||||
|
||||
match self {
|
||||
$(
|
||||
$name::$variant => stringify!($variant).from_case(convert_case::Case::Camel).to_case(convert_case::Case::Lower).into()
|
||||
),+
|
||||
}
|
||||
}
|
||||
|
||||
fn current(&self) -> u32 {
|
||||
*self as u32
|
||||
}
|
||||
|
||||
fn total(&self) -> u32 {
|
||||
Self::CARDINALITY as u32
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! make_atomic_progress {
|
||||
($struct_name:ident alias $atomic_struct_name:ident => $step_name:literal) => {
|
||||
#[derive(Default, Debug, Clone, Copy)]
|
||||
pub struct $struct_name {}
|
||||
impl NamedStep for $struct_name {
|
||||
fn name(&self) -> &'static str {
|
||||
$step_name
|
||||
}
|
||||
}
|
||||
pub type $atomic_struct_name = AtomicSubStep<$struct_name>;
|
||||
};
|
||||
}
|
||||
|
||||
make_atomic_progress!(Document alias AtomicDocumentStep => "document" );
|
||||
make_atomic_progress!(Payload alias AtomicPayloadStep => "payload" );
|
||||
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ProgressView {
|
||||
pub steps: Vec<ProgressStepView>,
|
||||
pub percentage: f32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ProgressStepView {
|
||||
pub current_step: Cow<'static, str>,
|
||||
pub finished: u32,
|
||||
pub total: u32,
|
||||
}
|
|
@ -3,12 +3,13 @@ use std::collections::BTreeMap;
|
|||
use std::fmt::{self, Debug};
|
||||
|
||||
use bumpalo::Bump;
|
||||
use bumparaw_collections::{RawMap, RawVec, Value};
|
||||
use liquid::model::{
|
||||
ArrayView, DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, ScalarCow, State,
|
||||
Value as LiquidValue,
|
||||
};
|
||||
use liquid::{ObjectView, ValueView};
|
||||
use raw_collections::{RawMap, RawVec};
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
|
@ -195,7 +196,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc
|
|||
}
|
||||
|
||||
impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> {
|
||||
fn as_debug(&self) -> &dyn fmt::Debug {
|
||||
fn as_debug(&self) -> &dyn Debug {
|
||||
self
|
||||
}
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
|
@ -243,14 +244,13 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc,
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ParseableValue<'doc> {
|
||||
value: raw_collections::Value<'doc>,
|
||||
value: Value<'doc, FxBuildHasher>,
|
||||
}
|
||||
|
||||
impl<'doc> ParseableValue<'doc> {
|
||||
pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self {
|
||||
let value = raw_collections::Value::from_raw_value(value, doc_alloc).unwrap();
|
||||
let value = Value::from_raw_value_and_hasher(value, FxBuildHasher, doc_alloc).unwrap();
|
||||
Self { value }
|
||||
}
|
||||
|
||||
|
@ -260,19 +260,19 @@ impl<'doc> ParseableValue<'doc> {
|
|||
}
|
||||
|
||||
// transparent newtype for implementing ValueView
|
||||
#[repr(transparent)]
|
||||
#[derive(Debug)]
|
||||
struct ParseableMap<'doc>(RawMap<'doc>);
|
||||
#[repr(transparent)]
|
||||
struct ParseableMap<'doc>(RawMap<'doc, FxBuildHasher>);
|
||||
|
||||
// transparent newtype for implementing ValueView
|
||||
#[repr(transparent)]
|
||||
#[derive(Debug)]
|
||||
#[repr(transparent)]
|
||||
struct ParseableArray<'doc>(RawVec<'doc>);
|
||||
|
||||
impl<'doc> ParseableMap<'doc> {
|
||||
pub fn as_parseable<'a>(map: &'a RawMap<'doc>) -> &'a ParseableMap<'doc> {
|
||||
pub fn as_parseable<'a>(map: &'a RawMap<'doc, FxBuildHasher>) -> &'a ParseableMap<'doc> {
|
||||
// SAFETY: repr(transparent)
|
||||
unsafe { &*(map as *const RawMap as *const Self) }
|
||||
unsafe { &*(map as *const RawMap<FxBuildHasher> as *const Self) }
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -447,8 +447,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
|
|||
}
|
||||
|
||||
fn render(&self) -> DisplayCow<'_> {
|
||||
use raw_collections::value::Number;
|
||||
use raw_collections::Value;
|
||||
use bumparaw_collections::value::Number;
|
||||
use bumparaw_collections::Value;
|
||||
|
||||
match &self.value {
|
||||
Value::Null => LiquidValue::Nil.render(),
|
||||
Value::Bool(v) => v.render(),
|
||||
|
@ -464,8 +465,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
|
|||
}
|
||||
|
||||
fn source(&self) -> DisplayCow<'_> {
|
||||
use raw_collections::value::Number;
|
||||
use raw_collections::Value;
|
||||
use bumparaw_collections::value::Number;
|
||||
use bumparaw_collections::Value;
|
||||
|
||||
match &self.value {
|
||||
Value::Null => LiquidValue::Nil.source(),
|
||||
Value::Bool(v) => ValueView::source(v),
|
||||
|
@ -481,8 +483,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
|
|||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
use raw_collections::value::Number;
|
||||
use raw_collections::Value;
|
||||
use bumparaw_collections::value::Number;
|
||||
use bumparaw_collections::Value;
|
||||
|
||||
match &self.value {
|
||||
Value::Null => LiquidValue::Nil.type_name(),
|
||||
Value::Bool(v) => v.type_name(),
|
||||
|
@ -498,7 +501,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
|
|||
}
|
||||
|
||||
fn query_state(&self, state: State) -> bool {
|
||||
use raw_collections::Value;
|
||||
use bumparaw_collections::Value;
|
||||
|
||||
match &self.value {
|
||||
Value::Null => ValueView::query_state(&LiquidValue::Nil, state),
|
||||
Value::Bool(v) => ValueView::query_state(v, state),
|
||||
|
@ -515,7 +519,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
|
|||
}
|
||||
|
||||
fn to_kstr(&self) -> KStringCow<'_> {
|
||||
use raw_collections::Value;
|
||||
use bumparaw_collections::Value;
|
||||
|
||||
match &self.value {
|
||||
Value::Null => ValueView::to_kstr(&LiquidValue::Nil),
|
||||
Value::Bool(v) => ValueView::to_kstr(v),
|
||||
|
@ -527,12 +532,14 @@ impl<'doc> ValueView for ParseableValue<'doc> {
|
|||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
use raw_collections::Value;
|
||||
use bumparaw_collections::value::Number;
|
||||
use bumparaw_collections::Value;
|
||||
|
||||
match &self.value {
|
||||
Value::Null => LiquidValue::Nil,
|
||||
Value::Bool(v) => LiquidValue::Scalar(liquid::model::ScalarCow::new(*v)),
|
||||
Value::Number(number) => match number {
|
||||
raw_collections::value::Number::PosInt(number) => {
|
||||
Number::PosInt(number) => {
|
||||
let number: i64 = match (*number).try_into() {
|
||||
Ok(number) => number,
|
||||
Err(_) => {
|
||||
|
@ -541,12 +548,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
|
|||
};
|
||||
LiquidValue::Scalar(ScalarCow::new(number))
|
||||
}
|
||||
raw_collections::value::Number::NegInt(number) => {
|
||||
LiquidValue::Scalar(ScalarCow::new(*number))
|
||||
}
|
||||
raw_collections::value::Number::Finite(number) => {
|
||||
LiquidValue::Scalar(ScalarCow::new(*number))
|
||||
}
|
||||
Number::NegInt(number) => LiquidValue::Scalar(ScalarCow::new(*number)),
|
||||
Number::Finite(number) => LiquidValue::Scalar(ScalarCow::new(*number)),
|
||||
},
|
||||
Value::String(s) => LiquidValue::Scalar(liquid::model::ScalarCow::new(s.to_string())),
|
||||
Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_value(),
|
||||
|
@ -555,8 +558,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
|
|||
}
|
||||
|
||||
fn as_scalar(&self) -> Option<liquid::model::ScalarCow<'_>> {
|
||||
use raw_collections::value::Number;
|
||||
use raw_collections::Value;
|
||||
use bumparaw_collections::value::Number;
|
||||
use bumparaw_collections::Value;
|
||||
|
||||
match &self.value {
|
||||
Value::Bool(v) => Some(liquid::model::ScalarCow::new(*v)),
|
||||
Value::Number(number) => match number {
|
||||
|
@ -576,34 +580,41 @@ impl<'doc> ValueView for ParseableValue<'doc> {
|
|||
}
|
||||
|
||||
fn is_scalar(&self) -> bool {
|
||||
use raw_collections::Value;
|
||||
use bumparaw_collections::Value;
|
||||
|
||||
matches!(&self.value, Value::Bool(_) | Value::Number(_) | Value::String(_))
|
||||
}
|
||||
|
||||
fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> {
|
||||
if let raw_collections::Value::Array(array) = &self.value {
|
||||
if let Value::Array(array) = &self.value {
|
||||
return Some(ParseableArray::as_parseable(array) as _);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn is_array(&self) -> bool {
|
||||
matches!(&self.value, raw_collections::Value::Array(_))
|
||||
matches!(&self.value, bumparaw_collections::Value::Array(_))
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
if let raw_collections::Value::Object(object) = &self.value {
|
||||
if let Value::Object(object) = &self.value {
|
||||
return Some(ParseableMap::as_parseable(object) as _);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn is_object(&self) -> bool {
|
||||
matches!(&self.value, raw_collections::Value::Object(_))
|
||||
matches!(&self.value, bumparaw_collections::Value::Object(_))
|
||||
}
|
||||
|
||||
fn is_nil(&self) -> bool {
|
||||
matches!(&self.value, raw_collections::Value::Null)
|
||||
matches!(&self.value, bumparaw_collections::Value::Null)
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for ParseableValue<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("ParseableValue").field("value", &self.value).finish()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -38,6 +38,16 @@ pub struct RenderPromptError {
|
|||
pub fault: FaultSource,
|
||||
}
|
||||
impl RenderPromptError {
|
||||
pub(crate) fn missing_context_with_external_docid(
|
||||
external_docid: String,
|
||||
inner: liquid::Error,
|
||||
) -> RenderPromptError {
|
||||
Self {
|
||||
kind: RenderPromptErrorKind::MissingContextWithExternalDocid(external_docid, inner),
|
||||
fault: FaultSource::User,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn missing_context(inner: liquid::Error) -> RenderPromptError {
|
||||
Self { kind: RenderPromptErrorKind::MissingContext(inner), fault: FaultSource::User }
|
||||
}
|
||||
|
@ -47,6 +57,8 @@ impl RenderPromptError {
|
|||
pub enum RenderPromptErrorKind {
|
||||
#[error("missing field in document: {0}")]
|
||||
MissingContext(liquid::Error),
|
||||
#[error("missing field in document `{0}`: {1}")]
|
||||
MissingContextWithExternalDocid(String, liquid::Error),
|
||||
}
|
||||
|
||||
impl From<RenderPromptError> for crate::Error {
|
||||
|
|
|
@ -119,6 +119,7 @@ impl Prompt {
|
|||
'doc: 'a, // lifetime of the allocator, will live for an entire chunk of documents
|
||||
>(
|
||||
&self,
|
||||
external_docid: &str,
|
||||
document: impl crate::update::new::document::Document<'a> + Debug,
|
||||
field_id_map: &RefCell<GlobalFieldsIdsMap>,
|
||||
doc_alloc: &'doc Bump,
|
||||
|
@ -130,9 +131,12 @@ impl Prompt {
|
|||
self.max_bytes.unwrap_or_else(default_max_bytes).get(),
|
||||
doc_alloc,
|
||||
);
|
||||
self.template
|
||||
.render_to(&mut rendered, &context)
|
||||
.map_err(RenderPromptError::missing_context)?;
|
||||
self.template.render_to(&mut rendered, &context).map_err(|liquid_error| {
|
||||
RenderPromptError::missing_context_with_external_docid(
|
||||
external_docid.to_owned(),
|
||||
liquid_error,
|
||||
)
|
||||
})?;
|
||||
Ok(std::str::from_utf8(rendered.into_bump_slice())
|
||||
.expect("render can only write UTF-8 because all inputs and processing preserve utf-8"))
|
||||
}
|
||||
|
|
|
@ -207,7 +207,11 @@ impl<'a> Search<'a> {
|
|||
Ok(embedding) => embedding,
|
||||
Err(error) => {
|
||||
tracing::error!(error=%error, "Embedding failed");
|
||||
return Ok((keyword_results, Some(0)));
|
||||
return Ok(return_keyword_results(
|
||||
self.limit,
|
||||
self.offset,
|
||||
keyword_results,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -274,7 +274,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||
last_match_last_token_position_plus_one
|
||||
} else {
|
||||
// we have matched the end of possible tokens, there's nothing to advance
|
||||
tokens.len() - 1
|
||||
tokens.len()
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -49,6 +49,7 @@ pub use self::geo_sort::Strategy as GeoSortStrategy;
|
|||
use self::graph_based_ranking_rule::Words;
|
||||
use self::interner::Interned;
|
||||
use self::vector_sort::VectorSort;
|
||||
use crate::index::PrefixSearch;
|
||||
use crate::localized_attributes_rules::LocalizedFieldIds;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
|
@ -68,6 +69,7 @@ pub struct SearchContext<'ctx> {
|
|||
pub term_interner: Interner<QueryTerm>,
|
||||
pub phrase_docids: PhraseDocIdsCache,
|
||||
pub restricted_fids: Option<RestrictedFids>,
|
||||
pub prefix_search: PrefixSearch,
|
||||
}
|
||||
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
|
@ -85,6 +87,8 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
}
|
||||
}
|
||||
|
||||
let prefix_search = index.prefix_search(txn)?.unwrap_or_default();
|
||||
|
||||
Ok(Self {
|
||||
index,
|
||||
txn,
|
||||
|
@ -94,9 +98,14 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
term_interner: <_>::default(),
|
||||
phrase_docids: <_>::default(),
|
||||
restricted_fids: None,
|
||||
prefix_search,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_prefix_search_allowed(&self) -> bool {
|
||||
self.prefix_search != PrefixSearch::Disabled
|
||||
}
|
||||
|
||||
pub fn attributes_to_search_on(
|
||||
&mut self,
|
||||
attributes_to_search_on: &'ctx [String],
|
||||
|
|
|
@ -28,6 +28,7 @@ pub fn located_query_terms_from_tokens(
|
|||
words_limit: Option<usize>,
|
||||
) -> Result<ExtractedTokens> {
|
||||
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||
let allow_prefix_search = ctx.is_prefix_search_allowed();
|
||||
|
||||
let mut query_terms = Vec::new();
|
||||
|
||||
|
@ -94,7 +95,7 @@ pub fn located_query_terms_from_tokens(
|
|||
ctx,
|
||||
word,
|
||||
nbr_typos(word),
|
||||
true,
|
||||
allow_prefix_search,
|
||||
false,
|
||||
)?;
|
||||
let located_term = LocatedQueryTerm {
|
||||
|
|
|
@ -193,15 +193,23 @@ pub fn compute_phrase_docids(
|
|||
if words.is_empty() {
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut candidates = None;
|
||||
for word in words.iter().flatten().copied() {
|
||||
if let Some(word_docids) = ctx.word_docids(None, Word::Original(word))? {
|
||||
candidates |= word_docids;
|
||||
if let Some(candidates) = candidates.as_mut() {
|
||||
*candidates &= word_docids;
|
||||
} else {
|
||||
candidates = Some(word_docids);
|
||||
}
|
||||
} else {
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
}
|
||||
|
||||
let Some(mut candidates) = candidates else {
|
||||
return Ok(RoaringBitmap::new());
|
||||
};
|
||||
|
||||
let winsize = words.len().min(3);
|
||||
|
||||
for win in words.windows(winsize) {
|
||||
|
|
|
@ -5,6 +5,7 @@ use bumpalo::Bump;
|
|||
use heed::EnvOpenOptions;
|
||||
use maplit::{btreemap, hashset};
|
||||
|
||||
use crate::progress::Progress;
|
||||
use crate::update::new::indexer;
|
||||
use crate::update::{IndexDocumentsMethod, IndexerConfig, Settings};
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
|
@ -72,7 +73,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -83,6 +84,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -90,7 +92,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
|
|
@ -172,6 +172,14 @@ impl<'i> FacetsUpdate<'i> {
|
|||
incremental_update.execute(wtxn)?;
|
||||
}
|
||||
|
||||
if !self.index.facet_search(wtxn)? {
|
||||
// If facet search is disabled, we don't need to compute facet search databases.
|
||||
// We clear the facet search databases.
|
||||
self.index.facet_id_string_fst.clear(wtxn)?;
|
||||
self.index.facet_id_normalized_string_strings.clear(wtxn)?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
match self.normalized_delta_data {
|
||||
Some(data) => index_facet_search(wtxn, data, self.index),
|
||||
None => Ok(()),
|
||||
|
|
|
@ -58,9 +58,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let old_dictionary: Option<Vec<_>> =
|
||||
settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let del_builder =
|
||||
let mut del_builder =
|
||||
tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref());
|
||||
let del_tokenizer = del_builder.into_tokenizer();
|
||||
let del_tokenizer = del_builder.build();
|
||||
|
||||
let new_stop_words = settings_diff.new.stop_words.as_ref();
|
||||
let new_separators: Option<Vec<_>> = settings_diff
|
||||
|
@ -70,9 +70,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let new_dictionary: Option<Vec<_>> =
|
||||
settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let add_builder =
|
||||
let mut add_builder =
|
||||
tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref());
|
||||
let add_tokenizer = add_builder.into_tokenizer();
|
||||
let add_tokenizer = add_builder.build();
|
||||
|
||||
// iterate over documents.
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
|
|
|
@ -34,10 +34,12 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
|||
extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
|
||||
} else {
|
||||
let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
|
||||
let facet_search = settings_diff.new.facet_search;
|
||||
extract_facet_string_docids_document_update(
|
||||
docid_fid_facet_string,
|
||||
indexer,
|
||||
localized_field_ids,
|
||||
facet_search,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
@ -51,6 +53,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
|||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
localized_field_ids: &LocalizedFieldIds,
|
||||
facet_search: bool,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
|
@ -96,7 +99,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
|||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
{
|
||||
if facet_search {
|
||||
let locales = localized_field_ids.locales(field_id);
|
||||
let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
|
||||
|
||||
|
@ -179,8 +182,10 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
|||
let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
|
||||
|
||||
let are_same_locales = old_locales == new_locales;
|
||||
let reindex_facet_search =
|
||||
settings_diff.new.facet_search && !settings_diff.old.facet_search;
|
||||
|
||||
if is_same_value && are_same_locales {
|
||||
if is_same_value && are_same_locales && !reindex_facet_search {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -191,18 +196,26 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
|||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
{
|
||||
let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
|
||||
let new_hyper_normalized_value = if are_same_locales {
|
||||
&old_hyper_normalized_value
|
||||
if settings_diff.new.facet_search {
|
||||
let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales);
|
||||
let old_hyper_normalized_value;
|
||||
let old_hyper_normalized_value = if !settings_diff.old.facet_search
|
||||
|| deladd_reader.get(DelAdd::Deletion).is_none()
|
||||
{
|
||||
// if the facet search is disabled in the old settings or if no facet string is deleted,
|
||||
// we don't need to normalize the facet string.
|
||||
None
|
||||
} else if are_same_locales {
|
||||
Some(&new_hyper_normalized_value)
|
||||
} else {
|
||||
&normalize_facet_string(normalized_value, new_locales)
|
||||
old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
|
||||
Some(&old_hyper_normalized_value)
|
||||
};
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
// if the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
if old_hyper_normalized_value == new_hyper_normalized_value.as_str() {
|
||||
if old_hyper_normalized_value == Some(&new_hyper_normalized_value) {
|
||||
// nothing to do if we delete and re-add the value.
|
||||
if is_same_value {
|
||||
continue;
|
||||
|
@ -222,7 +235,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
|||
} else {
|
||||
// if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different.
|
||||
// deletion
|
||||
if deladd_reader.get(DelAdd::Deletion).is_some() {
|
||||
if let Some(old_hyper_normalized_value) = old_hyper_normalized_value {
|
||||
// insert old value
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
buffer.clear();
|
||||
|
|
|
@ -80,7 +80,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
let new_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.new.faceted_fields_ids.iter().copied().collect();
|
||||
|
||||
if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids {
|
||||
if !settings_diff.settings_update_only || settings_diff.reindex_facets() {
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::from_slice(value);
|
||||
|
@ -112,8 +112,10 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
(field_id, None, add_value)
|
||||
}
|
||||
EitherOrBoth::Both(&field_id, _) => {
|
||||
// during settings update, recompute the changing settings only.
|
||||
if settings_diff.settings_update_only {
|
||||
// during settings update, recompute the changing settings only unless a global change is detected.
|
||||
if settings_diff.settings_update_only
|
||||
&& !settings_diff.global_facet_settings_changed()
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ pub use self::transform::{Transform, TransformOutput};
|
|||
use super::new::StdResult;
|
||||
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError};
|
||||
use crate::index::{PrefixSearch, PrefixSettings};
|
||||
use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
|
||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||
use crate::update::{
|
||||
|
@ -82,8 +83,6 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
|
|||
|
||||
#[derive(Default, Debug, Clone)]
|
||||
pub struct IndexDocumentsConfig {
|
||||
pub words_prefix_threshold: Option<u32>,
|
||||
pub max_prefix_length: Option<usize>,
|
||||
pub words_positions_level_group_size: Option<NonZeroU32>,
|
||||
pub words_positions_min_level_size: Option<NonZeroU32>,
|
||||
pub update_method: IndexDocumentsMethod,
|
||||
|
@ -565,14 +564,32 @@ where
|
|||
self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?;
|
||||
|
||||
// Run the words prefixes update operation.
|
||||
let mut builder = WordsPrefixesFst::new(self.wtxn, self.index);
|
||||
if let Some(value) = self.config.words_prefix_threshold {
|
||||
builder.threshold(value);
|
||||
let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } =
|
||||
self.index.prefix_settings(self.wtxn)?;
|
||||
|
||||
// If the prefix search is enabled at indexing time, we compute the prefixes.
|
||||
if compute_prefixes == PrefixSearch::IndexingTime {
|
||||
let mut builder = WordsPrefixesFst::new(self.wtxn, self.index);
|
||||
builder.threshold(prefix_count_threshold);
|
||||
builder.max_prefix_length(max_prefix_length);
|
||||
builder.execute()?;
|
||||
} else {
|
||||
// If the prefix search is disabled at indexing time, we delete the previous words prefixes fst.
|
||||
// And all the associated docids databases.
|
||||
self.index.delete_words_prefixes_fst(self.wtxn)?;
|
||||
self.index.word_prefix_docids.clear(self.wtxn)?;
|
||||
self.index.exact_word_prefix_docids.clear(self.wtxn)?;
|
||||
self.index.word_prefix_position_docids.clear(self.wtxn)?;
|
||||
self.index.word_prefix_fid_docids.clear(self.wtxn)?;
|
||||
|
||||
databases_seen += 3;
|
||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
if let Some(value) = self.config.max_prefix_length {
|
||||
builder.max_prefix_length(value);
|
||||
}
|
||||
builder.execute()?;
|
||||
|
||||
if (self.should_abort)() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
|
@ -749,6 +766,7 @@ mod tests {
|
|||
use crate::documents::mmap_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::Progress;
|
||||
use crate::search::TermsMatchingStrategy;
|
||||
use crate::update::new::indexer;
|
||||
use crate::update::Setting;
|
||||
|
@ -1947,7 +1965,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2131,13 +2149,14 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -2145,7 +2164,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2192,13 +2211,14 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -2206,7 +2226,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2244,13 +2264,14 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -2258,7 +2279,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2295,13 +2316,14 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -2309,7 +2331,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2348,13 +2370,14 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -2362,7 +2385,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2406,13 +2429,14 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -2420,7 +2444,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2457,13 +2481,14 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -2471,7 +2496,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2508,13 +2533,14 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -2522,7 +2548,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2701,13 +2727,14 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -2715,7 +2742,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2759,13 +2786,14 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -2773,7 +2801,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2814,13 +2842,14 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -2828,7 +2857,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
|
|
@ -667,14 +667,23 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) };
|
||||
|
||||
// If only a faceted field has been added, keep only this field.
|
||||
let must_reindex_facets = settings_diff.reindex_facets();
|
||||
let necessary_faceted_field = |id: FieldId| -> bool {
|
||||
let field_name = settings_diff.new.fields_ids_map.name(id).unwrap();
|
||||
must_reindex_facets
|
||||
&& modified_faceted_fields
|
||||
.iter()
|
||||
.any(|long| is_faceted_by(long, field_name) || is_faceted_by(field_name, long))
|
||||
};
|
||||
let global_facet_settings_changed = settings_diff.global_facet_settings_changed();
|
||||
let facet_fids_changed = settings_diff.facet_fids_changed();
|
||||
let necessary_faceted_field =
|
||||
|id: FieldId| -> bool {
|
||||
let field_name = settings_diff.new.fields_ids_map.name(id).unwrap();
|
||||
if global_facet_settings_changed {
|
||||
settings_diff.new.user_defined_faceted_fields.iter().any(|long| {
|
||||
is_faceted_by(long, field_name) || is_faceted_by(field_name, long)
|
||||
})
|
||||
} else if facet_fids_changed {
|
||||
modified_faceted_fields.iter().any(|long| {
|
||||
is_faceted_by(long, field_name) || is_faceted_by(field_name, long)
|
||||
})
|
||||
} else {
|
||||
false
|
||||
}
|
||||
};
|
||||
|
||||
// Alway provide all fields when vectors are involved because
|
||||
// we need the fields for the prompt/templating.
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,7 +1,8 @@
|
|||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use bumparaw_collections::RawMap;
|
||||
use heed::RoTxn;
|
||||
use raw_collections::RawMap;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
use super::vector_document::VectorDocument;
|
||||
|
@ -385,12 +386,12 @@ pub type Entry<'doc> = (&'doc str, &'doc RawValue);
|
|||
|
||||
#[derive(Debug)]
|
||||
pub struct Versions<'doc> {
|
||||
data: RawMap<'doc>,
|
||||
data: RawMap<'doc, FxBuildHasher>,
|
||||
}
|
||||
|
||||
impl<'doc> Versions<'doc> {
|
||||
pub fn multiple(
|
||||
mut versions: impl Iterator<Item = Result<RawMap<'doc>>>,
|
||||
mut versions: impl Iterator<Item = Result<RawMap<'doc, FxBuildHasher>>>,
|
||||
) -> Result<Option<Self>> {
|
||||
let Some(data) = versions.next() else { return Ok(None) };
|
||||
let mut data = data?;
|
||||
|
@ -403,7 +404,7 @@ impl<'doc> Versions<'doc> {
|
|||
Ok(Some(Self::single(data)))
|
||||
}
|
||||
|
||||
pub fn single(version: RawMap<'doc>) -> Self {
|
||||
pub fn single(version: RawMap<'doc, FxBuildHasher>) -> Self {
|
||||
Self { data: version }
|
||||
}
|
||||
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
use bumpalo::Bump;
|
||||
use heed::RoTxn;
|
||||
|
||||
use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions};
|
||||
use super::document::{
|
||||
Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions,
|
||||
};
|
||||
use super::extract::perm_json_p;
|
||||
use super::vector_document::{
|
||||
MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions,
|
||||
};
|
||||
|
@ -164,6 +167,80 @@ impl<'doc> Update<'doc> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Returns whether the updated version of the document is different from the current version for the passed subset of fields.
|
||||
///
|
||||
/// `true` if at least one top-level-field that is a exactly a member of field or a parent of a member of field changed.
|
||||
/// Otherwise `false`.
|
||||
pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>(
|
||||
&self,
|
||||
fields: Option<&[&str]>,
|
||||
rtxn: &'t RoTxn,
|
||||
index: &'t Index,
|
||||
mapper: &'t Mapper,
|
||||
) -> Result<bool> {
|
||||
let mut changed = false;
|
||||
let mut cached_current = None;
|
||||
let mut updated_selected_field_count = 0;
|
||||
|
||||
for entry in self.updated().iter_top_level_fields() {
|
||||
let (key, updated_value) = entry?;
|
||||
|
||||
if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip {
|
||||
continue;
|
||||
}
|
||||
|
||||
updated_selected_field_count += 1;
|
||||
let current = match cached_current {
|
||||
Some(current) => current,
|
||||
None => self.current(rtxn, index, mapper)?,
|
||||
};
|
||||
let current_value = current.top_level_field(key)?;
|
||||
let Some(current_value) = current_value else {
|
||||
changed = true;
|
||||
break;
|
||||
};
|
||||
|
||||
if current_value.get() != updated_value.get() {
|
||||
changed = true;
|
||||
break;
|
||||
}
|
||||
cached_current = Some(current);
|
||||
}
|
||||
|
||||
if !self.has_deletion {
|
||||
// no field deletion, so fields that don't appear in `updated` cannot have changed
|
||||
return Ok(changed);
|
||||
}
|
||||
|
||||
if changed {
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
// we saw all updated fields, and set `changed` if any field wasn't in `current`.
|
||||
// so if there are as many fields in `current` as in `updated`, then nothing changed.
|
||||
// If there is any more fields in `current`, then they are missing in `updated`.
|
||||
let has_deleted_fields = {
|
||||
let current = match cached_current {
|
||||
Some(current) => current,
|
||||
None => self.current(rtxn, index, mapper)?,
|
||||
};
|
||||
|
||||
let mut current_selected_field_count = 0;
|
||||
for entry in current.iter_top_level_fields() {
|
||||
let (key, _) = entry?;
|
||||
|
||||
if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip {
|
||||
continue;
|
||||
}
|
||||
current_selected_field_count += 1;
|
||||
}
|
||||
|
||||
current_selected_field_count != updated_selected_field_count
|
||||
};
|
||||
|
||||
Ok(has_deleted_fields)
|
||||
}
|
||||
|
||||
pub fn updated_vectors(
|
||||
&self,
|
||||
doc_alloc: &'doc Bump,
|
||||
|
|
|
@ -69,12 +69,12 @@ use std::io::BufReader;
|
|||
use std::{io, iter, mem};
|
||||
|
||||
use bumpalo::Bump;
|
||||
use bumparaw_collections::bbbul::{BitPacker, BitPacker4x};
|
||||
use bumparaw_collections::map::FrozenMap;
|
||||
use bumparaw_collections::{Bbbul, FrozenBbbul};
|
||||
use grenad::ReaderCursor;
|
||||
use hashbrown::hash_map::RawEntryMut;
|
||||
use hashbrown::HashMap;
|
||||
use raw_collections::bbbul::{BitPacker, BitPacker4x};
|
||||
use raw_collections::map::FrozenMap;
|
||||
use raw_collections::{Bbbul, FrozenBbbul};
|
||||
use roaring::RoaringBitmap;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
|
||||
|
@ -177,12 +177,12 @@ impl<'extractor> BalancedCaches<'extractor> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub fn freeze(&mut self) -> Result<Vec<FrozenCache<'_, 'extractor>>> {
|
||||
pub fn freeze(&mut self, source_id: usize) -> Result<Vec<FrozenCache<'_, 'extractor>>> {
|
||||
match &mut self.caches {
|
||||
InnerCaches::Normal(NormalCaches { caches }) => caches
|
||||
.iter_mut()
|
||||
.enumerate()
|
||||
.map(|(bucket, map)| {
|
||||
.map(|(bucket_id, map)| {
|
||||
// safety: we are transmuting the Bbbul into a FrozenBbbul
|
||||
// that are the same size.
|
||||
let map = unsafe {
|
||||
|
@ -201,14 +201,19 @@ impl<'extractor> BalancedCaches<'extractor> {
|
|||
>,
|
||||
>(map)
|
||||
};
|
||||
Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled: Vec::new() })
|
||||
Ok(FrozenCache {
|
||||
source_id,
|
||||
bucket_id,
|
||||
cache: FrozenMap::new(map),
|
||||
spilled: Vec::new(),
|
||||
})
|
||||
})
|
||||
.collect(),
|
||||
InnerCaches::Spilling(SpillingCaches { caches, spilled_entries, .. }) => caches
|
||||
.iter_mut()
|
||||
.zip(mem::take(spilled_entries))
|
||||
.enumerate()
|
||||
.map(|(bucket, (map, sorter))| {
|
||||
.map(|(bucket_id, (map, sorter))| {
|
||||
let spilled = sorter
|
||||
.into_reader_cursors()?
|
||||
.into_iter()
|
||||
|
@ -234,7 +239,7 @@ impl<'extractor> BalancedCaches<'extractor> {
|
|||
>,
|
||||
>(map)
|
||||
};
|
||||
Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled })
|
||||
Ok(FrozenCache { source_id, bucket_id, cache: FrozenMap::new(map), spilled })
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
|
@ -415,21 +420,21 @@ fn spill_entry_to_sorter(
|
|||
match deladd {
|
||||
DelAddRoaringBitmap { del: Some(del), add: None } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
|
||||
CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: None, add: Some(add) } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
|
||||
CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: Some(del), add: Some(add) } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
|
||||
CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
||||
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
|
||||
CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: None, add: None } => return Ok(()),
|
||||
|
@ -440,7 +445,8 @@ fn spill_entry_to_sorter(
|
|||
}
|
||||
|
||||
pub struct FrozenCache<'a, 'extractor> {
|
||||
bucket: usize,
|
||||
bucket_id: usize,
|
||||
source_id: usize,
|
||||
cache: FrozenMap<
|
||||
'a,
|
||||
'extractor,
|
||||
|
@ -457,40 +463,36 @@ pub fn transpose_and_freeze_caches<'a, 'extractor>(
|
|||
let width = caches.first().map(BalancedCaches::buckets).unwrap_or(0);
|
||||
let mut bucket_caches: Vec<_> = iter::repeat_with(Vec::new).take(width).collect();
|
||||
|
||||
for thread_cache in caches {
|
||||
for frozen in thread_cache.freeze()? {
|
||||
bucket_caches[frozen.bucket].push(frozen);
|
||||
for (thread_index, thread_cache) in caches.iter_mut().enumerate() {
|
||||
for frozen in thread_cache.freeze(thread_index)? {
|
||||
bucket_caches[frozen.bucket_id].push(frozen);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(bucket_caches)
|
||||
}
|
||||
|
||||
/// Merges the caches that must be all associated to the same bucket.
|
||||
/// Merges the caches that must be all associated to the same bucket
|
||||
/// but make sure to sort the different buckets before performing the merges.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - If the bucket IDs in these frozen caches are not exactly the same.
|
||||
pub fn merge_caches<F>(frozen: Vec<FrozenCache>, mut f: F) -> Result<()>
|
||||
pub fn merge_caches_sorted<F>(frozen: Vec<FrozenCache>, mut f: F) -> Result<()>
|
||||
where
|
||||
F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>,
|
||||
{
|
||||
let mut maps = Vec::new();
|
||||
let mut readers = Vec::new();
|
||||
let mut current_bucket = None;
|
||||
for FrozenCache { bucket, cache, ref mut spilled } in frozen {
|
||||
assert_eq!(*current_bucket.get_or_insert(bucket), bucket);
|
||||
maps.push(cache);
|
||||
readers.append(spilled);
|
||||
}
|
||||
|
||||
// First manage the spilled entries by looking into the HashMaps,
|
||||
// merge them and mark them as dummy.
|
||||
let mut heap = BinaryHeap::new();
|
||||
for (source_index, source) in readers.into_iter().enumerate() {
|
||||
let mut cursor = source.into_cursor()?;
|
||||
if cursor.move_on_next()?.is_some() {
|
||||
heap.push(Entry { cursor, source_index });
|
||||
let mut current_bucket = None;
|
||||
for FrozenCache { source_id, bucket_id, cache, spilled } in frozen {
|
||||
assert_eq!(*current_bucket.get_or_insert(bucket_id), bucket_id);
|
||||
maps.push((source_id, cache));
|
||||
for reader in spilled {
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
if cursor.move_on_next()?.is_some() {
|
||||
heap.push(Entry { cursor, source_id });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -507,25 +509,29 @@ where
|
|||
|
||||
let mut output = DelAddRoaringBitmap::from_bytes(first_value)?;
|
||||
while let Some(mut entry) = heap.peek_mut() {
|
||||
if let Some((key, _value)) = entry.cursor.current() {
|
||||
if first_key == key {
|
||||
let new = DelAddRoaringBitmap::from_bytes(first_value)?;
|
||||
output = output.merge(new);
|
||||
// When we are done we the current value of this entry move make
|
||||
// it move forward and let the heap reorganize itself (on drop)
|
||||
if entry.cursor.move_on_next()?.is_none() {
|
||||
PeekMut::pop(entry);
|
||||
}
|
||||
} else {
|
||||
if let Some((key, value)) = entry.cursor.current() {
|
||||
if first_key != key {
|
||||
break;
|
||||
}
|
||||
|
||||
let new = DelAddRoaringBitmap::from_bytes(value)?;
|
||||
output = output.merge(new);
|
||||
// When we are done we the current value of this entry move make
|
||||
// it move forward and let the heap reorganize itself (on drop)
|
||||
if entry.cursor.move_on_next()?.is_none() {
|
||||
PeekMut::pop(entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Once we merged all of the spilled bitmaps we must also
|
||||
// fetch the entries from the non-spilled entries (the HashMaps).
|
||||
for (map_index, map) in maps.iter_mut().enumerate() {
|
||||
if first_entry.source_index != map_index {
|
||||
for (source_id, map) in maps.iter_mut() {
|
||||
debug_assert!(
|
||||
!(map.get(first_key).is_some() && first_entry.source_id == *source_id),
|
||||
"A thread should not have spiled a key that has been inserted in the cache"
|
||||
);
|
||||
if first_entry.source_id != *source_id {
|
||||
if let Some(new) = map.get_mut(first_key) {
|
||||
output.union_and_clear_bbbul(new);
|
||||
}
|
||||
|
@ -537,22 +543,22 @@ where
|
|||
|
||||
// Don't forget to put the first entry back into the heap.
|
||||
if first_entry.cursor.move_on_next()?.is_some() {
|
||||
heap.push(first_entry)
|
||||
heap.push(first_entry);
|
||||
}
|
||||
}
|
||||
|
||||
// Then manage the content on the HashMap entries that weren't taken (mem::take).
|
||||
while let Some(mut map) = maps.pop() {
|
||||
for (key, bbbul) in map.iter_mut() {
|
||||
// Make sure we don't try to work with entries already managed by the spilled
|
||||
if bbbul.is_empty() {
|
||||
continue;
|
||||
}
|
||||
while let Some((_, mut map)) = maps.pop() {
|
||||
// Make sure we don't try to work with entries already managed by the spilled
|
||||
let mut ordered_entries: Vec<_> =
|
||||
map.iter_mut().filter(|(_, bbbul)| !bbbul.is_empty()).collect();
|
||||
ordered_entries.sort_unstable_by_key(|(key, _)| *key);
|
||||
|
||||
for (key, bbbul) in ordered_entries {
|
||||
let mut output = DelAddRoaringBitmap::empty();
|
||||
output.union_and_clear_bbbul(bbbul);
|
||||
|
||||
for rhs in maps.iter_mut() {
|
||||
for (_, rhs) in maps.iter_mut() {
|
||||
if let Some(new) = rhs.get_mut(key) {
|
||||
output.union_and_clear_bbbul(new);
|
||||
}
|
||||
|
@ -568,14 +574,14 @@ where
|
|||
|
||||
struct Entry<R> {
|
||||
cursor: ReaderCursor<R>,
|
||||
source_index: usize,
|
||||
source_id: usize,
|
||||
}
|
||||
|
||||
impl<R> Ord for Entry<R> {
|
||||
fn cmp(&self, other: &Entry<R>) -> Ordering {
|
||||
let skey = self.cursor.current().map(|(k, _)| k);
|
||||
let okey = other.cursor.current().map(|(k, _)| k);
|
||||
skey.cmp(&okey).then(self.source_index.cmp(&other.source_index)).reverse()
|
||||
skey.cmp(&okey).then(self.source_id.cmp(&other.source_id)).reverse()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -12,13 +12,14 @@ use crate::update::new::thread_local::FullySend;
|
|||
use crate::update::new::DocumentChange;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::Result;
|
||||
pub struct DocumentsExtractor<'a> {
|
||||
document_sender: &'a DocumentsSender<'a>,
|
||||
|
||||
pub struct DocumentsExtractor<'a, 'b> {
|
||||
document_sender: DocumentsSender<'a, 'b>,
|
||||
embedders: &'a EmbeddingConfigs,
|
||||
}
|
||||
|
||||
impl<'a> DocumentsExtractor<'a> {
|
||||
pub fn new(document_sender: &'a DocumentsSender<'a>, embedders: &'a EmbeddingConfigs) -> Self {
|
||||
impl<'a, 'b> DocumentsExtractor<'a, 'b> {
|
||||
pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self {
|
||||
Self { document_sender, embedders }
|
||||
}
|
||||
}
|
||||
|
@ -29,7 +30,7 @@ pub struct DocumentExtractorData {
|
|||
pub field_distribution_delta: HashMap<String, i64>,
|
||||
}
|
||||
|
||||
impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> {
|
||||
impl<'a, 'b, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a, 'b> {
|
||||
type Data = FullySend<RefCell<DocumentExtractorData>>;
|
||||
|
||||
fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
|
|
|
@ -16,23 +16,23 @@ use crate::update::del_add::DelAdd;
|
|||
use crate::update::new::channel::FieldIdDocidFacetSender;
|
||||
use crate::update::new::extract::perm_json_p;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
pub struct FacetedExtractorData<'a> {
|
||||
pub struct FacetedExtractorData<'a, 'b> {
|
||||
attributes_to_extract: &'a [&'a str],
|
||||
sender: &'a FieldIdDocidFacetSender<'a>,
|
||||
sender: &'a FieldIdDocidFacetSender<'a, 'b>,
|
||||
grenad_parameters: GrenadParameters,
|
||||
buckets: usize,
|
||||
}
|
||||
|
||||
impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> {
|
||||
impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> {
|
||||
type Data = RefCell<BalancedCaches<'extractor>>;
|
||||
|
||||
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
|
@ -97,6 +97,15 @@ impl FacetedDocidsExtractor {
|
|||
},
|
||||
),
|
||||
DocumentChange::Update(inner) => {
|
||||
if !inner.has_changed_for_fields(
|
||||
Some(attributes_to_extract),
|
||||
rtxn,
|
||||
index,
|
||||
context.db_fields_ids_map,
|
||||
)? {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
extract_document_facets(
|
||||
attributes_to_extract,
|
||||
inner.current(rtxn, index, context.db_fields_ids_map)?,
|
||||
|
@ -318,7 +327,7 @@ impl<'doc> DelAddFacetValue<'doc> {
|
|||
docid: DocumentId,
|
||||
sender: &FieldIdDocidFacetSender,
|
||||
doc_alloc: &Bump,
|
||||
) -> std::result::Result<(), crossbeam_channel::SendError<()>> {
|
||||
) -> crate::Result<()> {
|
||||
let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc);
|
||||
for ((fid, value), deladd) in self.strings {
|
||||
if let Ok(s) = std::str::from_utf8(&value) {
|
||||
|
@ -364,26 +373,16 @@ fn truncate_str(s: &str) -> &str {
|
|||
|
||||
impl FacetedDocidsExtractor {
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
|
||||
pub fn run_extraction<
|
||||
'pl,
|
||||
'fid,
|
||||
'indexer,
|
||||
'index,
|
||||
'extractor,
|
||||
DC: DocumentChanges<'pl>,
|
||||
MSP,
|
||||
SP,
|
||||
>(
|
||||
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
sender: &FieldIdDocidFacetSender,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
let index = indexing_context.index;
|
||||
let rtxn = index.read_txn()?;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
use std::cell::RefCell;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Write as _};
|
||||
use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Seek as _, Write as _};
|
||||
use std::{iter, mem, result};
|
||||
|
||||
use bumpalo::Bump;
|
||||
|
@ -97,30 +97,34 @@ pub struct FrozenGeoExtractorData<'extractor> {
|
|||
impl<'extractor> FrozenGeoExtractorData<'extractor> {
|
||||
pub fn iter_and_clear_removed(
|
||||
&mut self,
|
||||
) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
|
||||
mem::take(&mut self.removed)
|
||||
) -> io::Result<impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_> {
|
||||
Ok(mem::take(&mut self.removed)
|
||||
.iter()
|
||||
.copied()
|
||||
.map(Ok)
|
||||
.chain(iterator_over_spilled_geopoints(&mut self.spilled_removed))
|
||||
.chain(iterator_over_spilled_geopoints(&mut self.spilled_removed)?))
|
||||
}
|
||||
|
||||
pub fn iter_and_clear_inserted(
|
||||
&mut self,
|
||||
) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
|
||||
mem::take(&mut self.inserted)
|
||||
) -> io::Result<impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_> {
|
||||
Ok(mem::take(&mut self.inserted)
|
||||
.iter()
|
||||
.copied()
|
||||
.map(Ok)
|
||||
.chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted))
|
||||
.chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted)?))
|
||||
}
|
||||
}
|
||||
|
||||
fn iterator_over_spilled_geopoints(
|
||||
spilled: &mut Option<BufReader<File>>,
|
||||
) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
|
||||
) -> io::Result<impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_> {
|
||||
let mut spilled = spilled.take();
|
||||
iter::from_fn(move || match &mut spilled {
|
||||
if let Some(spilled) = &mut spilled {
|
||||
spilled.rewind()?;
|
||||
}
|
||||
|
||||
Ok(iter::from_fn(move || match &mut spilled {
|
||||
Some(file) => {
|
||||
let geopoint_bytes = &mut [0u8; mem::size_of::<ExtractedGeoPoint>()];
|
||||
match file.read_exact(geopoint_bytes) {
|
||||
|
@ -130,7 +134,7 @@ fn iterator_over_spilled_geopoints(
|
|||
}
|
||||
}
|
||||
None => None,
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
impl<'extractor> Extractor<'extractor> for GeoExtractor {
|
||||
|
@ -157,7 +161,9 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
|
|||
let mut data_ref = context.data.borrow_mut_or_yield();
|
||||
|
||||
for change in changes {
|
||||
if max_memory.map_or(false, |mm| context.extractor_alloc.allocated_bytes() >= mm) {
|
||||
if data_ref.spilled_removed.is_none()
|
||||
&& max_memory.map_or(false, |mm| context.extractor_alloc.allocated_bytes() >= mm)
|
||||
{
|
||||
// We must spill as we allocated too much memory
|
||||
data_ref.spilled_removed = tempfile::tempfile().map(BufWriter::new).map(Some)?;
|
||||
data_ref.spilled_inserted = tempfile::tempfile().map(BufWriter::new).map(Some)?;
|
||||
|
|
|
@ -6,30 +6,31 @@ mod searchable;
|
|||
mod vectors;
|
||||
|
||||
use bumpalo::Bump;
|
||||
pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap};
|
||||
pub use cache::{
|
||||
merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap,
|
||||
};
|
||||
pub use documents::*;
|
||||
pub use faceted::*;
|
||||
pub use geo::*;
|
||||
pub use searchable::*;
|
||||
pub use vectors::EmbeddingExtractor;
|
||||
|
||||
use super::indexer::document_changes::{DocumentChanges, IndexingContext, Progress};
|
||||
use super::steps::Step;
|
||||
use super::indexer::document_changes::{DocumentChanges, IndexingContext};
|
||||
use super::steps::IndexingStep;
|
||||
use super::thread_local::{FullySend, ThreadLocal};
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::Result;
|
||||
|
||||
pub trait DocidsExtractor {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync;
|
||||
MSP: Fn() -> bool + Sync;
|
||||
}
|
||||
|
||||
/// TODO move in permissive json pointer
|
||||
|
|
|
@ -11,10 +11,10 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
|||
use crate::update::new::extract::cache::BalancedCaches;
|
||||
use crate::update::new::extract::perm_json_p::contained_in;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::GrenadParameters;
|
||||
|
@ -28,7 +28,7 @@ pub struct WordDocidsBalancedCaches<'extractor> {
|
|||
exact_word_docids: BalancedCaches<'extractor>,
|
||||
word_position_docids: BalancedCaches<'extractor>,
|
||||
fid_word_count_docids: BalancedCaches<'extractor>,
|
||||
fid_word_count: HashMap<FieldId, (usize, usize)>,
|
||||
fid_word_count: HashMap<FieldId, (Option<usize>, Option<usize>)>,
|
||||
current_docid: Option<DocumentId>,
|
||||
}
|
||||
|
||||
|
@ -85,8 +85,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
|||
|
||||
self.fid_word_count
|
||||
.entry(field_id)
|
||||
.and_modify(|(_current_count, new_count)| *new_count += 1)
|
||||
.or_insert((0, 1));
|
||||
.and_modify(|(_current_count, new_count)| *new_count.get_or_insert(0) += 1)
|
||||
.or_insert((None, Some(1)));
|
||||
self.current_docid = Some(docid);
|
||||
|
||||
Ok(())
|
||||
|
@ -130,8 +130,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
|||
|
||||
self.fid_word_count
|
||||
.entry(field_id)
|
||||
.and_modify(|(current_count, _new_count)| *current_count += 1)
|
||||
.or_insert((1, 0));
|
||||
.and_modify(|(current_count, _new_count)| *current_count.get_or_insert(0) += 1)
|
||||
.or_insert((Some(1), None));
|
||||
|
||||
self.current_docid = Some(docid);
|
||||
|
||||
|
@ -141,14 +141,18 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
|||
fn flush_fid_word_count(&mut self, buffer: &mut BumpVec<u8>) -> Result<()> {
|
||||
for (fid, (current_count, new_count)) in self.fid_word_count.drain() {
|
||||
if current_count != new_count {
|
||||
if current_count <= MAX_COUNTED_WORDS {
|
||||
if let Some(current_count) =
|
||||
current_count.filter(|current_count| *current_count <= MAX_COUNTED_WORDS)
|
||||
{
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
buffer.push(current_count as u8);
|
||||
self.fid_word_count_docids
|
||||
.insert_del_u32(buffer, self.current_docid.unwrap())?;
|
||||
}
|
||||
if new_count <= MAX_COUNTED_WORDS {
|
||||
if let Some(new_count) =
|
||||
new_count.filter(|new_count| *new_count <= MAX_COUNTED_WORDS)
|
||||
{
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
buffer.push(new_count as u8);
|
||||
|
@ -235,25 +239,15 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
|
|||
pub struct WordDocidsExtractors;
|
||||
|
||||
impl WordDocidsExtractors {
|
||||
pub fn run_extraction<
|
||||
'pl,
|
||||
'fid,
|
||||
'indexer,
|
||||
'index,
|
||||
'extractor,
|
||||
DC: DocumentChanges<'pl>,
|
||||
MSP,
|
||||
SP,
|
||||
>(
|
||||
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<WordDocidsCaches<'extractor>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
let index = indexing_context.index;
|
||||
let rtxn = index.read_txn()?;
|
||||
|
@ -351,6 +345,15 @@ impl WordDocidsExtractors {
|
|||
)?;
|
||||
}
|
||||
DocumentChange::Update(inner) => {
|
||||
if !inner.has_changed_for_fields(
|
||||
document_tokenizer.attribute_to_extract,
|
||||
&context.rtxn,
|
||||
context.index,
|
||||
context.db_fields_ids_map,
|
||||
)? {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
cached_sorter.insert_del_u32(
|
||||
fid,
|
||||
|
|
|
@ -70,6 +70,15 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
|||
)?;
|
||||
}
|
||||
DocumentChange::Update(inner) => {
|
||||
if !inner.has_changed_for_fields(
|
||||
document_tokenizer.attribute_to_extract,
|
||||
rtxn,
|
||||
index,
|
||||
context.db_fields_ids_map,
|
||||
)? {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let document = inner.current(rtxn, index, context.db_fields_ids_map)?;
|
||||
process_document_tokens(
|
||||
document,
|
||||
|
|
|
@ -14,9 +14,9 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
|||
use super::cache::BalancedCaches;
|
||||
use super::DocidsExtractor;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::GrenadParameters;
|
||||
|
@ -56,16 +56,15 @@ impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
|
|||
}
|
||||
|
||||
pub trait SearchableExtractor: Sized + Sync {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
let rtxn = indexing_context.index.read_txn()?;
|
||||
let stop_words = indexing_context.index.stop_words(&rtxn)?;
|
||||
|
@ -134,16 +133,15 @@ pub trait SearchableExtractor: Sized + Sync {
|
|||
}
|
||||
|
||||
impl<T: SearchableExtractor> DocidsExtractor for T {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
Self::run_extraction(
|
||||
grenad_parameters,
|
||||
|
|
|
@ -176,9 +176,10 @@ pub fn tokenizer_builder<'a>(
|
|||
#[cfg(test)]
|
||||
mod test {
|
||||
use bumpalo::Bump;
|
||||
use bumparaw_collections::RawMap;
|
||||
use charabia::TokenizerBuilder;
|
||||
use meili_snap::snapshot;
|
||||
use raw_collections::RawMap;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use serde_json::json;
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
|
@ -234,7 +235,7 @@ mod test {
|
|||
|
||||
let bump = Bump::new();
|
||||
let document: &RawValue = serde_json::from_str(&document).unwrap();
|
||||
let document = RawMap::from_raw_value(document, &bump).unwrap();
|
||||
let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, &bump).unwrap();
|
||||
|
||||
let document = Versions::single(document);
|
||||
let document = DocumentFromVersions::new(&document);
|
||||
|
|
|
@ -18,17 +18,17 @@ use crate::vector::error::{
|
|||
use crate::vector::{Embedder, Embedding, EmbeddingConfigs};
|
||||
use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError};
|
||||
|
||||
pub struct EmbeddingExtractor<'a> {
|
||||
pub struct EmbeddingExtractor<'a, 'b> {
|
||||
embedders: &'a EmbeddingConfigs,
|
||||
sender: &'a EmbeddingSender<'a>,
|
||||
sender: EmbeddingSender<'a, 'b>,
|
||||
possible_embedding_mistakes: PossibleEmbeddingMistakes,
|
||||
threads: &'a ThreadPoolNoAbort,
|
||||
}
|
||||
|
||||
impl<'a> EmbeddingExtractor<'a> {
|
||||
impl<'a, 'b> EmbeddingExtractor<'a, 'b> {
|
||||
pub fn new(
|
||||
embedders: &'a EmbeddingConfigs,
|
||||
sender: &'a EmbeddingSender<'a>,
|
||||
sender: EmbeddingSender<'a, 'b>,
|
||||
field_distribution: &'a FieldDistribution,
|
||||
threads: &'a ThreadPoolNoAbort,
|
||||
) -> Self {
|
||||
|
@ -43,7 +43,7 @@ pub struct EmbeddingExtractorData<'extractor>(
|
|||
|
||||
unsafe impl MostlySend for EmbeddingExtractorData<'_> {}
|
||||
|
||||
impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
||||
impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
||||
type Data = RefCell<EmbeddingExtractorData<'extractor>>;
|
||||
|
||||
fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result<Self::Data> {
|
||||
|
@ -130,6 +130,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||
);
|
||||
} else if new_vectors.regenerate {
|
||||
let new_rendered = prompt.render_document(
|
||||
update.external_document_id(),
|
||||
update.current(
|
||||
&context.rtxn,
|
||||
context.index,
|
||||
|
@ -139,6 +140,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||
&context.doc_alloc,
|
||||
)?;
|
||||
let old_rendered = prompt.render_document(
|
||||
update.external_document_id(),
|
||||
update.merged(
|
||||
&context.rtxn,
|
||||
context.index,
|
||||
|
@ -158,6 +160,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||
}
|
||||
} else if old_vectors.regenerate {
|
||||
let old_rendered = prompt.render_document(
|
||||
update.external_document_id(),
|
||||
update.current(
|
||||
&context.rtxn,
|
||||
context.index,
|
||||
|
@ -167,6 +170,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||
&context.doc_alloc,
|
||||
)?;
|
||||
let new_rendered = prompt.render_document(
|
||||
update.external_document_id(),
|
||||
update.merged(
|
||||
&context.rtxn,
|
||||
context.index,
|
||||
|
@ -216,6 +220,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||
);
|
||||
} else if new_vectors.regenerate {
|
||||
let rendered = prompt.render_document(
|
||||
insertion.external_document_id(),
|
||||
insertion.inserted(),
|
||||
context.new_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
|
@ -229,6 +234,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||
}
|
||||
} else {
|
||||
let rendered = prompt.render_document(
|
||||
insertion.external_document_id(),
|
||||
insertion.inserted(),
|
||||
context.new_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
|
@ -259,7 +265,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||
// Currently this is the case as:
|
||||
// 1. BVec are inside of the bumaplo
|
||||
// 2. All other fields are either trivial (u8) or references.
|
||||
struct Chunks<'a, 'extractor> {
|
||||
struct Chunks<'a, 'b, 'extractor> {
|
||||
texts: BVec<'a, &'a str>,
|
||||
ids: BVec<'a, DocumentId>,
|
||||
|
||||
|
@ -270,11 +276,11 @@ struct Chunks<'a, 'extractor> {
|
|||
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
||||
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
||||
threads: &'a ThreadPoolNoAbort,
|
||||
sender: &'a EmbeddingSender<'a>,
|
||||
sender: EmbeddingSender<'a, 'b>,
|
||||
has_manual_generation: Option<&'a str>,
|
||||
}
|
||||
|
||||
impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
||||
impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
embedder: &'a Embedder,
|
||||
|
@ -284,7 +290,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
|||
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
||||
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
||||
threads: &'a ThreadPoolNoAbort,
|
||||
sender: &'a EmbeddingSender<'a>,
|
||||
sender: EmbeddingSender<'a, 'b>,
|
||||
doc_alloc: &'a Bump,
|
||||
) -> Self {
|
||||
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
||||
|
@ -368,7 +374,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
|||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
sender: &EmbeddingSender<'a>,
|
||||
sender: EmbeddingSender<'a, 'b>,
|
||||
has_manual_generation: Option<&'a str>,
|
||||
) -> Result<()> {
|
||||
if let Some(external_docid) = has_manual_generation {
|
||||
|
|
|
@ -103,6 +103,8 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
|
|||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_fst")]
|
||||
pub fn merge_and_write(self, index: &Index, wtxn: &mut RwTxn, rtxn: &RoTxn) -> Result<()> {
|
||||
tracing::trace!("merge facet strings for facet search: {:?}", self.registered_facets);
|
||||
|
||||
let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?;
|
||||
let mut builder = grenad::MergerBuilder::new(MergeDeladdBtreesetString);
|
||||
builder.extend(reader);
|
||||
|
@ -118,12 +120,15 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
|
|||
BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?;
|
||||
|
||||
if current_field_id != Some(field_id) {
|
||||
if let Some(fst_merger_builder) = fst_merger_builder {
|
||||
if let (Some(current_field_id), Some(fst_merger_builder)) =
|
||||
(current_field_id, fst_merger_builder)
|
||||
{
|
||||
let mmap = fst_merger_builder.build(&mut callback)?;
|
||||
index
|
||||
.facet_id_string_fst
|
||||
.remap_data_type::<Bytes>()
|
||||
.put(wtxn, &field_id, &mmap)?;
|
||||
index.facet_id_string_fst.remap_data_type::<Bytes>().put(
|
||||
wtxn,
|
||||
¤t_field_id,
|
||||
&mmap,
|
||||
)?;
|
||||
}
|
||||
|
||||
fst = index.facet_id_string_fst.get(rtxn, &field_id)?;
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
use std::ops::ControlFlow;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use bumparaw_collections::RawVec;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use serde::de::{DeserializeSeed, Deserializer as _, Visitor};
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
|
@ -360,7 +362,7 @@ impl<'a> DeserrRawValue<'a> {
|
|||
}
|
||||
|
||||
pub struct DeserrRawVec<'a> {
|
||||
vec: raw_collections::RawVec<'a>,
|
||||
vec: RawVec<'a>,
|
||||
alloc: &'a Bump,
|
||||
}
|
||||
|
||||
|
@ -379,7 +381,7 @@ impl<'a> deserr::Sequence for DeserrRawVec<'a> {
|
|||
}
|
||||
|
||||
pub struct DeserrRawVecIter<'a> {
|
||||
it: raw_collections::vec::iter::IntoIter<'a>,
|
||||
it: bumparaw_collections::vec::iter::IntoIter<'a>,
|
||||
alloc: &'a Bump,
|
||||
}
|
||||
|
||||
|
@ -393,7 +395,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> {
|
|||
}
|
||||
|
||||
pub struct DeserrRawMap<'a> {
|
||||
map: raw_collections::RawMap<'a>,
|
||||
map: bumparaw_collections::RawMap<'a, FxBuildHasher>,
|
||||
alloc: &'a Bump,
|
||||
}
|
||||
|
||||
|
@ -416,7 +418,7 @@ impl<'a> deserr::Map for DeserrRawMap<'a> {
|
|||
}
|
||||
|
||||
pub struct DeserrRawMapIter<'a> {
|
||||
it: raw_collections::map::iter::IntoIter<'a>,
|
||||
it: bumparaw_collections::map::iter::IntoIter<'a>,
|
||||
alloc: &'a Bump,
|
||||
}
|
||||
|
||||
|
@ -615,7 +617,7 @@ impl<'de> Visitor<'de> for DeserrRawValueVisitor<'de> {
|
|||
where
|
||||
A: serde::de::SeqAccess<'de>,
|
||||
{
|
||||
let mut raw_vec = raw_collections::RawVec::new_in(self.alloc);
|
||||
let mut raw_vec = RawVec::new_in(self.alloc);
|
||||
while let Some(next) = seq.next_element()? {
|
||||
raw_vec.push(next);
|
||||
}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use std::cell::{Cell, RefCell};
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use bumpalo::Bump;
|
||||
|
@ -7,8 +8,9 @@ use rayon::iter::IndexedParallelIterator;
|
|||
|
||||
use super::super::document_change::DocumentChange;
|
||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||
use crate::progress::{AtomicDocumentStep, Progress};
|
||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result};
|
||||
|
||||
|
@ -70,7 +72,7 @@ impl<
|
|||
F: FnOnce(&'extractor Bump) -> Result<T>,
|
||||
{
|
||||
let doc_alloc =
|
||||
doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024))));
|
||||
doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024))));
|
||||
let doc_alloc = doc_alloc.0.take();
|
||||
let fields_ids_map = fields_ids_map_store
|
||||
.get_or(|| RefCell::new(GlobalFieldsIdsMap::new(new_fields_ids_map)).into());
|
||||
|
@ -133,10 +135,8 @@ pub struct IndexingContext<
|
|||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||
'index, // covariant lifetime of the index
|
||||
MSP,
|
||||
SP,
|
||||
> where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
pub index: &'index Index,
|
||||
pub db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||
|
@ -144,7 +144,7 @@ pub struct IndexingContext<
|
|||
pub doc_allocs: &'indexer ThreadLocal<FullySend<Cell<Bump>>>,
|
||||
pub fields_ids_map_store: &'indexer ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
|
||||
pub must_stop_processing: &'indexer MSP,
|
||||
pub send_progress: &'indexer SP,
|
||||
pub progress: &'indexer Progress,
|
||||
}
|
||||
|
||||
impl<
|
||||
|
@ -152,18 +152,15 @@ impl<
|
|||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||
'index, // covariant lifetime of the index
|
||||
MSP,
|
||||
SP,
|
||||
> Copy
|
||||
for IndexingContext<
|
||||
'fid, // invariant lifetime of fields ids map
|
||||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||
'index, // covariant lifetime of the index
|
||||
MSP,
|
||||
SP,
|
||||
>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -172,18 +169,15 @@ impl<
|
|||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||
'index, // covariant lifetime of the index
|
||||
MSP,
|
||||
SP,
|
||||
> Clone
|
||||
for IndexingContext<
|
||||
'fid, // invariant lifetime of fields ids map
|
||||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||
'index, // covariant lifetime of the index
|
||||
MSP,
|
||||
SP,
|
||||
>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
*self
|
||||
|
@ -202,7 +196,6 @@ pub fn extract<
|
|||
EX,
|
||||
DC: DocumentChanges<'pl>,
|
||||
MSP,
|
||||
SP,
|
||||
>(
|
||||
document_changes: &DC,
|
||||
extractor: &EX,
|
||||
|
@ -213,18 +206,18 @@ pub fn extract<
|
|||
doc_allocs,
|
||||
fields_ids_map_store,
|
||||
must_stop_processing,
|
||||
send_progress,
|
||||
}: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
progress,
|
||||
}: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
datastore: &'data ThreadLocal<EX::Data>,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<()>
|
||||
where
|
||||
EX: Extractor<'extractor>,
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
tracing::trace!("We are resetting the extractor allocators");
|
||||
progress.update_progress(step);
|
||||
// Clean up and reuse the extractor allocs
|
||||
for extractor_alloc in extractor_allocs.iter_mut() {
|
||||
tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes());
|
||||
|
@ -232,9 +225,11 @@ where
|
|||
}
|
||||
|
||||
let total_documents = document_changes.len() as u32;
|
||||
let (step, progress_step) = AtomicDocumentStep::new(total_documents);
|
||||
progress.update_progress(progress_step);
|
||||
|
||||
let pi = document_changes.iter(CHUNK_SIZE);
|
||||
pi.enumerate().try_arc_for_each_try_init(
|
||||
pi.try_arc_for_each_try_init(
|
||||
|| {
|
||||
DocumentChangeContext::new(
|
||||
index,
|
||||
|
@ -247,13 +242,10 @@ where
|
|||
move |index_alloc| extractor.init_data(index_alloc),
|
||||
)
|
||||
},
|
||||
|context, (finished_documents, items)| {
|
||||
|context, items| {
|
||||
if (must_stop_processing)() {
|
||||
return Err(Arc::new(InternalError::AbortedIndexation.into()));
|
||||
}
|
||||
let finished_documents = (finished_documents * CHUNK_SIZE) as u32;
|
||||
|
||||
(send_progress)(Progress::from_step_substep(step, finished_documents, total_documents));
|
||||
|
||||
// Clean up and reuse the document-specific allocator
|
||||
context.doc_alloc.reset();
|
||||
|
@ -264,6 +256,7 @@ where
|
|||
});
|
||||
|
||||
let res = extractor.process(changes, context).map_err(Arc::new);
|
||||
step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed);
|
||||
|
||||
// send back the doc_alloc in the pool
|
||||
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
|
||||
|
@ -271,32 +264,7 @@ where
|
|||
res
|
||||
},
|
||||
)?;
|
||||
|
||||
(send_progress)(Progress::from_step_substep(step, total_documents, total_documents));
|
||||
step.store(total_documents, Ordering::Relaxed);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct Progress {
|
||||
pub finished_steps: u16,
|
||||
pub total_steps: u16,
|
||||
pub step_name: &'static str,
|
||||
pub finished_total_substep: Option<(u32, u32)>,
|
||||
}
|
||||
|
||||
impl Progress {
|
||||
pub fn from_step(step: Step) -> Self {
|
||||
Self {
|
||||
finished_steps: step.finished_steps(),
|
||||
total_steps: Step::total_steps(),
|
||||
step_name: step.name(),
|
||||
finished_total_substep: None,
|
||||
}
|
||||
}
|
||||
pub fn from_step_substep(step: Step, finished_substep: u32, total_substep: u32) -> Self {
|
||||
Self {
|
||||
finished_total_substep: Some((finished_substep, total_substep)),
|
||||
..Progress::from_step(step)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -92,11 +92,12 @@ mod test {
|
|||
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::progress::Progress;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::indexer::DocumentDeletion;
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{MostlySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::DocumentId;
|
||||
|
@ -164,7 +165,7 @@ mod test {
|
|||
doc_allocs: &doc_allocs,
|
||||
fields_ids_map_store: &fields_ids_map_store,
|
||||
must_stop_processing: &(|| false),
|
||||
send_progress: &(|_progress| {}),
|
||||
progress: &Progress::default(),
|
||||
};
|
||||
|
||||
for _ in 0..3 {
|
||||
|
@ -176,7 +177,7 @@ mod test {
|
|||
context,
|
||||
&mut extractor_allocs,
|
||||
&datastore,
|
||||
Step::ExtractingDocuments,
|
||||
IndexingStep::ExtractingDocuments,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
|
|
@ -1,19 +1,23 @@
|
|||
use std::sync::atomic::Ordering;
|
||||
|
||||
use bumpalo::collections::CollectIn;
|
||||
use bumpalo::Bump;
|
||||
use bumparaw_collections::RawMap;
|
||||
use hashbrown::hash_map::Entry;
|
||||
use heed::RoTxn;
|
||||
use memmap2::Mmap;
|
||||
use raw_collections::RawMap;
|
||||
use rayon::slice::ParallelSlice;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use serde_json::value::RawValue;
|
||||
use serde_json::Deserializer;
|
||||
|
||||
use super::super::document_change::DocumentChange;
|
||||
use super::document_changes::{DocumentChangeContext, DocumentChanges, Progress};
|
||||
use super::document_changes::{DocumentChangeContext, DocumentChanges};
|
||||
use super::retrieve_or_guess_primary_key;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::progress::{AtomicPayloadStep, Progress};
|
||||
use crate::update::new::document::Versions;
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::MostlySend;
|
||||
use crate::update::new::{Deletion, Insertion, Update};
|
||||
use crate::update::{AvailableIds, IndexDocumentsMethod};
|
||||
|
@ -44,7 +48,7 @@ impl<'pl> DocumentOperation<'pl> {
|
|||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")]
|
||||
pub fn into_changes<MSP, SP>(
|
||||
pub fn into_changes<MSP>(
|
||||
self,
|
||||
indexer: &'pl Bump,
|
||||
index: &Index,
|
||||
|
@ -52,12 +56,12 @@ impl<'pl> DocumentOperation<'pl> {
|
|||
primary_key_from_op: Option<&'pl str>,
|
||||
new_fields_ids_map: &mut FieldsIdsMap,
|
||||
must_stop_processing: &MSP,
|
||||
send_progress: &SP,
|
||||
progress: Progress,
|
||||
) -> Result<(DocumentOperationChanges<'pl>, Vec<PayloadStats>, Option<PrimaryKey<'pl>>)>
|
||||
where
|
||||
MSP: Fn() -> bool,
|
||||
SP: Fn(Progress),
|
||||
{
|
||||
progress.update_progress(IndexingStep::PreparingPayloads);
|
||||
let Self { operations, method } = self;
|
||||
|
||||
let documents_ids = index.documents_ids(rtxn)?;
|
||||
|
@ -67,16 +71,14 @@ impl<'pl> DocumentOperation<'pl> {
|
|||
let mut primary_key = None;
|
||||
|
||||
let payload_count = operations.len();
|
||||
let (step, progress_step) = AtomicPayloadStep::new(payload_count as u32);
|
||||
progress.update_progress(progress_step);
|
||||
|
||||
for (payload_index, operation) in operations.into_iter().enumerate() {
|
||||
if must_stop_processing() {
|
||||
return Err(InternalError::AbortedIndexation.into());
|
||||
}
|
||||
send_progress(Progress::from_step_substep(
|
||||
Step::PreparingPayloads,
|
||||
payload_index as u32,
|
||||
payload_count as u32,
|
||||
));
|
||||
step.store(payload_index as u32, Ordering::Relaxed);
|
||||
|
||||
let mut bytes = 0;
|
||||
let result = match operation {
|
||||
|
@ -117,12 +119,7 @@ impl<'pl> DocumentOperation<'pl> {
|
|||
};
|
||||
operations_stats.push(PayloadStats { document_count, bytes, error });
|
||||
}
|
||||
|
||||
send_progress(Progress::from_step_substep(
|
||||
Step::PreparingPayloads,
|
||||
payload_count as u32,
|
||||
payload_count as u32,
|
||||
));
|
||||
step.store(payload_count as u32, Ordering::Relaxed);
|
||||
|
||||
// TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
|
||||
let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> =
|
||||
|
@ -166,8 +163,9 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
|
|||
|
||||
// Only guess the primary key if it is the first document
|
||||
let retrieved_primary_key = if previous_offset == 0 {
|
||||
let doc =
|
||||
RawMap::from_raw_value(doc, indexer).map(Some).map_err(UserError::SerdeJson)?;
|
||||
let doc = RawMap::from_raw_value_and_hasher(doc, FxBuildHasher, indexer)
|
||||
.map(Some)
|
||||
.map_err(UserError::SerdeJson)?;
|
||||
|
||||
let result = retrieve_or_guess_primary_key(
|
||||
rtxn,
|
||||
|
@ -545,8 +543,9 @@ impl MergeChanges for MergeDocumentForReplacement {
|
|||
match operations.last() {
|
||||
Some(InnerDocOp::Addition(DocumentOffset { content })) => {
|
||||
let document = serde_json::from_slice(content).unwrap();
|
||||
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
|
||||
.map_err(UserError::SerdeJson)?;
|
||||
let document =
|
||||
RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
|
||||
.map_err(UserError::SerdeJson)?;
|
||||
|
||||
if is_new {
|
||||
Ok(Some(DocumentChange::Insertion(Insertion::create(
|
||||
|
@ -632,8 +631,9 @@ impl MergeChanges for MergeDocumentForUpdates {
|
|||
}
|
||||
};
|
||||
let document = serde_json::from_slice(content).unwrap();
|
||||
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
|
||||
.map_err(UserError::SerdeJson)?;
|
||||
let document =
|
||||
RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
|
||||
.map_err(UserError::SerdeJson)?;
|
||||
|
||||
Some(Versions::single(document))
|
||||
}
|
||||
|
@ -647,8 +647,9 @@ impl MergeChanges for MergeDocumentForUpdates {
|
|||
};
|
||||
|
||||
let document = serde_json::from_slice(content).unwrap();
|
||||
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
|
||||
.map_err(UserError::SerdeJson)?;
|
||||
let document =
|
||||
RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
|
||||
.map_err(UserError::SerdeJson)?;
|
||||
Ok(document)
|
||||
});
|
||||
Versions::multiple(versions)?
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::{OnceLock, RwLock};
|
||||
use std::thread::{self, Builder};
|
||||
|
||||
use big_s::S;
|
||||
use document_changes::{extract, DocumentChanges, IndexingContext, Progress};
|
||||
use bumparaw_collections::RawMap;
|
||||
use document_changes::{extract, DocumentChanges, IndexingContext};
|
||||
pub use document_deletion::DocumentDeletion;
|
||||
pub use document_operation::{DocumentOperation, PayloadStats};
|
||||
use hashbrown::HashMap;
|
||||
|
@ -12,7 +14,7 @@ use heed::{RoTxn, RwTxn};
|
|||
use itertools::{merge_join_by, EitherOrBoth};
|
||||
pub use partial_dump::PartialDump;
|
||||
use rand::SeedableRng as _;
|
||||
use raw_collections::RawMap;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use time::OffsetDateTime;
|
||||
pub use update_by_function::UpdateByFunction;
|
||||
|
||||
|
@ -20,7 +22,7 @@ use super::channel::*;
|
|||
use super::extract::*;
|
||||
use super::facet_search_builder::FacetSearchBuilder;
|
||||
use super::merger::FacetFieldIdsDelta;
|
||||
use super::steps::Step;
|
||||
use super::steps::IndexingStep;
|
||||
use super::thread_local::ThreadLocal;
|
||||
use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder};
|
||||
use super::words_prefix_docids::{
|
||||
|
@ -31,6 +33,7 @@ use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
|
|||
use crate::facet::FacetType;
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
|
||||
use crate::progress::Progress;
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::new::extract::EmbeddingExtractor;
|
||||
|
@ -41,7 +44,7 @@ use crate::update::settings::InnerIndexSettings;
|
|||
use crate::update::{FacetsUpdateBulk, GrenadParameters};
|
||||
use crate::vector::{ArroyWrapper, EmbeddingConfigs, Embeddings};
|
||||
use crate::{
|
||||
FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort,
|
||||
Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort,
|
||||
ThreadPoolNoAbortBuilder, UserError,
|
||||
};
|
||||
|
||||
|
@ -58,9 +61,10 @@ mod update_by_function;
|
|||
///
|
||||
/// TODO return stats
|
||||
#[allow(clippy::too_many_arguments)] // clippy: 😝
|
||||
pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>(
|
||||
pub fn index<'pl, 'indexer, 'index, DC, MSP>(
|
||||
wtxn: &mut RwTxn,
|
||||
index: &'index Index,
|
||||
pool: &ThreadPoolNoAbort,
|
||||
grenad_parameters: GrenadParameters,
|
||||
db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||
new_fields_ids_map: FieldsIdsMap,
|
||||
|
@ -68,14 +72,44 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>(
|
|||
document_changes: &DC,
|
||||
embedders: EmbeddingConfigs,
|
||||
must_stop_processing: &'indexer MSP,
|
||||
send_progress: &'indexer SP,
|
||||
progress: &'indexer Progress,
|
||||
) -> Result<()>
|
||||
where
|
||||
DC: DocumentChanges<'pl>,
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000);
|
||||
let mut bbbuffers = Vec::new();
|
||||
let finished_extraction = AtomicBool::new(false);
|
||||
|
||||
// We reduce the actual memory used to 5%. The reason we do this here and not in Meilisearch
|
||||
// is because we still use the old indexer for the settings and it is highly impacted by the
|
||||
// max memory. So we keep the changes here and will remove these changes once we use the new
|
||||
// indexer to also index settings. Related to #5125 and #5141.
|
||||
let grenad_parameters = GrenadParameters {
|
||||
max_memory: grenad_parameters.max_memory.map(|mm| mm * 5 / 100),
|
||||
..grenad_parameters
|
||||
};
|
||||
|
||||
// We compute and remove the allocated BBQueues buffers capacity from the indexing memory.
|
||||
let minimum_capacity = 50 * 1024 * 1024 * pool.current_num_threads(); // 50 MiB
|
||||
let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or(
|
||||
(grenad_parameters, 2 * minimum_capacity), // 100 MiB by thread by default
|
||||
|max_memory| {
|
||||
// 2% of the indexing memory
|
||||
let total_bbbuffer_capacity = (max_memory / 100 / 2).max(minimum_capacity);
|
||||
let new_grenad_parameters = GrenadParameters {
|
||||
max_memory: Some(
|
||||
max_memory.saturating_sub(total_bbbuffer_capacity).max(100 * 1024 * 1024),
|
||||
),
|
||||
..grenad_parameters
|
||||
};
|
||||
(new_grenad_parameters, total_bbbuffer_capacity)
|
||||
},
|
||||
);
|
||||
|
||||
let (extractor_sender, mut writer_receiver) = pool
|
||||
.install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000))
|
||||
.unwrap();
|
||||
|
||||
let metadata_builder = MetadataBuilder::from_index(index, wtxn)?;
|
||||
let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder);
|
||||
|
@ -91,244 +125,274 @@ where
|
|||
doc_allocs: &doc_allocs,
|
||||
fields_ids_map_store: &fields_ids_map_store,
|
||||
must_stop_processing,
|
||||
send_progress,
|
||||
progress,
|
||||
};
|
||||
|
||||
let mut index_embeddings = index.embedding_configs(wtxn)?;
|
||||
let mut field_distribution = index.field_distribution(wtxn)?;
|
||||
let mut document_ids = index.documents_ids(wtxn)?;
|
||||
|
||||
thread::scope(|s| -> Result<()> {
|
||||
let indexer_span = tracing::Span::current();
|
||||
let embedders = &embedders;
|
||||
let finished_extraction = &finished_extraction;
|
||||
// prevent moving the field_distribution and document_ids in the inner closure...
|
||||
let field_distribution = &mut field_distribution;
|
||||
let document_ids = &mut document_ids;
|
||||
let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
|
||||
let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
|
||||
let _entered = span.enter();
|
||||
|
||||
let rtxn = index.read_txn()?;
|
||||
|
||||
// document but we need to create a function that collects and compresses documents.
|
||||
let document_sender = extractor_sender.documents();
|
||||
let document_extractor = DocumentsExtractor::new(&document_sender, embedders);
|
||||
let datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
|
||||
extract(document_changes,
|
||||
&document_extractor,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
&datastore,
|
||||
Step::ExtractingDocuments,
|
||||
)?;
|
||||
|
||||
for document_extractor_data in datastore {
|
||||
let document_extractor_data = document_extractor_data.0.into_inner();
|
||||
for (field, delta) in document_extractor_data.field_distribution_delta {
|
||||
let current = field_distribution.entry(field).or_default();
|
||||
// adding the delta should never cause a negative result, as we are removing fields that previously existed.
|
||||
*current = current.saturating_add_signed(delta);
|
||||
}
|
||||
document_extractor_data.docids_delta.apply_to(document_ids);
|
||||
}
|
||||
|
||||
field_distribution.retain(|_, v| *v != 0);
|
||||
|
||||
let facet_field_ids_delta;
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
|
||||
pool.install(move || {
|
||||
let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
|
||||
let _entered = span.enter();
|
||||
|
||||
facet_field_ids_delta = merge_and_send_facet_docids(
|
||||
FacetedDocidsExtractor::run_extraction(
|
||||
grenad_parameters,
|
||||
let rtxn = index.read_txn()?;
|
||||
|
||||
// document but we need to create a function that collects and compresses documents.
|
||||
let document_sender = extractor_sender.documents();
|
||||
let document_extractor = DocumentsExtractor::new(document_sender, embedders);
|
||||
let datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents");
|
||||
let _entered = span.enter();
|
||||
extract(
|
||||
document_changes,
|
||||
&document_extractor,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
&extractor_sender.field_id_docid_facet_sender(),
|
||||
Step::ExtractingFacets
|
||||
)?,
|
||||
FacetDatabases::new(index),
|
||||
index,
|
||||
extractor_sender.facet_docids(),
|
||||
)?;
|
||||
}
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
|
||||
let WordDocidsCaches {
|
||||
word_docids,
|
||||
word_fid_docids,
|
||||
exact_word_docids,
|
||||
word_position_docids,
|
||||
fid_word_count_docids,
|
||||
} = WordDocidsExtractors::run_extraction(
|
||||
grenad_parameters,
|
||||
document_changes,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
Step::ExtractingWords
|
||||
)?;
|
||||
|
||||
// TODO Word Docids Merger
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
word_docids,
|
||||
index.word_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<WordDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
&datastore,
|
||||
IndexingStep::ExtractingDocuments,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Word Fid Docids Merging
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "documents");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
word_fid_docids,
|
||||
index.word_fid_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<WordFidDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
for document_extractor_data in datastore {
|
||||
let document_extractor_data = document_extractor_data.0.into_inner();
|
||||
for (field, delta) in document_extractor_data.field_distribution_delta {
|
||||
let current = field_distribution.entry(field).or_default();
|
||||
// adding the delta should never cause a negative result, as we are removing fields that previously existed.
|
||||
*current = current.saturating_add_signed(delta);
|
||||
}
|
||||
document_extractor_data.docids_delta.apply_to(document_ids);
|
||||
}
|
||||
|
||||
field_distribution.retain(|_, v| *v != 0);
|
||||
}
|
||||
|
||||
// Exact Word Docids Merging
|
||||
let facet_field_ids_delta;
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
exact_word_docids,
|
||||
index.exact_word_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<ExactWordDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
let caches = {
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "faceted");
|
||||
let _entered = span.enter();
|
||||
|
||||
// Word Position Docids Merging
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
word_position_docids,
|
||||
index.word_position_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<WordPositionDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
FacetedDocidsExtractor::run_extraction(
|
||||
grenad_parameters,
|
||||
document_changes,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
&extractor_sender.field_id_docid_facet_sender(),
|
||||
IndexingStep::ExtractingFacets
|
||||
)?
|
||||
};
|
||||
|
||||
// Fid Word Count Docids Merging
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
fid_word_count_docids,
|
||||
index.field_id_word_count_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<FidWordCountDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted");
|
||||
let _entered = span.enter();
|
||||
|
||||
// run the proximity extraction only if the precision is by word
|
||||
// this works only if the settings didn't change during this transaction.
|
||||
let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default();
|
||||
if proximity_precision == ProximityPrecision::ByWord {
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
|
||||
let caches = <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(
|
||||
grenad_parameters,
|
||||
document_changes,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
Step::ExtractingWordProximity,
|
||||
)?;
|
||||
|
||||
merge_and_send_docids(
|
||||
caches,
|
||||
index.word_pair_proximity_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<WordPairProximityDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
'vectors: {
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut index_embeddings = index.embedding_configs(&rtxn)?;
|
||||
if index_embeddings.is_empty() {
|
||||
break 'vectors;
|
||||
}
|
||||
|
||||
let embedding_sender = extractor_sender.embeddings();
|
||||
let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads());
|
||||
let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, Step::ExtractingEmbeddings)?;
|
||||
|
||||
for config in &mut index_embeddings {
|
||||
'data: for data in datastore.iter_mut() {
|
||||
let data = &mut data.get_mut().0;
|
||||
let Some(deladd) = data.remove(&config.name) else { continue 'data; };
|
||||
deladd.apply_to(&mut config.user_provided);
|
||||
facet_field_ids_delta = merge_and_send_facet_docids(
|
||||
caches,
|
||||
FacetDatabases::new(index),
|
||||
index,
|
||||
extractor_sender.facet_docids(),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
embedding_sender.finish(index_embeddings).unwrap();
|
||||
}
|
||||
{
|
||||
let WordDocidsCaches {
|
||||
word_docids,
|
||||
word_fid_docids,
|
||||
exact_word_docids,
|
||||
word_position_docids,
|
||||
fid_word_count_docids,
|
||||
} = {
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
'geo: {
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "geo");
|
||||
let _entered = span.enter();
|
||||
WordDocidsExtractors::run_extraction(
|
||||
grenad_parameters,
|
||||
document_changes,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
IndexingStep::ExtractingWords
|
||||
)?
|
||||
};
|
||||
|
||||
let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else {
|
||||
break 'geo;
|
||||
};
|
||||
let datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
extract(
|
||||
document_changes,
|
||||
&extractor,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
&datastore,
|
||||
Step::WritingGeoPoints
|
||||
)?;
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
word_docids,
|
||||
index.word_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<WordDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
merge_and_send_rtree(
|
||||
datastore,
|
||||
&rtxn,
|
||||
index,
|
||||
extractor_sender.geo(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
word_fid_docids,
|
||||
index.word_fid_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<WordFidDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH");
|
||||
let _entered = span.enter();
|
||||
(indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase));
|
||||
}
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
exact_word_docids,
|
||||
index.exact_word_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<ExactWordDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
Result::Ok(facet_field_ids_delta)
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
word_position_docids,
|
||||
index.word_position_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<WordPositionDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
fid_word_count_docids,
|
||||
index.field_id_word_count_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<FidWordCountDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
// run the proximity extraction only if the precision is by word
|
||||
// this works only if the settings didn't change during this transaction.
|
||||
let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default();
|
||||
if proximity_precision == ProximityPrecision::ByWord {
|
||||
let caches = {
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
<WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(
|
||||
grenad_parameters,
|
||||
document_changes,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
IndexingStep::ExtractingWordProximity,
|
||||
)?
|
||||
};
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
merge_and_send_docids(
|
||||
caches,
|
||||
index.word_pair_proximity_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<WordPairProximityDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
'vectors: {
|
||||
if index_embeddings.is_empty() {
|
||||
break 'vectors;
|
||||
}
|
||||
|
||||
let embedding_sender = extractor_sender.embeddings();
|
||||
let extractor = EmbeddingExtractor::new(embedders, embedding_sender, field_distribution, request_threads());
|
||||
let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors");
|
||||
let _entered = span.enter();
|
||||
|
||||
extract(
|
||||
document_changes,
|
||||
&extractor,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
&datastore,
|
||||
IndexingStep::ExtractingEmbeddings,
|
||||
)?;
|
||||
}
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors");
|
||||
let _entered = span.enter();
|
||||
|
||||
for config in &mut index_embeddings {
|
||||
'data: for data in datastore.iter_mut() {
|
||||
let data = &mut data.get_mut().0;
|
||||
let Some(deladd) = data.remove(&config.name) else { continue 'data; };
|
||||
deladd.apply_to(&mut config.user_provided);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
'geo: {
|
||||
let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else {
|
||||
break 'geo;
|
||||
};
|
||||
let datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "geo");
|
||||
let _entered = span.enter();
|
||||
|
||||
extract(
|
||||
document_changes,
|
||||
&extractor,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
&datastore,
|
||||
IndexingStep::WritingGeoPoints
|
||||
)?;
|
||||
}
|
||||
|
||||
merge_and_send_rtree(
|
||||
datastore,
|
||||
&rtxn,
|
||||
index,
|
||||
extractor_sender.geo(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
indexing_context.progress.update_progress(IndexingStep::WritingToDatabase);
|
||||
finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
Result::Ok((facet_field_ids_delta, index_embeddings))
|
||||
}).unwrap()
|
||||
})?;
|
||||
|
||||
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
|
||||
|
||||
let vector_arroy = index.vector_arroy;
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
||||
let indexer_span = tracing::Span::current();
|
||||
let arroy_writers: Result<HashMap<_, _>> = embedders
|
||||
.inner_as_ref()
|
||||
|
@ -351,94 +415,116 @@ where
|
|||
})
|
||||
.collect();
|
||||
|
||||
// Used by by the ArroySetVector to copy the embedding into an
|
||||
// aligned memory area, required by arroy to accept a new vector.
|
||||
let mut aligned_embedding = Vec::new();
|
||||
let mut arroy_writers = arroy_writers?;
|
||||
for operation in writer_receiver {
|
||||
match operation {
|
||||
WriterOperation::DbOperation(db_operation) => {
|
||||
let database = db_operation.database(index);
|
||||
match db_operation.entry() {
|
||||
EntryOperation::Delete(e) => {
|
||||
if !database.delete(wtxn, e.entry())? {
|
||||
unreachable!("We tried to delete an unknown key")
|
||||
}
|
||||
}
|
||||
EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?,
|
||||
}
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "all");
|
||||
let _entered = span.enter();
|
||||
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "post_merge");
|
||||
let mut _entered_post_merge = None;
|
||||
|
||||
while let Some(action) = writer_receiver.recv_action() {
|
||||
if _entered_post_merge.is_none()
|
||||
&& finished_extraction.load(std::sync::atomic::Ordering::Relaxed)
|
||||
{
|
||||
_entered_post_merge = Some(span.enter());
|
||||
}
|
||||
WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation {
|
||||
ArroyOperation::DeleteVectors { docid } => {
|
||||
for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in
|
||||
&mut arroy_writers
|
||||
{
|
||||
let dimensions = *dimensions;
|
||||
writer.del_items(wtxn, dimensions, docid)?;
|
||||
|
||||
match action {
|
||||
ReceiverAction::WakeUp => (),
|
||||
ReceiverAction::LargeEntry(LargeEntry { database, key, value }) => {
|
||||
let database_name = database.database_name();
|
||||
let database = database.database(index);
|
||||
if let Err(error) = database.put(wtxn, &key, &value) {
|
||||
return Err(Error::InternalError(InternalError::StorePut {
|
||||
database_name,
|
||||
key: bstr::BString::from(&key[..]),
|
||||
value_length: value.len(),
|
||||
error,
|
||||
}));
|
||||
}
|
||||
}
|
||||
ArroyOperation::SetVectors {
|
||||
docid,
|
||||
embedder_id,
|
||||
embeddings: raw_embeddings,
|
||||
} => {
|
||||
ReceiverAction::LargeVectors(large_vectors) => {
|
||||
let LargeVectors { docid, embedder_id, .. } = large_vectors;
|
||||
let (_, _, writer, dimensions) =
|
||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
// TODO: switch to Embeddings
|
||||
let mut embeddings = Embeddings::new(*dimensions);
|
||||
for embedding in raw_embeddings {
|
||||
embeddings.append(embedding).unwrap();
|
||||
for embedding in large_vectors.read_embeddings(*dimensions) {
|
||||
embeddings.push(embedding.to_vec()).unwrap();
|
||||
}
|
||||
|
||||
writer.del_items(wtxn, *dimensions, docid)?;
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
}
|
||||
ArroyOperation::SetVector { docid, embedder_id, embedding } => {
|
||||
let (_, _, writer, dimensions) =
|
||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
writer.del_items(wtxn, *dimensions, docid)?;
|
||||
writer.add_item(wtxn, docid, &embedding)?;
|
||||
}
|
||||
ArroyOperation::Finish { configs } => {
|
||||
let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build");
|
||||
let _entered = span.enter();
|
||||
}
|
||||
|
||||
(indexing_context.send_progress)(Progress::from_step(
|
||||
Step::WritingEmbeddingsToDatabase,
|
||||
));
|
||||
|
||||
for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in
|
||||
&mut arroy_writers
|
||||
{
|
||||
let dimensions = *dimensions;
|
||||
writer.build_and_quantize(
|
||||
wtxn,
|
||||
&mut rng,
|
||||
dimensions,
|
||||
false,
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
index.put_embedding_configs(wtxn, configs)?;
|
||||
}
|
||||
},
|
||||
// Every time the is a message in the channel we search
|
||||
// for new entries in the BBQueue buffers.
|
||||
write_from_bbqueue(
|
||||
&mut writer_receiver,
|
||||
index,
|
||||
wtxn,
|
||||
&arroy_writers,
|
||||
&mut aligned_embedding,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Once the extractor/writer channel is closed
|
||||
// we must process the remaining BBQueue messages.
|
||||
write_from_bbqueue(
|
||||
&mut writer_receiver,
|
||||
index,
|
||||
wtxn,
|
||||
&arroy_writers,
|
||||
&mut aligned_embedding,
|
||||
)?;
|
||||
}
|
||||
|
||||
(indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors));
|
||||
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
|
||||
|
||||
let facet_field_ids_delta = extractor_handle.join().unwrap()?;
|
||||
let (facet_field_ids_delta, index_embeddings) = extractor_handle.join().unwrap()?;
|
||||
|
||||
(indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets));
|
||||
'vectors: {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build");
|
||||
let _entered = span.enter();
|
||||
|
||||
if index_embeddings.is_empty() {
|
||||
break 'vectors;
|
||||
}
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase);
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
||||
for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers {
|
||||
let dimensions = *dimensions;
|
||||
writer.build_and_quantize(
|
||||
wtxn,
|
||||
&mut rng,
|
||||
dimensions,
|
||||
false,
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
index.put_embedding_configs(wtxn, index_embeddings)?;
|
||||
}
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets);
|
||||
if index.facet_search(wtxn)? {
|
||||
compute_facet_search_database(index, wtxn, global_fields_ids_map)?;
|
||||
}
|
||||
|
||||
compute_facet_search_database(index, wtxn, global_fields_ids_map)?;
|
||||
compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
|
||||
|
||||
(indexing_context.send_progress)(Progress::from_step(Step::PostProcessingWords));
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::PostProcessingWords);
|
||||
if let Some(prefix_delta) = compute_word_fst(index, wtxn)? {
|
||||
compute_prefix_database(index, wtxn, prefix_delta, grenad_parameters)?;
|
||||
}
|
||||
|
||||
(indexing_context.send_progress)(Progress::from_step(Step::Finalizing));
|
||||
indexing_context.progress.update_progress(IndexingStep::Finalizing);
|
||||
|
||||
Ok(()) as Result<_>
|
||||
})?;
|
||||
|
@ -464,6 +550,72 @@ where
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// A function dedicated to manage all the available BBQueue frames.
|
||||
///
|
||||
/// It reads all the available frames, do the corresponding database operations
|
||||
/// and stops when no frame are available.
|
||||
fn write_from_bbqueue(
|
||||
writer_receiver: &mut WriterBbqueueReceiver<'_>,
|
||||
index: &Index,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
arroy_writers: &HashMap<u8, (&str, &crate::vector::Embedder, ArroyWrapper, usize)>,
|
||||
aligned_embedding: &mut Vec<f32>,
|
||||
) -> crate::Result<()> {
|
||||
while let Some(frame_with_header) = writer_receiver.recv_frame() {
|
||||
match frame_with_header.header() {
|
||||
EntryHeader::DbOperation(operation) => {
|
||||
let database_name = operation.database.database_name();
|
||||
let database = operation.database.database(index);
|
||||
let frame = frame_with_header.frame();
|
||||
match operation.key_value(frame) {
|
||||
(key, Some(value)) => {
|
||||
if let Err(error) = database.put(wtxn, key, value) {
|
||||
return Err(Error::InternalError(InternalError::StorePut {
|
||||
database_name,
|
||||
key: key.into(),
|
||||
value_length: value.len(),
|
||||
error,
|
||||
}));
|
||||
}
|
||||
}
|
||||
(key, None) => match database.delete(wtxn, key) {
|
||||
Ok(false) => {
|
||||
unreachable!("We tried to delete an unknown key: {key:?}")
|
||||
}
|
||||
Ok(_) => (),
|
||||
Err(error) => {
|
||||
return Err(Error::InternalError(InternalError::StoreDeletion {
|
||||
database_name,
|
||||
key: key.into(),
|
||||
error,
|
||||
}));
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => {
|
||||
for (_index, (_name, _embedder, writer, dimensions)) in arroy_writers {
|
||||
let dimensions = *dimensions;
|
||||
writer.del_items(wtxn, dimensions, docid)?;
|
||||
}
|
||||
}
|
||||
EntryHeader::ArroySetVectors(asvs) => {
|
||||
let ArroySetVectors { docid, embedder_id, .. } = asvs;
|
||||
let frame = frame_with_header.frame();
|
||||
let (_, _, writer, dimensions) =
|
||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
let mut embeddings = Embeddings::new(*dimensions);
|
||||
let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding);
|
||||
embeddings.append(all_embeddings.to_vec()).unwrap();
|
||||
writer.del_items(wtxn, *dimensions, docid)?;
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||
fn compute_prefix_database(
|
||||
index: &Index,
|
||||
|
@ -618,7 +770,7 @@ pub fn retrieve_or_guess_primary_key<'a>(
|
|||
index: &Index,
|
||||
new_fields_ids_map: &mut FieldsIdsMap,
|
||||
primary_key_from_op: Option<&'a str>,
|
||||
first_document: Option<RawMap<'a>>,
|
||||
first_document: Option<RawMap<'a, FxBuildHasher>>,
|
||||
) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> {
|
||||
// make sure that we have a declared primary key, either fetching it from the index or attempting to guess it.
|
||||
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
use std::ops::DerefMut;
|
||||
|
||||
use bumparaw_collections::RawMap;
|
||||
use rayon::iter::IndexedParallelIterator;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
use super::document_changes::{DocumentChangeContext, DocumentChanges};
|
||||
|
@ -75,7 +77,7 @@ where
|
|||
self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?;
|
||||
let external_document_id = external_document_id.to_de();
|
||||
|
||||
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
|
||||
let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
|
||||
let insertion = Insertion::create(docid, external_document_id, Versions::single(document));
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
use raw_collections::RawMap;
|
||||
use bumparaw_collections::RawMap;
|
||||
use rayon::iter::IndexedParallelIterator;
|
||||
use rayon::slice::ParallelSlice as _;
|
||||
use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};
|
||||
use roaring::RoaringBitmap;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
|
||||
use super::document_changes::DocumentChangeContext;
|
||||
use super::DocumentChanges;
|
||||
|
@ -160,8 +161,12 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
|
|||
if document_id != new_document_id {
|
||||
Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey))
|
||||
} else {
|
||||
let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
let raw_new_doc = RawMap::from_raw_value_and_hasher(
|
||||
raw_new_doc,
|
||||
FxBuildHasher,
|
||||
doc_alloc,
|
||||
)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
|
||||
Ok(Some(DocumentChange::Update(Update::create(
|
||||
docid,
|
||||
|
|
|
@ -9,8 +9,8 @@ use roaring::RoaringBitmap;
|
|||
|
||||
use super::channel::*;
|
||||
use super::extract::{
|
||||
merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind,
|
||||
GeoExtractorData,
|
||||
merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap,
|
||||
FacetKind, GeoExtractorData,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
|
||||
|
||||
|
@ -19,7 +19,7 @@ pub fn merge_and_send_rtree<'extractor, MSP>(
|
|||
datastore: impl IntoIterator<Item = RefCell<GeoExtractorData<'extractor>>>,
|
||||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
geo_sender: GeoSender<'_>,
|
||||
geo_sender: GeoSender<'_, '_>,
|
||||
must_stop_processing: &MSP,
|
||||
) -> Result<()>
|
||||
where
|
||||
|
@ -34,7 +34,7 @@ where
|
|||
}
|
||||
|
||||
let mut frozen = data.into_inner().freeze()?;
|
||||
for result in frozen.iter_and_clear_removed() {
|
||||
for result in frozen.iter_and_clear_removed()? {
|
||||
let extracted_geo_point = result?;
|
||||
let removed = rtree.remove(&GeoPoint::from(extracted_geo_point));
|
||||
debug_assert!(removed.is_some());
|
||||
|
@ -42,7 +42,7 @@ where
|
|||
debug_assert!(removed);
|
||||
}
|
||||
|
||||
for result in frozen.iter_and_clear_inserted() {
|
||||
for result in frozen.iter_and_clear_inserted()? {
|
||||
let extracted_geo_point = result?;
|
||||
rtree.insert(GeoPoint::from(extracted_geo_point));
|
||||
let inserted = faceted.insert(extracted_geo_point.docid);
|
||||
|
@ -56,38 +56,37 @@ where
|
|||
|
||||
let rtree_mmap = unsafe { Mmap::map(&file)? };
|
||||
geo_sender.set_rtree(rtree_mmap).unwrap();
|
||||
geo_sender.set_geo_faceted(&faceted).unwrap();
|
||||
geo_sender.set_geo_faceted(&faceted)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
||||
pub fn merge_and_send_docids<'extractor, MSP>(
|
||||
pub fn merge_and_send_docids<'extractor, MSP, D>(
|
||||
mut caches: Vec<BalancedCaches<'extractor>>,
|
||||
database: Database<Bytes, Bytes>,
|
||||
index: &Index,
|
||||
docids_sender: impl DocidsSender + Sync,
|
||||
docids_sender: WordDocidsSender<D>,
|
||||
must_stop_processing: &MSP,
|
||||
) -> Result<()>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
D: DatabaseType + Sync,
|
||||
{
|
||||
transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| {
|
||||
let rtxn = index.read_txn()?;
|
||||
let mut buffer = Vec::new();
|
||||
if must_stop_processing() {
|
||||
return Err(InternalError::AbortedIndexation.into());
|
||||
}
|
||||
merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
||||
merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
||||
let current = database.get(&rtxn, key)?;
|
||||
match merge_cbo_bitmaps(current, del, add)? {
|
||||
Operation::Write(bitmap) => {
|
||||
let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer);
|
||||
docids_sender.write(key, value).unwrap();
|
||||
docids_sender.write(key, &bitmap)?;
|
||||
Ok(())
|
||||
}
|
||||
Operation::Delete => {
|
||||
docids_sender.delete(key).unwrap();
|
||||
docids_sender.delete(key)?;
|
||||
Ok(())
|
||||
}
|
||||
Operation::Ignore => Ok(()),
|
||||
|
@ -101,26 +100,24 @@ pub fn merge_and_send_facet_docids<'extractor>(
|
|||
mut caches: Vec<BalancedCaches<'extractor>>,
|
||||
database: FacetDatabases,
|
||||
index: &Index,
|
||||
docids_sender: impl DocidsSender + Sync,
|
||||
docids_sender: FacetDocidsSender,
|
||||
) -> Result<FacetFieldIdsDelta> {
|
||||
transpose_and_freeze_caches(&mut caches)?
|
||||
.into_par_iter()
|
||||
.map(|frozen| {
|
||||
let mut facet_field_ids_delta = FacetFieldIdsDelta::default();
|
||||
let rtxn = index.read_txn()?;
|
||||
let mut buffer = Vec::new();
|
||||
merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
||||
merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
||||
let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?;
|
||||
match merge_cbo_bitmaps(current, del, add)? {
|
||||
Operation::Write(bitmap) => {
|
||||
facet_field_ids_delta.register_from_key(key);
|
||||
let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer);
|
||||
docids_sender.write(key, value).unwrap();
|
||||
docids_sender.write(key, &bitmap)?;
|
||||
Ok(())
|
||||
}
|
||||
Operation::Delete => {
|
||||
facet_field_ids_delta.register_from_key(key);
|
||||
docids_sender.delete(key).unwrap();
|
||||
docids_sender.delete(key)?;
|
||||
Ok(())
|
||||
}
|
||||
Operation::Ignore => Ok(()),
|
||||
|
@ -238,8 +235,12 @@ fn merge_cbo_bitmaps(
|
|||
(Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange
|
||||
(Some(current), None, Some(add)) => Ok(Operation::Write(current | add)),
|
||||
(Some(current), Some(del), add) => {
|
||||
debug_assert!(
|
||||
del.is_subset(¤t),
|
||||
"del is not a subset of current, which must be impossible."
|
||||
);
|
||||
let output = match add {
|
||||
Some(add) => (¤t - del) | add,
|
||||
Some(add) => (¤t - (&del - &add)) | (add - del),
|
||||
None => ¤t - del,
|
||||
};
|
||||
if output.is_empty() {
|
||||
|
@ -252,10 +253,3 @@ fn merge_cbo_bitmaps(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// TODO Return the slice directly from the serialize_into method
|
||||
fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec<u8>) -> &'b [u8] {
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(bitmap, buffer);
|
||||
buffer.as_slice()
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@ pub trait RefCellExt<T: ?Sized> {
|
|||
&self,
|
||||
) -> std::result::Result<RefMut<'_, T>, std::cell::BorrowMutError>;
|
||||
|
||||
#[track_caller]
|
||||
fn borrow_mut_or_yield(&self) -> RefMut<'_, T> {
|
||||
self.try_borrow_mut_or_yield().unwrap()
|
||||
}
|
||||
|
|
|
@ -1,8 +1,12 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use enum_iterator::Sequence;
|
||||
|
||||
use crate::progress::Step;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)]
|
||||
#[repr(u16)]
|
||||
pub enum Step {
|
||||
#[repr(u8)]
|
||||
pub enum IndexingStep {
|
||||
PreparingPayloads,
|
||||
ExtractingDocuments,
|
||||
ExtractingFacets,
|
||||
|
@ -11,37 +15,38 @@ pub enum Step {
|
|||
ExtractingEmbeddings,
|
||||
WritingGeoPoints,
|
||||
WritingToDatabase,
|
||||
WritingEmbeddingsToDatabase,
|
||||
WaitingForExtractors,
|
||||
WritingEmbeddingsToDatabase,
|
||||
PostProcessingFacets,
|
||||
PostProcessingWords,
|
||||
Finalizing,
|
||||
}
|
||||
|
||||
impl Step {
|
||||
pub fn name(&self) -> &'static str {
|
||||
impl Step for IndexingStep {
|
||||
fn name(&self) -> Cow<'static, str> {
|
||||
match self {
|
||||
Step::PreparingPayloads => "preparing update file",
|
||||
Step::ExtractingDocuments => "extracting documents",
|
||||
Step::ExtractingFacets => "extracting facets",
|
||||
Step::ExtractingWords => "extracting words",
|
||||
Step::ExtractingWordProximity => "extracting word proximity",
|
||||
Step::ExtractingEmbeddings => "extracting embeddings",
|
||||
Step::WritingGeoPoints => "writing geo points",
|
||||
Step::WritingToDatabase => "writing to database",
|
||||
Step::WritingEmbeddingsToDatabase => "writing embeddings to database",
|
||||
Step::WaitingForExtractors => "waiting for extractors",
|
||||
Step::PostProcessingFacets => "post-processing facets",
|
||||
Step::PostProcessingWords => "post-processing words",
|
||||
Step::Finalizing => "finalizing",
|
||||
IndexingStep::PreparingPayloads => "preparing update file",
|
||||
IndexingStep::ExtractingDocuments => "extracting documents",
|
||||
IndexingStep::ExtractingFacets => "extracting facets",
|
||||
IndexingStep::ExtractingWords => "extracting words",
|
||||
IndexingStep::ExtractingWordProximity => "extracting word proximity",
|
||||
IndexingStep::ExtractingEmbeddings => "extracting embeddings",
|
||||
IndexingStep::WritingGeoPoints => "writing geo points",
|
||||
IndexingStep::WritingToDatabase => "writing to database",
|
||||
IndexingStep::WaitingForExtractors => "waiting for extractors",
|
||||
IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database",
|
||||
IndexingStep::PostProcessingFacets => "post-processing facets",
|
||||
IndexingStep::PostProcessingWords => "post-processing words",
|
||||
IndexingStep::Finalizing => "finalizing",
|
||||
}
|
||||
.into()
|
||||
}
|
||||
|
||||
pub fn finished_steps(self) -> u16 {
|
||||
self as u16
|
||||
fn current(&self) -> u32 {
|
||||
*self as u32
|
||||
}
|
||||
|
||||
pub const fn total_steps() -> u16 {
|
||||
Self::CARDINALITY as u16
|
||||
fn total(&self) -> u32 {
|
||||
Self::CARDINALITY as u32
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
use std::collections::BTreeSet;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use bumparaw_collections::RawMap;
|
||||
use deserr::{Deserr, IntoValue};
|
||||
use heed::RoTxn;
|
||||
use raw_collections::RawMap;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use serde::Serialize;
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
|
@ -84,7 +85,7 @@ pub struct VectorDocumentFromDb<'t> {
|
|||
docid: DocumentId,
|
||||
embedding_config: Vec<IndexEmbeddingConfig>,
|
||||
index: &'t Index,
|
||||
vectors_field: Option<RawMap<'t>>,
|
||||
vectors_field: Option<RawMap<'t, FxBuildHasher>>,
|
||||
rtxn: &'t RoTxn<'t>,
|
||||
doc_alloc: &'t Bump,
|
||||
}
|
||||
|
@ -102,9 +103,10 @@ impl<'t> VectorDocumentFromDb<'t> {
|
|||
};
|
||||
let vectors = document.vectors_field()?;
|
||||
let vectors_field = match vectors {
|
||||
Some(vectors) => {
|
||||
Some(RawMap::from_raw_value(vectors, doc_alloc).map_err(InternalError::SerdeJson)?)
|
||||
}
|
||||
Some(vectors) => Some(
|
||||
RawMap::from_raw_value_and_hasher(vectors, FxBuildHasher, doc_alloc)
|
||||
.map_err(InternalError::SerdeJson)?,
|
||||
),
|
||||
None => None,
|
||||
};
|
||||
|
||||
|
@ -220,7 +222,7 @@ fn entry_from_raw_value(
|
|||
|
||||
pub struct VectorDocumentFromVersions<'doc> {
|
||||
external_document_id: &'doc str,
|
||||
vectors: RawMap<'doc>,
|
||||
vectors: RawMap<'doc, FxBuildHasher>,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
}
|
||||
|
||||
|
@ -233,8 +235,8 @@ impl<'doc> VectorDocumentFromVersions<'doc> {
|
|||
) -> Result<Option<Self>> {
|
||||
let document = DocumentFromVersions::new(versions);
|
||||
if let Some(vectors_field) = document.vectors_field()? {
|
||||
let vectors =
|
||||
RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?;
|
||||
let vectors = RawMap::from_raw_value_and_hasher(vectors_field, FxBuildHasher, bump)
|
||||
.map_err(UserError::SerdeJson)?;
|
||||
Ok(Some(Self { external_document_id, vectors, embedders }))
|
||||
} else {
|
||||
Ok(None)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use std::collections::HashSet;
|
||||
use std::collections::BTreeSet;
|
||||
use std::io::BufWriter;
|
||||
|
||||
use fst::{Set, SetBuilder, Streamer};
|
||||
|
@ -75,18 +75,18 @@ pub struct PrefixData {
|
|||
|
||||
#[derive(Debug)]
|
||||
pub struct PrefixDelta {
|
||||
pub modified: HashSet<Prefix>,
|
||||
pub deleted: HashSet<Prefix>,
|
||||
pub modified: BTreeSet<Prefix>,
|
||||
pub deleted: BTreeSet<Prefix>,
|
||||
}
|
||||
|
||||
struct PrefixFstBuilder {
|
||||
prefix_count_threshold: u64,
|
||||
prefix_count_threshold: usize,
|
||||
max_prefix_length: usize,
|
||||
/// TODO: Replace the full memory allocation
|
||||
prefix_fst_builders: Vec<SetBuilder<Vec<u8>>>,
|
||||
current_prefix: Vec<Prefix>,
|
||||
current_prefix_count: Vec<u64>,
|
||||
modified_prefixes: HashSet<Prefix>,
|
||||
current_prefix_count: Vec<usize>,
|
||||
modified_prefixes: BTreeSet<Prefix>,
|
||||
current_prefix_is_modified: Vec<bool>,
|
||||
}
|
||||
|
||||
|
@ -95,7 +95,7 @@ impl PrefixFstBuilder {
|
|||
let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } =
|
||||
prefix_settings;
|
||||
|
||||
if !compute_prefixes {
|
||||
if compute_prefixes != crate::index::PrefixSearch::IndexingTime {
|
||||
return None;
|
||||
}
|
||||
|
||||
|
@ -110,7 +110,7 @@ impl PrefixFstBuilder {
|
|||
prefix_fst_builders,
|
||||
current_prefix: vec![Prefix::new(); max_prefix_length],
|
||||
current_prefix_count: vec![0; max_prefix_length],
|
||||
modified_prefixes: HashSet::new(),
|
||||
modified_prefixes: BTreeSet::new(),
|
||||
current_prefix_is_modified: vec![false; max_prefix_length],
|
||||
})
|
||||
}
|
||||
|
@ -180,7 +180,7 @@ impl PrefixFstBuilder {
|
|||
let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? };
|
||||
let new_prefix_fst = Set::new(&prefix_fst_mmap)?;
|
||||
let old_prefix_fst = index.words_prefixes_fst(rtxn)?;
|
||||
let mut deleted_prefixes = HashSet::new();
|
||||
let mut deleted_prefixes = BTreeSet::new();
|
||||
{
|
||||
let mut deleted_prefixes_stream = old_prefix_fst.op().add(&new_prefix_fst).difference();
|
||||
while let Some(prefix) = deleted_prefixes_stream.next() {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use std::cell::RefCell;
|
||||
use std::collections::HashSet;
|
||||
use std::collections::BTreeSet;
|
||||
use std::io::{BufReader, BufWriter, Read, Seek, Write};
|
||||
|
||||
use hashbrown::HashMap;
|
||||
|
@ -37,8 +37,8 @@ impl WordPrefixDocids {
|
|||
fn execute(
|
||||
self,
|
||||
wtxn: &mut heed::RwTxn,
|
||||
prefix_to_compute: &HashSet<Prefix>,
|
||||
prefix_to_delete: &HashSet<Prefix>,
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
) -> Result<()> {
|
||||
delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?;
|
||||
self.recompute_modified_prefixes(wtxn, prefix_to_compute)
|
||||
|
@ -48,7 +48,7 @@ impl WordPrefixDocids {
|
|||
fn recompute_modified_prefixes(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
prefixes: &HashSet<Prefix>,
|
||||
prefixes: &BTreeSet<Prefix>,
|
||||
) -> Result<()> {
|
||||
// We fetch the docids associated to the newly added word prefix fst only.
|
||||
// And collect the CboRoaringBitmaps pointers in an HashMap.
|
||||
|
@ -76,7 +76,7 @@ impl WordPrefixDocids {
|
|||
.union()?;
|
||||
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&output, buffer);
|
||||
CboRoaringBitmapCodec::serialize_into_vec(&output, buffer);
|
||||
index.push(PrefixEntry { prefix, serialized_length: buffer.len() });
|
||||
file.write_all(buffer)
|
||||
})?;
|
||||
|
@ -127,7 +127,7 @@ impl<'a, 'rtxn> FrozenPrefixBitmaps<'a, 'rtxn> {
|
|||
pub fn from_prefixes(
|
||||
database: Database<Bytes, CboRoaringBitmapCodec>,
|
||||
rtxn: &'rtxn RoTxn,
|
||||
prefixes: &'a HashSet<Prefix>,
|
||||
prefixes: &'a BTreeSet<Prefix>,
|
||||
) -> heed::Result<Self> {
|
||||
let database = database.remap_data_type::<Bytes>();
|
||||
|
||||
|
@ -173,8 +173,8 @@ impl WordPrefixIntegerDocids {
|
|||
fn execute(
|
||||
self,
|
||||
wtxn: &mut heed::RwTxn,
|
||||
prefix_to_compute: &HashSet<Prefix>,
|
||||
prefix_to_delete: &HashSet<Prefix>,
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
) -> Result<()> {
|
||||
delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?;
|
||||
self.recompute_modified_prefixes(wtxn, prefix_to_compute)
|
||||
|
@ -184,7 +184,7 @@ impl WordPrefixIntegerDocids {
|
|||
fn recompute_modified_prefixes(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
prefixes: &HashSet<Prefix>,
|
||||
prefixes: &BTreeSet<Prefix>,
|
||||
) -> Result<()> {
|
||||
// We fetch the docids associated to the newly added word prefix fst only.
|
||||
// And collect the CboRoaringBitmaps pointers in an HashMap.
|
||||
|
@ -211,7 +211,7 @@ impl WordPrefixIntegerDocids {
|
|||
.union()?;
|
||||
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&output, buffer);
|
||||
CboRoaringBitmapCodec::serialize_into_vec(&output, buffer);
|
||||
index.push(PrefixIntegerEntry { prefix, pos, serialized_length: buffer.len() });
|
||||
file.write_all(buffer)?;
|
||||
}
|
||||
|
@ -262,7 +262,7 @@ impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> {
|
|||
pub fn from_prefixes(
|
||||
database: Database<Bytes, CboRoaringBitmapCodec>,
|
||||
rtxn: &'rtxn RoTxn,
|
||||
prefixes: &'a HashSet<Prefix>,
|
||||
prefixes: &'a BTreeSet<Prefix>,
|
||||
) -> heed::Result<Self> {
|
||||
let database = database.remap_data_type::<Bytes>();
|
||||
|
||||
|
@ -291,7 +291,7 @@ unsafe impl<'a, 'rtxn> Sync for FrozenPrefixIntegerBitmaps<'a, 'rtxn> {}
|
|||
fn delete_prefixes(
|
||||
wtxn: &mut RwTxn,
|
||||
prefix_database: &Database<Bytes, CboRoaringBitmapCodec>,
|
||||
prefixes: &HashSet<Prefix>,
|
||||
prefixes: &BTreeSet<Prefix>,
|
||||
) -> Result<()> {
|
||||
// We remove all the entries that are no more required in this word prefix docids database.
|
||||
for prefix in prefixes {
|
||||
|
@ -309,8 +309,8 @@ fn delete_prefixes(
|
|||
pub fn compute_word_prefix_docids(
|
||||
wtxn: &mut RwTxn,
|
||||
index: &Index,
|
||||
prefix_to_compute: &HashSet<Prefix>,
|
||||
prefix_to_delete: &HashSet<Prefix>,
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
grenad_parameters: GrenadParameters,
|
||||
) -> Result<()> {
|
||||
WordPrefixDocids::new(
|
||||
|
@ -325,8 +325,8 @@ pub fn compute_word_prefix_docids(
|
|||
pub fn compute_exact_word_prefix_docids(
|
||||
wtxn: &mut RwTxn,
|
||||
index: &Index,
|
||||
prefix_to_compute: &HashSet<Prefix>,
|
||||
prefix_to_delete: &HashSet<Prefix>,
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
grenad_parameters: GrenadParameters,
|
||||
) -> Result<()> {
|
||||
WordPrefixDocids::new(
|
||||
|
@ -341,8 +341,8 @@ pub fn compute_exact_word_prefix_docids(
|
|||
pub fn compute_word_prefix_fid_docids(
|
||||
wtxn: &mut RwTxn,
|
||||
index: &Index,
|
||||
prefix_to_compute: &HashSet<Prefix>,
|
||||
prefix_to_delete: &HashSet<Prefix>,
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
grenad_parameters: GrenadParameters,
|
||||
) -> Result<()> {
|
||||
WordPrefixIntegerDocids::new(
|
||||
|
@ -357,8 +357,8 @@ pub fn compute_word_prefix_fid_docids(
|
|||
pub fn compute_word_prefix_position_docids(
|
||||
wtxn: &mut RwTxn,
|
||||
index: &Index,
|
||||
prefix_to_compute: &HashSet<Prefix>,
|
||||
prefix_to_delete: &HashSet<Prefix>,
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
grenad_parameters: GrenadParameters,
|
||||
) -> Result<()> {
|
||||
WordPrefixIntegerDocids::new(
|
||||
|
|
|
@ -17,7 +17,8 @@ use super::IndexerConfig;
|
|||
use crate::criterion::Criterion;
|
||||
use crate::error::UserError;
|
||||
use crate::index::{
|
||||
IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
|
||||
IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO,
|
||||
DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
|
||||
};
|
||||
use crate::order_by_map::OrderByMap;
|
||||
use crate::prompt::default_max_bytes;
|
||||
|
@ -177,6 +178,8 @@ pub struct Settings<'a, 't, 'i> {
|
|||
embedder_settings: Setting<BTreeMap<String, Setting<EmbeddingSettings>>>,
|
||||
search_cutoff: Setting<u64>,
|
||||
localized_attributes_rules: Setting<Vec<LocalizedAttributesRule>>,
|
||||
prefix_search: Setting<PrefixSearch>,
|
||||
facet_search: Setting<bool>,
|
||||
}
|
||||
|
||||
impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
|
@ -212,6 +215,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
embedder_settings: Setting::NotSet,
|
||||
search_cutoff: Setting::NotSet,
|
||||
localized_attributes_rules: Setting::NotSet,
|
||||
prefix_search: Setting::NotSet,
|
||||
facet_search: Setting::NotSet,
|
||||
indexer_config,
|
||||
}
|
||||
}
|
||||
|
@ -418,6 +423,22 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
self.localized_attributes_rules = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_prefix_search(&mut self, value: PrefixSearch) {
|
||||
self.prefix_search = Setting::Set(value);
|
||||
}
|
||||
|
||||
pub fn reset_prefix_search(&mut self) {
|
||||
self.prefix_search = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_facet_search(&mut self, value: bool) {
|
||||
self.facet_search = Setting::Set(value);
|
||||
}
|
||||
|
||||
pub fn reset_facet_search(&mut self) {
|
||||
self.facet_search = Setting::Reset;
|
||||
}
|
||||
|
||||
#[tracing::instrument(
|
||||
level = "trace"
|
||||
skip(self, progress_callback, should_abort, settings_diff),
|
||||
|
@ -944,7 +965,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
false
|
||||
} else {
|
||||
self.index.put_proximity_precision(self.wtxn, new)?;
|
||||
true
|
||||
old.is_some() || new != ProximityPrecision::default()
|
||||
}
|
||||
}
|
||||
Setting::Reset => self.index.delete_proximity_precision(self.wtxn)?,
|
||||
|
@ -954,6 +975,42 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
Ok(changed)
|
||||
}
|
||||
|
||||
fn update_prefix_search(&mut self) -> Result<bool> {
|
||||
let changed = match self.prefix_search {
|
||||
Setting::Set(new) => {
|
||||
let old = self.index.prefix_search(self.wtxn)?;
|
||||
if old == Some(new) {
|
||||
false
|
||||
} else {
|
||||
self.index.put_prefix_search(self.wtxn, new)?;
|
||||
old.is_some() || new != PrefixSearch::default()
|
||||
}
|
||||
}
|
||||
Setting::Reset => self.index.delete_prefix_search(self.wtxn)?,
|
||||
Setting::NotSet => false,
|
||||
};
|
||||
|
||||
Ok(changed)
|
||||
}
|
||||
|
||||
fn update_facet_search(&mut self) -> Result<bool> {
|
||||
let changed = match self.facet_search {
|
||||
Setting::Set(new) => {
|
||||
let old = self.index.facet_search(self.wtxn)?;
|
||||
if old == new {
|
||||
false
|
||||
} else {
|
||||
self.index.put_facet_search(self.wtxn, new)?;
|
||||
true
|
||||
}
|
||||
}
|
||||
Setting::Reset => self.index.delete_facet_search(self.wtxn)?,
|
||||
Setting::NotSet => false,
|
||||
};
|
||||
|
||||
Ok(changed)
|
||||
}
|
||||
|
||||
fn update_embedding_configs(&mut self) -> Result<BTreeMap<String, EmbedderAction>> {
|
||||
match std::mem::take(&mut self.embedder_settings) {
|
||||
Setting::Set(configs) => self.update_embedding_configs_set(configs),
|
||||
|
@ -1203,6 +1260,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
self.update_searchable()?;
|
||||
self.update_exact_attributes()?;
|
||||
self.update_proximity_precision()?;
|
||||
self.update_prefix_search()?;
|
||||
self.update_facet_search()?;
|
||||
self.update_localized_attributes_rules()?;
|
||||
|
||||
let embedding_config_updates = self.update_embedding_configs()?;
|
||||
|
@ -1282,6 +1341,7 @@ impl InnerIndexSettingsDiff {
|
|||
|| old_settings.allowed_separators != new_settings.allowed_separators
|
||||
|| old_settings.dictionary != new_settings.dictionary
|
||||
|| old_settings.proximity_precision != new_settings.proximity_precision
|
||||
|| old_settings.prefix_search != new_settings.prefix_search
|
||||
|| old_settings.localized_searchable_fields_ids
|
||||
!= new_settings.localized_searchable_fields_ids
|
||||
};
|
||||
|
@ -1372,7 +1432,7 @@ impl InnerIndexSettingsDiff {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn reindex_facets(&self) -> bool {
|
||||
pub fn facet_fids_changed(&self) -> bool {
|
||||
let existing_fields = &self.new.existing_fields;
|
||||
if existing_fields.iter().any(|field| field.contains('.')) {
|
||||
return true;
|
||||
|
@ -1392,7 +1452,15 @@ impl InnerIndexSettingsDiff {
|
|||
}
|
||||
|
||||
(existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields)
|
||||
|| self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids
|
||||
}
|
||||
|
||||
pub fn global_facet_settings_changed(&self) -> bool {
|
||||
self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids
|
||||
|| self.old.facet_search != self.new.facet_search
|
||||
}
|
||||
|
||||
pub fn reindex_facets(&self) -> bool {
|
||||
self.facet_fids_changed() || self.global_facet_settings_changed()
|
||||
}
|
||||
|
||||
pub fn reindex_vectors(&self) -> bool {
|
||||
|
@ -1432,6 +1500,8 @@ pub(crate) struct InnerIndexSettings {
|
|||
pub non_faceted_fields_ids: Vec<FieldId>,
|
||||
pub localized_searchable_fields_ids: LocalizedFieldIds,
|
||||
pub localized_faceted_fields_ids: LocalizedFieldIds,
|
||||
pub prefix_search: PrefixSearch,
|
||||
pub facet_search: bool,
|
||||
}
|
||||
|
||||
impl InnerIndexSettings {
|
||||
|
@ -1457,6 +1527,8 @@ impl InnerIndexSettings {
|
|||
Some(embedding_configs) => embedding_configs,
|
||||
None => embedders(index.embedding_configs(rtxn)?)?,
|
||||
};
|
||||
let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default();
|
||||
let facet_search = index.facet_search(rtxn)?;
|
||||
let existing_fields: HashSet<_> = index
|
||||
.field_distribution(rtxn)?
|
||||
.into_iter()
|
||||
|
@ -1514,6 +1586,8 @@ impl InnerIndexSettings {
|
|||
non_faceted_fields_ids: vectors_fids.clone(),
|
||||
localized_searchable_fields_ids,
|
||||
localized_faceted_fields_ids,
|
||||
prefix_search,
|
||||
facet_search,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -2721,6 +2795,8 @@ mod tests {
|
|||
embedder_settings,
|
||||
search_cutoff,
|
||||
localized_attributes_rules,
|
||||
prefix_search,
|
||||
facet_search,
|
||||
} = settings;
|
||||
assert!(matches!(searchable_fields, Setting::NotSet));
|
||||
assert!(matches!(displayed_fields, Setting::NotSet));
|
||||
|
@ -2746,6 +2822,8 @@ mod tests {
|
|||
assert!(matches!(embedder_settings, Setting::NotSet));
|
||||
assert!(matches!(search_cutoff, Setting::NotSet));
|
||||
assert!(matches!(localized_attributes_rules, Setting::NotSet));
|
||||
assert!(matches!(prefix_search, Setting::NotSet));
|
||||
assert!(matches!(facet_search, Setting::NotSet));
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ use crate::{Index, Result, SmallString32};
|
|||
pub struct WordsPrefixesFst<'t, 'i> {
|
||||
wtxn: &'t mut RwTxn<'i>,
|
||||
index: &'i Index,
|
||||
threshold: u32,
|
||||
threshold: usize,
|
||||
max_prefix_length: usize,
|
||||
}
|
||||
|
||||
|
@ -24,8 +24,8 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> {
|
|||
///
|
||||
/// Default value is 100. This value must be higher than 50 and will be clamped
|
||||
/// to this bound otherwise.
|
||||
pub fn threshold(&mut self, value: u32) -> &mut Self {
|
||||
self.threshold = value.max(50);
|
||||
pub fn threshold(&mut self, value: usize) -> &mut Self {
|
||||
self.threshold = value;
|
||||
self
|
||||
}
|
||||
|
||||
|
@ -34,7 +34,7 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> {
|
|||
/// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
|
||||
/// to these bounds, otherwise.
|
||||
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
||||
self.max_prefix_length = value.clamp(1, 25);
|
||||
self.max_prefix_length = value;
|
||||
self
|
||||
}
|
||||
|
||||
|
|
|
@ -475,7 +475,7 @@ impl<F> Embeddings<F> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// Append a flat vector of embeddings a the end of the embeddings.
|
||||
/// Append a flat vector of embeddings at the end of the embeddings.
|
||||
///
|
||||
/// If `embeddings.len() % self.dimension != 0`, then the append operation fails.
|
||||
pub fn append(&mut self, mut embeddings: Vec<F>) -> Result<(), Vec<F>> {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue