mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 04:17:10 +02:00
Merge pull request #5509 from meilisearch/release-v1.14.0-tmp
Bring back changes from v1.14.0 to main
This commit is contained in:
commit
a500fa053c
43 changed files with 1047 additions and 508 deletions
|
@ -1,8 +1,13 @@
|
|||
use heed::types::Bytes;
|
||||
use std::mem;
|
||||
|
||||
use heed::Database;
|
||||
use heed::DatabaseStat;
|
||||
use heed::RoTxn;
|
||||
use heed::Unspecified;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::BEU32;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
/// The stats of a database.
|
||||
|
@ -20,58 +25,24 @@ impl DatabaseStats {
|
|||
///
|
||||
/// This function iterates over the whole database and computes the stats.
|
||||
/// It is not efficient and should be cached somewhere.
|
||||
pub(crate) fn new(database: Database<Bytes, Bytes>, rtxn: &RoTxn<'_>) -> heed::Result<Self> {
|
||||
let mut database_stats =
|
||||
Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 };
|
||||
pub(crate) fn new(
|
||||
database: Database<BEU32, Unspecified>,
|
||||
rtxn: &RoTxn<'_>,
|
||||
) -> heed::Result<Self> {
|
||||
let DatabaseStat { page_size, depth: _, branch_pages, leaf_pages, overflow_pages, entries } =
|
||||
database.stat(rtxn)?;
|
||||
|
||||
let mut iter = database.iter(rtxn)?;
|
||||
while let Some((key, value)) = iter.next().transpose()? {
|
||||
let key_size = key.len() as u64;
|
||||
let value_size = value.len() as u64;
|
||||
database_stats.total_key_size += key_size;
|
||||
database_stats.total_value_size += value_size;
|
||||
}
|
||||
// We first take the total size without overflow pages as the overflow pages contains the values and only that.
|
||||
let total_size = (branch_pages + leaf_pages + overflow_pages) * page_size as usize;
|
||||
// We compute an estimated size for the keys.
|
||||
let total_key_size = entries * (mem::size_of::<u32>() + 4);
|
||||
let total_value_size = total_size - total_key_size;
|
||||
|
||||
database_stats.number_of_entries = database.len(rtxn)?;
|
||||
|
||||
Ok(database_stats)
|
||||
}
|
||||
|
||||
/// Recomputes the stats of the database and returns the new stats.
|
||||
///
|
||||
/// This function is used to update the stats of the database when some keys are modified.
|
||||
/// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states.
|
||||
pub(crate) fn recompute<I, K>(
|
||||
mut stats: Self,
|
||||
database: Database<Bytes, Bytes>,
|
||||
before_rtxn: &RoTxn<'_>,
|
||||
after_rtxn: &RoTxn<'_>,
|
||||
modified_keys: I,
|
||||
) -> heed::Result<Self>
|
||||
where
|
||||
I: IntoIterator<Item = K>,
|
||||
K: AsRef<[u8]>,
|
||||
{
|
||||
for key in modified_keys {
|
||||
let key = key.as_ref();
|
||||
if let Some(value) = database.get(after_rtxn, key)? {
|
||||
let key_size = key.len() as u64;
|
||||
let value_size = value.len() as u64;
|
||||
stats.total_key_size = stats.total_key_size.saturating_add(key_size);
|
||||
stats.total_value_size = stats.total_value_size.saturating_add(value_size);
|
||||
}
|
||||
|
||||
if let Some(value) = database.get(before_rtxn, key)? {
|
||||
let key_size = key.len() as u64;
|
||||
let value_size = value.len() as u64;
|
||||
stats.total_key_size = stats.total_key_size.saturating_sub(key_size);
|
||||
stats.total_value_size = stats.total_value_size.saturating_sub(value_size);
|
||||
}
|
||||
}
|
||||
|
||||
stats.number_of_entries = database.len(after_rtxn)?;
|
||||
|
||||
Ok(stats)
|
||||
Ok(Self {
|
||||
number_of_entries: entries as u64,
|
||||
total_key_size: total_key_size as u64,
|
||||
total_value_size: total_value_size as u64,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn average_key_size(&self) -> u64 {
|
||||
|
@ -86,6 +57,10 @@ impl DatabaseStats {
|
|||
self.number_of_entries
|
||||
}
|
||||
|
||||
pub fn total_size(&self) -> u64 {
|
||||
self.total_key_size + self.total_value_size
|
||||
}
|
||||
|
||||
pub fn total_key_size(&self) -> u64 {
|
||||
self.total_key_size
|
||||
}
|
||||
|
|
|
@ -154,6 +154,14 @@ and can not be more than 511 bytes.", .document_id.to_string()
|
|||
InvalidGeoField(#[from] Box<GeoError>),
|
||||
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
|
||||
InvalidVectorDimensions { expected: usize, found: usize },
|
||||
#[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n - note: embedding #{embedding_index} has dimensions {found}\n - note: embedder `{embedder_name}` requires {expected}")]
|
||||
InvalidIndexingVectorDimensions {
|
||||
embedder_name: String,
|
||||
document_id: String,
|
||||
embedding_index: usize,
|
||||
expected: usize,
|
||||
found: usize,
|
||||
},
|
||||
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
||||
InvalidVectorsMapType { document_id: String, value: Value },
|
||||
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
|
||||
|
|
|
@ -3,8 +3,9 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
|||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
|
||||
use heed::{types::*, WithoutTls};
|
||||
use heed::{types::*, DatabaseStat, WithoutTls};
|
||||
use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
|
||||
use indexmap::IndexMap;
|
||||
use roaring::RoaringBitmap;
|
||||
use rstar::RTree;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
@ -410,38 +411,6 @@ impl Index {
|
|||
Ok(count.unwrap_or_default())
|
||||
}
|
||||
|
||||
/// Updates the stats of the documents database based on the previous stats and the modified docids.
|
||||
pub fn update_documents_stats(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
modified_docids: roaring::RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
let before_rtxn = self.read_txn()?;
|
||||
let document_stats = match self.documents_stats(&before_rtxn)? {
|
||||
Some(before_stats) => DatabaseStats::recompute(
|
||||
before_stats,
|
||||
self.documents.remap_types(),
|
||||
&before_rtxn,
|
||||
wtxn,
|
||||
modified_docids.iter().map(|docid| docid.to_be_bytes()),
|
||||
)?,
|
||||
None => {
|
||||
// This should never happen when there are already documents in the index, the documents stats should be present.
|
||||
// If it happens, it means that the index was not properly initialized/upgraded.
|
||||
debug_assert_eq!(
|
||||
self.documents.len(&before_rtxn)?,
|
||||
0,
|
||||
"The documents stats should be present when there are documents in the index"
|
||||
);
|
||||
tracing::warn!("No documents stats found, creating new ones");
|
||||
DatabaseStats::new(self.documents.remap_types(), &*wtxn)?
|
||||
}
|
||||
};
|
||||
|
||||
self.put_documents_stats(wtxn, document_stats)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Writes the stats of the documents database.
|
||||
pub fn put_documents_stats(
|
||||
&self,
|
||||
|
@ -1755,6 +1724,122 @@ impl Index {
|
|||
}
|
||||
Ok(stats)
|
||||
}
|
||||
|
||||
/// Check if the word is indexed in the index.
|
||||
///
|
||||
/// This function checks if the word is indexed in the index by looking at the word_docids and exact_word_docids.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `rtxn`: The read transaction.
|
||||
/// * `word`: The word to check.
|
||||
pub fn contains_word(&self, rtxn: &RoTxn<'_>, word: &str) -> Result<bool> {
|
||||
Ok(self.word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some()
|
||||
|| self.exact_word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some())
|
||||
}
|
||||
|
||||
/// Returns the sizes in bytes of each of the index database at the given rtxn.
|
||||
pub fn database_sizes(&self, rtxn: &RoTxn<'_>) -> heed::Result<IndexMap<&'static str, usize>> {
|
||||
let Self {
|
||||
env: _,
|
||||
main,
|
||||
external_documents_ids,
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_prefix_docids,
|
||||
exact_word_prefix_docids,
|
||||
word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
word_prefix_position_docids,
|
||||
word_prefix_fid_docids,
|
||||
field_id_word_count_docids,
|
||||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
facet_id_normalized_string_strings,
|
||||
facet_id_string_fst,
|
||||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
vector_arroy,
|
||||
embedder_category_id,
|
||||
documents,
|
||||
} = self;
|
||||
|
||||
fn compute_size(stats: DatabaseStat) -> usize {
|
||||
let DatabaseStat {
|
||||
page_size,
|
||||
depth: _,
|
||||
branch_pages,
|
||||
leaf_pages,
|
||||
overflow_pages,
|
||||
entries: _,
|
||||
} = stats;
|
||||
|
||||
(branch_pages + leaf_pages + overflow_pages) * page_size as usize
|
||||
}
|
||||
|
||||
let mut sizes = IndexMap::new();
|
||||
sizes.insert("main", main.stat(rtxn).map(compute_size)?);
|
||||
sizes
|
||||
.insert("external_documents_ids", external_documents_ids.stat(rtxn).map(compute_size)?);
|
||||
sizes.insert("word_docids", word_docids.stat(rtxn).map(compute_size)?);
|
||||
sizes.insert("exact_word_docids", exact_word_docids.stat(rtxn).map(compute_size)?);
|
||||
sizes.insert("word_prefix_docids", word_prefix_docids.stat(rtxn).map(compute_size)?);
|
||||
sizes.insert(
|
||||
"exact_word_prefix_docids",
|
||||
exact_word_prefix_docids.stat(rtxn).map(compute_size)?,
|
||||
);
|
||||
sizes.insert(
|
||||
"word_pair_proximity_docids",
|
||||
word_pair_proximity_docids.stat(rtxn).map(compute_size)?,
|
||||
);
|
||||
sizes.insert("word_position_docids", word_position_docids.stat(rtxn).map(compute_size)?);
|
||||
sizes.insert("word_fid_docids", word_fid_docids.stat(rtxn).map(compute_size)?);
|
||||
sizes.insert(
|
||||
"word_prefix_position_docids",
|
||||
word_prefix_position_docids.stat(rtxn).map(compute_size)?,
|
||||
);
|
||||
sizes
|
||||
.insert("word_prefix_fid_docids", word_prefix_fid_docids.stat(rtxn).map(compute_size)?);
|
||||
sizes.insert(
|
||||
"field_id_word_count_docids",
|
||||
field_id_word_count_docids.stat(rtxn).map(compute_size)?,
|
||||
);
|
||||
sizes.insert("facet_id_f64_docids", facet_id_f64_docids.stat(rtxn).map(compute_size)?);
|
||||
sizes
|
||||
.insert("facet_id_string_docids", facet_id_string_docids.stat(rtxn).map(compute_size)?);
|
||||
sizes.insert(
|
||||
"facet_id_normalized_string_strings",
|
||||
facet_id_normalized_string_strings.stat(rtxn).map(compute_size)?,
|
||||
);
|
||||
sizes.insert("facet_id_string_fst", facet_id_string_fst.stat(rtxn).map(compute_size)?);
|
||||
sizes
|
||||
.insert("facet_id_exists_docids", facet_id_exists_docids.stat(rtxn).map(compute_size)?);
|
||||
sizes.insert(
|
||||
"facet_id_is_null_docids",
|
||||
facet_id_is_null_docids.stat(rtxn).map(compute_size)?,
|
||||
);
|
||||
sizes.insert(
|
||||
"facet_id_is_empty_docids",
|
||||
facet_id_is_empty_docids.stat(rtxn).map(compute_size)?,
|
||||
);
|
||||
sizes.insert(
|
||||
"field_id_docid_facet_f64s",
|
||||
field_id_docid_facet_f64s.stat(rtxn).map(compute_size)?,
|
||||
);
|
||||
sizes.insert(
|
||||
"field_id_docid_facet_strings",
|
||||
field_id_docid_facet_strings.stat(rtxn).map(compute_size)?,
|
||||
);
|
||||
sizes.insert("vector_arroy", vector_arroy.stat(rtxn).map(compute_size)?);
|
||||
sizes.insert("embedder_category_id", embedder_category_id.stat(rtxn).map(compute_size)?);
|
||||
sizes.insert("documents", documents.stat(rtxn).map(compute_size)?);
|
||||
|
||||
Ok(sizes)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
|
|
|
@ -190,8 +190,18 @@ macro_rules! make_atomic_progress {
|
|||
};
|
||||
}
|
||||
|
||||
make_atomic_progress!(Document alias AtomicDocumentStep => "document" );
|
||||
make_atomic_progress!(Payload alias AtomicPayloadStep => "payload" );
|
||||
make_atomic_progress!(Document alias AtomicDocumentStep => "document");
|
||||
make_atomic_progress!(Payload alias AtomicPayloadStep => "payload");
|
||||
|
||||
make_enum_progress! {
|
||||
pub enum MergingWordCache {
|
||||
WordDocids,
|
||||
WordFieldIdDocids,
|
||||
ExactWordDocids,
|
||||
WordPositionDocids,
|
||||
FieldIdWordCountDocids,
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Clone, ToSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
|
|
|
@ -173,16 +173,18 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
ranking_rule_scores.push(ScoreDetails::Skipped);
|
||||
|
||||
// remove candidates from the universe without adding them to result if their score is below the threshold
|
||||
if let Some(ranking_score_threshold) = ranking_score_threshold {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
if current_score < ranking_score_threshold {
|
||||
all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
let is_below_threshold =
|
||||
ranking_score_threshold.is_some_and(|ranking_score_threshold| {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
current_score < ranking_score_threshold
|
||||
});
|
||||
|
||||
maybe_add_to_results!(bucket);
|
||||
if is_below_threshold {
|
||||
all_candidates -= &bucket;
|
||||
all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
|
||||
} else {
|
||||
maybe_add_to_results!(bucket);
|
||||
}
|
||||
|
||||
ranking_rule_scores.pop();
|
||||
|
||||
|
@ -237,23 +239,24 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
);
|
||||
|
||||
// remove candidates from the universe without adding them to result if their score is below the threshold
|
||||
if let Some(ranking_score_threshold) = ranking_score_threshold {
|
||||
let is_below_threshold = ranking_score_threshold.is_some_and(|ranking_score_threshold| {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
if current_score < ranking_score_threshold {
|
||||
all_candidates -=
|
||||
next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
current_score < ranking_score_threshold
|
||||
});
|
||||
|
||||
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
|
||||
|
||||
if cur_ranking_rule_index == ranking_rules_len - 1
|
||||
|| (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1)
|
||||
|| cur_offset + (next_bucket.candidates.len() as usize) < from
|
||||
|| is_below_threshold
|
||||
{
|
||||
maybe_add_to_results!(next_bucket.candidates);
|
||||
if is_below_threshold {
|
||||
all_candidates -= &next_bucket.candidates;
|
||||
all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
|
||||
} else {
|
||||
maybe_add_to_results!(next_bucket.candidates);
|
||||
}
|
||||
ranking_rule_scores.pop();
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use fst::automaton::Str;
|
||||
use fst::{Automaton, IntoStreamer, Streamer};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use heed::types::DecodeIgnore;
|
||||
use itertools::{merge_join_by, EitherOrBoth};
|
||||
|
||||
use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
|
||||
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||
|
@ -16,16 +18,10 @@ use crate::{Result, MAX_WORD_LENGTH};
|
|||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum NumberOfTypos {
|
||||
Zero,
|
||||
One,
|
||||
Two,
|
||||
}
|
||||
|
||||
pub enum ZeroOrOneTypo {
|
||||
Zero,
|
||||
One,
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
||||
let s = ctx.term_interner.get_mut(self);
|
||||
|
@ -47,34 +43,45 @@ impl Interned<QueryTerm> {
|
|||
}
|
||||
|
||||
fn find_zero_typo_prefix_derivations(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
word_interned: Interned<String>,
|
||||
fst: fst::Set<Cow<'_, [u8]>>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
||||
) -> Result<()> {
|
||||
let word = word_interner.get(word_interned).to_owned();
|
||||
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
let prefix = Str::new(word).starts_with();
|
||||
let mut stream = fst.search(prefix).into_stream();
|
||||
|
||||
while let Some(derived_word) = stream.next() {
|
||||
let derived_word = std::str::from_utf8(derived_word)?.to_owned();
|
||||
let derived_word_interned = word_interner.insert(derived_word);
|
||||
if derived_word_interned != word_interned {
|
||||
let cf = visit(derived_word_interned)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
let words =
|
||||
ctx.index.word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
|
||||
let exact_words =
|
||||
ctx.index.exact_word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
|
||||
|
||||
for eob in merge_join_by(words, exact_words, |lhs, rhs| match (lhs, rhs) {
|
||||
(Ok((word, _)), Ok((exact_word, _))) => word.cmp(exact_word),
|
||||
(Err(_), _) | (_, Err(_)) => Ordering::Equal,
|
||||
}) {
|
||||
match eob {
|
||||
EitherOrBoth::Both(kv, _) | EitherOrBoth::Left(kv) | EitherOrBoth::Right(kv) => {
|
||||
let (derived_word, _) = kv?;
|
||||
let derived_word = derived_word.to_string();
|
||||
let derived_word_interned = ctx.word_interner.insert(derived_word);
|
||||
if derived_word_interned != word_interned {
|
||||
let cf = visit(derived_word_interned)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_zero_one_typo_derivations(
|
||||
fn find_one_typo_derivations(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
word_interned: Interned<String>,
|
||||
is_prefix: bool,
|
||||
mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
|
||||
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
||||
) -> Result<()> {
|
||||
let fst = ctx.get_words_fst()?;
|
||||
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||
|
@ -89,16 +96,9 @@ fn find_zero_one_typo_derivations(
|
|||
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
|
||||
let d = dfa.distance(state.1);
|
||||
match d.to_u8() {
|
||||
0 => {
|
||||
if derived_word != word_interned {
|
||||
let cf = visit(derived_word, ZeroOrOneTypo::Zero)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
0 => (),
|
||||
1 => {
|
||||
let cf = visit(derived_word, ZeroOrOneTypo::One)?;
|
||||
let cf = visit(derived_word)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
|
@ -111,7 +111,7 @@ fn find_zero_one_typo_derivations(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn find_zero_one_two_typo_derivations(
|
||||
fn find_one_two_typo_derivations(
|
||||
word_interned: Interned<String>,
|
||||
is_prefix: bool,
|
||||
fst: fst::Set<Cow<'_, [u8]>>,
|
||||
|
@ -144,14 +144,7 @@ fn find_zero_one_two_typo_derivations(
|
|||
// correct distance
|
||||
let d = second_dfa.distance((state.1).0);
|
||||
match d.to_u8() {
|
||||
0 => {
|
||||
if derived_word_interned != word_interned {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Zero)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
0 => (),
|
||||
1 => {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::One)?;
|
||||
if cf.is_break() {
|
||||
|
@ -194,8 +187,6 @@ pub fn partially_initialized_term_from_word(
|
|||
});
|
||||
}
|
||||
|
||||
let fst = ctx.index.words_fst(ctx.txn)?;
|
||||
|
||||
let use_prefix_db = is_prefix
|
||||
&& (ctx
|
||||
.index
|
||||
|
@ -215,24 +206,19 @@ pub fn partially_initialized_term_from_word(
|
|||
let mut zero_typo = None;
|
||||
let mut prefix_of = BTreeSet::new();
|
||||
|
||||
if fst.contains(word) || ctx.index.exact_word_docids.get(ctx.txn, word)?.is_some() {
|
||||
if ctx.index.contains_word(ctx.txn, word)? {
|
||||
zero_typo = Some(word_interned);
|
||||
}
|
||||
|
||||
if is_prefix && use_prefix_db.is_none() {
|
||||
find_zero_typo_prefix_derivations(
|
||||
word_interned,
|
||||
fst,
|
||||
&mut ctx.word_interner,
|
||||
|derived_word| {
|
||||
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
|
||||
prefix_of.insert(derived_word);
|
||||
Ok(ControlFlow::Continue(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Break(()))
|
||||
}
|
||||
},
|
||||
)?;
|
||||
find_zero_typo_prefix_derivations(ctx, word_interned, |derived_word| {
|
||||
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
|
||||
prefix_of.insert(derived_word);
|
||||
Ok(ControlFlow::Continue(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Break(()))
|
||||
}
|
||||
})?;
|
||||
}
|
||||
let synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||
let mut synonym_word_count = 0;
|
||||
|
@ -295,18 +281,13 @@ impl Interned<QueryTerm> {
|
|||
let mut one_typo_words = BTreeSet::new();
|
||||
|
||||
if *max_nbr_typos > 0 {
|
||||
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
|
||||
match nbr_typos {
|
||||
ZeroOrOneTypo::Zero => {}
|
||||
ZeroOrOneTypo::One => {
|
||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||
one_typo_words.insert(derived_word);
|
||||
} else {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
}
|
||||
find_one_typo_derivations(ctx, original, is_prefix, |derived_word| {
|
||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||
one_typo_words.insert(derived_word);
|
||||
Ok(ControlFlow::Continue(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Break(()))
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
})?;
|
||||
}
|
||||
|
||||
|
@ -357,7 +338,7 @@ impl Interned<QueryTerm> {
|
|||
let mut two_typo_words = BTreeSet::new();
|
||||
|
||||
if *max_nbr_typos > 0 {
|
||||
find_zero_one_two_typo_derivations(
|
||||
find_one_two_typo_derivations(
|
||||
*original,
|
||||
*is_prefix,
|
||||
ctx.index.words_fst(ctx.txn)?,
|
||||
|
@ -370,7 +351,6 @@ impl Interned<QueryTerm> {
|
|||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
match nbr_typos {
|
||||
NumberOfTypos::Zero => {}
|
||||
NumberOfTypos::One => {
|
||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||
one_typo_words.insert(derived_word);
|
||||
|
|
|
@ -28,6 +28,7 @@ pub use self::helpers::*;
|
|||
pub use self::transform::{Transform, TransformOutput};
|
||||
use super::facet::clear_facet_levels_based_on_settings_diff;
|
||||
use super::new::StdResult;
|
||||
use crate::database_stats::DatabaseStats;
|
||||
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError};
|
||||
use crate::index::{PrefixSearch, PrefixSettings};
|
||||
|
@ -476,7 +477,8 @@ where
|
|||
|
||||
if !settings_diff.settings_update_only {
|
||||
// Update the stats of the documents database when there is a document update.
|
||||
self.index.update_documents_stats(self.wtxn, modified_docids)?;
|
||||
let stats = DatabaseStats::new(self.index.documents.remap_data_type(), self.wtxn)?;
|
||||
self.index.put_documents_stats(self.wtxn, stats)?;
|
||||
}
|
||||
// We write the field distribution into the main database
|
||||
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
use bumpalo::Bump;
|
||||
use heed::RoTxn;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::document::{
|
||||
Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions,
|
||||
|
@ -10,7 +11,7 @@ use super::vector_document::{
|
|||
use crate::attribute_patterns::PatternMatch;
|
||||
use crate::documents::FieldIdMapper;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::{DocumentId, Index, Result};
|
||||
use crate::{DocumentId, Index, InternalError, Result};
|
||||
|
||||
pub enum DocumentChange<'doc> {
|
||||
Deletion(Deletion<'doc>),
|
||||
|
@ -243,6 +244,29 @@ impl<'doc> Update<'doc> {
|
|||
Ok(has_deleted_fields)
|
||||
}
|
||||
|
||||
/// Returns `true` if the geo fields have changed.
|
||||
pub fn has_changed_for_geo_fields<'t, Mapper: FieldIdMapper>(
|
||||
&self,
|
||||
rtxn: &'t RoTxn,
|
||||
index: &'t Index,
|
||||
mapper: &'t Mapper,
|
||||
) -> Result<bool> {
|
||||
let current = self.current(rtxn, index, mapper)?;
|
||||
let current_geo = current.geo_field()?;
|
||||
let updated_geo = self.only_changed_fields().geo_field()?;
|
||||
match (current_geo, updated_geo) {
|
||||
(Some(current_geo), Some(updated_geo)) => {
|
||||
let current: Value =
|
||||
serde_json::from_str(current_geo.get()).map_err(InternalError::SerdeJson)?;
|
||||
let updated: Value =
|
||||
serde_json::from_str(updated_geo.get()).map_err(InternalError::SerdeJson)?;
|
||||
Ok(current != updated)
|
||||
}
|
||||
(None, None) => Ok(false),
|
||||
_ => Ok(true),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn only_changed_vectors(
|
||||
&self,
|
||||
doc_alloc: &'doc Bump,
|
||||
|
|
|
@ -117,7 +117,7 @@ impl FacetedDocidsExtractor {
|
|||
},
|
||||
),
|
||||
DocumentChange::Update(inner) => {
|
||||
if !inner.has_changed_for_fields(
|
||||
let has_changed = inner.has_changed_for_fields(
|
||||
&mut |field_name| {
|
||||
match_faceted_field(
|
||||
field_name,
|
||||
|
@ -130,7 +130,10 @@ impl FacetedDocidsExtractor {
|
|||
rtxn,
|
||||
index,
|
||||
context.db_fields_ids_map,
|
||||
)? {
|
||||
)?;
|
||||
let has_changed_for_geo_fields =
|
||||
inner.has_changed_for_geo_fields(rtxn, index, context.db_fields_ids_map)?;
|
||||
if !has_changed && !has_changed_for_geo_fields {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
|
|
@ -121,6 +121,7 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> {
|
|||
// do we have set embeddings?
|
||||
if let Some(embeddings) = new_vectors.embeddings {
|
||||
chunks.set_vectors(
|
||||
update.external_document_id(),
|
||||
update.docid(),
|
||||
embeddings
|
||||
.into_vec(&context.doc_alloc, embedder_name)
|
||||
|
@ -128,7 +129,7 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> {
|
|||
document_id: update.external_document_id().to_string(),
|
||||
error: error.to_string(),
|
||||
})?,
|
||||
);
|
||||
)?;
|
||||
} else if new_vectors.regenerate {
|
||||
let new_rendered = prompt.render_document(
|
||||
update.external_document_id(),
|
||||
|
@ -209,6 +210,7 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> {
|
|||
chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
|
||||
if let Some(embeddings) = new_vectors.embeddings {
|
||||
chunks.set_vectors(
|
||||
insertion.external_document_id(),
|
||||
insertion.docid(),
|
||||
embeddings
|
||||
.into_vec(&context.doc_alloc, embedder_name)
|
||||
|
@ -218,7 +220,7 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> {
|
|||
.to_string(),
|
||||
error: error.to_string(),
|
||||
})?,
|
||||
);
|
||||
)?;
|
||||
} else if new_vectors.regenerate {
|
||||
let rendered = prompt.render_document(
|
||||
insertion.external_document_id(),
|
||||
|
@ -273,6 +275,7 @@ struct Chunks<'a, 'b, 'extractor> {
|
|||
embedder: &'a Embedder,
|
||||
embedder_id: u8,
|
||||
embedder_name: &'a str,
|
||||
dimensions: usize,
|
||||
prompt: &'a Prompt,
|
||||
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
||||
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
||||
|
@ -297,6 +300,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
||||
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
let dimensions = embedder.dimensions();
|
||||
Self {
|
||||
texts,
|
||||
ids,
|
||||
|
@ -309,6 +313,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||
embedder_name,
|
||||
user_provided,
|
||||
has_manual_generation: None,
|
||||
dimensions,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -490,7 +495,25 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||
}
|
||||
}
|
||||
|
||||
fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
|
||||
fn set_vectors(
|
||||
&self,
|
||||
external_docid: &'a str,
|
||||
docid: DocumentId,
|
||||
embeddings: Vec<Embedding>,
|
||||
) -> Result<()> {
|
||||
for (embedding_index, embedding) in embeddings.iter().enumerate() {
|
||||
if embedding.len() != self.dimensions {
|
||||
return Err(UserError::InvalidIndexingVectorDimensions {
|
||||
expected: self.dimensions,
|
||||
found: embedding.len(),
|
||||
embedder_name: self.embedder_name.to_string(),
|
||||
document_id: external_docid.to_string(),
|
||||
embedding_index,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
}
|
||||
self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,6 +13,7 @@ use super::super::thread_local::{FullySend, ThreadLocal};
|
|||
use super::super::FacetFieldIdsDelta;
|
||||
use super::document_changes::{extract, DocumentChanges, IndexingContext};
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::MergingWordCache;
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::new::extract::EmbeddingExtractor;
|
||||
use crate::update::new::merger::merge_and_send_rtree;
|
||||
|
@ -96,6 +97,7 @@ where
|
|||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(IndexingStep::MergingFacetCaches);
|
||||
|
||||
facet_field_ids_delta = merge_and_send_facet_docids(
|
||||
caches,
|
||||
|
@ -117,7 +119,6 @@ where
|
|||
} = {
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
WordDocidsExtractors::run_extraction(
|
||||
document_changes,
|
||||
indexing_context,
|
||||
|
@ -126,9 +127,13 @@ where
|
|||
)?
|
||||
};
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::MergingWordCaches);
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(MergingWordCache::WordDocids);
|
||||
|
||||
merge_and_send_docids(
|
||||
word_docids,
|
||||
index.word_docids.remap_types(),
|
||||
|
@ -142,6 +147,8 @@ where
|
|||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(MergingWordCache::WordFieldIdDocids);
|
||||
|
||||
merge_and_send_docids(
|
||||
word_fid_docids,
|
||||
index.word_fid_docids.remap_types(),
|
||||
|
@ -155,6 +162,8 @@ where
|
|||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(MergingWordCache::ExactWordDocids);
|
||||
|
||||
merge_and_send_docids(
|
||||
exact_word_docids,
|
||||
index.exact_word_docids.remap_types(),
|
||||
|
@ -168,6 +177,8 @@ where
|
|||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(MergingWordCache::WordPositionDocids);
|
||||
|
||||
merge_and_send_docids(
|
||||
word_position_docids,
|
||||
index.word_position_docids.remap_types(),
|
||||
|
@ -181,6 +192,8 @@ where
|
|||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(MergingWordCache::FieldIdWordCountDocids);
|
||||
|
||||
merge_and_send_docids(
|
||||
fid_word_count_docids,
|
||||
index.field_id_word_count_docids.remap_types(),
|
||||
|
@ -210,6 +223,7 @@ where
|
|||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(IndexingStep::MergingWordProximity);
|
||||
|
||||
merge_and_send_docids(
|
||||
caches,
|
||||
|
|
|
@ -234,7 +234,6 @@ where
|
|||
embedders,
|
||||
field_distribution,
|
||||
document_ids,
|
||||
modified_docids,
|
||||
)?;
|
||||
|
||||
Ok(congestion)
|
||||
|
|
|
@ -7,12 +7,13 @@ use itertools::{merge_join_by, EitherOrBoth};
|
|||
use super::document_changes::IndexingContext;
|
||||
use crate::facet::FacetType;
|
||||
use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
|
||||
use crate::progress::Progress;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::facet::new_incremental::FacetsUpdateIncremental;
|
||||
use crate::update::facet::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||
use crate::update::new::facet_search_builder::FacetSearchBuilder;
|
||||
use crate::update::new::merger::FacetFieldIdDelta;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::steps::{IndexingStep, PostProcessingFacets, PostProcessingWords};
|
||||
use crate::update::new::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder};
|
||||
use crate::update::new::words_prefix_docids::{
|
||||
compute_exact_word_prefix_docids, compute_word_prefix_docids, compute_word_prefix_fid_docids,
|
||||
|
@ -33,11 +34,23 @@ where
|
|||
{
|
||||
let index = indexing_context.index;
|
||||
indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets);
|
||||
compute_facet_level_database(index, wtxn, facet_field_ids_delta, &mut global_fields_ids_map)?;
|
||||
compute_facet_search_database(index, wtxn, global_fields_ids_map)?;
|
||||
compute_facet_level_database(
|
||||
index,
|
||||
wtxn,
|
||||
facet_field_ids_delta,
|
||||
&mut global_fields_ids_map,
|
||||
indexing_context.progress,
|
||||
)?;
|
||||
compute_facet_search_database(index, wtxn, global_fields_ids_map, indexing_context.progress)?;
|
||||
indexing_context.progress.update_progress(IndexingStep::PostProcessingWords);
|
||||
if let Some(prefix_delta) = compute_word_fst(index, wtxn)? {
|
||||
compute_prefix_database(index, wtxn, prefix_delta, indexing_context.grenad_parameters)?;
|
||||
if let Some(prefix_delta) = compute_word_fst(index, wtxn, indexing_context.progress)? {
|
||||
compute_prefix_database(
|
||||
index,
|
||||
wtxn,
|
||||
prefix_delta,
|
||||
indexing_context.grenad_parameters,
|
||||
indexing_context.progress,
|
||||
)?;
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
@ -48,21 +61,32 @@ fn compute_prefix_database(
|
|||
wtxn: &mut RwTxn,
|
||||
prefix_delta: PrefixDelta,
|
||||
grenad_parameters: &GrenadParameters,
|
||||
progress: &Progress,
|
||||
) -> Result<()> {
|
||||
let PrefixDelta { modified, deleted } = prefix_delta;
|
||||
// Compute word prefix docids
|
||||
|
||||
progress.update_progress(PostProcessingWords::WordPrefixDocids);
|
||||
compute_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
|
||||
// Compute exact word prefix docids
|
||||
|
||||
progress.update_progress(PostProcessingWords::ExactWordPrefixDocids);
|
||||
compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
|
||||
// Compute word prefix fid docids
|
||||
|
||||
progress.update_progress(PostProcessingWords::WordPrefixFieldIdDocids);
|
||||
compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
|
||||
// Compute word prefix position docids
|
||||
|
||||
progress.update_progress(PostProcessingWords::WordPrefixPositionDocids);
|
||||
compute_word_prefix_position_docids(wtxn, index, &modified, &deleted, grenad_parameters)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing")]
|
||||
fn compute_word_fst(index: &Index, wtxn: &mut RwTxn) -> Result<Option<PrefixDelta>> {
|
||||
fn compute_word_fst(
|
||||
index: &Index,
|
||||
wtxn: &mut RwTxn,
|
||||
progress: &Progress,
|
||||
) -> Result<Option<PrefixDelta>> {
|
||||
let rtxn = index.read_txn()?;
|
||||
progress.update_progress(PostProcessingWords::WordFst);
|
||||
|
||||
let words_fst = index.words_fst(&rtxn)?;
|
||||
let mut word_fst_builder = WordFstBuilder::new(&words_fst)?;
|
||||
let prefix_settings = index.prefix_settings(&rtxn)?;
|
||||
|
@ -112,8 +136,10 @@ fn compute_facet_search_database(
|
|||
index: &Index,
|
||||
wtxn: &mut RwTxn,
|
||||
global_fields_ids_map: GlobalFieldsIdsMap,
|
||||
progress: &Progress,
|
||||
) -> Result<()> {
|
||||
let rtxn = index.read_txn()?;
|
||||
progress.update_progress(PostProcessingFacets::FacetSearch);
|
||||
|
||||
// if the facet search is not enabled, we can skip the rest of the function
|
||||
if !index.facet_search(wtxn)? {
|
||||
|
@ -171,10 +197,16 @@ fn compute_facet_level_database(
|
|||
wtxn: &mut RwTxn,
|
||||
mut facet_field_ids_delta: FacetFieldIdsDelta,
|
||||
global_fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
progress: &Progress,
|
||||
) -> Result<()> {
|
||||
let rtxn = index.read_txn()?;
|
||||
|
||||
let filterable_attributes_rules = index.filterable_attributes_rules(&rtxn)?;
|
||||
for (fid, delta) in facet_field_ids_delta.consume_facet_string_delta() {
|
||||
let mut deltas: Vec<_> = facet_field_ids_delta.consume_facet_string_delta().collect();
|
||||
// We move all bulks at the front and incrementals (others) at the end.
|
||||
deltas.sort_by_key(|(_, delta)| if let FacetFieldIdDelta::Bulk = delta { 0 } else { 1 });
|
||||
|
||||
for (fid, delta) in deltas {
|
||||
// skip field ids that should not be facet leveled
|
||||
let Some(metadata) = global_fields_ids_map.metadata(fid) else {
|
||||
continue;
|
||||
|
@ -187,11 +219,13 @@ fn compute_facet_level_database(
|
|||
let _entered = span.enter();
|
||||
match delta {
|
||||
FacetFieldIdDelta::Bulk => {
|
||||
progress.update_progress(PostProcessingFacets::StringsBulk);
|
||||
tracing::debug!(%fid, "bulk string facet processing");
|
||||
FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::String)
|
||||
.execute(wtxn)?
|
||||
}
|
||||
FacetFieldIdDelta::Incremental(delta_data) => {
|
||||
progress.update_progress(PostProcessingFacets::StringsIncremental);
|
||||
tracing::debug!(%fid, len=%delta_data.len(), "incremental string facet processing");
|
||||
FacetsUpdateIncremental::new(
|
||||
index,
|
||||
|
@ -207,16 +241,22 @@ fn compute_facet_level_database(
|
|||
}
|
||||
}
|
||||
|
||||
for (fid, delta) in facet_field_ids_delta.consume_facet_number_delta() {
|
||||
let mut deltas: Vec<_> = facet_field_ids_delta.consume_facet_number_delta().collect();
|
||||
// We move all bulks at the front and incrementals (others) at the end.
|
||||
deltas.sort_by_key(|(_, delta)| if let FacetFieldIdDelta::Bulk = delta { 0 } else { 1 });
|
||||
|
||||
for (fid, delta) in deltas {
|
||||
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number");
|
||||
let _entered = span.enter();
|
||||
match delta {
|
||||
FacetFieldIdDelta::Bulk => {
|
||||
progress.update_progress(PostProcessingFacets::NumbersBulk);
|
||||
tracing::debug!(%fid, "bulk number facet processing");
|
||||
FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::Number)
|
||||
.execute(wtxn)?
|
||||
}
|
||||
FacetFieldIdDelta::Incremental(delta_data) => {
|
||||
progress.update_progress(PostProcessingFacets::NumbersIncremental);
|
||||
tracing::debug!(%fid, len=%delta_data.len(), "incremental number facet processing");
|
||||
FacetsUpdateIncremental::new(
|
||||
index,
|
||||
|
|
|
@ -7,6 +7,7 @@ use rand::SeedableRng as _;
|
|||
use time::OffsetDateTime;
|
||||
|
||||
use super::super::channel::*;
|
||||
use crate::database_stats::DatabaseStats;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
|
@ -142,7 +143,6 @@ pub(super) fn update_index(
|
|||
embedders: EmbeddingConfigs,
|
||||
field_distribution: std::collections::BTreeMap<String, u64>,
|
||||
document_ids: roaring::RoaringBitmap,
|
||||
modified_docids: roaring::RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?;
|
||||
if let Some(new_primary_key) = new_primary_key {
|
||||
|
@ -153,7 +153,8 @@ pub(super) fn update_index(
|
|||
index.put_field_distribution(wtxn, &field_distribution)?;
|
||||
index.put_documents_ids(wtxn, &document_ids)?;
|
||||
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||
index.update_documents_stats(wtxn, modified_docids)?;
|
||||
let stats = DatabaseStats::new(index.documents.remap_data_type(), wtxn)?;
|
||||
index.put_documents_stats(wtxn, stats)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
|
@ -82,14 +82,8 @@ where
|
|||
merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
||||
let current = database.get(&rtxn, key)?;
|
||||
match merge_cbo_bitmaps(current, del, add)? {
|
||||
Operation::Write(bitmap) => {
|
||||
docids_sender.write(key, &bitmap)?;
|
||||
Ok(())
|
||||
}
|
||||
Operation::Delete => {
|
||||
docids_sender.delete(key)?;
|
||||
Ok(())
|
||||
}
|
||||
Operation::Write(bitmap) => docids_sender.write(key, &bitmap),
|
||||
Operation::Delete => docids_sender.delete(key),
|
||||
Operation::Ignore => Ok(()),
|
||||
}
|
||||
})
|
||||
|
@ -130,7 +124,6 @@ pub fn merge_and_send_facet_docids<'extractor>(
|
|||
Operation::Ignore => Ok(()),
|
||||
}
|
||||
})?;
|
||||
|
||||
Ok(facet_field_ids_delta)
|
||||
})
|
||||
.reduce(
|
||||
|
|
|
@ -1,52 +1,42 @@
|
|||
use std::borrow::Cow;
|
||||
use crate::make_enum_progress;
|
||||
|
||||
use enum_iterator::Sequence;
|
||||
|
||||
use crate::progress::Step;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)]
|
||||
#[repr(u8)]
|
||||
pub enum IndexingStep {
|
||||
PreparingPayloads,
|
||||
ExtractingDocuments,
|
||||
ExtractingFacets,
|
||||
ExtractingWords,
|
||||
ExtractingWordProximity,
|
||||
ExtractingEmbeddings,
|
||||
WritingGeoPoints,
|
||||
WaitingForDatabaseWrites,
|
||||
WaitingForExtractors,
|
||||
WritingEmbeddingsToDatabase,
|
||||
PostProcessingFacets,
|
||||
PostProcessingWords,
|
||||
Finalizing,
|
||||
}
|
||||
|
||||
impl Step for IndexingStep {
|
||||
fn name(&self) -> Cow<'static, str> {
|
||||
match self {
|
||||
IndexingStep::PreparingPayloads => "preparing update file",
|
||||
IndexingStep::ExtractingDocuments => "extracting documents",
|
||||
IndexingStep::ExtractingFacets => "extracting facets",
|
||||
IndexingStep::ExtractingWords => "extracting words",
|
||||
IndexingStep::ExtractingWordProximity => "extracting word proximity",
|
||||
IndexingStep::ExtractingEmbeddings => "extracting embeddings",
|
||||
IndexingStep::WritingGeoPoints => "writing geo points",
|
||||
IndexingStep::WaitingForDatabaseWrites => "waiting for database writes",
|
||||
IndexingStep::WaitingForExtractors => "waiting for extractors",
|
||||
IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database",
|
||||
IndexingStep::PostProcessingFacets => "post-processing facets",
|
||||
IndexingStep::PostProcessingWords => "post-processing words",
|
||||
IndexingStep::Finalizing => "finalizing",
|
||||
}
|
||||
.into()
|
||||
}
|
||||
|
||||
fn current(&self) -> u32 {
|
||||
*self as u32
|
||||
}
|
||||
|
||||
fn total(&self) -> u32 {
|
||||
Self::CARDINALITY as u32
|
||||
make_enum_progress! {
|
||||
pub enum IndexingStep {
|
||||
PreparingPayloads,
|
||||
ExtractingDocuments,
|
||||
ExtractingFacets,
|
||||
ExtractingWords,
|
||||
ExtractingWordProximity,
|
||||
ExtractingEmbeddings,
|
||||
MergingFacetCaches,
|
||||
MergingWordCaches,
|
||||
MergingWordProximity,
|
||||
WritingGeoPoints,
|
||||
WaitingForDatabaseWrites,
|
||||
WaitingForExtractors,
|
||||
WritingEmbeddingsToDatabase,
|
||||
PostProcessingFacets,
|
||||
PostProcessingWords,
|
||||
Finalizing,
|
||||
}
|
||||
}
|
||||
|
||||
make_enum_progress! {
|
||||
pub enum PostProcessingFacets {
|
||||
StringsBulk,
|
||||
StringsIncremental,
|
||||
NumbersBulk,
|
||||
NumbersIncremental,
|
||||
FacetSearch,
|
||||
}
|
||||
}
|
||||
|
||||
make_enum_progress! {
|
||||
pub enum PostProcessingWords {
|
||||
WordFst,
|
||||
WordPrefixDocids,
|
||||
ExactWordPrefixDocids,
|
||||
WordPrefixFieldIdDocids,
|
||||
WordPrefixPositionDocids,
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1331,8 +1331,21 @@ impl InnerIndexSettingsDiff {
|
|||
|
||||
let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes;
|
||||
|
||||
let cache_user_defined_searchables = old_settings.user_defined_searchable_attributes
|
||||
!= new_settings.user_defined_searchable_attributes;
|
||||
// Check if any searchable field has been added or removed form the list,
|
||||
// Changing the order should not be considered as a change for reindexing.
|
||||
let cache_user_defined_searchables = match (
|
||||
&old_settings.user_defined_searchable_attributes,
|
||||
&new_settings.user_defined_searchable_attributes,
|
||||
) {
|
||||
(Some(old), Some(new)) => {
|
||||
let old: BTreeSet<_> = old.iter().collect();
|
||||
let new: BTreeSet<_> = new.iter().collect();
|
||||
|
||||
old != new
|
||||
}
|
||||
(None, None) => false,
|
||||
_otherwise => true,
|
||||
};
|
||||
|
||||
// if the user-defined searchables changed, then we need to reindex prompts.
|
||||
if cache_user_defined_searchables {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue