Merge branch 'main' into all-cpus-in-import-dump

This commit is contained in:
nnethercott 2025-05-12 21:48:12 +02:00
commit 75a7e40a27
74 changed files with 2402 additions and 1726 deletions

View file

@ -127,7 +127,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
// merge all deletions
let obkv = KvReaderDelAdd::from_slice(value);
if let Some(value) = obkv.get(DelAdd::Deletion) {
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid)
|| settings_diff.old.disabled_typos_terms.is_exact(w);
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
obkv.insert(DelAdd::Deletion, value)?;
@ -139,7 +140,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
}
// merge all additions
if let Some(value) = obkv.get(DelAdd::Addition) {
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid)
|| settings_diff.new.disabled_typos_terms.is_exact(w);
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
obkv.insert(DelAdd::Addition, value)?;

View file

@ -273,14 +273,11 @@ pub(crate) fn write_typed_chunk_into_index(
unreachable!();
};
let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
let clonable_exact_word_docids =
unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
word_docids_builder.push(word_docids_reader.into_cursor()?);
exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
fst_merger_builder.push(clonable_word_docids.into_cursor()?);
fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?);
}
let word_docids_merger = word_docids_builder.build();

View file

@ -319,8 +319,11 @@ impl WordDocidsExtractors {
let doc_alloc = &context.doc_alloc;
let exact_attributes = index.exact_attributes(rtxn)?;
let is_exact_attribute =
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
let is_exact = |fname: &str, word: &str| {
exact_attributes.iter().any(|attr| contained_in(fname, attr))
|| disabled_typos_terms.is_exact(word)
};
match document_change {
DocumentChange::Deletion(inner) => {
let mut token_fn = |fname: &str, fid, pos, word: &str| {
@ -328,7 +331,7 @@ impl WordDocidsExtractors {
fid,
pos,
word,
is_exact_attribute(fname),
is_exact(fname, word),
inner.docid(),
doc_alloc,
)
@ -356,7 +359,7 @@ impl WordDocidsExtractors {
fid,
pos,
word,
is_exact_attribute(fname),
is_exact(fname, word),
inner.docid(),
doc_alloc,
)
@ -372,7 +375,7 @@ impl WordDocidsExtractors {
fid,
pos,
word,
is_exact_attribute(fname),
is_exact(fname, word),
inner.docid(),
doc_alloc,
)
@ -389,7 +392,7 @@ impl WordDocidsExtractors {
fid,
pos,
word,
is_exact_attribute(fname),
is_exact(fname, word),
inner.docid(),
doc_alloc,
)

View file

@ -9,6 +9,7 @@ pub use document_operation::{DocumentOperation, PayloadStats};
use hashbrown::HashMap;
use heed::RwTxn;
pub use partial_dump::PartialDump;
pub use post_processing::recompute_word_fst_from_word_docids_database;
pub use update_by_function::UpdateByFunction;
pub use write::ChannelCongestion;
use write::{build_vectors, update_index, write_to_db};

View file

@ -131,6 +131,20 @@ fn compute_word_fst(
}
}
pub fn recompute_word_fst_from_word_docids_database(index: &Index, wtxn: &mut RwTxn) -> Result<()> {
let fst = fst::Set::default().map_data(std::borrow::Cow::Owned)?;
let mut word_fst_builder = WordFstBuilder::new(&fst)?;
let words = index.word_docids.iter(wtxn)?.remap_data_type::<DecodeIgnore>();
for res in words {
let (word, _) = res?;
word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?;
}
let (word_fst_mmap, _) = word_fst_builder.build(index, wtxn)?;
index.main.remap_types::<Str, Bytes>().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?;
Ok(())
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_search")]
fn compute_facet_search_database(
index: &Index,

View file

@ -17,6 +17,7 @@ use super::IndexerConfig;
use crate::attribute_patterns::PatternMatch;
use crate::constants::RESERVED_GEO_FIELD_NAME;
use crate::criterion::Criterion;
use crate::disabled_typos_terms::DisabledTyposTerms;
use crate::error::UserError;
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
use crate::filterable_attributes_rules::match_faceted_field;
@ -169,6 +170,7 @@ pub struct Settings<'a, 't, 'i> {
synonyms: Setting<BTreeMap<String, Vec<String>>>,
primary_key: Setting<String>,
authorize_typos: Setting<bool>,
disable_on_numbers: Setting<bool>,
min_word_len_two_typos: Setting<u8>,
min_word_len_one_typo: Setting<u8>,
exact_words: Setting<BTreeSet<String>>,
@ -207,6 +209,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
synonyms: Setting::NotSet,
primary_key: Setting::NotSet,
authorize_typos: Setting::NotSet,
disable_on_numbers: Setting::NotSet,
exact_words: Setting::NotSet,
min_word_len_two_typos: Setting::NotSet,
min_word_len_one_typo: Setting::NotSet,
@ -354,6 +357,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.min_word_len_one_typo = Setting::Reset;
}
pub fn set_disable_on_numbers(&mut self, disable_on_numbers: bool) {
self.disable_on_numbers = Setting::Set(disable_on_numbers);
}
pub fn reset_disable_on_numbers(&mut self) {
self.disable_on_numbers = Setting::Reset;
}
pub fn set_exact_words(&mut self, words: BTreeSet<String>) {
self.exact_words = Setting::Set(words);
}
@ -866,6 +877,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Ok(())
}
fn update_disabled_typos_terms(&mut self) -> Result<()> {
let mut disabled_typos_terms = self.index.disabled_typos_terms(self.wtxn)?;
match self.disable_on_numbers {
Setting::Set(disable_on_numbers) => {
disabled_typos_terms.disable_on_numbers = disable_on_numbers;
}
Setting::Reset => {
self.index.delete_disabled_typos_terms(self.wtxn)?;
disabled_typos_terms.disable_on_numbers =
DisabledTyposTerms::default().disable_on_numbers;
}
Setting::NotSet => (),
}
self.index.put_disabled_typos_terms(self.wtxn, &disabled_typos_terms)?;
Ok(())
}
fn update_exact_words(&mut self) -> Result<()> {
match self.exact_words {
Setting::Set(ref mut words) => {
@ -1246,6 +1275,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.update_prefix_search()?;
self.update_facet_search()?;
self.update_localized_attributes_rules()?;
self.update_disabled_typos_terms()?;
let embedding_config_updates = self.update_embedding_configs()?;
@ -1327,6 +1357,7 @@ impl InnerIndexSettingsDiff {
|| old_settings.prefix_search != new_settings.prefix_search
|| old_settings.localized_attributes_rules
!= new_settings.localized_attributes_rules
|| old_settings.disabled_typos_terms != new_settings.disabled_typos_terms
};
let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes;
@ -1526,6 +1557,7 @@ pub(crate) struct InnerIndexSettings {
pub user_defined_searchable_attributes: Option<Vec<String>>,
pub sortable_fields: HashSet<String>,
pub exact_attributes: HashSet<FieldId>,
pub disabled_typos_terms: DisabledTyposTerms,
pub proximity_precision: ProximityPrecision,
pub embedding_configs: EmbeddingConfigs,
pub geo_fields_ids: Option<(FieldId, FieldId)>,
@ -1574,7 +1606,7 @@ impl InnerIndexSettings {
.map(|fields| fields.into_iter().map(|f| f.to_string()).collect());
let builder = MetadataBuilder::from_index(index, rtxn)?;
let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder);
let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
Ok(Self {
stop_words,
allowed_separators,
@ -1592,6 +1624,7 @@ impl InnerIndexSettings {
geo_fields_ids,
prefix_search,
facet_search,
disabled_typos_terms,
})
}

View file

@ -896,6 +896,7 @@ fn test_correct_settings_init() {
localized_attributes_rules,
prefix_search,
facet_search,
disable_on_numbers,
} = settings;
assert!(matches!(searchable_fields, Setting::NotSet));
assert!(matches!(displayed_fields, Setting::NotSet));
@ -923,6 +924,7 @@ fn test_correct_settings_init() {
assert!(matches!(localized_attributes_rules, Setting::NotSet));
assert!(matches!(prefix_search, Setting::NotSet));
assert!(matches!(facet_search, Setting::NotSet));
assert!(matches!(disable_on_numbers, Setting::NotSet));
})
.unwrap();
}

View file

@ -1,12 +1,14 @@
mod v1_12;
mod v1_13;
mod v1_14;
mod v1_15;
use heed::RwTxn;
use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3};
use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13};
use v1_14::Latest_V1_13_To_Latest_V1_14;
use v1_15::Latest_V1_14_To_Latest_V1_15;
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
use crate::progress::{Progress, VariableNameStep};
use crate::{Index, InternalError, Result};
@ -23,12 +25,16 @@ trait UpgradeIndex {
}
/// Return true if the cached stats of the index must be regenerated
pub fn upgrade(
pub fn upgrade<MSP>(
wtxn: &mut RwTxn,
index: &Index,
db_version: (u32, u32, u32),
must_stop_processing: MSP,
progress: Progress,
) -> Result<bool> {
) -> Result<bool>
where
MSP: Fn() -> bool + Sync,
{
let from = index.get_version(wtxn)?.unwrap_or(db_version);
let upgrade_functions: &[&dyn UpgradeIndex] = &[
&V1_12_To_V1_12_3 {},
@ -36,6 +42,10 @@ pub fn upgrade(
&V1_13_0_To_V1_13_1 {},
&V1_13_1_To_Latest_V1_13 {},
&Latest_V1_13_To_Latest_V1_14 {},
&Latest_V1_14_To_Latest_V1_15 {},
// This is the last upgrade function, it will be called when the index is up to date.
// any other upgrade function should be added before this one.
&ToCurrentNoOp {},
];
let start = match from {
@ -43,8 +53,9 @@ pub fn upgrade(
(1, 12, 3..) => 1,
(1, 13, 0) => 2,
(1, 13, _) => 4,
(1, 14, _) => 5,
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
(1, 14, _) => 4,
(1, 15, _) => 6,
(major, minor, patch) => {
return Err(InternalError::CannotUpgradeToVersion(major, minor, patch).into())
}
@ -56,6 +67,9 @@ pub fn upgrade(
let mut current_version = from;
let mut regenerate_stats = false;
for (i, upgrade) in upgrade_path.iter().enumerate() {
if (must_stop_processing)() {
return Err(crate::Error::InternalError(InternalError::AbortedIndexation));
}
let target = upgrade.target_version();
progress.update_progress(VariableNameStep::<UpgradeVersion>::new(
format!(
@ -77,3 +91,22 @@ pub fn upgrade(
Ok(regenerate_stats)
}
#[allow(non_camel_case_types)]
struct ToCurrentNoOp {}
impl UpgradeIndex for ToCurrentNoOp {
fn upgrade(
&self,
_wtxn: &mut RwTxn,
_index: &Index,
_original: (u32, u32, u32),
_progress: Progress,
) -> Result<bool> {
Ok(false)
}
fn target_version(&self) -> (u32, u32, u32) {
(VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH)
}
}

View file

@ -1,7 +1,6 @@
use heed::RwTxn;
use super::UpgradeIndex;
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
use crate::database_stats::DatabaseStats;
use crate::progress::Progress;
use crate::{make_enum_progress, Index, Result};
@ -51,10 +50,6 @@ impl UpgradeIndex for V1_13_1_To_Latest_V1_13 {
}
fn target_version(&self) -> (u32, u32, u32) {
(
VERSION_MAJOR.parse().unwrap(),
VERSION_MINOR.parse().unwrap(),
VERSION_PATCH.parse().unwrap(),
)
(1, 13, 3)
}
}

View file

@ -0,0 +1,35 @@
use heed::RwTxn;
use super::UpgradeIndex;
use crate::progress::Progress;
use crate::update::new::indexer::recompute_word_fst_from_word_docids_database;
use crate::{make_enum_progress, Index, Result};
#[allow(non_camel_case_types)]
pub(super) struct Latest_V1_14_To_Latest_V1_15();
impl UpgradeIndex for Latest_V1_14_To_Latest_V1_15 {
fn upgrade(
&self,
wtxn: &mut RwTxn,
index: &Index,
_original: (u32, u32, u32),
progress: Progress,
) -> Result<bool> {
// Recompute the word FST from the word docids database.
make_enum_progress! {
enum TypoTolerance {
RecomputeWordFst,
}
};
progress.update_progress(TypoTolerance::RecomputeWordFst);
recompute_word_fst_from_word_docids_database(index, wtxn)?;
Ok(false)
}
fn target_version(&self) -> (u32, u32, u32) {
(1, 15, 0)
}
}