Add disableOnNumber setting

This commit is contained in:
ManyTheFish 2025-04-03 17:31:05 +02:00
parent 9fd9fcb03e
commit 63a4dfa2a8
14 changed files with 135 additions and 32 deletions

View file

@ -0,0 +1,50 @@
use heed::{
types::{SerdeJson, Str},
RoTxn, RwTxn,
};
use serde::{Deserialize, Serialize};
use crate::{index::main_key, Index};
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
pub struct DisabledTyposTerms {
pub disable_on_numbers: bool,
}
impl Index {
pub fn disabled_typos_terms(&self, txn: &RoTxn<'_>) -> heed::Result<DisabledTyposTerms> {
self.main
.remap_types::<Str, SerdeJson<DisabledTyposTerms>>()
.get(txn, main_key::DISABLED_TYPOS_TERMS)
.map(|option| option.unwrap_or_default())
}
pub(crate) fn put_disabled_typos_terms(
&self,
txn: &mut RwTxn<'_>,
disabled_typos_terms: &DisabledTyposTerms,
) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<DisabledTyposTerms>>().put(
txn,
main_key::DISABLED_TYPOS_TERMS,
&disabled_typos_terms,
)?;
Ok(())
}
pub(crate) fn delete_disabled_typos_terms(&self, txn: &mut RwTxn<'_>) -> heed::Result<()> {
self.main
.remap_types::<Str, SerdeJson<DisabledTyposTerms>>()
.delete(txn, main_key::DISABLED_TYPOS_TERMS)?;
Ok(())
}
}
impl DisabledTyposTerms {
pub fn is_exact(&self, word: &str) -> bool {
// If disable_on_numbers is true, we disable the word if it contains only numbers or punctuation
self.disable_on_numbers && word.chars().all(|c| c.is_numeric() || c.is_ascii_punctuation())
}
}

View file

@ -78,6 +78,7 @@ pub mod main_key {
pub const FACET_SEARCH: &str = "facet_search";
pub const PREFIX_SEARCH: &str = "prefix_search";
pub const DOCUMENTS_STATS: &str = "documents_stats";
pub const DISABLED_TYPOS_TERMS: &str = "disabled_typos_terms";
}
pub mod db_name {

View file

@ -12,6 +12,7 @@ mod asc_desc;
mod attribute_patterns;
mod criterion;
pub mod database_stats;
pub mod disabled_typos_terms;
mod error;
mod external_documents_ids;
pub mod facet;

View file

@ -127,7 +127,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
// merge all deletions
let obkv = KvReaderDelAdd::from_slice(value);
if let Some(value) = obkv.get(DelAdd::Deletion) {
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid)
|| settings_diff.old.disabled_typos_terms.is_exact(&w);
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
obkv.insert(DelAdd::Deletion, value)?;
@ -139,7 +140,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
}
// merge all additions
if let Some(value) = obkv.get(DelAdd::Addition) {
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid)
|| settings_diff.new.disabled_typos_terms.is_exact(&w);
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
obkv.insert(DelAdd::Addition, value)?;

View file

@ -273,14 +273,11 @@ pub(crate) fn write_typed_chunk_into_index(
unreachable!();
};
let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
let clonable_exact_word_docids =
unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
word_docids_builder.push(word_docids_reader.into_cursor()?);
exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
fst_merger_builder.push(clonable_word_docids.into_cursor()?);
fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?);
}
let word_docids_merger = word_docids_builder.build();

View file

@ -319,8 +319,11 @@ impl WordDocidsExtractors {
let doc_alloc = &context.doc_alloc;
let exact_attributes = index.exact_attributes(rtxn)?;
let is_exact_attribute =
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
let is_exact = |fname: &str, word: &str| {
exact_attributes.iter().any(|attr| contained_in(fname, attr))
|| disabled_typos_terms.is_exact(word)
};
match document_change {
DocumentChange::Deletion(inner) => {
let mut token_fn = |fname: &str, fid, pos, word: &str| {
@ -328,7 +331,7 @@ impl WordDocidsExtractors {
fid,
pos,
word,
is_exact_attribute(fname),
is_exact(fname, word),
inner.docid(),
doc_alloc,
)
@ -356,7 +359,7 @@ impl WordDocidsExtractors {
fid,
pos,
word,
is_exact_attribute(fname),
is_exact(fname, word),
inner.docid(),
doc_alloc,
)
@ -372,7 +375,7 @@ impl WordDocidsExtractors {
fid,
pos,
word,
is_exact_attribute(fname),
is_exact(fname, word),
inner.docid(),
doc_alloc,
)
@ -389,7 +392,7 @@ impl WordDocidsExtractors {
fid,
pos,
word,
is_exact_attribute(fname),
is_exact(fname, word),
inner.docid(),
doc_alloc,
)

View file

@ -17,6 +17,7 @@ use super::IndexerConfig;
use crate::attribute_patterns::PatternMatch;
use crate::constants::RESERVED_GEO_FIELD_NAME;
use crate::criterion::Criterion;
use crate::disabled_typos_terms::DisabledTyposTerms;
use crate::error::UserError;
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
use crate::filterable_attributes_rules::match_faceted_field;
@ -169,6 +170,7 @@ pub struct Settings<'a, 't, 'i> {
synonyms: Setting<BTreeMap<String, Vec<String>>>,
primary_key: Setting<String>,
authorize_typos: Setting<bool>,
disabled_typos_terms: Setting<DisabledTyposTerms>,
min_word_len_two_typos: Setting<u8>,
min_word_len_one_typo: Setting<u8>,
exact_words: Setting<BTreeSet<String>>,
@ -207,6 +209,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
synonyms: Setting::NotSet,
primary_key: Setting::NotSet,
authorize_typos: Setting::NotSet,
disabled_typos_terms: Setting::NotSet,
exact_words: Setting::NotSet,
min_word_len_two_typos: Setting::NotSet,
min_word_len_one_typo: Setting::NotSet,
@ -354,6 +357,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.min_word_len_one_typo = Setting::Reset;
}
pub fn set_disabled_typos_terms(&mut self, disabled_typos_terms: DisabledTyposTerms) {
self.disabled_typos_terms = Setting::Set(disabled_typos_terms);
}
pub fn set_exact_words(&mut self, words: BTreeSet<String>) {
self.exact_words = Setting::Set(words);
}
@ -866,6 +873,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Ok(())
}
fn update_disabled_typos_terms(&mut self) -> Result<()> {
match self.disabled_typos_terms {
Setting::Set(disabled_typos_terms) => {
self.index.put_disabled_typos_terms(self.wtxn, &disabled_typos_terms)?;
}
Setting::Reset => {
self.index.delete_disabled_typos_terms(self.wtxn)?;
}
Setting::NotSet => (),
}
Ok(())
}
fn update_exact_words(&mut self) -> Result<()> {
match self.exact_words {
Setting::Set(ref mut words) => {
@ -1246,6 +1266,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.update_prefix_search()?;
self.update_facet_search()?;
self.update_localized_attributes_rules()?;
self.update_disabled_typos_terms()?;
let embedding_config_updates = self.update_embedding_configs()?;
@ -1327,6 +1348,7 @@ impl InnerIndexSettingsDiff {
|| old_settings.prefix_search != new_settings.prefix_search
|| old_settings.localized_attributes_rules
!= new_settings.localized_attributes_rules
|| old_settings.disabled_typos_terms != new_settings.disabled_typos_terms
};
let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes;
@ -1526,6 +1548,7 @@ pub(crate) struct InnerIndexSettings {
pub user_defined_searchable_attributes: Option<Vec<String>>,
pub sortable_fields: HashSet<String>,
pub exact_attributes: HashSet<FieldId>,
pub disabled_typos_terms: DisabledTyposTerms,
pub proximity_precision: ProximityPrecision,
pub embedding_configs: EmbeddingConfigs,
pub geo_fields_ids: Option<(FieldId, FieldId)>,
@ -1574,7 +1597,7 @@ impl InnerIndexSettings {
.map(|fields| fields.into_iter().map(|f| f.to_string()).collect());
let builder = MetadataBuilder::from_index(index, rtxn)?;
let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder);
let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
Ok(Self {
stop_words,
allowed_separators,
@ -1592,6 +1615,7 @@ impl InnerIndexSettings {
geo_fields_ids,
prefix_search,
facet_search,
disabled_typos_terms,
})
}

View file

@ -896,6 +896,7 @@ fn test_correct_settings_init() {
localized_attributes_rules,
prefix_search,
facet_search,
disabled_typos_terms,
} = settings;
assert!(matches!(searchable_fields, Setting::NotSet));
assert!(matches!(displayed_fields, Setting::NotSet));
@ -923,6 +924,7 @@ fn test_correct_settings_init() {
assert!(matches!(localized_attributes_rules, Setting::NotSet));
assert!(matches!(prefix_search, Setting::NotSet));
assert!(matches!(facet_search, Setting::NotSet));
assert!(matches!(disabled_typos_terms, Setting::NotSet));
})
.unwrap();
}