diff --git a/cli/src/main.rs b/cli/src/main.rs index 503b02887..542b9d472 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -99,8 +99,10 @@ impl Settings { }) .collect(); + let exact_attributes = index.exact_attributes(&txn)?; + println!( - "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\n", + "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n\t{}\n", displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), filterable_attributes.join("\n\t"), @@ -109,6 +111,7 @@ impl Settings { stop_words.join("\n\t"), distinct_field.unwrap_or_default(), synonyms.into_iter().map(|(k, v)| format!("\n\t{}:\n{:?}", k, v)).collect::(), + exact_attributes.join("\n\t"), ); Ok(()) } @@ -463,6 +466,8 @@ struct SettingsUpdate { filterable_attributes: Option>, #[structopt(long)] criteria: Option>, + #[structopt(long)] + exact_attributes: Option>, } impl Performer for SettingsUpdate { @@ -489,6 +494,14 @@ impl Performer for SettingsUpdate { } } + if let Some(exact_attributes) = self.exact_attributes { + if !exact_attributes.is_empty() { + update.set_exact_attributes(exact_attributes.into_iter().collect()); + } else { + update.reset_exact_attributes(); + } + } + let mut bars = Vec::new(); let progesses = MultiProgress::new(); for _ in 0..4 { diff --git a/infos/src/main.rs b/infos/src/main.rs index dc98d410d..05c168233 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -29,6 +29,8 @@ const ALL_DATABASE_NAMES: &[&str] = &[ FACET_ID_STRING_DOCIDS, FIELD_ID_DOCID_FACET_F64S, FIELD_ID_DOCID_FACET_STRINGS, + EXACT_WORD_DOCIDS, + EXACT_WORD_PREFIX_DOCIDS, DOCUMENTS, ]; @@ -384,6 +386,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, + exact_word_docids, + exact_word_prefix_docids, field_id_docid_facet_f64s: _, field_id_docid_facet_strings: _, documents, @@ -436,6 +440,14 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho } } + for result in exact_word_docids.remap_data_type::().iter(rtxn)? { + let (word, value) = result?; + heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); + if heap.len() > limit { + heap.pop(); + } + } + for result in word_prefix_docids.remap_data_type::().iter(rtxn)? { let (word, value) = result?; heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); @@ -444,6 +456,14 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho } } + for result in exact_word_prefix_docids.remap_data_type::().iter(rtxn)? { + let (word, value) = result?; + heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); + if heap.len() > limit { + heap.pop(); + } + } + for result in docid_word_positions.remap_data_type::().iter(rtxn)? { let ((docid, word), value) = result?; let key = format!("{} {}", docid, word); @@ -967,6 +987,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a facet_id_string_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, + exact_word_prefix_docids, + exact_word_docids, documents, } = index; @@ -991,6 +1013,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(), FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(), FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(), + EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(), + EXACT_WORD_PREFIX_DOCIDS => exact_word_prefix_docids.as_polymorph(), DOCUMENTS => documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), diff --git a/milli/src/index.rs b/milli/src/index.rs index c0be985da..42170bc80 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -53,12 +53,15 @@ pub mod main_key { pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len"; pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; pub const EXACT_WORDS: &str = "exact-words"; + pub const EXACT_ATTRIBUTES: &str = "exact-attributes"; } pub mod db_name { pub const MAIN: &str = "main"; pub const WORD_DOCIDS: &str = "word-docids"; + pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; + pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; @@ -82,9 +85,16 @@ pub struct Index { /// A word and all the documents ids containing the word. pub word_docids: Database, + + /// A word and all the documents ids containing the word, from attributes for which typos are not allowed. + pub exact_word_docids: Database, + /// A prefix of word and all the documents ids containing this prefix. pub word_prefix_docids: Database, + /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. + pub exact_word_prefix_docids: Database, + /// Maps a word and a document id (u32) to all the positions where the given word appears. pub docid_word_positions: Database, @@ -118,13 +128,15 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(14); + options.max_dbs(16); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; let main = env.create_poly_database(Some(MAIN))?; let word_docids = env.create_database(Some(WORD_DOCIDS))?; + let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?; let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; + let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_prefix_pair_proximity_docids = @@ -145,7 +157,9 @@ impl Index { env, main, word_docids, + exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, @@ -949,6 +963,33 @@ impl Index { )?; Ok(()) } + + /// Returns the exact attributes: attributes for which typo is disallowed. + pub fn exact_attributes<'t>(&self, txn: &'t RoTxn) -> Result> { + Ok(self + .main + .get::<_, Str, SerdeBincode>>(txn, main_key::EXACT_ATTRIBUTES)? + .unwrap_or_default()) + } + + /// Returns the list of exact attributes field ids. + pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result> { + let attrs = self.exact_attributes(txn)?; + let fid_map = self.fields_ids_map(txn)?; + Ok(attrs.iter().filter_map(|attr| fid_map.id(attr)).collect()) + } + + /// Writes the exact attributes to the database. + pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?; + Ok(()) + } + + /// Clears the exact attributes from the store. + pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> Result<()> { + self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?; + Ok(()) + } } #[cfg(test)] diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1dbfd2524..05305d724 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -68,7 +68,9 @@ impl Default for Candidates { pub trait Context<'c> { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; + fn exact_word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result>; fn word_pair_proximity_docids( &self, left: &str, @@ -118,10 +120,18 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.word_docids.get(self.rtxn, &word) } + fn exact_word_docids(&self, word: &str) -> heed::Result> { + self.index.exact_word_docids.get(self.rtxn, &word) + } + fn word_prefix_docids(&self, word: &str) -> heed::Result> { self.index.word_prefix_docids.get(self.rtxn, &word) } + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { + self.index.exact_word_prefix_docids.get(self.rtxn, &word) + } + fn word_pair_proximity_docids( &self, left: &str, @@ -392,26 +402,42 @@ fn query_docids( wdcache: &mut WordDerivationsCache, ) -> Result { match &query.kind { - QueryKind::Exact { word, .. } => { + QueryKind::Exact { word, original_typo } => { if query.prefix && ctx.in_prefix_cache(&word) { - Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default()) + let mut docids = ctx.word_prefix_docids(&word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_prefix_docids(&word)?.unwrap_or_default(); + } + Ok(docids) } else if query.prefix { let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { - let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); - docids |= current_docids; + docids |= ctx.word_docids(&word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_docids(&word)?.unwrap_or_default(); + } } Ok(docids) } else { - Ok(ctx.word_docids(&word)?.unwrap_or_default()) + let mut docids = ctx.word_docids(&word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_docids(&word)?.unwrap_or_default(); + } + Ok(docids) } } QueryKind::Tolerant { typo, word } => { let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); + for (word, typo) in words { + let mut current_docids = ctx.word_docids(&word)?.unwrap_or_default(); + if *typo == 0 { + current_docids |= ctx.exact_word_docids(&word)?.unwrap_or_default() + } docids |= current_docids; } Ok(docids) @@ -512,7 +538,9 @@ pub mod test { pub struct TestContext<'t> { words_fst: fst::Set>, word_docids: HashMap, + exact_word_docids: HashMap, word_prefix_docids: HashMap, + exact_word_prefix_docids: HashMap, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, docid_words: HashMap>, @@ -527,10 +555,18 @@ pub mod test { Ok(self.word_docids.get(&word.to_string()).cloned()) } + fn exact_word_docids(&self, word: &str) -> heed::Result> { + Ok(self.exact_word_docids.get(&word.to_string()).cloned()) + } + fn word_prefix_docids(&self, word: &str) -> heed::Result> { Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) } + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { + Ok(self.exact_word_prefix_docids.get(&word.to_string()).cloned()) + } + fn word_pair_proximity_docids( &self, left: &str, @@ -643,6 +679,8 @@ pub mod test { s("morning") => random_postings(rng, 125), }; + let exact_word_docids = HashMap::new(); + let mut docid_words = HashMap::new(); for (word, docids) in word_docids.iter() { for docid in docids { @@ -657,6 +695,8 @@ pub mod test { s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], }; + let exact_word_prefix_docids = HashMap::new(); + let mut word_pair_proximity_docids = HashMap::new(); let mut word_prefix_pair_proximity_docids = HashMap::new(); for (lword, lcandidates) in &word_docids { @@ -712,7 +752,9 @@ pub mod test { TestContext { words_fst, word_docids, + exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, word_pair_proximity_docids, word_prefix_pair_proximity_docids, docid_words, diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4eccae8ce..ff9d3f4e9 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1267,6 +1267,7 @@ mod test { QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() } ); } + #[test] fn disable_typo_on_word() { let query = "goodbye"; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 644547b91..3665d2313 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -19,7 +19,9 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { env: _env, main: _main, word_docids, + exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, @@ -55,7 +57,9 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // Clear the other databases. word_docids.clear(self.wtxn)?; + exact_word_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?; + exact_word_prefix_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 402cc61dd..77c32f0fb 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,8 +2,8 @@ use std::collections::btree_map::Entry; use std::collections::HashMap; use fst::IntoStreamer; -use heed::types::ByteSlice; -use heed::{BytesDecode, BytesEncode}; +use heed::types::{ByteSlice, Str}; +use heed::{BytesDecode, BytesEncode, Database}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -16,7 +16,10 @@ use crate::heed_codec::facet::{ }; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; -use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; +use crate::{ + DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32, + BEU32, +}; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -108,7 +111,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { env: _env, main: _main, word_docids, + exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, field_id_word_count_docids, @@ -204,25 +209,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We iterate over the words and delete the documents ids // from the word docids database. for (word, must_remove) in &mut words { - // We create an iterator to be able to get the content and delete the word docids. - // It's faster to acquire a cursor to get and delete or put, as we avoid traversing - // the LMDB B-Tree two times but only once. - let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; - if let Some((key, mut docids)) = iter.next().transpose()? { - if key == word.as_str() { - let previous_len = docids.len(); - docids -= &self.documents_ids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - *must_remove = true; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } - } + remove_from_word_docids( + self.wtxn, + word_docids, + word.as_str(), + must_remove, + &self.documents_ids, + )?; + + remove_from_word_docids( + self.wtxn, + exact_word_docids, + word.as_str(), + must_remove, + &self.documents_ids, + )?; } // We construct an FST set that contains the words to delete from the words FST. @@ -254,34 +255,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We write the new words FST into the main database. self.index.put_words_fst(self.wtxn, &new_words_fst)?; - // We iterate over the word prefix docids database and remove the deleted documents ids - // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. - let mut prefixes_to_delete = fst::SetBuilder::memory(); - let mut iter = word_prefix_docids.iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (prefix, mut docids) = result?; - let prefix = prefix.to_owned(); - let previous_len = docids.len(); - docids -= &self.documents_ids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - prefixes_to_delete.insert(prefix)?; - } else if docids.len() != previous_len { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&prefix, &docids)? }; - } - } + let prefixes_to_delete = + remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?; - drop(iter); + let exact_prefix_to_delete = remove_from_word_prefix_docids( + self.wtxn, + exact_word_prefix_docids, + &self.documents_ids, + )?; + + let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union(); // We compute the new prefix FST and write it only if there is a change. - let prefixes_to_delete = prefixes_to_delete.into_set(); - if !prefixes_to_delete.is_empty() { + if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() { let new_words_prefixes_fst = { // We retrieve the current words prefixes FST from the database. let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?; - let difference = words_prefixes_fst.op().add(&prefixes_to_delete).difference(); + let difference = + words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference(); // We stream the new external ids that does no more contains the to-delete external ids. let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory(); @@ -457,6 +448,64 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } +fn remove_from_word_prefix_docids( + txn: &mut heed::RwTxn, + db: &Database, + to_remove: &RoaringBitmap, +) -> Result>> { + let mut prefixes_to_delete = fst::SetBuilder::memory(); + + // We iterate over the word prefix docids database and remove the deleted documents ids + // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. + let mut iter = db.iter_mut(txn)?; + while let Some(result) = iter.next() { + let (prefix, mut docids) = result?; + let prefix = prefix.to_owned(); + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + prefixes_to_delete.insert(prefix)?; + } else if docids.len() != previous_len { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&prefix, &docids)? }; + } + } + + Ok(prefixes_to_delete.into_set()) +} + +fn remove_from_word_docids( + txn: &mut heed::RwTxn, + db: &heed::Database, + word: &str, + must_remove: &mut bool, + to_remove: &RoaringBitmap, +) -> Result<()> { + // We create an iterator to be able to get the content and delete the word docids. + // It's faster to acquire a cursor to get and delete or put, as we avoid traversing + // the LMDB B-Tree two times but only once. + let mut iter = db.prefix_iter_mut(txn, &word)?; + if let Some((key, mut docids)) = iter.next().transpose()? { + if key == word { + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + *must_remove = true; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } + } + } + + Ok(()) +} + fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 80d68298a..f3a44162b 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use std::fs::File; use std::io; use std::iter::FromIterator; @@ -10,17 +11,22 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::Result; +use crate::update::index_documents::helpers::read_u32_ne_bytes; +use crate::{relative_from_absolute_position, FieldId, Result}; /// Extracts the word and the documents ids where this word appear. /// /// Returns a grenad reader with the list of extracted words and /// documents ids from the given chunk of docid word positions. +/// +/// The first returned reader is the one for normal word_docids, and the second one is for +/// exact_word_docids #[logging_timer::time] pub fn extract_word_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, -) -> Result> { + exact_attributes: &HashSet, +) -> Result<(grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); let mut word_docids_sorter = create_sorter( @@ -28,20 +34,53 @@ pub fn extract_word_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory, + max_memory.map(|x| x / 2), + ); + + let mut exact_word_docids_sorter = create_sorter( + merge_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|x| x / 2), ); let mut value_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; - while let Some((key, _value)) = cursor.move_on_next()? { + while let Some((key, positions)) = cursor.move_on_next()? { let (document_id_bytes, word_bytes) = try_split_array_at(key) .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let bitmap = RoaringBitmap::from_iter(Some(document_id)); serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; - word_docids_sorter.insert(word_bytes, &value_buffer)?; + + // If there are no exact attributes, we do not need to iterate over positions. + if exact_attributes.is_empty() { + word_docids_sorter.insert(word_bytes, &value_buffer)?; + } else { + let mut added_to_exact = false; + let mut added_to_word_docids = false; + for position in read_u32_ne_bytes(positions) { + // as soon as we know that this word had been to both readers, we don't need to + // iterate over the positions. + if added_to_exact && added_to_word_docids { + break; + } + let (fid, _) = relative_from_absolute_position(position); + if exact_attributes.contains(&fid) && !added_to_exact { + exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; + added_to_exact = true; + } else if !added_to_word_docids { + word_docids_sorter.insert(word_bytes, &value_buffer)?; + added_to_word_docids = true; + } + } + } } - sorter_into_reader(word_docids_sorter, indexer) + Ok(( + sorter_into_reader(word_docids_sorter, indexer)?, + sorter_into_reader(exact_word_docids_sorter, indexer)?, + )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 4c81b9334..8f6797a3b 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -26,7 +26,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, - merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, + merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -43,6 +43,7 @@ pub(crate) fn data_from_obkv_documents( geo_field_id: Option, stop_words: Option>, max_positions_per_attributes: Option, + exact_attributes: HashSet, ) -> Result<()> { let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks .par_bridge() @@ -66,7 +67,7 @@ pub(crate) fn data_from_obkv_documents( (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), ) = result?; - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -76,7 +77,7 @@ pub(crate) fn data_from_obkv_documents( "word-pair-proximity-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -86,17 +87,20 @@ pub(crate) fn data_from_obkv_documents( "field-id-wordcount-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec<(grenad::Reader, grenad::Reader)>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), - extract_word_docids, + move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), merge_roaring_bitmaps, - TypedChunk::WordDocids, + |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + }, "word-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -106,7 +110,7 @@ pub(crate) fn data_from_obkv_documents( "word-position-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_strings_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -116,7 +120,7 @@ pub(crate) fn data_from_obkv_documents( "field-id-facet-string-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_numbers_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -133,7 +137,7 @@ pub(crate) fn data_from_obkv_documents( /// Generated grenad chunks are merged using the merge_fn. /// The result of merged chunks is serialized as TypedChunk using the serialize_fn /// and sent into lmdb_writer_sx. -fn spawn_extraction_task( +fn spawn_extraction_task( chunks: Vec>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, @@ -142,19 +146,21 @@ fn spawn_extraction_task( serialize_fn: FS, name: &'static str, ) where - FE: Fn(grenad::Reader, GrenadParameters) -> Result> + FE: Fn(grenad::Reader, GrenadParameters) -> Result + Sync + Send + 'static, - FS: Fn(grenad::Reader) -> TypedChunk + Sync + Send + 'static, + FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static, + M: MergeableReader + FromParallelIterator + Send + 'static, + M::Output: Send, { rayon::spawn(move || { - let chunks: Result> = + let chunks: Result = chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect(); rayon::spawn(move || match chunks { Ok(chunks) => { debug!("merge {} database", name); - let reader = merge_readers(chunks, merge_fn, indexer); + let reader = chunks.merge(merge_fn, &indexer); let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))); } Err(e) => { diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index e0ac3a175..9d5a67d78 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -78,25 +78,62 @@ pub unsafe fn as_cloneable_grenad( Ok(reader) } -pub fn merge_readers( - readers: Vec>, - merge_fn: MergeFn, - indexer: GrenadParameters, -) -> Result> { - let mut merger_builder = grenad::MergerBuilder::new(merge_fn); - for reader in readers { - merger_builder.push(reader.into_cursor()?); +pub trait MergeableReader +where + Self: Sized, +{ + type Output; + + fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result; +} + +impl MergeableReader for Vec> { + type Output = grenad::Reader; + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut merger = MergerBuilder::new(merge_fn); + self.into_iter().try_for_each(|r| merger.push(r))?; + merger.finish(params) + } +} + +impl MergeableReader for Vec<(grenad::Reader, grenad::Reader)> { + type Output = (grenad::Reader, grenad::Reader); + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut m1 = MergerBuilder::new(merge_fn); + let mut m2 = MergerBuilder::new(merge_fn); + for (r1, r2) in self.into_iter() { + m1.push(r1)?; + m2.push(r2)?; + } + Ok((m1.finish(params)?, m2.finish(params)?)) + } +} + +struct MergerBuilder(grenad::MergerBuilder); + +impl MergerBuilder { + fn new(merge_fn: MergeFn) -> Self { + Self(grenad::MergerBuilder::new(merge_fn)) } - let merger = merger_builder.build(); - let mut writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); - merger.write_into_stream_writer(&mut writer)?; + fn push(&mut self, reader: grenad::Reader) -> Result<()> { + self.0.push(reader.into_cursor()?); + Ok(()) + } - Ok(writer_into_reader(writer)?) + fn finish(self, params: &GrenadParameters) -> Result> { + let merger = self.0.build(); + let mut writer = create_writer( + params.chunk_compression_type, + params.chunk_compression_level, + tempfile::tempfile()?, + ); + merger.write_into_stream_writer(&mut writer)?; + + Ok(writer_into_reader(writer)?) + } } #[derive(Debug, Clone, Copy)] @@ -240,3 +277,8 @@ pub fn sorter_into_lmdb_database( debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) } + +/// Used when trying to merge readers, but you don't actually care about the values. +pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result> { + Ok(Cow::Owned(Vec::new())) +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 22c1cfd6c..79d0d0466 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -8,9 +8,9 @@ use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ - as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_readers, - sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, - GrenadParameters, + as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, + merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, + writer_into_reader, GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2d3004444..0e6e59e10 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -4,11 +4,13 @@ mod transform; mod typed_chunk; use std::collections::HashSet; -use std::io::{Read, Seek}; +use std::io::{Cursor, Read, Seek}; use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; use crossbeam_channel::{Receiver, Sender}; +use heed::types::Str; +use heed::Database; use log::debug; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -28,7 +30,7 @@ use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; -use crate::{Index, Result}; +use crate::{Index, Result, RoaringBitmapCodec}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; @@ -226,6 +228,7 @@ where }; let stop_words = self.index.stop_words(self.wtxn)?; + let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; // Run extraction pipeline in parallel. pool.install(|| { @@ -255,6 +258,7 @@ where geo_field_id, stop_words, self.indexer_config.max_positions_per_attributes, + exact_attributes, ) }); @@ -282,6 +286,7 @@ where let mut word_pair_proximity_docids = None; let mut word_position_docids = None; let mut word_docids = None; + let mut exact_word_docids = None; let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -291,10 +296,13 @@ where for result in lmdb_writer_rx { let typed_chunk = match result? { - TypedChunk::WordDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; word_docids = Some(cloneable_chunk); - TypedChunk::WordDocids(chunk) + let cloneable_chunk = + unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; + exact_word_docids = Some(cloneable_chunk); + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } } TypedChunk::WordPairProximityDocids(chunk) => { let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; @@ -346,6 +354,7 @@ where self.execute_prefix_databases( word_docids, + exact_word_docids, word_pair_proximity_docids, word_position_docids, )?; @@ -357,6 +366,7 @@ where pub fn execute_prefix_databases( self, word_docids: Option>, + exact_word_docids: Option>, word_pair_proximity_docids: Option>, word_position_docids: Option>, ) -> Result<()> @@ -425,14 +435,25 @@ where }); if let Some(word_docids) = word_docids { - // Run the word prefix docids update operation. - let mut builder = WordPrefixDocids::new(self.wtxn, self.index); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - builder.max_nb_chunks = self.indexer_config.max_nb_chunks; - builder.max_memory = self.indexer_config.max_memory; - builder.execute( + execute_word_prefix_docids( + self.wtxn, word_docids, + self.index.word_docids.clone(), + self.index.word_prefix_docids.clone(), + &self.indexer_config, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + } + + if let Some(exact_word_docids) = exact_word_docids { + execute_word_prefix_docids( + self.wtxn, + exact_word_docids, + self.index.exact_word_docids.clone(), + self.index.exact_word_prefix_docids.clone(), + &self.indexer_config, &new_prefix_fst_words, &common_prefix_fst_words, &del_prefix_fst_words, @@ -497,6 +518,32 @@ where } } +/// Run the word prefix docids update operation. +fn execute_word_prefix_docids( + txn: &mut heed::RwTxn, + reader: grenad::Reader>, + word_docids_db: Database, + word_prefix_docids_db: Database, + indexer_config: &IndexerConfig, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, +) -> Result<()> { + let cursor = reader.into_cursor()?; + let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db); + builder.chunk_compression_type = indexer_config.chunk_compression_type; + builder.chunk_compression_level = indexer_config.chunk_compression_level; + builder.max_nb_chunks = indexer_config.max_nb_chunks; + builder.max_memory = indexer_config.max_memory; + builder.execute( + cursor, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + Ok(()) +} + #[cfg(test)] mod tests { use std::io::Cursor; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 77ea31138..26b97c3a0 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -3,14 +3,16 @@ use std::convert::TryInto; use std::fs::File; use std::io; +use grenad::MergerBuilder; use heed::types::ByteSlice; use heed::{BytesDecode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ - self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, - CursorClonableMmap, + self, merge_ignore_values, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, + valid_lmdb_key, CursorClonableMmap, }; +use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ @@ -25,7 +27,10 @@ pub(crate) enum TypedChunk { Documents(grenad::Reader), FieldIdWordcountDocids(grenad::Reader), NewDocumentsIds(RoaringBitmap), - WordDocids(grenad::Reader), + WordDocids { + word_docids_reader: grenad::Reader, + exact_word_docids_reader: grenad::Reader, + }, WordPositionDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), @@ -86,8 +91,8 @@ pub(crate) fn write_typed_chunk_into_index( TypedChunk::NewDocumentsIds(documents_ids) => { return Ok((documents_ids, is_merged_database)) } - TypedChunk::WordDocids(word_docids_iter) => { - let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?; + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; append_entries_into_database( word_docids_iter.clone(), &index.word_docids, @@ -97,15 +102,18 @@ pub(crate) fn write_typed_chunk_into_index( merge_roaring_bitmaps, )?; + let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; + append_entries_into_database( + exact_word_docids_iter.clone(), + &index.exact_word_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_roaring_bitmaps, + )?; + // create fst from word docids - let mut builder = fst::SetBuilder::memory(); - let mut cursor = word_docids_iter.into_cursor()?; - while let Some((word, _value)) = cursor.move_on_next()? { - // This is a lexicographically ordered word position - // we use the key to construct the words fst. - builder.insert(word)?; - } - let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?; + let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?; let db_fst = index.words_fst(wtxn)?; // merge new fst with database fst @@ -214,6 +222,23 @@ pub(crate) fn write_typed_chunk_into_index( Ok((RoaringBitmap::new(), is_merged_database)) } +fn merge_word_docids_reader_into_fst( + word_docids_iter: grenad::Reader>, + exact_word_docids_iter: grenad::Reader>, +) -> Result>> { + let mut merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn); + merger_builder.push(word_docids_iter.into_cursor()?); + merger_builder.push(exact_word_docids_iter.into_cursor()?); + let mut iter = merger_builder.build().into_stream_merger_iter()?; + let mut builder = fst::SetBuilder::memory(); + + while let Some((k, _)) = iter.next()? { + builder.insert(k)?; + } + + Ok(builder.into_set()) +} + fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { let new_value = RoaringBitmap::deserialize_from(new_value)?; let db_value = RoaringBitmap::deserialize_from(db_value)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 503fbd06e..7a26361d4 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -93,6 +93,8 @@ pub struct Settings<'a, 't, 'u, 'i> { min_word_len_two_typos: Setting, min_word_len_one_typo: Setting, exact_words: Setting>, + /// Attributes on which typo tolerance is disabled. + exact_attributes: Setting>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -117,6 +119,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { exact_words: Setting::NotSet, min_word_len_two_typos: Setting::Reset, min_word_len_one_typo: Setting::Reset, + exact_attributes: Setting::Reset, indexer_config, } } @@ -226,6 +229,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.exact_words = Setting::Reset; } + pub fn set_exact_attributes(&mut self, attrs: HashSet) { + self.exact_attributes = Setting::Set(attrs); + } + + pub fn reset_exact_attributes(&mut self) { + self.exact_attributes = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -411,6 +422,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + fn update_exact_attributes(&mut self) -> Result { + match self.exact_attributes { + Setting::Set(ref attrs) => { + let attrs = attrs.iter().map(String::as_str).collect::>(); + self.index.put_exact_attributes(&mut self.wtxn, &attrs)?; + Ok(true) + } + Setting::Reset => { + self.index.delete_exact_attributes(&mut self.wtxn)?; + Ok(true) + } + Setting::NotSet => Ok(false), + } + } + fn update_filterable(&mut self) -> Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { @@ -579,8 +605,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let stop_words_updated = self.update_stop_words()?; let synonyms_updated = self.update_synonyms()?; let searchable_updated = self.update_searchable()?; + let exact_attributes_updated = self.update_exact_attributes()?; - if stop_words_updated || faceted_updated || synonyms_updated || searchable_updated { + if stop_words_updated + || faceted_updated + || synonyms_updated + || searchable_updated + || exact_attributes_updated + { self.reindex(&progress_callback, old_fields_ids_map)?; } diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 076816f09..2887b5583 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -1,16 +1,18 @@ use std::collections::{HashMap, HashSet}; use grenad::CompressionType; -use heed::types::ByteSlice; +use heed::types::{ByteSlice, Str}; +use heed::Database; use crate::update::index_documents::{ create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, MergeFn, }; -use crate::{Index, Result}; +use crate::{Result, RoaringBitmapCodec}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, + word_docids: Database, + word_prefix_docids: Database, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, @@ -20,11 +22,13 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, + word_docids: Database, + word_prefix_docids: Database, ) -> WordPrefixDocids<'t, 'u, 'i> { WordPrefixDocids { wtxn, - index, + word_docids, + word_prefix_docids, chunk_compression_type: CompressionType::None, chunk_compression_level: None, max_nb_chunks: None, @@ -35,7 +39,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { #[logging_timer::time("WordPrefixDocids::{}")] pub fn execute( self, - new_word_docids: grenad::Reader, + mut new_word_docids_iter: grenad::ReaderCursor, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -51,7 +55,6 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { ); if !common_prefix_fst_words.is_empty() { - let mut new_word_docids_iter = new_word_docids.into_cursor()?; let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); while let Some((word, data)) = new_word_docids_iter.move_on_next()? { @@ -84,7 +87,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } // We fetch the docids associated to the newly added word prefix fst only. - let db = self.index.word_docids.remap_data_type::(); + let db = self.word_docids.remap_data_type::(); for prefix in new_prefix_fst_words { let prefix = std::str::from_utf8(prefix.as_bytes())?; for result in db.prefix_iter(self.wtxn, prefix)? { @@ -94,7 +97,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } // We remove all the entries that are no more required in this word prefix docids database. - let mut iter = self.index.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + let mut iter = self.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data(); while let Some((prefix, _)) = iter.next().transpose()? { if del_prefix_fst_words.contains(prefix.as_bytes()) { unsafe { iter.del_current()? }; @@ -106,7 +109,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // We finally write the word prefix docids into the LMDB database. sorter_into_lmdb_database( self.wtxn, - *self.index.word_prefix_docids.as_polymorph(), + *self.word_prefix_docids.as_polymorph(), prefix_docids_sorter, merge_roaring_bitmaps, )?; diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index ef080db9f..786fdbcae 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -373,7 +373,7 @@ fn criteria_mixup() { fn criteria_ascdesc() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB + options.map_size(12 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index df15fb768..35cc4b4c2 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -170,3 +170,41 @@ fn test_typo_disabled_on_word() { let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); } + +#[test] +fn test_disable_typo_on_attribute() { + let criteria = [Typo]; + let index = super::setup_search_index_with_criteria(&criteria); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + // typo in `antebel(l)um` + search.query("antebelum"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + // disable typos on `description` + builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect()); + builder.execute(|_| ()).unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("antebelum"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); +}