From 7e9d56a9e75391724f2c24a6f892a17dd7c30c5b Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 21 Mar 2022 16:25:15 +0100 Subject: [PATCH 01/28] disable typos on exact words --- milli/src/search/query_tree.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4eccae8ce..7d13f27a3 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -623,7 +623,9 @@ mod test { } fn exact_words(&self) -> crate::Result>> { - Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap()) + let builder = fst::SetBuilder::new(Vec::new()).unwrap(); + let data = builder.into_inner().unwrap(); + Ok(fst::Set::new(Cow::Owned(data)).unwrap()) } } From c882d8daf0dd174c8bb8c51734493e6814780d24 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 22 Mar 2022 09:55:49 +0100 Subject: [PATCH 02/28] add test for exact words --- milli/src/search/query_tree.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 7d13f27a3..ff9d3f4e9 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -623,9 +623,7 @@ mod test { } fn exact_words(&self) -> crate::Result>> { - let builder = fst::SetBuilder::new(Vec::new()).unwrap(); - let data = builder.into_inner().unwrap(); - Ok(fst::Set::new(Cow::Owned(data)).unwrap()) + Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap()) } } @@ -1269,6 +1267,7 @@ mod test { QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() } ); } + #[test] fn disable_typo_on_word() { let query = "goodbye"; From f82d4b36eb37212df5b3b9f42120fcef50419108 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 22 Mar 2022 19:07:59 +0100 Subject: [PATCH 03/28] introduce exact attribute setting --- milli/src/index.rs | 18 ++++++++++++++++++ milli/src/update/settings.rs | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index c0be985da..f4e17d93c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -53,6 +53,7 @@ pub mod main_key { pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len"; pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; pub const EXACT_WORDS: &str = "exact-words"; + pub const EXACT_ATTRIBUTES: &str = "exact-attributes"; } pub mod db_name { @@ -949,6 +950,23 @@ impl Index { )?; Ok(()) } + + pub fn exact_attributes<'t>(&self, txn: &'t RoTxn) -> Result> { + Ok(self + .main + .get::<_, Str, SerdeBincode>>(txn, main_key::EXACT_ATTRIBUTES)? + .unwrap_or_default()) + } + + pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?; + Ok(()) + } + + pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> Result<()> { + self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?; + Ok(()) + } } #[cfg(test)] diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 503fbd06e..3ed2a4152 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -93,6 +93,8 @@ pub struct Settings<'a, 't, 'u, 'i> { min_word_len_two_typos: Setting, min_word_len_one_typo: Setting, exact_words: Setting>, + /// attributes on which typo tolerance is not enabled. + exact_attributes: Setting>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -117,6 +119,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { exact_words: Setting::NotSet, min_word_len_two_typos: Setting::Reset, min_word_len_one_typo: Setting::Reset, + exact_attributes: Setting::Reset, indexer_config, } } @@ -226,6 +229,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.exact_words = Setting::Reset; } + pub fn set_exact_attributes(&mut self, attrs: HashSet) { + self.exact_attributes = Setting::Set(attrs); + } + + pub fn reset_exact_attributes(&mut self) { + self.exact_attributes = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -411,6 +422,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + fn update_exact_attributes(&mut self) -> Result { + match self.exact_attributes { + Setting::Set(ref attrs) => { + let attrs = attrs.iter().map(String::as_str).collect::>(); + self.index.put_exact_attributes(&mut self.wtxn, &attrs)?; + Ok(true) + } + Setting::Reset => { + self.index.delete_exact_attributes(&mut self.wtxn)?; + Ok(true) + } + Setting::NotSet => Ok(false), + } + } + fn update_filterable(&mut self) -> Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { @@ -579,8 +605,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let stop_words_updated = self.update_stop_words()?; let synonyms_updated = self.update_synonyms()?; let searchable_updated = self.update_searchable()?; + let exact_attributes_updated = self.update_exact_attributes()?; - if stop_words_updated || faceted_updated || synonyms_updated || searchable_updated { + if stop_words_updated + || faceted_updated + || synonyms_updated + || searchable_updated + || exact_attributes_updated + { self.reindex(&progress_callback, old_fields_ids_map)?; } From 5f9f82757dbebec7087cd56b2e624e372c3bbb4f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Wed, 23 Mar 2022 14:48:15 +0100 Subject: [PATCH 04/28] refactor spawn_extraction_task --- .../src/update/index_documents/extract/mod.rs | 26 +++---- .../index_documents/helpers/grenad_helpers.rs | 69 ++++++++++++++----- .../src/update/index_documents/helpers/mod.rs | 4 +- 3 files changed, 69 insertions(+), 30 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 4c81b9334..100431237 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -26,7 +26,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, - merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, + merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -66,7 +66,7 @@ pub(crate) fn data_from_obkv_documents( (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), ) = result?; - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -76,7 +76,7 @@ pub(crate) fn data_from_obkv_documents( "word-pair-proximity-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -86,7 +86,7 @@ pub(crate) fn data_from_obkv_documents( "field-id-wordcount-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -96,7 +96,7 @@ pub(crate) fn data_from_obkv_documents( "word-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -106,7 +106,7 @@ pub(crate) fn data_from_obkv_documents( "word-position-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_strings_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -116,7 +116,7 @@ pub(crate) fn data_from_obkv_documents( "field-id-facet-string-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_numbers_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents( /// Generated grenad chunks are merged using the merge_fn. /// The result of merged chunks is serialized as TypedChunk using the serialize_fn /// and sent into lmdb_writer_sx. -fn spawn_extraction_task( +fn spawn_extraction_task( chunks: Vec>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, @@ -142,19 +142,21 @@ fn spawn_extraction_task( serialize_fn: FS, name: &'static str, ) where - FE: Fn(grenad::Reader, GrenadParameters) -> Result> + FE: Fn(grenad::Reader, GrenadParameters) -> Result + Sync + Send + 'static, - FS: Fn(grenad::Reader) -> TypedChunk + Sync + Send + 'static, + FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static, + M: MergeableReader + FromParallelIterator + Send + 'static, + M::Output: Send, { rayon::spawn(move || { - let chunks: Result> = + let chunks: Result = chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect(); rayon::spawn(move || match chunks { Ok(chunks) => { debug!("merge {} database", name); - let reader = merge_readers(chunks, merge_fn, indexer); + let reader = chunks.merge(merge_fn, &indexer); let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))); } Err(e) => { diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index e0ac3a175..fc28860b2 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -78,25 +78,62 @@ pub unsafe fn as_cloneable_grenad( Ok(reader) } -pub fn merge_readers( - readers: Vec>, - merge_fn: MergeFn, - indexer: GrenadParameters, -) -> Result> { - let mut merger_builder = grenad::MergerBuilder::new(merge_fn); - for reader in readers { - merger_builder.push(reader.into_cursor()?); +pub trait MergeableReader +where + Self: Sized, +{ + type Output; + + fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result; +} + +impl MergeableReader for Vec> { + type Output = grenad::Reader; + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut merger = MergerBuilder::new(merge_fn); + self.into_iter().try_for_each(|r| merger.push(r))?; + merger.finish(params) + } +} + +impl MergeableReader for Vec<(grenad::Reader, grenad::Reader)> { + type Output = (grenad::Reader, grenad::Reader); + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut m1 = MergerBuilder::new(merge_fn); + let mut m2 = MergerBuilder::new(merge_fn); + for (r1, r2) in self.into_iter() { + m1.push(r1)?; + m2.push(r2)?; + } + Ok((m1.finish(params)?, m2.finish(params)?)) + } +} + +struct MergerBuilder(grenad::MergerBuilder); + +impl MergerBuilder { + fn new(merge_fn: MergeFn) -> Self { + Self(grenad::MergerBuilder::new(merge_fn)) } - let merger = merger_builder.build(); - let mut writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); - merger.write_into_stream_writer(&mut writer)?; + fn push(&mut self, reader: grenad::Reader) -> Result<()> { + self.0.push(reader.into_cursor()?); + Ok(()) + } - Ok(writer_into_reader(writer)?) + fn finish(self, params: &GrenadParameters) -> Result> { + let merger = self.0.build(); + let mut writer = create_writer( + params.chunk_compression_type, + params.chunk_compression_level, + tempfile::tempfile()?, + ); + merger.write_into_stream_writer(&mut writer)?; + + Ok(writer_into_reader(writer)?) + } } #[derive(Debug, Clone, Copy)] diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 22c1cfd6c..f4940af1d 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -8,9 +8,9 @@ use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ - as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_readers, + as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, - GrenadParameters, + GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, From 0a77be4ec02f29df26242a6ffa7a94ddcb3b0724 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 24 Mar 2022 15:22:57 +0100 Subject: [PATCH 05/28] introduce exact_word_docids db --- milli/src/index.rs | 9 ++- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 71 +++++++++++++------ .../extract/extract_word_docids.rs | 12 +++- .../src/update/index_documents/extract/mod.rs | 7 +- .../index_documents/helpers/grenad_helpers.rs | 5 ++ .../src/update/index_documents/helpers/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 18 +++-- .../src/update/index_documents/typed_chunk.rs | 49 +++++++++---- milli/src/update/word_prefix_docids.rs | 5 +- 10 files changed, 133 insertions(+), 47 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index f4e17d93c..8f9c9beb7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -59,6 +59,7 @@ pub mod main_key { pub mod db_name { pub const MAIN: &str = "main"; pub const WORD_DOCIDS: &str = "word-docids"; + pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; @@ -83,6 +84,10 @@ pub struct Index { /// A word and all the documents ids containing the word. pub word_docids: Database, + + /// A word and all the documents ids containing the word, from attributes for which typos are not allowed. + pub exact_word_docids: Database, + /// A prefix of word and all the documents ids containing this prefix. pub word_prefix_docids: Database, @@ -119,12 +124,13 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(14); + options.max_dbs(15); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; let main = env.create_poly_database(Some(MAIN))?; let word_docids = env.create_database(Some(WORD_DOCIDS))?; + let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?; let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; @@ -146,6 +152,7 @@ impl Index { env, main, word_docids, + exact_word_docids, word_prefix_docids, docid_word_positions, word_pair_proximity_docids, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 644547b91..57c0969c7 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -19,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { env: _env, main: _main, word_docids, + exact_word_docids, word_prefix_docids, docid_word_positions, word_pair_proximity_docids, @@ -55,6 +56,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // Clear the other databases. word_docids.clear(self.wtxn)?; + exact_word_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 402cc61dd..46a4721c0 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,7 +2,7 @@ use std::collections::btree_map::Entry; use std::collections::HashMap; use fst::IntoStreamer; -use heed::types::ByteSlice; +use heed::types::{ByteSlice, Str}; use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -16,7 +16,10 @@ use crate::heed_codec::facet::{ }; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; -use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; +use crate::{ + DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32, + BEU32, +}; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -108,6 +111,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { env: _env, main: _main, word_docids, + exact_word_docids, word_prefix_docids, docid_word_positions, word_pair_proximity_docids, @@ -204,25 +208,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We iterate over the words and delete the documents ids // from the word docids database. for (word, must_remove) in &mut words { - // We create an iterator to be able to get the content and delete the word docids. - // It's faster to acquire a cursor to get and delete or put, as we avoid traversing - // the LMDB B-Tree two times but only once. - let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; - if let Some((key, mut docids)) = iter.next().transpose()? { - if key == word.as_str() { - let previous_len = docids.len(); - docids -= &self.documents_ids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - *must_remove = true; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } - } + remove_from_word_docids( + self.wtxn, + word_docids, + word.as_str(), + must_remove, + &self.documents_ids, + )?; + + remove_from_word_docids( + self.wtxn, + exact_word_docids, + word.as_str(), + must_remove, + &self.documents_ids, + )?; } // We construct an FST set that contains the words to delete from the words FST. @@ -457,6 +457,35 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } +fn remove_from_word_docids( + txn: &mut heed::RwTxn, + db: &heed::Database, + word: &str, + must_remove: &mut bool, + to_remove: &RoaringBitmap, +) -> Result<()> { + // We create an iterator to be able to get the content and delete the word docids. + // It's faster to acquire a cursor to get and delete or put, as we avoid traversing + // the LMDB B-Tree two times but only once. + let mut iter = db.prefix_iter_mut(txn, &word)?; + if let Some((key, mut docids)) = iter.next().transpose()? { + if key == word { + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + *must_remove = true; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } + } + } + Ok(()) +} + fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 80d68298a..03bfada21 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -10,17 +10,21 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::update::index_documents::MergeFn; use crate::Result; /// Extracts the word and the documents ids where this word appear. /// /// Returns a grenad reader with the list of extracted words and /// documents ids from the given chunk of docid word positions. +/// +/// The first returned reader in the one for normal word_docids, and the second one is for +/// exact_word_docids #[logging_timer::time] pub fn extract_word_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, -) -> Result> { +) -> Result<(grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); let mut word_docids_sorter = create_sorter( @@ -43,5 +47,9 @@ pub fn extract_word_docids( word_docids_sorter.insert(word_bytes, &value_buffer)?; } - sorter_into_reader(word_docids_sorter, indexer) + let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn); + Ok(( + sorter_into_reader(word_docids_sorter, indexer)?, + sorter_into_reader(empty_sorter, indexer)?, + )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 100431237..4e7f211ce 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -86,13 +86,16 @@ pub(crate) fn data_from_obkv_documents( "field-id-wordcount-docids", ); - spawn_extraction_task::<_, _, Vec>>( + spawn_extraction_task::<_, _, Vec<(grenad::Reader, grenad::Reader)>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), extract_word_docids, merge_roaring_bitmaps, - TypedChunk::WordDocids, + |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + }, "word-docids", ); diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index fc28860b2..fb5242910 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -277,3 +277,8 @@ pub fn sorter_into_lmdb_database( debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) } + +/// Used when trying to merge readers, but you don't actually care about the values. +pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result> { + Ok(Cow::Owned(Vec::new())) +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index f4940af1d..4642bcf14 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -8,7 +8,7 @@ use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ - as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, + as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, GrenadParameters, MergeableReader, }; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2d3004444..633b72cc9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,7 +20,7 @@ pub use self::helpers::{ fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, }; -use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; +use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::DocumentBatchReader; pub use crate::update::index_documents::helpers::CursorClonableMmap; @@ -282,6 +282,7 @@ where let mut word_pair_proximity_docids = None; let mut word_position_docids = None; let mut word_docids = None; + let mut _exact_word_docids = None; let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -291,10 +292,13 @@ where for result in lmdb_writer_rx { let typed_chunk = match result? { - TypedChunk::WordDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; word_docids = Some(cloneable_chunk); - TypedChunk::WordDocids(chunk) + let cloneable_chunk = + unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; + _exact_word_docids = Some(cloneable_chunk); + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } } TypedChunk::WordPairProximityDocids(chunk) => { let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; @@ -425,6 +429,10 @@ where }); if let Some(word_docids) = word_docids { + let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn); + word_docids_builder.push(word_docids.into_cursor()?); + // TODO: push exact_word_docids + let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?; // Run the word prefix docids update operation. let mut builder = WordPrefixDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.indexer_config.chunk_compression_type; @@ -432,7 +440,7 @@ where builder.max_nb_chunks = self.indexer_config.max_nb_chunks; builder.max_memory = self.indexer_config.max_memory; builder.execute( - word_docids, + word_docids_iter, &new_prefix_fst_words, &common_prefix_fst_words, &del_prefix_fst_words, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 77ea31138..be440114f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -3,14 +3,16 @@ use std::convert::TryInto; use std::fs::File; use std::io; +use grenad::MergerBuilder; use heed::types::ByteSlice; use heed::{BytesDecode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ - self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, + self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, }; +use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ @@ -25,7 +27,10 @@ pub(crate) enum TypedChunk { Documents(grenad::Reader), FieldIdWordcountDocids(grenad::Reader), NewDocumentsIds(RoaringBitmap), - WordDocids(grenad::Reader), + WordDocids { + word_docids_reader: grenad::Reader, + exact_word_docids_reader: grenad::Reader, + }, WordPositionDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), @@ -86,8 +91,8 @@ pub(crate) fn write_typed_chunk_into_index( TypedChunk::NewDocumentsIds(documents_ids) => { return Ok((documents_ids, is_merged_database)) } - TypedChunk::WordDocids(word_docids_iter) => { - let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?; + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; append_entries_into_database( word_docids_iter.clone(), &index.word_docids, @@ -97,15 +102,18 @@ pub(crate) fn write_typed_chunk_into_index( merge_roaring_bitmaps, )?; + let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; + append_entries_into_database( + exact_word_docids_iter.clone(), + &index.exact_word_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_roaring_bitmaps, + )?; + // create fst from word docids - let mut builder = fst::SetBuilder::memory(); - let mut cursor = word_docids_iter.into_cursor()?; - while let Some((word, _value)) = cursor.move_on_next()? { - // This is a lexicographically ordered word position - // we use the key to construct the words fst. - builder.insert(word)?; - } - let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?; + let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?; let db_fst = index.words_fst(wtxn)?; // merge new fst with database fst @@ -214,6 +222,23 @@ pub(crate) fn write_typed_chunk_into_index( Ok((RoaringBitmap::new(), is_merged_database)) } +fn merge_word_docids_reader_into_fst( + word_docids_iter: grenad::Reader>, + exact_word_docids_iter: grenad::Reader>, +) -> Result>> { + let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn); + merger_builder.push(word_docids_iter.into_cursor()?); + merger_builder.push(exact_word_docids_iter.into_cursor()?); + let mut iter = merger_builder.build().into_stream_merger_iter()?; + let mut builder = fst::SetBuilder::memory(); + + while let Some((k, _)) = iter.next()? { + builder.insert(k)?; + } + + Ok(builder.into_set()) +} + fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { let new_value = RoaringBitmap::deserialize_from(new_value)?; let db_value = RoaringBitmap::deserialize_from(db_value)?; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 076816f09..4114f8baf 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -35,7 +35,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { #[logging_timer::time("WordPrefixDocids::{}")] pub fn execute( self, - new_word_docids: grenad::Reader, + mut new_word_docids_iter: grenad::MergerIter, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -51,10 +51,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { ); if !common_prefix_fst_words.is_empty() { - let mut new_word_docids_iter = new_word_docids.into_cursor()?; let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((word, data)) = new_word_docids_iter.move_on_next()? { + while let Some((word, data)) = new_word_docids_iter.next()? { current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), _otherwise => { From 5451c64d5d84ecbc154dc7708ad1c72c62336f6e Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 24 Mar 2022 15:55:29 +0100 Subject: [PATCH 06/28] increase criteria asc desc test map size --- milli/tests/search/query_criteria.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index ef080db9f..786fdbcae 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -373,7 +373,7 @@ fn criteria_mixup() { fn criteria_ascdesc() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB + options.map_size(12 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); From 8d46a5b0b5d86b85a4c865a72522b915d540ccce Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 24 Mar 2022 17:00:29 +0100 Subject: [PATCH 07/28] extract exact word docids --- milli/src/index.rs | 5 +++ milli/src/lib.rs | 4 ++ .../extract/extract_word_docids.rs | 43 ++++++++++++++++--- .../src/update/index_documents/extract/mod.rs | 3 +- milli/src/update/index_documents/mod.rs | 2 + 5 files changed, 50 insertions(+), 7 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 8f9c9beb7..3d6d954f0 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -964,6 +964,11 @@ impl Index { .get::<_, Str, SerdeBincode>>(txn, main_key::EXACT_ATTRIBUTES)? .unwrap_or_default()) } + pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result> { + let attrs = self.exact_attributes(txn)?; + let fid_map = self.fields_ids_map(txn)?; + Ok(attrs.iter().filter_map(|attr| fid_map.id(attr)).collect()) + } pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> { self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?; diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ba2bd9b0f..b68c76048 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -74,6 +74,10 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi (field_id as u32) << 16 | (relative as u32) } +pub fn field_id_from_position(position: u32) -> FieldId { + (position >> 16 & 0xffff) as u16 +} + /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 03bfada21..5f231e5aa 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use std::fs::File; use std::io; use std::iter::FromIterator; @@ -10,8 +11,8 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::update::index_documents::MergeFn; -use crate::Result; +use crate::update::index_documents::helpers::read_u32_ne_bytes; +use crate::{field_id_from_position, FieldId, Result}; /// Extracts the word and the documents ids where this word appear. /// @@ -24,6 +25,7 @@ use crate::Result; pub fn extract_word_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, + exact_attributes: &HashSet, ) -> Result<(grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); @@ -35,21 +37,50 @@ pub fn extract_word_docids( max_memory, ); + let mut exact_word_docids_sorter = create_sorter( + merge_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + let mut value_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; - while let Some((key, _value)) = cursor.move_on_next()? { + while let Some((key, positions)) = cursor.move_on_next()? { let (document_id_bytes, word_bytes) = try_split_array_at(key) .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let bitmap = RoaringBitmap::from_iter(Some(document_id)); serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; - word_docids_sorter.insert(word_bytes, &value_buffer)?; + + // If there are no exact attributes, we do not need to iterate over positions. + if exact_attributes.is_empty() { + word_docids_sorter.insert(word_bytes, &value_buffer)?; + } else { + let mut added_to_exact = false; + let mut added_to_word_docids = false; + for position in read_u32_ne_bytes(positions) { + // as soon as we know that this word had been to both readers, we don't need to + // iterate over the positions. + if added_to_exact && added_to_word_docids { + break; + } + let fid = field_id_from_position(position); + if exact_attributes.contains(&fid) && !added_to_exact { + exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; + added_to_exact = true; + } else if !added_to_word_docids { + word_docids_sorter.insert(word_bytes, &value_buffer)?; + added_to_word_docids = true; + } + } + } } - let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn); Ok(( sorter_into_reader(word_docids_sorter, indexer)?, - sorter_into_reader(empty_sorter, indexer)?, + sorter_into_reader(exact_word_docids_sorter, indexer)?, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 4e7f211ce..8f6797a3b 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -43,6 +43,7 @@ pub(crate) fn data_from_obkv_documents( geo_field_id: Option, stop_words: Option>, max_positions_per_attributes: Option, + exact_attributes: HashSet, ) -> Result<()> { let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks .par_bridge() @@ -90,7 +91,7 @@ pub(crate) fn data_from_obkv_documents( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), - extract_word_docids, + move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), merge_roaring_bitmaps, |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { word_docids_reader, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 633b72cc9..c490e93da 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -226,6 +226,7 @@ where }; let stop_words = self.index.stop_words(self.wtxn)?; + let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; // Run extraction pipeline in parallel. pool.install(|| { @@ -255,6 +256,7 @@ where geo_field_id, stop_words, self.indexer_config.max_positions_per_attributes, + exact_attributes, ) }); From c4c6e3535290c88016e6a74f0f015563432e7fc9 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 24 Mar 2022 19:25:11 +0100 Subject: [PATCH 08/28] query exact_word_docids in resolve_query_tree --- milli/src/search/criteria/mod.rs | 20 ++++++++++++++++++-- milli/src/update/index_documents/mod.rs | 11 ++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1dbfd2524..df9189239 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -68,6 +68,7 @@ impl Default for Candidates { pub trait Context<'c> { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; + fn exact_word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; fn word_pair_proximity_docids( &self, @@ -118,6 +119,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.word_docids.get(self.rtxn, &word) } + fn exact_word_docids(&self, word: &str) -> heed::Result> { + self.index.exact_word_docids.get(self.rtxn, &word) + } + fn word_prefix_docids(&self, word: &str) -> heed::Result> { self.index.word_prefix_docids.get(self.rtxn, &word) } @@ -400,11 +405,14 @@ fn query_docids( let mut docids = RoaringBitmap::new(); for (word, _typo) in words { let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); - docids |= current_docids; + let exact_current_docids = ctx.exact_word_docids(&word)?.unwrap_or_default(); + docids |= current_docids | exact_current_docids; } Ok(docids) } else { - Ok(ctx.word_docids(&word)?.unwrap_or_default()) + let word_docids = ctx.word_docids(&word)?.unwrap_or_default(); + let exact_word_docids = ctx.exact_word_docids(&word)?.unwrap_or_default(); + Ok(word_docids | exact_word_docids) } } QueryKind::Tolerant { typo, word } => { @@ -512,6 +520,7 @@ pub mod test { pub struct TestContext<'t> { words_fst: fst::Set>, word_docids: HashMap, + exact_word_docids: HashMap, word_prefix_docids: HashMap, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, @@ -527,6 +536,10 @@ pub mod test { Ok(self.word_docids.get(&word.to_string()).cloned()) } + fn exact_word_docids(&self, word: &str) -> heed::Result> { + Ok(self.exact_word_docids.get(&word.to_string()).cloned()) + } + fn word_prefix_docids(&self, word: &str) -> heed::Result> { Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) } @@ -643,6 +656,8 @@ pub mod test { s("morning") => random_postings(rng, 125), }; + let exact_word_docids = HashMap::new(); + let mut docid_words = HashMap::new(); for (word, docids) in word_docids.iter() { for docid in docids { @@ -712,6 +727,7 @@ pub mod test { TestContext { words_fst, word_docids, + exact_word_docids, word_prefix_docids, word_pair_proximity_docids, word_prefix_pair_proximity_docids, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index c490e93da..54d30f8fb 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -284,7 +284,7 @@ where let mut word_pair_proximity_docids = None; let mut word_position_docids = None; let mut word_docids = None; - let mut _exact_word_docids = None; + let mut exact_word_docids = None; let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -299,7 +299,7 @@ where word_docids = Some(cloneable_chunk); let cloneable_chunk = unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; - _exact_word_docids = Some(cloneable_chunk); + exact_word_docids = Some(cloneable_chunk); TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } } TypedChunk::WordPairProximityDocids(chunk) => { @@ -352,6 +352,7 @@ where self.execute_prefix_databases( word_docids, + exact_word_docids, word_pair_proximity_docids, word_position_docids, )?; @@ -363,6 +364,7 @@ where pub fn execute_prefix_databases( self, word_docids: Option>, + exact_word_docids: Option>, word_pair_proximity_docids: Option>, word_position_docids: Option>, ) -> Result<()> @@ -433,7 +435,10 @@ where if let Some(word_docids) = word_docids { let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn); word_docids_builder.push(word_docids.into_cursor()?); - // TODO: push exact_word_docids + if let Some(exact_word_docids) = exact_word_docids { + word_docids_builder.push(exact_word_docids.into_cursor()?); + } + let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?; // Run the word prefix docids update operation. let mut builder = WordPrefixDocids::new(self.wtxn, self.index); From ba0bb29cd8a1b748b325c2854ce9ea6daaf127a1 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 25 Mar 2022 10:20:39 +0100 Subject: [PATCH 09/28] refactor WordPrefixDocids to take dbs instead of indexes --- milli/src/update/index_documents/mod.rs | 6 +++++- milli/src/update/word_prefix_docids.rs | 20 ++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 54d30f8fb..91d108c72 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -441,7 +441,11 @@ where let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?; // Run the word prefix docids update operation. - let mut builder = WordPrefixDocids::new(self.wtxn, self.index); + let mut builder = WordPrefixDocids::new( + self.wtxn, + self.index.word_docids.clone(), + self.index.word_prefix_docids.clone(), + ); builder.chunk_compression_type = self.indexer_config.chunk_compression_type; builder.chunk_compression_level = self.indexer_config.chunk_compression_level; builder.max_nb_chunks = self.indexer_config.max_nb_chunks; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 4114f8baf..b166812a5 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -1,16 +1,18 @@ use std::collections::{HashMap, HashSet}; use grenad::CompressionType; -use heed::types::ByteSlice; +use heed::types::{ByteSlice, Str}; +use heed::Database; use crate::update::index_documents::{ create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, MergeFn, }; -use crate::{Index, Result}; +use crate::{Result, RoaringBitmapCodec}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, + word_docids: Database, + word_prefix_docids: Database, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, @@ -20,11 +22,13 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, + word_docids: Database, + word_prefixes_docids: Database, ) -> WordPrefixDocids<'t, 'u, 'i> { WordPrefixDocids { wtxn, - index, + word_docids, + word_prefix_docids: word_prefixes_docids, chunk_compression_type: CompressionType::None, chunk_compression_level: None, max_nb_chunks: None, @@ -83,7 +87,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } // We fetch the docids associated to the newly added word prefix fst only. - let db = self.index.word_docids.remap_data_type::(); + let db = self.word_docids.remap_data_type::(); for prefix in new_prefix_fst_words { let prefix = std::str::from_utf8(prefix.as_bytes())?; for result in db.prefix_iter(self.wtxn, prefix)? { @@ -93,7 +97,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } // We remove all the entries that are no more required in this word prefix docids database. - let mut iter = self.index.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + let mut iter = self.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data(); while let Some((prefix, _)) = iter.next().transpose()? { if del_prefix_fst_words.contains(prefix.as_bytes()) { unsafe { iter.del_current()? }; @@ -105,7 +109,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // We finally write the word prefix docids into the LMDB database. sorter_into_lmdb_database( self.wtxn, - *self.index.word_prefix_docids.as_polymorph(), + *self.word_prefix_docids.as_polymorph(), prefix_docids_sorter, merge_roaring_bitmaps, )?; From 6dd2e4ffbd97bac64d0d3a7a5c39a51b0a5639a5 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 25 Mar 2022 10:49:34 +0100 Subject: [PATCH 10/28] introduce exact_word_prefix database in index --- milli/src/index.rs | 8 +++- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 67 ++++++++++++++++++---------- 3 files changed, 53 insertions(+), 24 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 3d6d954f0..80f62f684 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -61,6 +61,7 @@ pub mod db_name { pub const WORD_DOCIDS: &str = "word-docids"; pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; + pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; @@ -91,6 +92,9 @@ pub struct Index { /// A prefix of word and all the documents ids containing this prefix. pub word_prefix_docids: Database, + /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. + pub exact_word_prefix_docids: Database, + /// Maps a word and a document id (u32) to all the positions where the given word appears. pub docid_word_positions: Database, @@ -124,7 +128,7 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(15); + options.max_dbs(16); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -132,6 +136,7 @@ impl Index { let word_docids = env.create_database(Some(WORD_DOCIDS))?; let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?; let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; + let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_prefix_pair_proximity_docids = @@ -154,6 +159,7 @@ impl Index { word_docids, exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 57c0969c7..3665d2313 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -21,6 +21,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_docids, exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, @@ -58,6 +59,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_docids.clear(self.wtxn)?; exact_word_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?; + exact_word_prefix_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 46a4721c0..58c4d4f70 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; -use heed::{BytesDecode, BytesEncode}; +use heed::{BytesDecode, BytesEncode, Database}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -113,6 +113,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_docids, exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, field_id_word_count_docids, @@ -254,34 +255,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We write the new words FST into the main database. self.index.put_words_fst(self.wtxn, &new_words_fst)?; - // We iterate over the word prefix docids database and remove the deleted documents ids - // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. - let mut prefixes_to_delete = fst::SetBuilder::memory(); - let mut iter = word_prefix_docids.iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (prefix, mut docids) = result?; - let prefix = prefix.to_owned(); - let previous_len = docids.len(); - docids -= &self.documents_ids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - prefixes_to_delete.insert(prefix)?; - } else if docids.len() != previous_len { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&prefix, &docids)? }; - } - } + let prefixes_to_delete = + remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?; - drop(iter); + let exact_prefix_to_delete = remove_from_word_prefix_docids( + self.wtxn, + exact_word_prefix_docids, + &self.documents_ids, + )?; + + let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union(); // We compute the new prefix FST and write it only if there is a change. - let prefixes_to_delete = prefixes_to_delete.into_set(); - if !prefixes_to_delete.is_empty() { + if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() { let new_words_prefixes_fst = { // We retrieve the current words prefixes FST from the database. let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?; - let difference = words_prefixes_fst.op().add(&prefixes_to_delete).difference(); + let difference = + words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference(); // We stream the new external ids that does no more contains the to-delete external ids. let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory(); @@ -457,6 +448,36 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } +fn remove_from_word_prefix_docids( + txn: &mut heed::RwTxn, + db: &Database, + to_remove: &RoaringBitmap, +) -> Result>> { + let mut prefixes_to_delete = fst::SetBuilder::memory(); + + // We iterate over the word prefix docids database and remove the deleted documents ids + // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. + let mut iter = db.iter_mut(txn)?; + while let Some(result) = iter.next() { + let (prefix, mut docids) = result?; + let prefix = prefix.to_owned(); + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + prefixes_to_delete.insert(prefix)?; + } else if docids.len() != previous_len { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&prefix, &docids)? }; + } + } + + drop(iter); + + Ok(prefixes_to_delete.into_set()) +} + fn remove_from_word_docids( txn: &mut heed::RwTxn, db: &heed::Database, From e8f06f6c0606b130b2e398246bb55ceeb51602b3 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 25 Mar 2022 16:17:55 +0100 Subject: [PATCH 11/28] extract exact_word_prefix_docids --- milli/src/update/index_documents/mod.rs | 66 ++++++++++++++++++------- milli/src/update/word_prefix_docids.rs | 8 +-- 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 91d108c72..0e6e59e10 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -4,11 +4,13 @@ mod transform; mod typed_chunk; use std::collections::HashSet; -use std::io::{Read, Seek}; +use std::io::{Cursor, Read, Seek}; use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; use crossbeam_channel::{Receiver, Sender}; +use heed::types::Str; +use heed::Database; use log::debug; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -20,7 +22,7 @@ pub use self::helpers::{ fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, }; -use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters}; +use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::DocumentBatchReader; pub use crate::update::index_documents::helpers::CursorClonableMmap; @@ -28,7 +30,7 @@ use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; -use crate::{Index, Result}; +use crate::{Index, Result, RoaringBitmapCodec}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; @@ -433,25 +435,25 @@ where }); if let Some(word_docids) = word_docids { - let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn); - word_docids_builder.push(word_docids.into_cursor()?); - if let Some(exact_word_docids) = exact_word_docids { - word_docids_builder.push(exact_word_docids.into_cursor()?); - } - - let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?; - // Run the word prefix docids update operation. - let mut builder = WordPrefixDocids::new( + execute_word_prefix_docids( self.wtxn, + word_docids, self.index.word_docids.clone(), self.index.word_prefix_docids.clone(), - ); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - builder.max_nb_chunks = self.indexer_config.max_nb_chunks; - builder.max_memory = self.indexer_config.max_memory; - builder.execute( - word_docids_iter, + &self.indexer_config, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + } + + if let Some(exact_word_docids) = exact_word_docids { + execute_word_prefix_docids( + self.wtxn, + exact_word_docids, + self.index.exact_word_docids.clone(), + self.index.exact_word_prefix_docids.clone(), + &self.indexer_config, &new_prefix_fst_words, &common_prefix_fst_words, &del_prefix_fst_words, @@ -516,6 +518,32 @@ where } } +/// Run the word prefix docids update operation. +fn execute_word_prefix_docids( + txn: &mut heed::RwTxn, + reader: grenad::Reader>, + word_docids_db: Database, + word_prefix_docids_db: Database, + indexer_config: &IndexerConfig, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, +) -> Result<()> { + let cursor = reader.into_cursor()?; + let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db); + builder.chunk_compression_type = indexer_config.chunk_compression_type; + builder.chunk_compression_level = indexer_config.chunk_compression_level; + builder.max_nb_chunks = indexer_config.max_nb_chunks; + builder.max_memory = indexer_config.max_memory; + builder.execute( + cursor, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + Ok(()) +} + #[cfg(test)] mod tests { use std::io::Cursor; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index b166812a5..2887b5583 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -23,12 +23,12 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, word_docids: Database, - word_prefixes_docids: Database, + word_prefix_docids: Database, ) -> WordPrefixDocids<'t, 'u, 'i> { WordPrefixDocids { wtxn, word_docids, - word_prefix_docids: word_prefixes_docids, + word_prefix_docids, chunk_compression_type: CompressionType::None, chunk_compression_level: None, max_nb_chunks: None, @@ -39,7 +39,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { #[logging_timer::time("WordPrefixDocids::{}")] pub fn execute( self, - mut new_word_docids_iter: grenad::MergerIter, + mut new_word_docids_iter: grenad::ReaderCursor, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -57,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { if !common_prefix_fst_words.is_empty() { let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((word, data)) = new_word_docids_iter.next()? { + while let Some((word, data)) = new_word_docids_iter.move_on_next()? { current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), _otherwise => { From 21ae4143b177389dde584411107f6559a5fbe4aa Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 25 Mar 2022 16:27:48 +0100 Subject: [PATCH 12/28] add exact_word_prefix to Context --- milli/src/search/criteria/mod.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index df9189239..3daa258bf 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -70,6 +70,7 @@ pub trait Context<'c> { fn word_docids(&self, word: &str) -> heed::Result>; fn exact_word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result>; fn word_pair_proximity_docids( &self, left: &str, @@ -127,6 +128,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.word_prefix_docids.get(self.rtxn, &word) } + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { + self.index.exact_word_prefix_docids.get(self.rtxn, &word) + } + fn word_pair_proximity_docids( &self, left: &str, @@ -522,6 +527,7 @@ pub mod test { word_docids: HashMap, exact_word_docids: HashMap, word_prefix_docids: HashMap, + exact_word_prefix_docids: HashMap, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, docid_words: HashMap>, @@ -544,6 +550,10 @@ pub mod test { Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) } + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { + Ok(self.exact_word_prefix_docids.get(&word.to_string()).cloned()) + } + fn word_pair_proximity_docids( &self, left: &str, @@ -672,6 +682,8 @@ pub mod test { s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], }; + let exact_word_prefix_docids = HashMap::new(); + let mut word_pair_proximity_docids = HashMap::new(); let mut word_prefix_pair_proximity_docids = HashMap::new(); for (lword, lcandidates) in &word_docids { @@ -729,6 +741,7 @@ pub mod test { word_docids, exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, word_pair_proximity_docids, word_prefix_pair_proximity_docids, docid_words, From 56b4f5dce2a32505e6e25b973880b7d682e4d4be Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 25 Mar 2022 16:30:18 +0100 Subject: [PATCH 13/28] add exact prefix to query_docids --- milli/src/search/criteria/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 3daa258bf..6ac076ea4 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -404,7 +404,9 @@ fn query_docids( match &query.kind { QueryKind::Exact { word, .. } => { if query.prefix && ctx.in_prefix_cache(&word) { - Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default()) + let doc_ids = ctx.word_prefix_docids(&word)?.unwrap_or_default(); + let exact_docids = ctx.exact_word_prefix_docids(&word)?.unwrap_or_default(); + Ok(doc_ids | exact_docids) } else if query.prefix { let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); From 6b2c2509b2e5bfcd5f522a3129f2c8c42bed2c07 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Wed, 30 Mar 2022 16:07:59 +0200 Subject: [PATCH 14/28] fix bug in exact search --- milli/src/index.rs | 1 + milli/src/search/criteria/mod.rs | 35 ++++++++++++------- .../extract/extract_word_docids.rs | 2 ++ 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 80f62f684..c7441c590 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -970,6 +970,7 @@ impl Index { .get::<_, Str, SerdeBincode>>(txn, main_key::EXACT_ATTRIBUTES)? .unwrap_or_default()) } + pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result> { let attrs = self.exact_attributes(txn)?; let fid_map = self.fields_ids_map(txn)?; diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 6ac076ea4..05305d724 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -402,31 +402,42 @@ fn query_docids( wdcache: &mut WordDerivationsCache, ) -> Result { match &query.kind { - QueryKind::Exact { word, .. } => { + QueryKind::Exact { word, original_typo } => { if query.prefix && ctx.in_prefix_cache(&word) { - let doc_ids = ctx.word_prefix_docids(&word)?.unwrap_or_default(); - let exact_docids = ctx.exact_word_prefix_docids(&word)?.unwrap_or_default(); - Ok(doc_ids | exact_docids) + let mut docids = ctx.word_prefix_docids(&word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_prefix_docids(&word)?.unwrap_or_default(); + } + Ok(docids) } else if query.prefix { let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { - let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); - let exact_current_docids = ctx.exact_word_docids(&word)?.unwrap_or_default(); - docids |= current_docids | exact_current_docids; + docids |= ctx.word_docids(&word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_docids(&word)?.unwrap_or_default(); + } } Ok(docids) } else { - let word_docids = ctx.word_docids(&word)?.unwrap_or_default(); - let exact_word_docids = ctx.exact_word_docids(&word)?.unwrap_or_default(); - Ok(word_docids | exact_word_docids) + let mut docids = ctx.word_docids(&word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_docids(&word)?.unwrap_or_default(); + } + Ok(docids) } } QueryKind::Tolerant { typo, word } => { let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); + for (word, typo) in words { + let mut current_docids = ctx.word_docids(&word)?.unwrap_or_default(); + if *typo == 0 { + current_docids |= ctx.exact_word_docids(&word)?.unwrap_or_default() + } docids |= current_docids; } Ok(docids) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 5f231e5aa..fbc9f6919 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -69,9 +69,11 @@ pub fn extract_word_docids( } let fid = field_id_from_position(position); if exact_attributes.contains(&fid) && !added_to_exact { + println!("is exact: {}", std::str::from_utf8(&word_bytes).unwrap()); exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; added_to_exact = true; } else if !added_to_word_docids { + println!("isnt exact: {}", std::str::from_utf8(&word_bytes).unwrap()); word_docids_sorter.insert(word_bytes, &value_buffer)?; added_to_word_docids = true; } From bfd81ce050c6f0723f7322300958a0834529bcf6 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Wed, 30 Mar 2022 16:08:20 +0200 Subject: [PATCH 15/28] add exact atttributes to cli settings --- cli/src/main.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 503b02887..6523cef2e 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,4 +1,4 @@ -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashSet}; use std::fs::File; use std::io::{stdin, BufRead, BufReader, Cursor, Read, Write}; use std::path::PathBuf; @@ -99,8 +99,10 @@ impl Settings { }) .collect(); + let exact_attributes = index.exact_attributes(&txn)?; + println!( - "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\n", + "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}", displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), filterable_attributes.join("\n\t"), @@ -109,6 +111,7 @@ impl Settings { stop_words.join("\n\t"), distinct_field.unwrap_or_default(), synonyms.into_iter().map(|(k, v)| format!("\n\t{}:\n{:?}", k, v)).collect::(), + exact_attributes.join("\n\t"), ); Ok(()) } @@ -463,6 +466,8 @@ struct SettingsUpdate { filterable_attributes: Option>, #[structopt(long)] criteria: Option>, + #[structopt(long)] + exact_attributes: Option>, } impl Performer for SettingsUpdate { @@ -489,6 +494,14 @@ impl Performer for SettingsUpdate { } } + if let Some(exact_attributes) = self.exact_attributes { + if !exact_attributes.is_empty() { + update.set_exact_attributes(exact_attributes.into_iter().collect()); + } else { + update.reset_exact_attributes(); + } + } + let mut bars = Vec::new(); let progesses = MultiProgress::new(); for _ in 0..4 { From c8d3a09af83d7f1cdfab65d45ac6173dfa1b31d3 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 14:47:07 +0200 Subject: [PATCH 16/28] add integration test for disabel typo on attributes --- milli/tests/search/typo_tolerance.rs | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index df15fb768..92d57c9b9 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -170,3 +170,40 @@ fn test_typo_disabled_on_word() { let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); } + +#[test] +fn test_disable_typo_on_attribute() { + let criteria = [Typo]; + let index = super::setup_search_index_with_criteria(&criteria); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("antebelum"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect()); + builder.execute(|_| ()).unwrap(); + + // typo is now supported for 4 letters words + let mut search = Search::new(&txn, &index); + search.query("antebelum"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); +} From 9963f11172a06fa79ed06c3baf8cb4ae727c743b Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 20:38:45 +0200 Subject: [PATCH 17/28] fix infos crate compilation issue --- cli/src/main.rs | 2 +- infos/src/main.rs | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 6523cef2e..cf1e85984 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashSet}; +use std::collections::BTreeMap; use std::fs::File; use std::io::{stdin, BufRead, BufReader, Cursor, Read, Write}; use std::path::PathBuf; diff --git a/infos/src/main.rs b/infos/src/main.rs index dc98d410d..6a270833b 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -387,6 +387,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho field_id_docid_facet_f64s: _, field_id_docid_facet_strings: _, documents, + .. } = index; let main_name = "main"; @@ -968,6 +969,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, + .. } = index; let names = if names.is_empty() { From 6cabd47c32bcf2ba53a3ebe94f254a7fe63de520 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 20:59:20 +0200 Subject: [PATCH 18/28] fix typo in comment --- milli/src/update/index_documents/extract/extract_word_docids.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index fbc9f6919..b577ef567 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -19,7 +19,7 @@ use crate::{field_id_from_position, FieldId, Result}; /// Returns a grenad reader with the list of extracted words and /// documents ids from the given chunk of docid word positions. /// -/// The first returned reader in the one for normal word_docids, and the second one is for +/// The first returned reader is the one for normal word_docids, and the second one is for /// exact_word_docids #[logging_timer::time] pub fn extract_word_docids( From b7694c34f53da8f3253236aff1a5b4a24503bf3c Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 21:00:07 +0200 Subject: [PATCH 19/28] remove println --- milli/src/update/index_documents/extract/extract_word_docids.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index b577ef567..5083bbd90 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -69,11 +69,9 @@ pub fn extract_word_docids( } let fid = field_id_from_position(position); if exact_attributes.contains(&fid) && !added_to_exact { - println!("is exact: {}", std::str::from_utf8(&word_bytes).unwrap()); exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; added_to_exact = true; } else if !added_to_word_docids { - println!("isnt exact: {}", std::str::from_utf8(&word_bytes).unwrap()); word_docids_sorter.insert(word_bytes, &value_buffer)?; added_to_word_docids = true; } From 1810927dbd5f23b85c7e6d9c01d4e68907e84a3f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 21:04:49 +0200 Subject: [PATCH 20/28] rephrase exact_attributes doc --- milli/src/update/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 3ed2a4152..7a26361d4 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -93,7 +93,7 @@ pub struct Settings<'a, 't, 'u, 'i> { min_word_len_two_typos: Setting, min_word_len_one_typo: Setting, exact_words: Setting>, - /// attributes on which typo tolerance is not enabled. + /// Attributes on which typo tolerance is disabled. exact_attributes: Setting>, } From 59e41d98e303205fbb38b467d947c853d15f9ca8 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 21:17:06 +0200 Subject: [PATCH 21/28] add comments to integration test --- milli/tests/search/typo_tolerance.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 92d57c9b9..35cc4b4c2 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -181,6 +181,7 @@ fn test_disable_typo_on_attribute() { let txn = index.read_txn().unwrap(); let mut search = Search::new(&txn, &index); + // typo in `antebel(l)um` search.query("antebelum"); search.limit(10); search.authorize_typos(true); @@ -194,10 +195,10 @@ fn test_disable_typo_on_attribute() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); + // disable typos on `description` builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect()); builder.execute(|_| ()).unwrap(); - // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); search.query("antebelum"); search.limit(10); From ab185a59b5a969f76013670cb61c6892e435f32d Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 09:46:56 +0200 Subject: [PATCH 22/28] fix infos --- infos/src/main.rs | 26 ++++++++++++++++++++++++-- milli/src/update/delete_documents.rs | 2 -- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 6a270833b..05c168233 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -29,6 +29,8 @@ const ALL_DATABASE_NAMES: &[&str] = &[ FACET_ID_STRING_DOCIDS, FIELD_ID_DOCID_FACET_F64S, FIELD_ID_DOCID_FACET_STRINGS, + EXACT_WORD_DOCIDS, + EXACT_WORD_PREFIX_DOCIDS, DOCUMENTS, ]; @@ -384,10 +386,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, + exact_word_docids, + exact_word_prefix_docids, field_id_docid_facet_f64s: _, field_id_docid_facet_strings: _, documents, - .. } = index; let main_name = "main"; @@ -437,6 +440,14 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho } } + for result in exact_word_docids.remap_data_type::().iter(rtxn)? { + let (word, value) = result?; + heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); + if heap.len() > limit { + heap.pop(); + } + } + for result in word_prefix_docids.remap_data_type::().iter(rtxn)? { let (word, value) = result?; heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); @@ -445,6 +456,14 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho } } + for result in exact_word_prefix_docids.remap_data_type::().iter(rtxn)? { + let (word, value) = result?; + heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); + if heap.len() > limit { + heap.pop(); + } + } + for result in docid_word_positions.remap_data_type::().iter(rtxn)? { let ((docid, word), value) = result?; let key = format!("{} {}", docid, word); @@ -968,8 +987,9 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a facet_id_string_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, + exact_word_prefix_docids, + exact_word_docids, documents, - .. } = index; let names = if names.is_empty() { @@ -993,6 +1013,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(), FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(), FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(), + EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(), + EXACT_WORD_PREFIX_DOCIDS => exact_word_prefix_docids.as_polymorph(), DOCUMENTS => documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 58c4d4f70..b347aae38 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -473,8 +473,6 @@ fn remove_from_word_prefix_docids( } } - drop(iter); - Ok(prefixes_to_delete.into_set()) } From dac81b2d44e479a838f20cc9bc14e37efa430d7f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 09:48:56 +0200 Subject: [PATCH 23/28] add missing \n in cli settings --- cli/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index cf1e85984..202c67707 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -102,7 +102,7 @@ impl Settings { let exact_attributes = index.exact_attributes(&txn)?; println!( - "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}", + "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}\n", displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), filterable_attributes.join("\n\t"), From b85cd4983ea01b062ce5e3a2c79a8a3a06f7b0ed Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 09:50:34 +0200 Subject: [PATCH 24/28] remove field_id_from_position --- milli/src/lib.rs | 4 ---- .../src/update/index_documents/extract/extract_word_docids.rs | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index b68c76048..ba2bd9b0f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -74,10 +74,6 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi (field_id as u32) << 16 | (relative as u32) } -pub fn field_id_from_position(position: u32) -> FieldId { - (position >> 16 & 0xffff) as u16 -} - /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 5083bbd90..0f8b4c039 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -12,7 +12,7 @@ use super::helpers::{ use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::update::index_documents::helpers::read_u32_ne_bytes; -use crate::{field_id_from_position, FieldId, Result}; +use crate::{relative_from_absolute_position, FieldId, Result}; /// Extracts the word and the documents ids where this word appear. /// @@ -67,7 +67,7 @@ pub fn extract_word_docids( if added_to_exact && added_to_word_docids { break; } - let fid = field_id_from_position(position); + let (fid, _) = relative_from_absolute_position(position); if exact_attributes.contains(&fid) && !added_to_exact { exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; added_to_exact = true; From 5cfd3d8407bd2bc11f6771385436681726ea8e12 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 14:10:22 +0200 Subject: [PATCH 25/28] add exact attributes documentation --- milli/src/index.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index c7441c590..42170bc80 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -964,6 +964,7 @@ impl Index { Ok(()) } + /// Returns the exact attributes: attributes for which typo is disallowed. pub fn exact_attributes<'t>(&self, txn: &'t RoTxn) -> Result> { Ok(self .main @@ -971,17 +972,20 @@ impl Index { .unwrap_or_default()) } + /// Returns the list of exact attributes field ids. pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result> { let attrs = self.exact_attributes(txn)?; let fid_map = self.fields_ids_map(txn)?; Ok(attrs.iter().filter_map(|attr| fid_map.id(attr)).collect()) } + /// Writes the exact attributes to the database. pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> { self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?; Ok(()) } + /// Clears the exact attributes from the store. pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> Result<()> { self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?; Ok(()) From 201fea0fdaae3a334936a8ad52e2c5de8f178a84 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 14:14:15 +0200 Subject: [PATCH 26/28] limit extract_word_docids memory usage --- milli/src/update/delete_documents.rs | 1 + .../src/update/index_documents/extract/extract_word_docids.rs | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index b347aae38..77c32f0fb 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -502,6 +502,7 @@ fn remove_from_word_docids( } } } + Ok(()) } diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 0f8b4c039..f3a44162b 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -34,7 +34,7 @@ pub fn extract_word_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory, + max_memory.map(|x| x / 2), ); let mut exact_word_docids_sorter = create_sorter( @@ -42,7 +42,7 @@ pub fn extract_word_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory, + max_memory.map(|x| x / 2), ); let mut value_buffer = Vec::new(); From b799f3326b982e382f8f1b7a809f1abe1521c008 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 18:44:35 +0200 Subject: [PATCH 27/28] rename merge_nothing to merge_ignore_values --- milli/src/update/index_documents/helpers/grenad_helpers.rs | 2 +- milli/src/update/index_documents/helpers/mod.rs | 6 +++--- milli/src/update/index_documents/typed_chunk.rs | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index fb5242910..9d5a67d78 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -279,6 +279,6 @@ pub fn sorter_into_lmdb_database( } /// Used when trying to merge readers, but you don't actually care about the values. -pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result> { +pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result> { Ok(Cow::Owned(Vec::new())) } diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 4642bcf14..79d0d0466 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -8,9 +8,9 @@ use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ - as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing, - sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, - GrenadParameters, MergeableReader, + as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, + merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, + writer_into_reader, GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index be440114f..26b97c3a0 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -9,8 +9,8 @@ use heed::{BytesDecode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ - self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, - CursorClonableMmap, + self, merge_ignore_values, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, + valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; @@ -226,7 +226,7 @@ fn merge_word_docids_reader_into_fst( word_docids_iter: grenad::Reader>, exact_word_docids_iter: grenad::Reader>, ) -> Result>> { - let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn); + let mut merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn); merger_builder.push(word_docids_iter.into_cursor()?); merger_builder.push(exact_word_docids_iter.into_cursor()?); let mut iter = merger_builder.build().into_stream_merger_iter()?; From 86249e2ae43e5a2e9bbdc747435fc6938ce2abc5 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 21:35:06 +0200 Subject: [PATCH 28/28] add missing \t in cli update display MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- cli/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 202c67707..542b9d472 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -102,7 +102,7 @@ impl Settings { let exact_attributes = index.exact_attributes(&txn)?; println!( - "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}\n", + "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n\t{}\n", displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), filterable_attributes.join("\n\t"),