From d8354f6f026e8f0d7fd2d4daf4020eec10ab2ccd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 27 Sep 2020 11:46:52 +0200 Subject: [PATCH] Fix the word_docids capacity limit detection --- src/bin/indexer.rs | 48 ++++++++++++++++++++-------------------------- src/search.rs | 12 ++++++++---- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index e58935b98..74ef3fb37 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -91,7 +91,7 @@ struct IndexerOpt { /// Size of the linked hash map cache when indexing. /// The bigger it is, the faster the indexing is but the more memory it takes. - #[structopt(long, default_value = "4096")] + #[structopt(long, default_value = "1048576")] linked_hash_map_size: usize, /// The name of the compression algorithm to use when compressing intermediate @@ -169,9 +169,10 @@ type MergeFn = fn(&[u8], &[Vec]) -> Result, ()>; struct Store { word_docids: LinkedHashMap, RoaringBitmap>, + word_docids_limit: usize, documents_ids: RoaringBitmap, sorter: Sorter, - documents_sorter: Sorter, + documents_writer: Writer, chunk_compression_type: CompressionType, chunk_compression_level: Option, } @@ -183,7 +184,7 @@ impl Store { max_memory: Option, chunk_compression_type: CompressionType, chunk_compression_level: Option, - ) -> Store + ) -> anyhow::Result { let mut builder = Sorter::builder(merge as MergeFn); builder.chunk_compression_type(chunk_compression_type); @@ -197,35 +198,36 @@ impl Store { builder.max_memory(memory); } - let mut documents_builder = Sorter::builder(docs_merge as MergeFn); - documents_builder.chunk_compression_type(chunk_compression_type); + let mut documents_builder = Writer::builder(); + documents_builder.compression_type(chunk_compression_type); if let Some(level) = chunk_compression_level { - builder.chunk_compression_level(level); + documents_builder.compression_level(level); } + let documents_writer = tempfile::tempfile().map(|f| documents_builder.build(f))?; - Store { - // We overflow by one before poping the LRU element. - word_docids: LinkedHashMap::with_capacity(linked_hash_map_size + 1), + Ok(Store { + word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), + word_docids_limit: linked_hash_map_size, documents_ids: RoaringBitmap::new(), sorter: builder.build(), - documents_sorter: documents_builder.build(), + documents_writer, chunk_compression_type, chunk_compression_level, - } + }) } // Save the documents ids under the position and word we have seen it. fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> anyhow::Result<()> { - let word_vec = SmallVec32::from(word.as_bytes()); // if get_refresh finds the element it is assured to be at the end of the linked hash map. - match self.word_docids.get_refresh(&word_vec) { + match self.word_docids.get_refresh(word.as_bytes()) { Some(old) => { old.insert(id); }, None => { + let word_vec = SmallVec32::from(word.as_bytes()); // A newly inserted element is append at the end of the linked hash map. self.word_docids.insert(word_vec, RoaringBitmap::from_iter(Some(id))); // If the word docids just reached it's capacity we must make sure to remove // one element, this way next time we insert we doesn't grow the capacity. - if self.word_docids.len() == self.word_docids.capacity() { + if self.word_docids.len() == self.word_docids_limit { // Removing the front element is equivalent to removing the LRU element. let lru = self.word_docids.pop_front(); Self::write_word_docids(&mut self.sorter, lru)?; @@ -261,7 +263,7 @@ impl Store { .with_context(|| format!("could not encode CSV record"))?; self.documents_ids.insert(document_id); - self.documents_sorter.insert(document_id.to_be_bytes(), record)?; + self.documents_writer.insert(document_id.to_be_bytes(), record)?; Self::write_docid_word_positions(&mut self.sorter, document_id, words_positions)?; Ok(()) @@ -436,10 +438,7 @@ impl Store { let fst = builder.into_set(); wtr.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?; - let docs_wtr_file = tempfile::tempfile()?; - let mut docs_wtr = create_writer(compression_type, compression_level, docs_wtr_file); - self.documents_sorter.write_into(&mut docs_wtr)?; - let docs_file = docs_wtr.into_inner()?; + let docs_file = self.documents_writer.into_inner()?; let docs_mmap = unsafe { Mmap::map(&docs_file)? }; let docs_reader = Reader::new(docs_mmap)?; @@ -451,12 +450,6 @@ impl Store { } } -fn docs_merge(key: &[u8], values: &[Vec]) -> Result, ()> { - let key = key.try_into().unwrap(); - let id = u32::from_be_bytes(key); - panic!("documents must not conflict ({} with {} values)!", id, values.len()) -} - fn merge(key: &[u8], values: &[Vec]) -> Result, ()> { match key { WORDS_FST_KEY => { @@ -633,13 +626,14 @@ fn main() -> anyhow::Result<()> { .into_par_iter() .enumerate() .map(|(i, rdr)| { - Store::new( + let store = Store::new( linked_hash_map_size, max_nb_chunks, Some(max_memory), chunk_compression_type, chunk_compression_level, - ).index_csv(rdr, i, num_threads) + )?; + store.index_csv(rdr, i, num_threads) }) .collect::, _>>()?; diff --git a/src/search.rs b/src/search.rs index a4cf5ead3..23daf9ad3 100644 --- a/src/search.rs +++ b/src/search.rs @@ -217,10 +217,14 @@ impl<'a> Search<'a> { eprintln!("found pairs {:?}", pairs); let mut pairs_union = RoaringBitmap::new(); - for (w1, w2) in pairs { - let key = (w1, w2, 1); - if let Some(docids) = index.word_pair_proximity_docids.get(rtxn, &key)? { - pairs_union.union_with(&docids); + 'pairs: for (w1, w2) in pairs { + for prox in 1..=7 { + let key = (w1, w2, prox); + eprintln!("{:?}", key); + if let Some(docids) = index.word_pair_proximity_docids.get(rtxn, &key)? { + pairs_union.union_with(&docids); + continue 'pairs; + } } }