From 86c34a996b5713915215d107d5cc22735ef238ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 31 Oct 2022 13:33:49 +0100 Subject: [PATCH 1/7] Deduplicate matching words --- milli/src/search/matches/matching_words.rs | 21 ++-- milli/src/search/matches/mod.rs | 50 +++++---- milli/src/search/query_tree.rs | 124 ++++++++++++++++++--- 3 files changed, 151 insertions(+), 44 deletions(-) diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 25d447d0c..5bd6c222d 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -2,6 +2,7 @@ use std::cmp::{min, Reverse}; use std::collections::BTreeMap; use std::fmt; use std::ops::{Index, IndexMut}; +use std::rc::Rc; use charabia::Token; use levenshtein_automata::{Distance, DFA}; @@ -14,11 +15,11 @@ type IsPrefix = bool; /// referencing words that match the given query tree. #[derive(Default)] pub struct MatchingWords { - inner: Vec<(Vec, Vec)>, + inner: Vec<(Vec>, Vec)>, } impl MatchingWords { - pub fn new(mut matching_words: Vec<(Vec, Vec)>) -> Self { + pub fn new(mut matching_words: Vec<(Vec>, Vec)>) -> Self { // Sort word by len in DESC order prioritizing the longuest matches, // in order to highlight the longuest part of the matched word. matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len()))); @@ -35,7 +36,8 @@ impl MatchingWords { /// Iterator over terms that match the given token, /// This allow to lazily evaluate matches. pub struct MatchesIter<'a, 'b> { - inner: Box, Vec)> + 'a>, + #[allow(clippy::type_complexity)] + inner: Box>, Vec)> + 'a>, token: &'b Token<'b>, } @@ -126,7 +128,7 @@ pub enum MatchType<'a> { /// Structure helper to match several tokens in a row in order to complete a partial match. #[derive(Debug, PartialEq)] pub struct PartialMatch<'a> { - matching_words: &'a [MatchingWord], + matching_words: &'a [Rc], ids: &'a [PrimitiveWordId], char_len: usize, } @@ -332,10 +334,15 @@ mod tests { #[test] fn matching_words() { + let all = vec![ + Rc::new(MatchingWord::new("split".to_string(), 1, true)), + Rc::new(MatchingWord::new("this".to_string(), 0, false)), + Rc::new(MatchingWord::new("world".to_string(), 1, true)), + ]; let matching_words = vec![ - (vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]), - (vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]), - (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + (vec![all[0].clone()], vec![0]), + (vec![all[1].clone()], vec![1]), + (vec![all[2].clone()], vec![2]), ]; let matching_words = MatchingWords::new(matching_words); diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index ec47f848d..0e515fde6 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -494,16 +494,23 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { #[cfg(test)] mod tests { + use std::rc::Rc; + use charabia::TokenizerBuilder; use super::*; use crate::search::matches::matching_words::MatchingWord; fn matching_words() -> MatchingWords { + let all = vec![ + Rc::new(MatchingWord::new("split".to_string(), 0, false)), + Rc::new(MatchingWord::new("the".to_string(), 0, false)), + Rc::new(MatchingWord::new("world".to_string(), 1, true)), + ]; let matching_words = vec![ - (vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]), - (vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]), - (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + (vec![all[0].clone()], vec![0]), + (vec![all[1].clone()], vec![1]), + (vec![all[2].clone()], vec![2]), ]; MatchingWords::new(matching_words) @@ -587,10 +594,11 @@ mod tests { #[test] fn highlight_unicode() { - let matching_words = vec![ - (vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]), - (vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]), + let all = vec![ + Rc::new(MatchingWord::new("wessfali".to_string(), 1, true)), + Rc::new(MatchingWord::new("world".to_string(), 1, true)), ]; + let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])]; let matching_words = MatchingWords::new(matching_words); @@ -823,24 +831,20 @@ mod tests { #[test] fn partial_matches() { + let all = vec![ + Rc::new(MatchingWord::new("the".to_string(), 0, false)), + Rc::new(MatchingWord::new("t".to_string(), 0, false)), + Rc::new(MatchingWord::new("he".to_string(), 0, false)), + Rc::new(MatchingWord::new("door".to_string(), 0, false)), + Rc::new(MatchingWord::new("do".to_string(), 0, false)), + Rc::new(MatchingWord::new("or".to_string(), 0, false)), + ]; let matching_words = vec![ - (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), - ( - vec![ - MatchingWord::new("t".to_string(), 0, false), - MatchingWord::new("he".to_string(), 0, false), - ], - vec![0], - ), - (vec![MatchingWord::new("door".to_string(), 0, false)], vec![1]), - ( - vec![ - MatchingWord::new("do".to_string(), 0, false), - MatchingWord::new("or".to_string(), 0, false), - ], - vec![1], - ), - (vec![MatchingWord::new("do".to_string(), 0, false)], vec![2]), + (vec![all[0].clone()], vec![0]), + (vec![all[1].clone(), all[2].clone()], vec![0]), + (vec![all[3].clone()], vec![1]), + (vec![all[4].clone(), all[5].clone()], vec![1]), + (vec![all[4].clone()], vec![2]), ]; let matching_words = MatchingWords::new(matching_words); diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index a9c1ac29f..acb326022 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,5 +1,9 @@ use std::borrow::Cow; use std::cmp::max; +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::hash::Hash; +use std::rc::Rc; use std::{fmt, mem}; use charabia::classifier::ClassifiedTokenIter; @@ -540,6 +544,30 @@ fn create_query_tree( Ok(Operation::or(true, operation_children)) } +#[derive(Default, Debug)] +struct MatchingWordCache { + all: Vec>, + map: HashMap<(String, u8, bool), Rc>, +} +impl MatchingWordCache { + fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Rc { + // Toggle the (un)commented code to switch between cached and non-cached + // implementations. + + // self.all.push(MatchingWord::new(word, typo, prefix)); + // self.all.len() - 1 + match self.map.entry((word.clone(), typo, prefix)) { + Entry::Occupied(idx) => idx.get().clone(), + Entry::Vacant(vacant) => { + let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)); + self.all.push(matching_word.clone()); + vacant.insert(matching_word.clone()); + matching_word + } + } + } +} + /// Main function that matchings words used for crop and highlight. fn create_matching_words( ctx: &impl Context, @@ -551,7 +579,8 @@ fn create_matching_words( ctx: &impl Context, authorize_typos: bool, part: PrimitiveQueryPart, - matching_words: &mut Vec<(Vec, Vec)>, + matching_words: &mut Vec<(Vec>, Vec)>, + matching_word_cache: &mut MatchingWordCache, id: PrimitiveWordId, ) -> Result<()> { match part { @@ -562,15 +591,15 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| MatchingWord::new(syn, 0, false)) + .map(|syn| matching_word_cache.insert(syn, 0, false)) .collect(); matching_words.push((synonym, vec![id])); } } if let Some((left, right)) = split_best_frequency(ctx, &word)? { - let left = MatchingWord::new(left.to_string(), 0, false); - let right = MatchingWord::new(right.to_string(), 0, false); + let left = matching_word_cache.insert(left.to_string(), 0, false); + let right = matching_word_cache.insert(right.to_string(), 0, false); matching_words.push((vec![left, right], vec![id])); } @@ -580,8 +609,10 @@ fn create_matching_words( TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; let matching_word = match typos(word, authorize_typos, config) { - QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix), - QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix), + QueryKind::Exact { word, .. } => matching_word_cache.insert(word, 0, prefix), + QueryKind::Tolerant { typo, word } => { + matching_word_cache.insert(word, typo, prefix) + } }; matching_words.push((vec![matching_word], vec![id])); } @@ -589,8 +620,11 @@ fn create_matching_words( PrimitiveQueryPart::Phrase(words) => { let ids: Vec<_> = (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); - let words = - words.into_iter().flatten().map(|w| MatchingWord::new(w, 0, false)).collect(); + let words = words + .into_iter() + .flatten() + .map(|w| matching_word_cache.insert(w, 0, false)) + .collect(); matching_words.push((words, ids)); } } @@ -603,7 +637,8 @@ fn create_matching_words( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], - matching_words: &mut Vec<(Vec, Vec)>, + matching_words: &mut Vec<(Vec>, Vec)>, + matching_word_cache: &mut MatchingWordCache, mut id: PrimitiveWordId, ) -> Result<()> { const MAX_NGRAM: usize = 3; @@ -621,6 +656,7 @@ fn create_matching_words( authorize_typos, part.clone(), matching_words, + matching_word_cache, id, )?; } @@ -645,7 +681,7 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| MatchingWord::new(syn, 0, false)) + .map(|syn| matching_word_cache.insert(syn, 0, false)) .collect(); matching_words.push((synonym, ids.clone())); } @@ -662,10 +698,10 @@ fn create_matching_words( }; let matching_word = match typos(word, authorize_typos, config) { QueryKind::Exact { word, .. } => { - MatchingWord::new(word, 0, is_prefix) + matching_word_cache.insert(word, 0, is_prefix) } QueryKind::Tolerant { typo, word } => { - MatchingWord::new(word, typo, is_prefix) + matching_word_cache.insert(word, typo, is_prefix) } }; matching_words.push((vec![matching_word], ids)); @@ -673,7 +709,14 @@ fn create_matching_words( } if !is_last { - ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?; + ngrams( + ctx, + authorize_typos, + tail, + matching_words, + matching_word_cache, + id + 1, + )?; } } } @@ -683,8 +726,9 @@ fn create_matching_words( Ok(()) } + let mut matching_word_cache = MatchingWordCache::default(); let mut matching_words = Vec::new(); - ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?; + ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?; Ok(MatchingWords::new(matching_words)) } @@ -806,7 +850,9 @@ pub fn maximum_proximity(operation: &Operation) -> usize { #[cfg(test)] mod test { + use std::alloc::{GlobalAlloc, System}; use std::collections::HashMap; + use std::sync::atomic::{self, AtomicI64}; use charabia::Tokenize; use maplit::hashmap; @@ -814,6 +860,7 @@ mod test { use rand::{Rng, SeedableRng}; use super::*; + use crate::index::tests::TempIndex; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; #[derive(Debug)] @@ -1310,4 +1357,53 @@ mod test { Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) )); } + + #[global_allocator] + static ALLOC: CountingAlloc = + CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) }; + + pub struct CountingAlloc { + pub resident: AtomicI64, + pub allocated: AtomicI64, + } + unsafe impl GlobalAlloc for CountingAlloc { + unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { + self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); + self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); + + System.alloc(layout) + } + + unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { + self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::SeqCst); + System.dealloc(ptr, layout) + } + } + + // This test must be run + #[test] + fn ten_words() { + let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); + let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); + + let index = TempIndex::new(); + let rtxn = index.read_txn().unwrap(); + let query = "a beautiful summer house by the beach overlooking what seems"; + let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); + builder.words_limit(10); + let x = builder.build(query.tokenize()).unwrap().unwrap(); + let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); + let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); + + insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4521710"); + insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7259092"); + + // Note, if the matching word cache is deactivated, the memory usage is: + // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91311265"); + // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125948410"); + // or about 20x more resident memory (90MB vs 4.5MB) + + // Use x + let _x = x; + } } From 8d0ace2d64aa37558c354249e6104e7d407f3a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 24 Nov 2022 09:00:53 +0100 Subject: [PATCH 2/7] Avoid creating a MatchingWord for words that exceed the length limit --- milli/src/lib.rs | 15 ++++ milli/src/search/matches/matching_words.rs | 25 ++++-- milli/src/search/matches/mod.rs | 22 ++--- milli/src/search/query_tree.rs | 82 +++++++++++++------ .../extract/extract_docid_word_positions.rs | 8 +- .../extract/extract_facet_string_docids.rs | 3 +- .../extract/extract_fid_docid_facet_values.rs | 3 +- .../src/update/index_documents/helpers/mod.rs | 15 +--- 8 files changed, 111 insertions(+), 62 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index c33aae9eb..40e36092f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -70,6 +70,21 @@ pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; /// expressed in term of latitude and longitude. pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>; +/// The maximum length a LMDB key can be. +/// +/// Note that the actual allowed length is a little bit higher, but +/// we keep a margin of safety. +const MAX_LMDB_KEY_LENGTH: usize = 500; + +/// The maximum length a field value can be when inserted in an LMDB key. +/// +/// This number is determined by the keys of the different facet databases +/// and adding a margin of safety. +pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20; + +/// The maximum length a word can be +pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; + pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; // Convert an absolute word position into a relative position. diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 5bd6c222d..22ba973b5 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -8,6 +8,7 @@ use charabia::Token; use levenshtein_automata::{Distance, DFA}; use crate::search::build_dfa; +use crate::MAX_WORD_LENGTH; type IsPrefix = bool; @@ -18,6 +19,17 @@ pub struct MatchingWords { inner: Vec<(Vec>, Vec)>, } +impl fmt::Debug for MatchingWords { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "[")?; + for (matching_words, primitive_word_id) in self.inner.iter() { + writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?; + } + writeln!(f, "]")?; + Ok(()) + } +} + impl MatchingWords { pub fn new(mut matching_words: Vec<(Vec>, Vec)>) -> Self { // Sort word by len in DESC order prioritizing the longuest matches, @@ -93,10 +105,13 @@ impl PartialEq for MatchingWord { } impl MatchingWord { - pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self { + pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option { + if word.len() > MAX_WORD_LENGTH { + return None; + } let dfa = build_dfa(&word, typo, prefix); - Self { dfa, word, typo, prefix } + Some(Self { dfa, word, typo, prefix }) } /// Returns the lenght in chars of the match in case of the token matches the term. @@ -335,9 +350,9 @@ mod tests { #[test] fn matching_words() { let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 1, true)), - Rc::new(MatchingWord::new("this".to_string(), 0, false)), - Rc::new(MatchingWord::new("world".to_string(), 1, true)), + Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()), + Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), ]; let matching_words = vec![ (vec![all[0].clone()], vec![0]), diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 0e515fde6..25ee52ab1 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -503,9 +503,9 @@ mod tests { fn matching_words() -> MatchingWords { let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 0, false)), - Rc::new(MatchingWord::new("the".to_string(), 0, false)), - Rc::new(MatchingWord::new("world".to_string(), 1, true)), + Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), ]; let matching_words = vec![ (vec![all[0].clone()], vec![0]), @@ -595,8 +595,8 @@ mod tests { #[test] fn highlight_unicode() { let all = vec![ - Rc::new(MatchingWord::new("wessfali".to_string(), 1, true)), - Rc::new(MatchingWord::new("world".to_string(), 1, true)), + Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), ]; let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])]; @@ -832,12 +832,12 @@ mod tests { #[test] fn partial_matches() { let all = vec![ - Rc::new(MatchingWord::new("the".to_string(), 0, false)), - Rc::new(MatchingWord::new("t".to_string(), 0, false)), - Rc::new(MatchingWord::new("he".to_string(), 0, false)), - Rc::new(MatchingWord::new("door".to_string(), 0, false)), - Rc::new(MatchingWord::new("do".to_string(), 0, false)), - Rc::new(MatchingWord::new("or".to_string(), 0, false)), + Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()), ]; let matching_words = vec![ (vec![all[0].clone()], vec![0]), diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index acb326022..74b244f9a 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -550,21 +550,20 @@ struct MatchingWordCache { map: HashMap<(String, u8, bool), Rc>, } impl MatchingWordCache { - fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Rc { - // Toggle the (un)commented code to switch between cached and non-cached - // implementations. - - // self.all.push(MatchingWord::new(word, typo, prefix)); - // self.all.len() - 1 + fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Option> { match self.map.entry((word.clone(), typo, prefix)) { - Entry::Occupied(idx) => idx.get().clone(), + Entry::Occupied(idx) => Some(idx.get().clone()), Entry::Vacant(vacant) => { - let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)); + let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); self.all.push(matching_word.clone()); vacant.insert(matching_word.clone()); - matching_word + Some(matching_word) } } + // To deactivate the cache, for testing purposes, use the following instead: + // let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); + // self.all.push(matching_word.clone()); + // Some(matching_word) } } @@ -591,16 +590,19 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| matching_word_cache.insert(syn, 0, false)) + .flat_map(|syn| matching_word_cache.insert(syn, 0, false)) .collect(); matching_words.push((synonym, vec![id])); } } if let Some((left, right)) = split_best_frequency(ctx, &word)? { - let left = matching_word_cache.insert(left.to_string(), 0, false); - let right = matching_word_cache.insert(right.to_string(), 0, false); - matching_words.push((vec![left, right], vec![id])); + if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) { + if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false) + { + matching_words.push((vec![left, right], vec![id])); + } + } } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; @@ -614,7 +616,9 @@ fn create_matching_words( matching_word_cache.insert(word, typo, prefix) } }; - matching_words.push((vec![matching_word], vec![id])); + if let Some(matching_word) = matching_word { + matching_words.push((vec![matching_word], vec![id])); + } } // create a CONSECUTIVE matchings words wrapping all word in the phrase PrimitiveQueryPart::Phrase(words) => { @@ -623,7 +627,7 @@ fn create_matching_words( let words = words .into_iter() .flatten() - .map(|w| matching_word_cache.insert(w, 0, false)) + .flat_map(|w| matching_word_cache.insert(w, 0, false)) .collect(); matching_words.push((words, ids)); } @@ -681,7 +685,7 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| matching_word_cache.insert(syn, 0, false)) + .flat_map(|syn| matching_word_cache.insert(syn, 0, false)) .collect(); matching_words.push((synonym, ids.clone())); } @@ -704,7 +708,9 @@ fn create_matching_words( matching_word_cache.insert(word, typo, is_prefix) } }; - matching_words.push((vec![matching_word], ids)); + if let Some(matching_word) = matching_word { + matching_words.push((vec![matching_word], ids)); + } } } @@ -1341,6 +1347,27 @@ mod test { ); } + #[test] + fn test_dont_create_matching_word_for_long_words() { + let index = TempIndex::new(); + let rtxn = index.read_txn().unwrap(); + let query = "what a supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious house"; + let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); + builder.words_limit(10); + let (_, _, matching_words) = builder.build(query.tokenize()).unwrap().unwrap(); + insta::assert_snapshot!(format!("{matching_words:?}"), @r###" + [ + ([MatchingWord { word: "house", typo: 1, prefix: true }], [3]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) + ([MatchingWord { word: "whata", typo: 1, prefix: false }], [0, 1]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [1]) + ([MatchingWord { word: "what", typo: 0, prefix: false }], [0]) + ([MatchingWord { word: "a", typo: 0, prefix: false }], [1]) + ] + "###); + } + #[test] fn disable_typo_on_word() { let query = "goodbye"; @@ -1380,9 +1407,8 @@ mod test { } } - // This test must be run #[test] - fn ten_words() { + fn memory_usage_of_ten_word_query() { let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); @@ -1395,12 +1421,20 @@ mod test { let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); - insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4521710"); - insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7259092"); + // Weak check on the memory usage + // Don't keep more than 5MB. (Arguably 5MB is already too high) + assert!(resident_after - resident_before < 5_000_000); + // Don't allocate more than 10MB. + assert!(allocated_after - allocated_before < 10_000_000); - // Note, if the matching word cache is deactivated, the memory usage is: - // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91311265"); - // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125948410"); + // Use these snapshots to measure the exact memory usage. + // The values below were correct at the time I wrote them. + // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950"); + // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502"); + + // Note, with the matching word cache deactivated, the memory usage was: + // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697"); + // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588"); // or about 20x more resident memory (90MB vs 4.5MB) // Use x diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 8eae0caee..be9b479bb 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -7,11 +7,11 @@ use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; use roaring::RoaringBitmap; use serde_json::Value; -use super::helpers::{ - concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters, MAX_WORD_LENGTH, -}; +use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; -use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{ + absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, +}; /// Extracts the word and positions where this word appear and /// prefixes it by the document id. diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 3a0af3c96..0d9c0981e 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -6,9 +6,8 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::StrRefCodec; -use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH; use crate::update::index_documents::merge_cbo_roaring_bitmaps; -use crate::{FieldId, Result}; +use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet string and the documents ids where this facet string appear. /// diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index b37cd90d3..0a7dfbeb1 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -12,9 +12,8 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH; use crate::update::index_documents::{create_writer, writer_into_reader}; -use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32}; +use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet values of each faceted field of each document. /// diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index e1f112858..a496ccd6e 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -18,20 +18,7 @@ pub use merge_functions::{ serialize_roaring_bitmap, MergeFn, }; -/// The maximum length a LMDB key can be. -/// -/// Note that the actual allowed length is a little bit higher, but -/// we keep a margin of safety. -const MAX_LMDB_KEY_LENGTH: usize = 500; - -/// The maximum length a field value can be when inserted in an LMDB key. -/// -/// This number is determined by the keys of the different facet databases -/// and adding a margin of safety. -pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20; - -/// The maximum length a word can be -pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; +use crate::MAX_WORD_LENGTH; pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty() From 8284bd760f601da6c22b76bea61dba9460fec4ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 24 Nov 2022 09:29:10 +0100 Subject: [PATCH 3/7] Relax memory ordering of operations within the test CountingAlloc --- milli/src/search/query_tree.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 74b244f9a..9d0ca5633 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1395,14 +1395,14 @@ mod test { } unsafe impl GlobalAlloc for CountingAlloc { unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { - self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); - self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); + self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); + self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); System.alloc(layout) } unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { - self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::SeqCst); + self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); System.dealloc(ptr, layout) } } From e2ebed62b1f20b36d1cf802938867ec3046e5a44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 28 Nov 2022 10:19:43 +0100 Subject: [PATCH 4/7] Don't create partial matching words for synonyms, split words, phrases --- milli/src/search/query_tree.rs | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 9d0ca5633..b218b48e2 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -588,15 +588,21 @@ fn create_matching_words( PrimitiveQueryPart::Word(word, prefix) => { if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? { for synonym in synonyms { - let synonym = synonym + // Require that all words of the synonym have a corresponding MatchingWord + // before adding any of its words to the matching_words result. + if let Some(synonym_matching_words) = synonym .into_iter() - .flat_map(|syn| matching_word_cache.insert(syn, 0, false)) - .collect(); - matching_words.push((synonym, vec![id])); + .map(|word| matching_word_cache.insert(word, 0, false)) + .collect() + { + matching_words.push((synonym_matching_words, vec![id])); + } } } if let Some((left, right)) = split_best_frequency(ctx, &word)? { + // Require that both left and right words have a corresponding MatchingWord + // before adding them to the matching_words result if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) { if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false) { @@ -624,12 +630,16 @@ fn create_matching_words( PrimitiveQueryPart::Phrase(words) => { let ids: Vec<_> = (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); - let words = words + // Require that all words of the phrase have a corresponding MatchingWord + // before adding any of them to the matching_words result + if let Some(phrase_matching_words) = words .into_iter() .flatten() - .flat_map(|w| matching_word_cache.insert(w, 0, false)) - .collect(); - matching_words.push((words, ids)); + .map(|w| matching_word_cache.insert(w, 0, false)) + .collect() + { + matching_words.push((phrase_matching_words, ids)); + } } } From 80588daae516127b8aa6b2db9f9f55014f357900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 28 Nov 2022 10:27:15 +0100 Subject: [PATCH 5/7] Fix compilation error in formatting benches --- benchmarks/benches/formatting.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs index 25c5a0ba8..3479029f4 100644 --- a/benchmarks/benches/formatting.rs +++ b/benchmarks/benches/formatting.rs @@ -1,3 +1,5 @@ +use std::rc::Rc; + use criterion::{criterion_group, criterion_main}; use milli::tokenizer::TokenizerBuilder; use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; @@ -18,14 +20,14 @@ fn bench_formatting(c: &mut criterion::Criterion) { name: "'the door d'", text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#, matching_words: MatcherBuilder::new(MatchingWords::new(vec![ - (vec![MatchingWord::new("t".to_string(), 0, false), MatchingWord::new("he".to_string(), 0, false)], vec![0]), - (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), - (vec![MatchingWord::new("door".to_string(), 1, false)], vec![1]), - (vec![MatchingWord::new("do".to_string(), 0, false), MatchingWord::new("or".to_string(), 0, false)], vec![0]), - (vec![MatchingWord::new("thedoor".to_string(), 1, false)], vec![0, 1]), - (vec![MatchingWord::new("d".to_string(), 0, true)], vec![2]), - (vec![MatchingWord::new("thedoord".to_string(), 1, true)], vec![0, 1, 2]), - (vec![MatchingWord::new("doord".to_string(), 1, true)], vec![1, 2]), + (vec![Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap())], vec![0]), + (vec![Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap())], vec![0]), + (vec![Rc::new(MatchingWord::new("door".to_string(), 1, false).unwrap())], vec![1]), + (vec![Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap())], vec![0]), + (vec![Rc::new(MatchingWord::new("thedoor".to_string(), 1, false).unwrap())], vec![0, 1]), + (vec![Rc::new(MatchingWord::new("d".to_string(), 0, true).unwrap())], vec![2]), + (vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]), + (vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]), ] ), TokenizerBuilder::default().build()), }, From f70856bab1505b808261e6d889bcfb3dd9eaad61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 28 Nov 2022 12:39:43 +0100 Subject: [PATCH 6/7] Remove memory usage test that fails when many tests are run in parallel --- milli/src/search/query_tree.rs | 117 +++++++++++++++++---------------- 1 file changed, 61 insertions(+), 56 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index b218b48e2..6ea82f165 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -866,9 +866,7 @@ pub fn maximum_proximity(operation: &Operation) -> usize { #[cfg(test)] mod test { - use std::alloc::{GlobalAlloc, System}; use std::collections::HashMap; - use std::sync::atomic::{self, AtomicI64}; use charabia::Tokenize; use maplit::hashmap; @@ -1395,59 +1393,66 @@ mod test { )); } - #[global_allocator] - static ALLOC: CountingAlloc = - CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) }; + // The memory usage test below is disabled because `cargo test` runs multiple tests in parallel, + // which invalidates the measurements of memory usage. Nevertheless, it is a useful test to run + // manually from time to time, so I kept it here, commented-out. - pub struct CountingAlloc { - pub resident: AtomicI64, - pub allocated: AtomicI64, - } - unsafe impl GlobalAlloc for CountingAlloc { - unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { - self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); - self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); - - System.alloc(layout) - } - - unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { - self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); - System.dealloc(ptr, layout) - } - } - - #[test] - fn memory_usage_of_ten_word_query() { - let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); - let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); - - let index = TempIndex::new(); - let rtxn = index.read_txn().unwrap(); - let query = "a beautiful summer house by the beach overlooking what seems"; - let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); - builder.words_limit(10); - let x = builder.build(query.tokenize()).unwrap().unwrap(); - let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); - let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); - - // Weak check on the memory usage - // Don't keep more than 5MB. (Arguably 5MB is already too high) - assert!(resident_after - resident_before < 5_000_000); - // Don't allocate more than 10MB. - assert!(allocated_after - allocated_before < 10_000_000); - - // Use these snapshots to measure the exact memory usage. - // The values below were correct at the time I wrote them. - // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950"); - // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502"); - - // Note, with the matching word cache deactivated, the memory usage was: - // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697"); - // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588"); - // or about 20x more resident memory (90MB vs 4.5MB) - - // Use x - let _x = x; - } + // use std::alloc::{GlobalAlloc, System}; + // use std::sync::atomic::{self, AtomicI64}; + // + // #[global_allocator] + // static ALLOC: CountingAlloc = + // CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) }; + // + // pub struct CountingAlloc { + // pub resident: AtomicI64, + // pub allocated: AtomicI64, + // } + // unsafe impl GlobalAlloc for CountingAlloc { + // unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { + // self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); + // self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); + // + // System.alloc(layout) + // } + // + // unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { + // self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); + // System.dealloc(ptr, layout) + // } + // } + // + // #[test] + // fn memory_usage_of_ten_word_query() { + // let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); + // let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); + // + // let index = TempIndex::new(); + // let rtxn = index.read_txn().unwrap(); + // let query = "a beautiful summer house by the beach overlooking what seems"; + // let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); + // builder.words_limit(10); + // let x = builder.build(query.tokenize()).unwrap().unwrap(); + // let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); + // let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); + // + // // Weak check on the memory usage + // // Don't keep more than 5MB. (Arguably 5MB is already too high) + // assert!(resident_after - resident_before < 5_000_000); + // // Don't allocate more than 10MB. + // assert!(allocated_after - allocated_before < 10_000_000); + // + // // Use these snapshots to measure the exact memory usage. + // // The values below were correct at the time I wrote them. + // // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950"); + // // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502"); + // + // // Note, with the matching word cache deactivated, the memory usage was: + // // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697"); + // // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588"); + // // or about 20x more resident memory (90MB vs 4.5MB) + // + // // Use x + // let _x = x; + // } } From 61b58b115a5dcfe7b0dc701c722f4832abdc747a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 28 Nov 2022 16:32:28 +0100 Subject: [PATCH 7/7] Don't create partial matching words for synonyms in ngrams --- milli/src/search/query_tree.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 6ea82f165..e689ae440 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -693,11 +693,13 @@ fn create_matching_words( if let Some(synonyms) = ctx.synonyms(&words)? { for synonym in synonyms { - let synonym = synonym + if let Some(synonym) = synonym .into_iter() - .flat_map(|syn| matching_word_cache.insert(syn, 0, false)) - .collect(); - matching_words.push((synonym, ids.clone())); + .map(|syn| matching_word_cache.insert(syn, 0, false)) + .collect() + { + matching_words.push((synonym, ids.clone())); + } } } let word = words.concat();