From 0f4c0beffd0a25437c2e58cfb2f7686ab17da87f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 11 Mar 2021 11:48:55 +0100 Subject: [PATCH 01/45] Introduce the Attribute criterion --- Cargo.lock | 7 ++ milli/Cargo.toml | 1 + milli/src/search/criteria/attribute.rs | 133 +++++++++++++++++++++++++ milli/src/search/criteria/mod.rs | 4 + 4 files changed, 145 insertions(+) create mode 100644 milli/src/search/criteria/attribute.rs diff --git a/Cargo.lock b/Cargo.lock index bbe86a2a7..065be362f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -122,6 +122,12 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "big_s" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "199edb7b90631283b10c2422e6a0bc8b7d987bf732995ba1de53b576c97e51a8" + [[package]] name = "bincode" version = "1.3.1" @@ -1251,6 +1257,7 @@ name = "milli" version = "0.1.1" dependencies = [ "anyhow", + "big_s", "bstr", "byteorder", "chrono", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b198131c1..eefdfa7d5 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -52,6 +52,7 @@ logging_timer = "1.0.0" tinytemplate = "=1.1.0" [dev-dependencies] +big_s = "1.0.2" criterion = "0.3.4" maplit = "1.0.2" rand = "0.8.3" diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs new file mode 100644 index 000000000..9c31740b1 --- /dev/null +++ b/milli/src/search/criteria/attribute.rs @@ -0,0 +1,133 @@ +use log::debug; +use roaring::RoaringBitmap; + +use crate::search::criteria::Query; +use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; +use super::{Criterion, CriterionResult, Context}; + +pub struct Attribute<'t> { + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + bucket_candidates: RoaringBitmap, + parent: Option>, +} + +impl<'t> Attribute<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> Self + { + Attribute { + ctx, + query_tree, + candidates, + bucket_candidates: RoaringBitmap::new(), + parent: None, + } + } + + pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + Attribute { + ctx, + query_tree: None, + candidates: None, + bucket_candidates: RoaringBitmap::new(), + parent: Some(parent), + } + } +} + +impl<'t> Criterion for Attribute<'t> { + #[logging_timer::time("Attribute::{}")] + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + todo!("Attribute") + } +} + +// TODO can we keep refs of Query +fn explode_query_tree(query_tree: &Operation) -> Vec> { + use crate::search::criteria::Operation::{And, Or, Consecutive}; + + fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec> { + match tail.split_first() { + Some((thead, tail)) => { + let tail = and_recurse(thead, tail); + let mut out = Vec::new(); + for array in recurse(head) { + for tail_array in &tail { + let mut array = array.clone(); + array.extend(tail_array.iter().cloned()); + out.push(array); + } + } + out + }, + None => recurse(head), + } + } + + fn recurse(op: &Operation) -> Vec> { + match op { + And(ops) | Consecutive(ops) => { + ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) + }, + Or(_, ops) => ops.into_iter().map(recurse).flatten().collect(), + Operation::Query(query) => vec![vec![query.clone()]], + } + } + + recurse(query_tree) +} + +#[cfg(test)] +mod tests { + use big_s::S; + + use crate::search::criteria::QueryKind; + use super::*; + + #[test] + fn simple_explode_query_tree() { + let query_tree = Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), + ]), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), + ]), + ]), + ]), + ]); + + let expected = vec![ + vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }], + vec![ + Query { prefix: false, kind: QueryKind::exact(S("manythe")) }, + Query { prefix: false, kind: QueryKind::exact(S("fish")) }, + ], + vec![ + Query { prefix: false, kind: QueryKind::exact(S("many")) }, + Query { prefix: false, kind: QueryKind::exact(S("thefish")) }, + ], + vec![ + Query { prefix: false, kind: QueryKind::exact(S("many")) }, + Query { prefix: false, kind: QueryKind::exact(S("the")) }, + Query { prefix: false, kind: QueryKind::exact(S("fish")) }, + ], + ]; + + let result = explode_query_tree(&query_tree); + assert_eq!(expected, result); + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 22f081871..8d9c21f6e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -12,12 +12,14 @@ use self::typo::Typo; use self::words::Words; use self::asc_desc::AscDesc; use self::proximity::Proximity; +use self::attribute::Attribute; use self::fetcher::Fetcher; mod typo; mod words; mod asc_desc; mod proximity; +mod attribute; pub mod fetcher; pub trait Criterion { @@ -139,6 +141,7 @@ impl<'t> CriteriaBuilder<'t> { Name::Typo => Box::new(Typo::new(self, father)), Name::Words => Box::new(Words::new(self, father)), Name::Proximity => Box::new(Proximity::new(self, father)), + Name::Attribute => Box::new(Attribute::new(self, father)), Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, father, field)?), Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, father, field)?), _otherwise => father, @@ -147,6 +150,7 @@ impl<'t> CriteriaBuilder<'t> { Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())), Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())), Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())), + Name::Attribute => Box::new(Attribute::initial(self, query_tree.take(), facet_candidates.take())), Name::Asc(field) => { Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) }, From 4ff67ec2ee16d9b02362c85ab582dad9898b4a66 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 11 Mar 2021 17:31:02 +0100 Subject: [PATCH 02/45] Implement attribute criterion for small amounts of candidates --- milli/src/search/criteria/attribute.rs | 164 +++++++++++++++++++++++-- 1 file changed, 157 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 9c31740b1..7f8b5c622 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,10 +1,13 @@ -use log::debug; +use std::collections::{BTreeMap, HashMap, btree_map}; +use std::mem::take; + use roaring::RoaringBitmap; +use crate::{search::build_dfa}; use crate::search::criteria::Query; -use crate::search::query_tree::Operation; +use crate::search::query_tree::{Operation, QueryKind}; use crate::search::WordDerivationsCache; -use super::{Criterion, CriterionResult, Context}; +use super::{Criterion, CriterionResult, Context, resolve_query_tree}; pub struct Attribute<'t> { ctx: &'t dyn Context, @@ -12,6 +15,8 @@ pub struct Attribute<'t> { candidates: Option, bucket_candidates: RoaringBitmap, parent: Option>, + flattened_query_tree: Option>>, + current_buckets: Option>, } impl<'t> Attribute<'t> { @@ -27,6 +32,8 @@ impl<'t> Attribute<'t> { candidates, bucket_candidates: RoaringBitmap::new(), parent: None, + flattened_query_tree: None, + current_buckets: None, } } @@ -37,6 +44,8 @@ impl<'t> Attribute<'t> { candidates: None, bucket_candidates: RoaringBitmap::new(), parent: Some(parent), + flattened_query_tree: None, + current_buckets: None, } } } @@ -44,12 +53,153 @@ impl<'t> Attribute<'t> { impl<'t> Criterion for Attribute<'t> { #[logging_timer::time("Attribute::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { - todo!("Attribute") + loop { + match (&self.query_tree, &mut self.candidates) { + (_, Some(candidates)) if candidates.is_empty() => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + (Some(qt), Some(candidates)) => { + let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| flatten_query_tree(&qt)); + let current_buckets = if let Some(current_buckets) = self.current_buckets.as_mut() { + current_buckets + } else { + let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; + self.current_buckets.get_or_insert(new_buckets.into_iter()) + }; + + let found_candidates = if let Some((_score, candidates)) = current_buckets.next() { + candidates + } else { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }; + candidates.difference_with(&found_candidates); + + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => found_candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: Some(found_candidates), + bucket_candidates: bucket_candidates, + })); + }, + (Some(qt), None) => { + let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?; + self.bucket_candidates.union_with(&query_tree_candidates); + self.candidates = Some(query_tree_candidates); + }, + (None, Some(_)) => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + (None, None) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next(wdcache)? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree; + self.candidates = candidates; + self.bucket_candidates.union_with(&bucket_candidates); + self.flattened_query_tree = None; + self.current_buckets = None; + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } } } +fn linear_compute_candidates( + ctx: &dyn Context, + branches: &Vec>, + allowed_candidates: &RoaringBitmap, +) -> anyhow::Result> +{ + fn compute_candidate_rank(branches: &Vec>, words_positions: HashMap) -> u64 { + let mut min_rank = u64::max_value(); + for branch in branches { + let mut branch_rank = 0; + for Query { prefix, kind } in branch { + // find the best position of the current word in the document. + let position = match kind { + QueryKind::Exact { word, .. } => { + if *prefix { + word_derivations(word, true, 0, &words_positions) + .flat_map(|positions| positions.iter().next()).min() + } else { + words_positions.get(word) + .map(|positions| positions.iter().next()) + .flatten() + } + }, + QueryKind::Tolerant { typo, word } => { + word_derivations(word, *prefix, *typo, &words_positions) + .flat_map(|positions| positions.iter().next()).min() + }, + }; + + // if a position is found, we add it to the branch score, + // otherwise the branch is considered as unfindable in this document and we break. + if let Some(position) = position { + branch_rank += position as u64; + } else { + branch_rank = u64::max_value(); + break; + } + } + min_rank = min_rank.min(branch_rank); + } + + min_rank + } + + fn word_derivations<'a>( + word: &str, + is_prefix: bool, + max_typo: u8, + words_positions: &'a HashMap, + ) -> impl Iterator + { + let dfa = build_dfa(word, max_typo, is_prefix); + words_positions.iter().filter_map(move |(document_word, positions)| { + use levenshtein_automata::Distance; + match dfa.eval(document_word) { + Distance::Exact(_) => Some(positions), + Distance::AtLeast(_) => None, + } + }) + } + + let mut candidates = BTreeMap::new(); + for docid in allowed_candidates { + let words_positions = ctx.docid_words_positions(docid)?; + let rank = compute_candidate_rank(branches, words_positions); + candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid); + } + + Ok(candidates) +} + // TODO can we keep refs of Query -fn explode_query_tree(query_tree: &Operation) -> Vec> { +fn flatten_query_tree(query_tree: &Operation) -> Vec> { use crate::search::criteria::Operation::{And, Or, Consecutive}; fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec> { @@ -91,7 +241,7 @@ mod tests { use super::*; #[test] - fn simple_explode_query_tree() { + fn simple_flatten_query_tree() { let query_tree = Operation::Or(false, vec![ Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), Operation::And(vec![ @@ -127,7 +277,7 @@ mod tests { ], ]; - let result = explode_query_tree(&query_tree); + let result = flatten_query_tree(&query_tree); assert_eq!(expected, result); } } From 75e7b1e3dadb46c0761e07a14b3a70dfe6e3c01d Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Mar 2021 13:49:55 +0100 Subject: [PATCH 03/45] Implement test Context methods --- milli/src/search/criteria/mod.rs | 99 +++++++++++++++++++------------- 1 file changed, 59 insertions(+), 40 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 8d9c21f6e..1d7026d71 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -366,6 +366,7 @@ pub mod test { word_prefix_docids: HashMap, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + docid_words: HashMap>, } impl<'a> Context for TestContext<'a> { @@ -399,8 +400,17 @@ pub mod test { self.word_prefix_docids.contains_key(&word.to_string()) } - fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result> { - todo!() + fn docid_words_positions(&self, docid: DocumentId) -> heed::Result> { + if let Some(docid_words) = self.docid_words.get(&docid) { + Ok(docid_words + .iter() + .enumerate() + .map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32)))) + .collect() + ) + } else { + Ok(HashMap::new()) + } } } @@ -435,50 +445,58 @@ pub mod test { s("morning") => random_postings(rng, 125), }; + let mut docid_words = HashMap::new(); + for (word, docids) in word_docids.iter() { + for docid in docids { + let words = docid_words.entry(docid).or_insert(vec![]); + words.push(word.clone()); + } + } + let word_prefix_docids = hashmap!{ s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], }; - let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")]; - let hello_world_split = (hello_world.len() / 2) as usize; - let hello_world_1 = hello_world.iter().take(hello_world_split).collect(); - let hello_world_2 = hello_world.iter().skip(hello_world_split).collect(); - - let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")]; - let hello_word_split = (hello_word.len() / 2) as usize; - let hello_word_4 = hello_word.iter().take(hello_word_split).collect(); - let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect(); - let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect(); - let word_pair_proximity_docids = hashmap!{ - (s("good"), s("morning"), 1) => &word_docids[&s("good")] & &word_docids[&s("morning")], - (s("hello"), s("world"), 1) => hello_world_1, - (s("hello"), s("world"), 4) => hello_world_2, - (s("this"), s("is"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")], - (s("is"), s("2021"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], - (s("is"), s("2020"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), - (s("this"), s("2021"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], - (s("this"), s("2020"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), - (s("word"), s("split"), 1) => &word_docids[&s("word")] & &word_docids[&s("split")], - (s("world"), s("split"), 1) => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")], - (s("hello"), s("word"), 4) => hello_word_4, - (s("hello"), s("word"), 6) => hello_word_6, - (s("hello"), s("word"), 7) => hello_word_7, - (s("split"), s("ngrams"), 3) => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")], - (s("split"), s("ngrams"), 5) => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], - (s("this"), s("ngrams"), 1) => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")], - (s("this"), s("ngrams"), 2) => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], - }; - - let word_prefix_pair_proximity_docids = hashmap!{ - (s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(), - (s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(), - (s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(), - (s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(), - (s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(), - (s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(), - }; + let mut word_pair_proximity_docids = HashMap::new(); + let mut word_prefix_pair_proximity_docids = HashMap::new(); + for (lword, lcandidates) in &word_docids { + for (rword, rcandidates) in &word_docids { + if lword == rword { continue } + let candidates = lcandidates & rcandidates; + for candidate in candidates { + if let Some(docid_words) = docid_words.get(&candidate) { + let lposition = docid_words.iter().position(|w| w == lword).unwrap(); + let rposition = docid_words.iter().position(|w| w == rword).unwrap(); + let key = if lposition < rposition { + (s(lword), s(rword), (rposition - lposition) as i32) + } else { + (s(lword), s(rword), (lposition - rposition + 1) as i32) + }; + let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); + docids.push(candidate); + } + } + } + for (pword, pcandidates) in &word_prefix_docids { + if lword.starts_with(pword) { continue } + let candidates = lcandidates & pcandidates; + for candidate in candidates { + if let Some(docid_words) = docid_words.get(&candidate) { + let lposition = docid_words.iter().position(|w| w == lword).unwrap(); + let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); + let key = if lposition < rposition { + (s(lword), s(pword), (rposition - lposition) as i32) + } else { + (s(lword), s(pword), (lposition - rposition + 1) as i32) + }; + let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); + docids.push(candidate); + } + } + } + } let mut keys = word_docids.keys().collect::>(); keys.sort_unstable(); @@ -490,6 +508,7 @@ pub mod test { word_prefix_docids, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + docid_words, } } } From b0a417f342de6afe8678c628b5e3be9c30f9c302 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 11 Mar 2021 17:24:35 +0100 Subject: [PATCH 04/45] Introduce the word_level_position_docids Index database --- infos/src/main.rs | 1 + milli/src/heed_codec/mod.rs | 2 + .../heed_codec/str_level_position_codec.rs | 42 +++++++++++++++++++ milli/src/index.rs | 8 +++- milli/src/lib.rs | 2 +- milli/src/update/clear_documents.rs | 1 + milli/src/update/delete_documents.rs | 1 + 7 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 milli/src/heed_codec/str_level_position_codec.rs diff --git a/infos/src/main.rs b/infos/src/main.rs index cc1727a68..356a5417c 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -319,6 +319,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values: _, documents, diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index a070c66eb..cc73cdc65 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -2,6 +2,7 @@ mod beu32_str_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; +mod str_level_position_codec; mod str_str_u8_codec; pub mod facet; @@ -9,4 +10,5 @@ pub use self::beu32_str_codec::BEU32StrCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; +pub use self::str_level_position_codec::StrLevelPositionCodec; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs new file mode 100644 index 000000000..c421c04b5 --- /dev/null +++ b/milli/src/heed_codec/str_level_position_codec.rs @@ -0,0 +1,42 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::mem::size_of; +use std::str; + +pub struct StrLevelPositionCodec; + +impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { + type DItem = (&'a str, u8, u32, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let footer_len = size_of::() + size_of::() * 2; + + if bytes.len() < footer_len { return None } + + let (word, bytes) = bytes.split_at(bytes.len() - footer_len); + let word = str::from_utf8(word).ok()?; + + let (level, bytes) = bytes.split_first()?; + let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?; + let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?; + + Some((word, *level, left, right)) + } +} + +impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { + type EItem = (&'a str, u8, u32, u32); + + fn bytes_encode((word, level, left, right): &Self::EItem) -> Option> { + let left = left.to_be_bytes(); + let right = right.to_be_bytes(); + + let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len()); + bytes.extend_from_slice(word.as_bytes()); + bytes.push(*level); + bytes.extend_from_slice(&left[..]); + bytes.extend_from_slice(&right[..]); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 045eabc3c..0659b207a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -12,7 +12,7 @@ use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; use crate::{ BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, - ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec, + ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, }; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; @@ -52,6 +52,8 @@ pub struct Index { pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, + /// Maps the word, level and position range with the docids that corresponds to it. + pub word_level_position_docids: Database, /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. pub facet_field_id_value_docids: Database, /// Maps the document id, the facet field id and the globally ordered value. @@ -62,7 +64,7 @@ pub struct Index { impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(9); + options.max_dbs(10); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -71,6 +73,7 @@ impl Index { let docid_word_positions = env.create_database(Some("docid-word-positions"))?; let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; + let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; @@ -94,6 +97,7 @@ impl Index { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index fe9bd828b..de5c6511e 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -22,7 +22,7 @@ use serde_json::{Map, Value}; pub use self::criterion::{Criterion, default_criteria}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; -pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; +pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 2c24d9c07..250e4b13a 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -28,6 +28,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 8a2ba9bbf..b60b7bac2 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -88,6 +88,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, From 9242f2f1d451807e45f29462c0126c992d8950af Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 11 Mar 2021 17:23:46 +0100 Subject: [PATCH 05/45] Store the first word positions levels --- .../update/index_documents/merge_function.rs | 4 + milli/src/update/index_documents/mod.rs | 42 +++- milli/src/update/index_documents/store.rs | 58 +++++- milli/src/update/mod.rs | 2 + milli/src/update/words_level_positions.rs | 184 ++++++++++++++++++ 5 files changed, 284 insertions(+), 6 deletions(-) create mode 100644 milli/src/update/words_level_positions.rs diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 6f24fcad9..54f994fc0 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -52,6 +52,10 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) - cbo_roaring_bitmap_merge(values) } +pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + cbo_roaring_bitmap_merge(values) +} + pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { cbo_roaring_bitmap_merge(values) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 52949c13c..8fc35b654 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -18,11 +18,12 @@ use rayon::prelude::*; use serde::{Serialize, Deserialize}; use crate::index::Index; -use crate::update::{Facets, WordsPrefixes, UpdateIndexingStep}; +use crate::update::{Facets, WordsLevelPositions, WordsPrefixes, UpdateIndexingStep}; use self::store::{Store, Readers}; pub use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - docid_word_positions_merge, documents_merge, facet_field_value_docids_merge, + docid_word_positions_merge, documents_merge, + word_level_position_docids_merge, facet_field_value_docids_merge, field_id_docid_facet_values_merge, }; pub use self::transform::{Transform, TransformOutput}; @@ -402,6 +403,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { enum DatabaseType { Main, WordDocids, + WordLevel0PositionDocids, FacetLevel0ValuesDocids, } @@ -467,6 +469,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut word_docids_readers = Vec::with_capacity(readers.len()); let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); + let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len()); let mut documents_readers = Vec::with_capacity(readers.len()); @@ -476,6 +479,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { word_docids, docid_word_positions, words_pairs_proximities_docids, + word_level_position_docids, facet_field_value_docids, field_id_docid_facet_values, documents @@ -484,6 +488,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { word_docids_readers.push(word_docids); docid_word_positions_readers.push(docid_word_positions); words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); + word_level_position_docids_readers.push(word_level_position_docids); facet_field_value_docids_readers.push(facet_field_value_docids); field_id_docid_facet_values_readers.push(field_id_docid_facet_values); documents_readers.push(documents); @@ -514,6 +519,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { facet_field_value_docids_readers, facet_field_value_docids_merge, ), + ( + DatabaseType::WordLevel0PositionDocids, + word_level_position_docids_readers, + word_level_position_docids_merge, + ), ] .into_par_iter() .for_each(|(dbtype, readers, merge)| { @@ -569,7 +579,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; - let total_databases = 7; + let total_databases = 8; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, @@ -661,7 +671,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { )?; }, DatabaseType::FacetLevel0ValuesDocids => { - debug!("Writing the facet values docids into LMDB on disk..."); + debug!("Writing the facet level 0 values docids into LMDB on disk..."); let db = *self.index.facet_field_id_value_docids.as_polymorph(); write_into_lmdb_database( self.wtxn, @@ -671,6 +681,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; }, + DatabaseType::WordLevel0PositionDocids => { + debug!("Writing the word level 0 positions docids into LMDB on disk..."); + let db = *self.index.word_level_position_docids.as_polymorph(); + write_into_lmdb_database( + self.wtxn, + db, + content, + word_level_position_docids_merge, + write_method, + )?; + } } database_count += 1; @@ -693,6 +714,19 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + // Run the words positions update operation. + let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + if let Some(value) = self.facet_level_group_size { + builder.level_group_size(value); + } + if let Some(value) = self.facet_min_level_size { + builder.min_level_size(value); + } + builder.execute()?; + // Run the words prefixes update operation. let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 0bd83b692..358552768 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -29,7 +29,8 @@ use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - facet_field_value_docids_merge, field_id_docid_facet_values_merge, + word_level_position_docids_merge, facet_field_value_docids_merge, + field_id_docid_facet_values_merge, }; const LMDB_MAX_KEY_LENGTH: usize = 511; @@ -43,6 +44,7 @@ pub struct Readers { pub word_docids: Reader, pub docid_word_positions: Reader, pub words_pairs_proximities_docids: Reader, + pub word_level_position_docids: Reader, pub facet_field_value_docids: Reader, pub field_id_docid_facet_values: Reader, pub documents: Reader, @@ -69,6 +71,7 @@ pub struct Store<'s, A> { main_sorter: Sorter, word_docids_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, + word_level_position_docids_sorter: Sorter, facet_field_value_docids_sorter: Sorter, field_id_docid_facet_values_sorter: Sorter, // MTBL writers @@ -94,7 +97,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { ) -> anyhow::Result { // We divide the max memory by the number of sorter the Store have. - let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4)); + let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); let main_sorter = create_sorter( @@ -121,6 +124,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_nb_chunks, max_memory, ); + let word_level_position_docids_sorter = create_sorter( + word_level_position_docids_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + max_memory, + ); let facet_field_value_docids_sorter = create_sorter( facet_field_value_docids_merge, chunk_compression_type, @@ -172,6 +183,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { main_sorter, word_docids_sorter, words_pairs_proximities_docids_sorter, + word_level_position_docids_sorter, facet_field_value_docids_sorter, field_id_docid_facet_values_sorter, // MTBL writers @@ -290,6 +302,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { self.documents_writer.insert(document_id.to_be_bytes(), record)?; Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; + Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?; words_positions.clear(); @@ -360,6 +373,42 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } + fn write_word_position_docids( + writer: &mut Sorter, + document_id: DocumentId, + words_positions: &HashMap>, + ) -> anyhow::Result<()> + { + let mut key_buffer = Vec::new(); + let mut data_buffer = Vec::new(); + + for (word, positions) in words_positions { + key_buffer.clear(); + key_buffer.extend_from_slice(word.as_bytes()); + key_buffer.push(0); // level 0 + + for position in positions { + key_buffer.truncate(word.len()); + let position_bytes = position.to_be_bytes(); + key_buffer.extend_from_slice(position_bytes.as_bytes()); + key_buffer.extend_from_slice(position_bytes.as_bytes()); + + data_buffer.clear(); + let positions = RoaringBitmap::from_iter(Some(document_id)); + // We serialize the positions into a buffer. + CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer) + .with_context(|| "could not serialize positions")?; + + // that we write under the generated key into MTBL + if lmdb_key_valid_size(&key_buffer) { + writer.insert(&key_buffer, &data_buffer)?; + } + } + } + + Ok(()) + } + fn write_facet_field_value_docids( sorter: &mut Sorter, iter: I, @@ -561,6 +610,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; + let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; + let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; @@ -570,6 +622,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let main = writer_into_reader(main_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; + let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?; let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; @@ -580,6 +633,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { word_docids, docid_word_positions, words_pairs_proximities_docids, + word_level_position_docids, facet_field_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index c2df94468..1fc4890fb 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -6,6 +6,7 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc pub use self::settings::{Setting, Settings}; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; +pub use self::words_level_positions::WordsLevelPositions; pub use self::words_prefixes::WordsPrefixes; mod available_documents_ids; @@ -16,5 +17,6 @@ mod index_documents; mod settings; mod update_builder; mod update_step; +mod words_level_positions; mod words_prefixes; diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs new file mode 100644 index 000000000..983f82657 --- /dev/null +++ b/milli/src/update/words_level_positions.rs @@ -0,0 +1,184 @@ +use std::cmp; +use std::fs::File; +use std::num::NonZeroUsize; + +use grenad::{CompressionType, Reader, Writer, FileFuse}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesEncode, Error}; +use log::debug; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; +use crate::Index; +use crate::update::index_documents::WriteMethod; +use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; + +pub struct WordsLevelPositions<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, + _update_id: u64, +} + +impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + update_id: u64, + ) -> WordsLevelPositions<'t, 'u, 'i> + { + WordsLevelPositions { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + level_group_size: NonZeroUsize::new(4).unwrap(), + min_level_size: NonZeroUsize::new(5).unwrap(), + _update_id: update_id, + } + } + + pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { + self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); + self + } + + pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { + self.min_level_size = value; + self + } + + pub fn execute(self) -> anyhow::Result<()> { + debug!("Computing and writing the word levels positions docids into LMDB on disk..."); + + clear_non_zero_levels_positions(self.wtxn, self.index.word_level_position_docids)?; + + let entries = compute_positions_levels( + self.wtxn, + self.index.word_level_position_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + )?; + + write_into_lmdb_database( + self.wtxn, + *self.index.facet_field_id_value_docids.as_polymorph(), + entries, + |_, _| anyhow::bail!("invalid facet level merging"), + WriteMethod::GetMergePut, + )?; + + Ok(()) + } +} + +fn clear_non_zero_levels_positions( + wtxn: &mut heed::RwTxn, + db: heed::Database, +) -> heed::Result<()> +{ + let mut iter = db.iter_mut(wtxn)?.lazily_decode_data(); + while let Some(result) = iter.next() { + let ((_, level, _, _), _) = result?; + if level != 0 { + iter.del_current()?; + } + } + Ok(()) +} + +/// Generates all the words positions levels (including the level zero). +fn compute_positions_levels( + rtxn: &heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + shrink_size: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, +) -> anyhow::Result> +{ + // let first_level_size = db.prefix_iter(rtxn, &[field_id])? + // .remap_types::() + // .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + // // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // // therefore we write the facet levels entries into a grenad file before transfering them. + // let mut writer = tempfile::tempfile().and_then(|file| { + // create_writer(compression_type, compression_level, file) + // })?; + + // let level_0_range = { + // let left = (field_id, 0, T::min_value(), T::min_value()); + // let right = (field_id, 0, T::max_value(), T::max_value()); + // left..=right + // }; + + // // Groups sizes are always a power of the original level_group_size and therefore a group + // // always maps groups of the previous level and never splits previous levels groups in half. + // let group_size_iter = (1u8..) + // .map(|l| (l, level_group_size.get().pow(l as u32))) + // .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); + + // for (level, group_size) in group_size_iter { + // let mut left = T::zero(); + // let mut right = T::zero(); + // let mut group_docids = RoaringBitmap::new(); + + // let db = db.remap_key_type::(); + // for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { + // let ((_field_id, _level, value, _right), docids) = result?; + + // if i == 0 { + // left = value; + // } else if i % group_size == 0 { + // // we found the first bound of the next group, we must store the left + // // and right bounds associated with the docids. + // write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; + + // // We save the left bound for the new group and also reset the docids. + // group_docids = RoaringBitmap::new(); + // left = value; + // } + + // // The right bound is always the bound we run through. + // group_docids.union_with(&docids); + // right = value; + // } + + // if !group_docids.is_empty() { + // write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; + // } + // } + + // writer_into_reader(writer, shrink_size) + + todo!() +} + +fn write_entry( + writer: &mut Writer, + field_id: u8, + level: u8, + left: T, + right: T, + ids: &RoaringBitmap, +) -> anyhow::Result<()> +where + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, +{ + let key = (field_id, level, left, right); + let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) +} From c765f277a3328be0bae4e9ae173de2fa61f23962 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 14:34:21 +0100 Subject: [PATCH 06/45] Introduce the WordsLevelPositions update --- milli/src/update/words_level_positions.rs | 117 +++++++++++----------- 1 file changed, 61 insertions(+), 56 deletions(-) diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 983f82657..0a7bc484d 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -3,12 +3,11 @@ use std::fs::File; use std::num::NonZeroUsize; use grenad::{CompressionType, Reader, Writer, FileFuse}; -use heed::types::{ByteSlice, DecodeIgnore}; +use heed::types::DecodeIgnore; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; use crate::Index; use crate::update::index_documents::WriteMethod; @@ -69,12 +68,16 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.min_level_size, )?; + // The previously computed entries also defines the level 0 entries + // so we can clear the database and append all of these entries. + self.index.word_level_position_docids.clear(self.wtxn)?; + write_into_lmdb_database( self.wtxn, *self.index.facet_field_id_value_docids.as_polymorph(), entries, |_, _| anyhow::bail!("invalid facet level merging"), - WriteMethod::GetMergePut, + WriteMethod::Append, )?; Ok(()) @@ -107,77 +110,79 @@ fn compute_positions_levels( min_level_size: NonZeroUsize, ) -> anyhow::Result> { - // let first_level_size = db.prefix_iter(rtxn, &[field_id])? - // .remap_types::() - // .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // therefore we write the facet levels entries into a grenad file before transfering them. + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(compression_type, compression_level, file) + })?; - // // It is forbidden to keep a cursor and write in a database at the same time with LMDB - // // therefore we write the facet levels entries into a grenad file before transfering them. - // let mut writer = tempfile::tempfile().and_then(|file| { - // create_writer(compression_type, compression_level, file) - // })?; + for result in db.iter(rtxn)? { + let ((word, level, left, right), docids) = result?; - // let level_0_range = { - // let left = (field_id, 0, T::min_value(), T::min_value()); - // let right = (field_id, 0, T::max_value(), T::max_value()); - // left..=right - // }; + let first_level_size = db.remap_data_type::() + .prefix_iter(rtxn, &(word, level, u32::min_value(), u32::min_value()))? + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - // // Groups sizes are always a power of the original level_group_size and therefore a group - // // always maps groups of the previous level and never splits previous levels groups in half. - // let group_size_iter = (1u8..) - // .map(|l| (l, level_group_size.get().pow(l as u32))) - // .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); + let level_0_range = { + let left = (word, 0, u32::min_value(), u32::min_value()); + let right = (word, 0, u32::max_value(), u32::max_value()); + left..=right + }; - // for (level, group_size) in group_size_iter { - // let mut left = T::zero(); - // let mut right = T::zero(); - // let mut group_docids = RoaringBitmap::new(); + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); - // let db = db.remap_key_type::(); - // for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { - // let ((_field_id, _level, value, _right), docids) = result?; + // As specified in the documentation, we also write the level 0 entries. + write_level_entry(&mut writer, word, level, left, right, &docids)?; - // if i == 0 { - // left = value; - // } else if i % group_size == 0 { - // // we found the first bound of the next group, we must store the left - // // and right bounds associated with the docids. - // write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; + for (level, group_size) in group_size_iter { + let mut left = 0; + let mut right = 0; + let mut group_docids = RoaringBitmap::new(); - // // We save the left bound for the new group and also reset the docids. - // group_docids = RoaringBitmap::new(); - // left = value; - // } + for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { + let ((_field_id, _level, value, _right), docids) = result?; - // // The right bound is always the bound we run through. - // group_docids.union_with(&docids); - // right = value; - // } + if i == 0 { + left = value; + } else if i % group_size == 0 { + // we found the first bound of the next group, we must store the left + // and right bounds associated with the docids. + write_level_entry(&mut writer, word, level, left, right, &group_docids)?; - // if !group_docids.is_empty() { - // write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; - // } - // } + // We save the left bound for the new group and also reset the docids. + group_docids = RoaringBitmap::new(); + left = value; + } - // writer_into_reader(writer, shrink_size) + // The right bound is always the bound we run through. + group_docids.union_with(&docids); + right = value; + } - todo!() + if !group_docids.is_empty() { + write_level_entry(&mut writer, word, level, left, right, &group_docids)?; + } + } + } + + writer_into_reader(writer, shrink_size) } -fn write_entry( +fn write_level_entry( writer: &mut Writer, - field_id: u8, + word: &str, level: u8, - left: T, - right: T, + left: u32, + right: u32, ids: &RoaringBitmap, ) -> anyhow::Result<()> -where - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { - let key = (field_id, level, left, right); - let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?; + let key = (word, level, left, right); + let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; writer.insert(&key, &data)?; Ok(()) From 3a25137ee42d1f6d98db6f9e569baae40cb1949f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 13:55:24 +0100 Subject: [PATCH 07/45] Expose and use the WordsLevelPositions update --- milli/src/update/index_documents/mod.rs | 17 +++++++++++++++++ milli/src/update/update_builder.rs | 20 +++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 8fc35b654..e7143bde0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -263,6 +263,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { facet_min_level_size: Option, words_prefix_threshold: Option, max_prefix_length: Option, + words_positions_level_group_size: Option, + words_positions_min_level_size: Option, update_method: IndexDocumentsMethod, update_format: UpdateFormat, autogenerate_docids: bool, @@ -290,6 +292,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { facet_min_level_size: None, words_prefix_threshold: None, max_prefix_length: None, + words_positions_level_group_size: None, + words_positions_min_level_size: None, update_method: IndexDocumentsMethod::ReplaceDocuments, update_format: UpdateFormat::Json, autogenerate_docids: true, @@ -740,6 +744,19 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + // Run the words level positions update operation. + let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + if let Some(value) = self.words_positions_level_group_size { + builder.level_group_size(value); + } + if let Some(value) = self.words_positions_min_level_size { + builder.min_level_size(value); + } + builder.execute()?; + debug_assert_eq!(database_count, total_databases); info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index c966f72d2..9a4fb850e 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -2,7 +2,10 @@ use grenad::CompressionType; use rayon::ThreadPool; use crate::Index; -use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets, WordsPrefixes}; +use super::{ + ClearDocuments, DeleteDocuments, IndexDocuments, Settings, + Facets, WordsPrefixes, WordsLevelPositions, +}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -150,4 +153,19 @@ impl<'a> UpdateBuilder<'a> { builder } + + pub fn words_level_positions<'t, 'u, 'i>( + self, + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordsLevelPositions<'t, 'u, 'i> + { + let mut builder = WordsLevelPositions::new(wtxn, index, self.update_id); + + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + + builder + } } From e8cc7f9cee818ecc18fff3c65f7b7566fb75a836 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 14:32:00 +0100 Subject: [PATCH 08/45] Expose a route in the http-ui to update the WordsLevelPositions --- http-ui/src/main.rs | 46 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index b091985f3..dbf7aadce 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -229,6 +229,7 @@ enum UpdateMeta { Settings(Settings), Facets(Facets), WordsPrefixes(WordsPrefixes), + WordsLevelPositions(WordsLevelPositions), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -281,6 +282,22 @@ struct WordsPrefixes { max_prefix_length: Option, } +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +struct WordsLevelPositions { + level_group_size: Option, + min_level_size: Option, +} + +// Any value that is present is considered Some value, including null. +fn deserialize_some<'de, T, D>(deserializer: D) -> Result, D::Error> +where T: Deserialize<'de>, + D: Deserializer<'de> +{ + Deserialize::deserialize(deserializer).map(Some) +} + #[tokio::main] async fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); @@ -493,6 +510,21 @@ async fn main() -> anyhow::Result<()> { Ok(()) => wtxn.commit().map_err(Into::into), Err(e) => Err(e) } + }, + UpdateMeta::WordsLevelPositions(levels) => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = update_builder.words_level_positions(&mut wtxn, &index_cloned); + if let Some(value) = levels.level_group_size { + builder.level_group_size(value); + } + if let Some(value) = levels.min_level_size { + builder.min_level_size(value); + } + match builder.execute() { + Ok(()) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()) + } } }; @@ -923,6 +955,19 @@ async fn main() -> anyhow::Result<()> { warp::reply() }); + let update_store_cloned = update_store.clone(); + let update_status_sender_cloned = update_status_sender.clone(); + let change_words_level_positions_route = warp::filters::method::post() + .and(warp::path!("words-level-positions")) + .and(warp::body::json()) + .map(move |levels: WordsLevelPositions| { + let meta = UpdateMeta::WordsLevelPositions(levels); + let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); + let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); + eprintln!("update {} registered", update_id); + warp::reply() + }); + let update_store_cloned = update_store.clone(); let update_status_sender_cloned = update_status_sender.clone(); let abort_update_id_route = warp::filters::method::delete() @@ -998,6 +1043,7 @@ async fn main() -> anyhow::Result<()> { .or(change_settings_route) .or(change_facet_levels_route) .or(change_words_prefixes_route) + .or(change_words_level_positions_route) .or(update_ws_route); let addr = SocketAddr::from_str(&opt.http_listen_addr)?; From 6b1b42b928685f468507f0c2fccd8ff6a2925e99 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 14:22:01 +0100 Subject: [PATCH 09/45] Introduce an infos wordsLevelPositionsDocids subcommand --- infos/src/main.rs | 61 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 356a5417c..e4d59c641 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -19,9 +19,10 @@ const WORD_DOCIDS_DB_NAME: &str = "word-docids"; const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids"; const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; -const FACET_FIELD_ID_VALUE_DOCIDS_NAME: &str = "facet-field-id-value-docids"; -const FIELD_ID_DOCID_FACET_VALUES_NAME: &str = "field-id-docid-facet-values"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; +const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; +const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids"; +const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values"; const DOCUMENTS_DB_NAME: &str = "documents"; const ALL_DATABASE_NAMES: &[&str] = &[ @@ -31,8 +32,9 @@ const ALL_DATABASE_NAMES: &[&str] = &[ DOCID_WORD_POSITIONS_DB_NAME, WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, - FACET_FIELD_ID_VALUE_DOCIDS_NAME, - FIELD_ID_DOCID_FACET_VALUES_NAME, + WORD_LEVEL_POSITION_DOCIDS_DB_NAME, + FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME, + FIELD_ID_DOCID_FACET_VALUES_DB_NAME, DOCUMENTS_DB_NAME, ]; @@ -114,6 +116,16 @@ enum Command { field_name: String, }, + /// Outputs a CSV with the documents ids along with the word level positions where it appears. + WordsLevelPositionsDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// The field name in the document. + words: Vec, + }, + /// Outputs a CSV with the documents ids, words and the positions where this word appears. DocidsWordsPositions { /// Display the whole positions in detail. @@ -221,6 +233,9 @@ fn main() -> anyhow::Result<()> { FacetValuesDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, field_name) }, + WordsLevelPositionsDocids { full_display, words } => { + words_level_positions_docids(&index, &rtxn, !full_display, words) + }, DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) }, @@ -525,6 +540,40 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam Ok(wtr.flush()?) } +fn words_level_positions_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + words: Vec, +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["word", "level", "position_range", "documents_count", "documents_ids"])?; + + for word in words.iter().map(AsRef::as_ref) { + let range = { + let left = (word, 0, u32::min_value(), u32::min_value()); + let right = (word, u8::max_value(), u32::max_value(), u32::max_value()); + left..=right + }; + for result in index.word_level_position_docids.range(rtxn, &range)? { + let ((word, level, left, right), docids) = result?; + let level = level.to_string(); + let count = docids.len().to_string(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + let position_range = format!("{:?}", left..=right); + wtr.write_record(&[word, &level, &position_range, &count, &docids])?; + } + } + + Ok(wtr.flush()?) +} + fn docids_words_positions( index: &Index, rtxn: &heed::RoTxn, @@ -730,8 +779,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(), - FACET_FIELD_ID_VALUE_DOCIDS_NAME => index.facet_field_id_value_docids.as_polymorph(), - FIELD_ID_DOCID_FACET_VALUES_NAME => index.field_id_docid_facet_values.as_polymorph(), + FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => index.facet_field_id_value_docids.as_polymorph(), + FIELD_ID_DOCID_FACET_VALUES_DB_NAME => index.field_id_docid_facet_values.as_polymorph(), DOCUMENTS_DB_NAME => index.documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), }; From 3069bf4f4a3ad50a89a1573b49dec92c61107678 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 15:40:38 +0100 Subject: [PATCH 10/45] Fix and improve the words-level-positions computation --- infos/src/main.rs | 6 ++-- milli/src/update/index_documents/store.rs | 2 +- milli/src/update/words_level_positions.rs | 42 ++++++++--------------- 3 files changed, 20 insertions(+), 30 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index e4d59c641..c219c5758 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -558,7 +558,9 @@ fn words_level_positions_docids( left..=right }; for result in index.word_level_position_docids.range(rtxn, &range)? { - let ((word, level, left, right), docids) = result?; + let ((w, level, left, right), docids) = result?; + if word != w { break } + let level = level.to_string(); let count = docids.len().to_string(); let docids = if debug { @@ -567,7 +569,7 @@ fn words_level_positions_docids( format!("{:?}", docids.iter().collect::>()) }; let position_range = format!("{:?}", left..=right); - wtr.write_record(&[word, &level, &position_range, &count, &docids])?; + wtr.write_record(&[w, &level, &position_range, &count, &docids])?; } } diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 358552768..0f97476d9 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -388,7 +388,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { key_buffer.push(0); // level 0 for position in positions { - key_buffer.truncate(word.len()); + key_buffer.truncate(word.len() + 1); let position_bytes = position.to_be_bytes(); key_buffer.extend_from_slice(position_bytes.as_bytes()); key_buffer.extend_from_slice(position_bytes.as_bytes()); diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 0a7bc484d..77cec246a 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::num::NonZeroUsize; use grenad::{CompressionType, Reader, Writer, FileFuse}; -use heed::types::DecodeIgnore; +use heed::types::{DecodeIgnore, Str}; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; @@ -56,10 +56,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { pub fn execute(self) -> anyhow::Result<()> { debug!("Computing and writing the word levels positions docids into LMDB on disk..."); - clear_non_zero_levels_positions(self.wtxn, self.index.word_level_position_docids)?; - let entries = compute_positions_levels( self.wtxn, + self.index.word_docids.remap_data_type::(), self.index.word_level_position_docids, self.chunk_compression_type, self.chunk_compression_level, @@ -74,7 +73,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { write_into_lmdb_database( self.wtxn, - *self.index.facet_field_id_value_docids.as_polymorph(), + *self.index.word_level_position_docids.as_polymorph(), entries, |_, _| anyhow::bail!("invalid facet level merging"), WriteMethod::Append, @@ -84,25 +83,11 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { } } -fn clear_non_zero_levels_positions( - wtxn: &mut heed::RwTxn, - db: heed::Database, -) -> heed::Result<()> -{ - let mut iter = db.iter_mut(wtxn)?.lazily_decode_data(); - while let Some(result) = iter.next() { - let ((_, level, _, _), _) = result?; - if level != 0 { - iter.del_current()?; - } - } - Ok(()) -} - -/// Generates all the words positions levels (including the level zero). +/// Generates all the words positions levels based on the levels zero (including the level zero). fn compute_positions_levels( rtxn: &heed::RoTxn, - db: heed::Database, + words_db: heed::Database, + words_positions_db: heed::Database, compression_type: CompressionType, compression_level: Option, shrink_size: Option, @@ -116,11 +101,11 @@ fn compute_positions_levels( create_writer(compression_type, compression_level, file) })?; - for result in db.iter(rtxn)? { - let ((word, level, left, right), docids) = result?; + for result in words_db.iter(rtxn)? { + let (word, ()) = result?; - let first_level_size = db.remap_data_type::() - .prefix_iter(rtxn, &(word, level, u32::min_value(), u32::min_value()))? + let first_level_size = words_positions_db.remap_data_type::() + .prefix_iter(rtxn, &(word, 0, u32::min_value(), u32::min_value()))? .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; let level_0_range = { @@ -136,14 +121,17 @@ fn compute_positions_levels( .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); // As specified in the documentation, we also write the level 0 entries. - write_level_entry(&mut writer, word, level, left, right, &docids)?; + for result in words_positions_db.range(rtxn, &level_0_range)? { + let ((word, level, left, right), docids) = result?; + write_level_entry(&mut writer, word, level, left, right, &docids)?; + } for (level, group_size) in group_size_iter { let mut left = 0; let mut right = 0; let mut group_docids = RoaringBitmap::new(); - for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { + for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() { let ((_field_id, _level, value, _right), docids) = result?; if i == 0 { From f7138284066887cf3bc610b35b48c8f2393bb448 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 15:47:41 +0100 Subject: [PATCH 11/45] Implement the clear and delete documents for the word-level-positions database --- milli/src/update/clear_documents.rs | 1 + milli/src/update/delete_documents.rs | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 250e4b13a..6d7dd72b8 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -56,6 +56,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; + word_level_position_docids.clear(self.wtxn)?; facet_field_id_value_docids.clear(self.wtxn)?; field_id_docid_facet_values.clear(self.wtxn)?; documents.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index b60b7bac2..f9303d339 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -330,6 +330,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + // We delete the documents ids that are under the word level position docids. + let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + } else if docids.len() != previous_len { + iter.put_current(bytes, &docids)?; + } + } + + drop(iter); + Ok(self.documents_ids.len()) } } From 8bd4f5d93ec5e212197a8e662a37431bfdf0c865 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 16:09:18 +0100 Subject: [PATCH 12/45] Compute the biggest values of the words_level_positions_docids --- infos/src/main.rs | 18 +++++++++++++++--- milli/src/update/words_level_positions.rs | 10 +++++----- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index c219c5758..2c11d3783 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -346,6 +346,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let docid_word_positions_name = "docid_word_positions"; let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids"; let word_pair_proximity_docids_name = "word_pair_proximity_docids"; + let word_level_position_docids_name = "word_level_position_docids"; let facet_field_id_value_docids_name = "facet_field_id_value_docids"; let documents_name = "documents"; @@ -402,6 +403,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho if heap.len() > limit { heap.pop(); } } + for result in word_level_position_docids.remap_data_type::().iter(rtxn)? { + let ((word, level, left, right), value) = result?; + let key = format!("{} {} {:?}", word, level, left..=right); + heap.push(Reverse((value.len(), key, word_level_position_docids_name))); + if heap.len() > limit { heap.pop(); } + } + let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; for (field_id, field_type) in faceted_fields { @@ -549,7 +557,7 @@ fn words_level_positions_docids( { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["word", "level", "position_range", "documents_count", "documents_ids"])?; + wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?; for word in words.iter().map(AsRef::as_ref) { let range = { @@ -561,14 +569,18 @@ fn words_level_positions_docids( let ((w, level, left, right), docids) = result?; if word != w { break } - let level = level.to_string(); let count = docids.len().to_string(); let docids = if debug { format!("{:?}", docids) } else { format!("{:?}", docids.iter().collect::>()) }; - let position_range = format!("{:?}", left..=right); + let position_range = if level == 0 { + format!("{:?}", left) + } else { + format!("{:?}", left..=right) + }; + let level = level.to_string(); wtr.write_record(&[w, &level, &position_range, &count, &docids])?; } } diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 77cec246a..a7be248b6 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -104,16 +104,16 @@ fn compute_positions_levels( for result in words_db.iter(rtxn)? { let (word, ()) = result?; - let first_level_size = words_positions_db.remap_data_type::() - .prefix_iter(rtxn, &(word, 0, u32::min_value(), u32::min_value()))? - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - let level_0_range = { let left = (word, 0, u32::min_value(), u32::min_value()); let right = (word, 0, u32::max_value(), u32::max_value()); left..=right }; + let first_level_size = words_positions_db.remap_data_type::() + .range(rtxn, &level_0_range)? + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. let group_size_iter = (1u8..) @@ -132,7 +132,7 @@ fn compute_positions_levels( let mut group_docids = RoaringBitmap::new(); for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() { - let ((_field_id, _level, value, _right), docids) = result?; + let ((_word, _level, value, _right), docids) = result?; if i == 0 { left = value; From bd1a371c62cf7d1fb79b29c1b5ccde30d63aa0ca Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Mar 2021 15:41:44 +0100 Subject: [PATCH 13/45] Compute the WordsLevelPositions only once --- milli/src/update/index_documents/mod.rs | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e7143bde0..3a41a52ae 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -718,19 +718,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; - // Run the words positions update operation. - let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id); - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; - if let Some(value) = self.facet_level_group_size { - builder.level_group_size(value); - } - if let Some(value) = self.facet_min_level_size { - builder.min_level_size(value); - } - builder.execute()?; - // Run the words prefixes update operation. let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; From 89ee2cf576858398ee160a0ed54d6494aedcecfc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Mar 2021 17:20:16 +0100 Subject: [PATCH 14/45] Introduce the TreeLevel struct --- infos/src/main.rs | 9 ++-- .../heed_codec/str_level_position_codec.rs | 13 +++-- milli/src/lib.rs | 2 + milli/src/tree_level.rs | 47 +++++++++++++++++++ milli/src/update/words_level_positions.rs | 11 +++-- 5 files changed, 67 insertions(+), 15 deletions(-) create mode 100644 milli/src/tree_level.rs diff --git a/infos/src/main.rs b/infos/src/main.rs index 2c11d3783..0e6403d7b 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -5,7 +5,7 @@ use std::{str, io, fmt}; use anyhow::Context; use byte_unit::Byte; use heed::EnvOpenOptions; -use milli::Index; +use milli::{Index, TreeLevel}; use structopt::StructOpt; use Command::*; @@ -561,13 +561,12 @@ fn words_level_positions_docids( for word in words.iter().map(AsRef::as_ref) { let range = { - let left = (word, 0, u32::min_value(), u32::min_value()); - let right = (word, u8::max_value(), u32::max_value(), u32::max_value()); + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); left..=right }; for result in index.word_level_position_docids.range(rtxn, &range)? { let ((w, level, left, right), docids) = result?; - if word != w { break } let count = docids.len().to_string(); let docids = if debug { @@ -575,7 +574,7 @@ fn words_level_positions_docids( } else { format!("{:?}", docids.iter().collect::>()) }; - let position_range = if level == 0 { + let position_range = if level == TreeLevel::min_value() { format!("{:?}", left) } else { format!("{:?}", left..=right) diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs index c421c04b5..810e91940 100644 --- a/milli/src/heed_codec/str_level_position_codec.rs +++ b/milli/src/heed_codec/str_level_position_codec.rs @@ -1,12 +1,14 @@ use std::borrow::Cow; -use std::convert::TryInto; +use std::convert::{TryFrom, TryInto}; use std::mem::size_of; use std::str; +use crate::TreeLevel; + pub struct StrLevelPositionCodec; impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { - type DItem = (&'a str, u8, u32, u32); + type DItem = (&'a str, TreeLevel, u32, u32); fn bytes_decode(bytes: &'a [u8]) -> Option { let footer_len = size_of::() + size_of::() * 2; @@ -19,13 +21,14 @@ impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { let (level, bytes) = bytes.split_first()?; let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?; let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?; + let level = TreeLevel::try_from(*level).ok()?; - Some((word, *level, left, right)) + Some((word, level, left, right)) } } impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { - type EItem = (&'a str, u8, u32, u32); + type EItem = (&'a str, TreeLevel, u32, u32); fn bytes_encode((word, level, left, right): &Self::EItem) -> Option> { let left = left.to_be_bytes(); @@ -33,7 +36,7 @@ impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len()); bytes.extend_from_slice(word.as_bytes()); - bytes.push(*level); + bytes.push((*level).into()); bytes.extend_from_slice(&left[..]); bytes.extend_from_slice(&right[..]); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index de5c6511e..03169bce7 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -9,6 +9,7 @@ pub mod facet; pub mod heed_codec; pub mod index; pub mod proximity; +pub mod tree_level; pub mod update; use std::borrow::Cow; @@ -27,6 +28,7 @@ pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringB pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords}; +pub use self::tree_level::TreeLevel; pub use self::update_store::UpdateStore; pub type FastMap4 = HashMap>; diff --git a/milli/src/tree_level.rs b/milli/src/tree_level.rs new file mode 100644 index 000000000..7ce2904e2 --- /dev/null +++ b/milli/src/tree_level.rs @@ -0,0 +1,47 @@ +use std::convert::TryFrom; +use std::fmt; + +/// This is just before the lowest printable character (space, sp, 32) +const MAX_VALUE: u8 = 31; + +#[derive(Debug, Copy, Clone)] +pub enum Error { + LevelTooHigh(u8), +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(transparent)] +pub struct TreeLevel(u8); + +impl TreeLevel { + pub const fn max_value() -> TreeLevel { + TreeLevel(MAX_VALUE) + } + + pub const fn min_value() -> TreeLevel { + TreeLevel(0) + } +} + +impl Into for TreeLevel { + fn into(self) -> u8 { + self.0 + } +} + +impl TryFrom for TreeLevel { + type Error = Error; + + fn try_from(value: u8) -> Result { + match value { + 0..=MAX_VALUE => Ok(TreeLevel(value)), + _ => Err(Error::LevelTooHigh(value)), + } + } +} + +impl fmt::Display for TreeLevel { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index a7be248b6..4286fc780 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -1,4 +1,5 @@ use std::cmp; +use std::convert::TryFrom; use std::fs::File; use std::num::NonZeroUsize; @@ -9,9 +10,9 @@ use log::debug; use roaring::RoaringBitmap; use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; -use crate::Index; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; +use crate::{Index, TreeLevel}; pub struct WordsLevelPositions<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -105,8 +106,8 @@ fn compute_positions_levels( let (word, ()) = result?; let level_0_range = { - let left = (word, 0, u32::min_value(), u32::min_value()); - let right = (word, 0, u32::max_value(), u32::max_value()); + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); left..=right }; @@ -117,7 +118,7 @@ fn compute_positions_levels( // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) + .map(|l| (TreeLevel::try_from(l).unwrap(), level_group_size.get().pow(l as u32))) .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); // As specified in the documentation, we also write the level 0 entries. @@ -163,7 +164,7 @@ fn compute_positions_levels( fn write_level_entry( writer: &mut Writer, word: &str, - level: u8, + level: TreeLevel, left: u32, right: u32, ids: &RoaringBitmap, From 658f316511faf6f87d4d7733236887e80d5eef79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 23 Mar 2021 15:25:46 +0100 Subject: [PATCH 15/45] Introduce the Initial Criterion --- milli/Cargo.toml | 3 - milli/src/search/criteria/asc_desc.rs | 161 +++++++------------------ milli/src/search/criteria/attribute.rs | 88 ++++++-------- milli/src/search/criteria/fetcher.rs | 135 --------------------- milli/src/search/criteria/final.rs | 57 +++++++++ milli/src/search/criteria/initial.rs | 28 +++++ milli/src/search/criteria/mod.rs | 65 ++++------ milli/src/search/criteria/proximity.rs | 151 ++++++++--------------- milli/src/search/criteria/typo.rs | 66 ++++------ milli/src/search/criteria/words.rs | 56 +++------ milli/src/search/mod.rs | 9 +- 11 files changed, 286 insertions(+), 533 deletions(-) delete mode 100644 milli/src/search/criteria/fetcher.rs create mode 100644 milli/src/search/criteria/final.rs create mode 100644 milli/src/search/criteria/initial.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index eefdfa7d5..ef9c64b7b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -57,9 +57,6 @@ criterion = "0.3.4" maplit = "1.0.2" rand = "0.8.3" -[build-dependencies] -fst = "0.4.5" - [features] default = [] diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 1dc186720..d2841d449 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -31,32 +31,10 @@ pub struct AscDesc<'t> { candidates: Box> + 't>, bucket_candidates: RoaringBitmap, faceted_candidates: RoaringBitmap, - parent: Option>, + parent: Box, } impl<'t> AscDesc<'t> { - pub fn initial_asc( - index: &'t Index, - rtxn: &'t heed::RoTxn, - query_tree: Option, - candidates: Option, - field_name: String, - ) -> anyhow::Result - { - Self::initial(index, rtxn, query_tree, candidates, field_name, true) - } - - pub fn initial_desc( - index: &'t Index, - rtxn: &'t heed::RoTxn, - query_tree: Option, - candidates: Option, - field_name: String, - ) -> anyhow::Result - { - Self::initial(index, rtxn, query_tree, candidates, field_name, false) - } - pub fn asc( index: &'t Index, rtxn: &'t heed::RoTxn, @@ -77,47 +55,6 @@ impl<'t> AscDesc<'t> { Self::new(index, rtxn, parent, field_name, false) } - fn initial( - index: &'t Index, - rtxn: &'t heed::RoTxn, - query_tree: Option, - candidates: Option, - field_name: String, - ascending: bool, - ) -> anyhow::Result - { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields(rtxn)?; - let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; - - let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?; - let candidates = match &query_tree { - Some(qt) => { - let context = CriteriaBuilder::new(rtxn, index)?; - let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), &mut WordDerivationsCache::new())?; - if let Some(candidates) = candidates { - qt_candidates.intersect_with(&candidates); - } - qt_candidates - }, - None => candidates.unwrap_or(faceted_candidates.clone()), - }; - - Ok(AscDesc { - index, - rtxn, - field_name, - field_id, - facet_type, - ascending, - query_tree, - candidates: facet_ordered(index, rtxn, field_id, facet_type, ascending, candidates)?, - faceted_candidates, - bucket_candidates: RoaringBitmap::new(), - parent: None, - }) - } - fn new( index: &'t Index, rtxn: &'t heed::RoTxn, @@ -141,7 +78,7 @@ impl<'t> AscDesc<'t> { candidates: Box::new(std::iter::empty()), faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, }) } } @@ -156,64 +93,56 @@ impl<'t> Criterion for AscDesc<'t> { match self.candidates.next().transpose()? { None => { - let query_tree = self.query_tree.take(); - let bucket_candidates = take(&mut self.bucket_candidates); - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree; - let candidates = match (&self.query_tree, candidates) { - (_, Some(mut candidates)) => { - candidates.intersect_with(&self.faceted_candidates); - candidates - }, - (Some(qt), None) => { - let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; - candidates.intersect_with(&self.faceted_candidates); - candidates - }, - (None, None) => take(&mut self.faceted_candidates), - }; - if bucket_candidates.is_empty() { - self.bucket_candidates.union_with(&candidates); - } else { - self.bucket_candidates.union_with(&bucket_candidates); - } - self.candidates = facet_ordered( - self.index, - self.rtxn, - self.field_id, - self.facet_type, - self.ascending, - candidates, - )?; + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates_is_some = candidates.is_some(); + self.query_tree = query_tree; + let candidates = match (&self.query_tree, candidates) { + (_, Some(mut candidates)) => { + candidates.intersect_with(&self.faceted_candidates); + candidates }, - None => return Ok(None), - } - }, - None => if query_tree.is_none() && bucket_candidates.is_empty() { - return Ok(None) - }, - } + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; + candidates.intersect_with(&self.faceted_candidates); + candidates + }, + (None, None) => take(&mut self.faceted_candidates), + }; - return Ok(Some(CriterionResult { - query_tree, - candidates: Some(RoaringBitmap::new()), - bucket_candidates, - })); + // If our parent returns candidates it means that the bucket + // candidates were already computed before and we can use them. + // + // If not, we must use the just computed candidates as our bucket + // candidates. + if candidates_is_some { + self.bucket_candidates.union_with(&bucket_candidates); + } else { + self.bucket_candidates.union_with(&candidates); + } + + if candidates.is_empty() { + continue; + } + + self.candidates = facet_ordered( + self.index, + self.rtxn, + self.field_id, + self.facet_type, + self.ascending, + candidates, + )?; + }, + None => return Ok(None), + } }, Some(candidates) => { - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); }, } diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 7f8b5c622..6398c7d87 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -14,36 +14,19 @@ pub struct Attribute<'t> { query_tree: Option, candidates: Option, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Box, flattened_query_tree: Option>>, current_buckets: Option>, } impl<'t> Attribute<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Attribute { - ctx, - query_tree, - candidates, - bucket_candidates: RoaringBitmap::new(), - parent: None, - flattened_query_tree: None, - current_buckets: None, - } - } - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { Attribute { ctx, query_tree: None, candidates: None, bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, flattened_query_tree: None, current_buckets: None, } @@ -63,34 +46,35 @@ impl<'t> Criterion for Attribute<'t> { })); }, (Some(qt), Some(candidates)) => { - let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| flatten_query_tree(&qt)); - let current_buckets = if let Some(current_buckets) = self.current_buckets.as_mut() { - current_buckets - } else { - let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; - self.current_buckets.get_or_insert(new_buckets.into_iter()) + let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| { + flatten_query_tree(&qt) + }); + + let current_buckets = match self.current_buckets.as_mut() { + Some(current_buckets) => current_buckets, + None => { + let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; + self.current_buckets.get_or_insert(new_buckets.into_iter()) + }, }; - let found_candidates = if let Some((_score, candidates)) = current_buckets.next() { - candidates - } else { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), - })); + let found_candidates = match current_buckets.next() { + Some((_score, candidates)) => candidates, + None => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, }; + candidates.difference_with(&found_candidates); - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => found_candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(found_candidates), - bucket_candidates: bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); }, (Some(qt), None) => { @@ -106,18 +90,20 @@ impl<'t> Criterion for Attribute<'t> { })); }, (None, None) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree; - self.candidates = candidates; - self.bucket_candidates.union_with(&bucket_candidates); - self.flattened_query_tree = None; - self.current_buckets = None; - }, - None => return Ok(None), - } + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree; + self.candidates = candidates; + self.bucket_candidates.union_with(&bucket_candidates); + self.flattened_query_tree = None; + self.current_buckets = None; }, None => return Ok(None), } diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs deleted file mode 100644 index fa204bdf2..000000000 --- a/milli/src/search/criteria/fetcher.rs +++ /dev/null @@ -1,135 +0,0 @@ -use std::collections::HashMap; -use std::mem::take; - -use log::debug; -use roaring::RoaringBitmap; - -use crate::search::query_tree::Operation; -use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; - -/// The result of a call to the fetcher. -#[derive(Debug, Clone, PartialEq)] -pub struct FetcherResult { - /// The query tree corresponding to the current bucket of the last criterion. - pub query_tree: Option, - /// The candidates of the current bucket of the last criterion. - pub candidates: RoaringBitmap, - /// Candidates that comes from the current bucket of the initial criterion. - pub bucket_candidates: RoaringBitmap, -} - -pub struct Fetcher<'t> { - ctx: &'t dyn Context, - query_tree: Option, - candidates: Candidates, - parent: Option>, - should_get_documents_ids: bool, - wdcache: WordDerivationsCache, -} - -impl<'t> Fetcher<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Fetcher { - ctx, - query_tree, - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - parent: None, - should_get_documents_ids: true, - wdcache: WordDerivationsCache::new(), - } - } - - pub fn new( - ctx: &'t dyn Context, - parent: Box, - ) -> Self - { - Fetcher { - ctx, - query_tree: None, - candidates: Candidates::default(), - parent: Some(parent), - should_get_documents_ids: true, - wdcache: WordDerivationsCache::new(), - } - } - - #[logging_timer::time("Fetcher::{}")] - pub fn next(&mut self) -> anyhow::Result> { - use Candidates::{Allowed, Forbidden}; - loop { - debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", - self.should_get_documents_ids, self.candidates, - ); - - let should_get_documents_ids = take(&mut self.should_get_documents_ids); - match &mut self.candidates { - Allowed(_) => { - let candidates = take(&mut self.candidates).into_inner(); - let candidates = match &self.query_tree { - Some(qt) if should_get_documents_ids => { - let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?; - docids.intersect_with(&candidates); - docids - }, - _ => candidates, - }; - - return Ok(Some(FetcherResult { - query_tree: self.query_tree.take(), - candidates: candidates.clone(), - bucket_candidates: candidates, - })); - }, - Forbidden(_) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(&mut self.wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, - (None, None) => RoaringBitmap::new(), - }; - - return Ok(Some(FetcherResult { query_tree, candidates, bucket_candidates })) - }, - None => if should_get_documents_ids { - let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, - None => self.ctx.documents_ids()?, - }; - - return Ok(Some(FetcherResult { - query_tree: self.query_tree.clone(), - candidates: candidates.clone(), - bucket_candidates: candidates, - })); - }, - } - }, - None => if should_get_documents_ids { - let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, - None => self.ctx.documents_ids()?, - }; - - return Ok(Some(FetcherResult { - query_tree: self.query_tree.clone(), - candidates: candidates.clone(), - bucket_candidates: candidates, - })); - }, - } - return Ok(None); - }, - } - } - } -} diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs new file mode 100644 index 000000000..fe224ef94 --- /dev/null +++ b/milli/src/search/criteria/final.rs @@ -0,0 +1,57 @@ +use std::collections::HashMap; + +use log::debug; +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; +use super::{resolve_query_tree, Criterion, CriterionResult, Context}; + +/// The result of a call to the fetcher. +#[derive(Debug, Clone, PartialEq)] +pub struct FinalResult { + /// The query tree corresponding to the current bucket of the last criterion. + pub query_tree: Option, + /// The candidates of the current bucket of the last criterion. + pub candidates: RoaringBitmap, + /// Candidates that comes from the current bucket of the initial criterion. + pub bucket_candidates: RoaringBitmap, +} + +pub struct Final<'t> { + ctx: &'t dyn Context, + parent: Box, + wdcache: WordDerivationsCache, +} + +impl<'t> Final<'t> { + pub fn new(ctx: &'t dyn Context, parent: Box) -> Final<'t> { + Final { ctx, parent, wdcache: WordDerivationsCache::new() } + } + + #[logging_timer::time("Final::{}")] + pub fn next(&mut self) -> anyhow::Result> { + loop { + debug!("Final iteration"); + + match self.parent.next(&mut self.wdcache)? { + Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => { + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, + (None, None) => self.ctx.documents_ids()?, + }; + + bucket_candidates.union_with(&candidates); + + return Ok(Some(FinalResult { + query_tree, + candidates, + bucket_candidates, + })); + }, + None => return Ok(None), + } + } + } +} diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs new file mode 100644 index 000000000..d4b9e1379 --- /dev/null +++ b/milli/src/search/criteria/initial.rs @@ -0,0 +1,28 @@ +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; + +use super::{Criterion, CriterionResult}; + +pub struct Initial { + answer: Option +} + +impl Initial { + pub fn new(query_tree: Option, mut candidates: Option) -> Initial { + let answer = CriterionResult { + query_tree, + candidates: candidates.clone(), + bucket_candidates: candidates.take().unwrap_or_default(), + }; + Initial { answer: Some(answer) } + } +} + +impl Criterion for Initial { + #[logging_timer::time("Initial::{}")] + fn next(&mut self, _: &mut WordDerivationsCache) -> anyhow::Result> { + Ok(self.answer.take()) + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1d7026d71..5e75be6ce 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -8,19 +8,21 @@ use crate::search::{word_derivations, WordDerivationsCache}; use crate::{Index, DocumentId}; use super::query_tree::{Operation, Query, QueryKind}; +use self::asc_desc::AscDesc; +use self::attribute::Attribute; +use self::r#final::Final; +use self::initial::Initial; +use self::proximity::Proximity; use self::typo::Typo; use self::words::Words; -use self::asc_desc::AscDesc; -use self::proximity::Proximity; -use self::attribute::Attribute; -use self::fetcher::Fetcher; +mod asc_desc; +mod attribute; +mod initial; +mod proximity; mod typo; mod words; -mod asc_desc; -mod proximity; -mod attribute; -pub mod fetcher; +pub mod r#final; pub trait Criterion { fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result>; @@ -61,6 +63,7 @@ impl Default for Candidates { Self::Forbidden(RoaringBitmap::new()) } } + pub trait Context { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; @@ -128,44 +131,26 @@ impl<'t> CriteriaBuilder<'t> { pub fn build( &'t self, - mut query_tree: Option, - mut facet_candidates: Option, - ) -> anyhow::Result> + query_tree: Option, + facet_candidates: Option, + ) -> anyhow::Result> { use crate::criterion::Criterion as Name; - let mut criterion = None as Option>; + let mut criterion = Box::new(Initial::new(query_tree, facet_candidates)) as Box; for name in self.index.criteria(&self.rtxn)? { - criterion = Some(match criterion.take() { - Some(father) => match name { - Name::Typo => Box::new(Typo::new(self, father)), - Name::Words => Box::new(Words::new(self, father)), - Name::Proximity => Box::new(Proximity::new(self, father)), - Name::Attribute => Box::new(Attribute::new(self, father)), - Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, father, field)?), - Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, father, field)?), - _otherwise => father, - }, - None => match name { - Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())), - Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())), - Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())), - Name::Attribute => Box::new(Attribute::initial(self, query_tree.take(), facet_candidates.take())), - Name::Asc(field) => { - Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) - }, - Name::Desc(field) => { - Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) - }, - _otherwise => continue, - }, - }); + criterion = match name { + Name::Typo => Box::new(Typo::new(self, criterion)), + Name::Words => Box::new(Words::new(self, criterion)), + Name::Proximity => Box::new(Proximity::new(self, criterion)), + Name::Attribute => Box::new(Attribute::new(self, criterion)), + Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?), + Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?), + _otherwise => criterion, + }; } - match criterion { - Some(criterion) => Ok(Fetcher::new(self, criterion)), - None => Ok(Fetcher::initial(self, query_tree, facet_candidates)), - } + Ok(Final::new(self, criterion)) } } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index decd4c338..dc1daafb2 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -8,48 +8,29 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::{build_dfa, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; +use super::{Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; pub struct Proximity<'t> { ctx: &'t dyn Context, - query_tree: Option<(usize, Operation)>, + /// ((max_proximity, query_tree), allowed_candidates) + state: Option<(Option<(usize, Operation)>, RoaringBitmap)>, proximity: u8, - candidates: Candidates, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Box, candidates_cache: Cache, plane_sweep_cache: Option>, } impl<'t> Proximity<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Proximity { - ctx, - query_tree: query_tree.map(|op| (maximum_proximity(&op), op)), - proximity: 0, - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - bucket_candidates: RoaringBitmap::new(), - parent: None, - candidates_cache: Cache::new(), - plane_sweep_cache: None, - } - } - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { Proximity { ctx, - query_tree: None, + state: None, proximity: 0, - candidates: Candidates::default(), bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent: parent, candidates_cache: Cache::new(), plane_sweep_cache: None, } @@ -59,27 +40,20 @@ impl<'t> Proximity<'t> { impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { - use Candidates::{Allowed, Forbidden}; loop { - debug!("Proximity at iteration {} (max {:?}) ({:?})", + debug!("Proximity at iteration {} (max prox {:?}) ({:?})", self.proximity, - self.query_tree.as_ref().map(|(mp, _)| mp), - self.candidates, + self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)), + self.state.as_ref().map(|(_, cd)| cd), ); - match (&mut self.query_tree, &mut self.candidates) { - (_, Allowed(candidates)) if candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: Some(take(&mut self.candidates).into_inner()), - bucket_candidates: take(&mut self.bucket_candidates), - })); + match &mut self.state { + Some((_, candidates)) if candidates.is_empty() => { + self.state = None; // reset state }, - (Some((max_prox, query_tree)), Allowed(candidates)) => { + Some((Some((max_prox, query_tree)), candidates)) => { if self.proximity as usize > *max_prox { - // reset state to (None, Forbidden(_)) - self.query_tree = None; - self.candidates = Candidates::default(); + self.state = None; // reset state } else { let mut new_candidates = if candidates.len() <= 1000 { if let Some(cache) = self.plane_sweep_cache.as_mut() { @@ -89,9 +63,7 @@ impl<'t> Criterion for Proximity<'t> { candidates }, None => { - // reset state to (None, Forbidden(_)) - self.query_tree = None; - self.candidates = Candidates::default(); + self.state = None; // reset state continue }, } @@ -120,79 +92,54 @@ impl<'t> Criterion for Proximity<'t> { candidates.difference_with(&new_candidates); self.proximity += 1; - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => new_candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: Some(query_tree.clone()), candidates: Some(new_candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); } }, - (Some((max_prox, query_tree)), Forbidden(candidates)) => { - if self.proximity as usize > *max_prox { - self.query_tree = None; - self.candidates = Candidates::default(); - } else { - let mut new_candidates = resolve_candidates( - self.ctx, - &query_tree, - self.proximity, - &mut self.candidates_cache, - wdcache, - )?; - - new_candidates.difference_with(&candidates); - candidates.union_with(&new_candidates); - self.proximity += 1; - - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => new_candidates.clone(), - }; - - return Ok(Some(CriterionResult { - query_tree: Some(query_tree.clone()), - candidates: Some(new_candidates), - bucket_candidates, - })); - } - }, - (None, Allowed(_)) => { - let candidates = take(&mut self.candidates).into_inner(); + Some((None, candidates)) => { + let candidates = take(candidates); + self.state = None; // reset state return Ok(Some(CriterionResult { query_tree: None, candidates: Some(candidates.clone()), bucket_candidates: candidates, })); }, - (None, Forbidden(_)) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, - (None, None) => RoaringBitmap::new(), - }; + None => { + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates_is_some = candidates.is_some(); + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, + (None, None) => RoaringBitmap::new(), + }; - if bucket_candidates.is_empty() { - self.bucket_candidates.union_with(&candidates); - } else { - self.bucket_candidates.union_with(&bucket_candidates); - } - - self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); - self.proximity = 0; - self.candidates = Candidates::Allowed(candidates); - self.plane_sweep_cache = None; - }, - None => return Ok(None), + // If our parent returns candidates it means that the bucket + // candidates were already computed before and we can use them. + // + // If not, we must use the just computed candidates as our bucket + // candidates. + if candidates_is_some { + self.bucket_candidates.union_with(&bucket_candidates); + } else { + self.bucket_candidates.union_with(&candidates); } + + let query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); + self.state = Some((query_tree, candidates)); + self.proximity = 0; + self.plane_sweep_cache = None; }, None => return Ok(None), } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 3877f53ed..40b06afc4 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -14,28 +14,11 @@ pub struct Typo<'t> { number_typos: u8, candidates: Candidates, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Box, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, } impl<'t> Typo<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Typo { - ctx, - query_tree: query_tree.map(|op| (maximum_typo(&op), op)), - number_typos: 0, - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - bucket_candidates: RoaringBitmap::new(), - parent: None, - candidates_cache: HashMap::new(), - } - } - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { Typo { ctx, @@ -43,7 +26,7 @@ impl<'t> Typo<'t> { number_typos: 0, candidates: Candidates::default(), bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, candidates_cache: HashMap::new(), } } @@ -90,15 +73,10 @@ impl<'t> Criterion for Typo<'t> { candidates.difference_with(&new_candidates); self.number_typos += 1; - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => new_candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), candidates: Some(new_candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); } }, @@ -145,17 +123,19 @@ impl<'t> Criterion for Typo<'t> { })); }, (None, Forbidden(_)) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); - self.number_typos = 0; - self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); - self.bucket_candidates.union_with(&bucket_candidates); - }, - None => return Ok(None), - } + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); + self.number_typos = 0; + self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } @@ -334,8 +314,8 @@ fn resolve_candidates<'t>( #[cfg(test)] mod test { - use super::*; + use super::super::initial::Initial; use super::super::test::TestContext; #[test] @@ -345,7 +325,8 @@ mod test { let facet_candidates = None; let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, query_tree, facet_candidates); + let parent = Initial::new(query_tree, facet_candidates); + let mut criteria = Typo::new(&context, Box::new(parent)); assert!(criteria.next(&mut wdcache).unwrap().is_none()); } @@ -364,7 +345,8 @@ mod test { let facet_candidates = None; let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates); + let parent = Initial::new(Some(query_tree), facet_candidates); + let mut criteria = Typo::new(&context, Box::new(parent)); let candidates_1 = context.word_docids("split").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap() @@ -413,7 +395,8 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())); + let parent = Initial::new(query_tree, Some(facet_candidates.clone())); + let mut criteria = Typo::new(&context, Box::new(parent)); let expected = CriterionResult { query_tree: None, @@ -442,7 +425,8 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())); + let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone())); + let mut criteria = Typo::new(&context, Box::new(parent)); let candidates_1 = context.word_docids("split").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap() diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 0aa3b483a..5bb9d8d90 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -12,34 +12,18 @@ pub struct Words<'t> { query_trees: Vec, candidates: Option, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Box, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, } impl<'t> Words<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Words { - ctx, - query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), - candidates, - bucket_candidates: RoaringBitmap::new(), - parent: None, - candidates_cache: HashMap::default(), - } - } - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { Words { ctx, query_trees: Vec::default(), candidates: None, bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, candidates_cache: HashMap::default(), } } @@ -65,27 +49,17 @@ impl<'t> Criterion for Words<'t> { found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => found_candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: Some(qt), candidates: Some(found_candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); }, (Some(qt), None) => { - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => RoaringBitmap::new(), - }; - return Ok(Some(CriterionResult { query_tree: Some(qt), candidates: None, - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); }, (None, Some(_)) => { @@ -97,16 +71,18 @@ impl<'t> Criterion for Words<'t> { })); }, (None, None) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); - self.candidates = candidates; - self.bucket_candidates.union_with(&bucket_candidates); - }, - None => return Ok(None), - } + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); + self.candidates = candidates; + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 174fff35c..4f0bde422 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -13,9 +13,8 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct}; - -use crate::search::criteria::fetcher::{Fetcher, FetcherResult}; -use crate::{DocumentId, Index}; +use crate::search::criteria::r#final::{Final, FinalResult}; +use crate::{Index, DocumentId}; pub use self::facet::{ FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator, @@ -162,14 +161,14 @@ impl<'a> Search<'a> { &self, mut distinct: impl for<'c> Distinct<'c>, matching_words: MatchingWords, - mut criteria: Fetcher, + mut criteria: Final, ) -> anyhow::Result { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); let mut excluded_documents = RoaringBitmap::new(); let mut documents_ids = Vec::with_capacity(self.limit); - while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { + while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next()? { debug!("Number of candidates found {}", candidates.len()); let excluded = take(&mut excluded_documents); From 7aa5753ed282afd2df90f1fae07beb2a1b8eeb68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 24 Mar 2021 15:06:54 +0100 Subject: [PATCH 16/45] Make the attribute positions range bounds to be fixed --- http-ui/src/main.rs | 6 +-- milli/src/update/index_documents/mod.rs | 6 +-- milli/src/update/words_level_positions.rs | 47 +++++++++++++++-------- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index dbf7aadce..c85bd9b15 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; use std::net::SocketAddr; -use std::num::NonZeroUsize; +use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; @@ -286,8 +286,8 @@ struct WordsPrefixes { #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] struct WordsLevelPositions { - level_group_size: Option, - min_level_size: Option, + level_group_size: Option, + min_level_size: Option, } // Any value that is present is considered Some value, including null. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3a41a52ae..7a2196481 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom}; -use std::num::NonZeroUsize; +use std::num::{NonZeroU32, NonZeroUsize}; use std::sync::mpsc::sync_channel; use std::time::Instant; @@ -263,8 +263,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { facet_min_level_size: Option, words_prefix_threshold: Option, max_prefix_length: Option, - words_positions_level_group_size: Option, - words_positions_min_level_size: Option, + words_positions_level_group_size: Option, + words_positions_min_level_size: Option, update_method: IndexDocumentsMethod, update_format: UpdateFormat, autogenerate_docids: bool, diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 4286fc780..eb8d3bb3c 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -1,7 +1,7 @@ use std::cmp; use std::convert::TryFrom; use std::fs::File; -use std::num::NonZeroUsize; +use std::num::NonZeroU32; use grenad::{CompressionType, Reader, Writer, FileFuse}; use heed::types::{DecodeIgnore, Str}; @@ -20,8 +20,8 @@ pub struct WordsLevelPositions<'t, 'u, 'i> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) chunk_fusing_shrink_size: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, + level_group_size: NonZeroU32, + min_level_size: NonZeroU32, _update_id: u64, } @@ -38,18 +38,18 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { chunk_compression_type: CompressionType::None, chunk_compression_level: None, chunk_fusing_shrink_size: None, - level_group_size: NonZeroUsize::new(4).unwrap(), - min_level_size: NonZeroUsize::new(5).unwrap(), + level_group_size: NonZeroU32::new(4).unwrap(), + min_level_size: NonZeroU32::new(5).unwrap(), _update_id: update_id, } } - pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); + pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { + self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); self } - pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { + pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { self.min_level_size = value; self } @@ -84,6 +84,20 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { } } +/// Returns the next number after or equal to `x` that is divisible by `d`. +fn next_divisible(x: u32, d: u32) -> u32 { + (x.saturating_sub(1) | (d - 1)) + 1 +} + +/// Returns the previous number after or equal to `x` that is divisible by `d`, +/// saturates on zero. +fn previous_divisible(x: u32, d: u32) -> u32 { + match x.checked_sub(d - 1) { + Some(0) | None => 0, + Some(x) => next_divisible(x, d), + } +} + /// Generates all the words positions levels based on the levels zero (including the level zero). fn compute_positions_levels( rtxn: &heed::RoTxn, @@ -92,8 +106,8 @@ fn compute_positions_levels( compression_type: CompressionType, compression_level: Option, shrink_size: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, + level_group_size: NonZeroU32, + min_level_size: NonZeroU32, ) -> anyhow::Result> { // It is forbidden to keep a cursor and write in a database at the same time with LMDB @@ -113,7 +127,7 @@ fn compute_positions_levels( let first_level_size = words_positions_db.remap_data_type::() .range(rtxn, &level_0_range)? - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. @@ -136,20 +150,23 @@ fn compute_positions_levels( let ((_word, _level, value, _right), docids) = result?; if i == 0 { - left = value; - } else if i % group_size == 0 { + left = previous_divisible(value, group_size); + right = left + (group_size - 1); + } + + if value > right { // we found the first bound of the next group, we must store the left // and right bounds associated with the docids. write_level_entry(&mut writer, word, level, left, right, &group_docids)?; // We save the left bound for the new group and also reset the docids. group_docids = RoaringBitmap::new(); - left = value; + left = previous_divisible(value, group_size); + right = left + (group_size - 1); } // The right bound is always the bound we run through. group_docids.union_with(&docids); - right = value; } if !group_docids.is_empty() { From 0ad9499b935db85347323f14b2144c1c6a45d924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 24 Mar 2021 15:37:03 +0100 Subject: [PATCH 17/45] Fix an indexing bug in the words level positions --- milli/src/update/words_level_positions.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index eb8d3bb3c..70bc89860 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -121,7 +121,7 @@ fn compute_positions_levels( let level_0_range = { let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); - let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); + let right = (word, TreeLevel::min_value(), u32::max_value(), u32::max_value()); left..=right }; From ab92c814c3247b03bf2447ef5a346688a80cef95 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 24 Mar 2021 18:20:13 +0100 Subject: [PATCH 18/45] Fix attributes score --- milli/src/search/criteria/attribute.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 6398c7d87..160807847 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -122,7 +122,8 @@ fn linear_compute_candidates( fn compute_candidate_rank(branches: &Vec>, words_positions: HashMap) -> u64 { let mut min_rank = u64::max_value(); for branch in branches { - let mut branch_rank = 0; + let branch_len = branch.len(); + let mut branch_rank = Vec::with_capacity(branch_len); for Query { prefix, kind } in branch { // find the best position of the current word in the document. let position = match kind { @@ -145,13 +146,21 @@ fn linear_compute_candidates( // if a position is found, we add it to the branch score, // otherwise the branch is considered as unfindable in this document and we break. if let Some(position) = position { - branch_rank += position as u64; + branch_rank.push(position as u64); } else { - branch_rank = u64::max_value(); + branch_rank.clear(); break; } } - min_rank = min_rank.min(branch_rank); + + if !branch_rank.is_empty() { + branch_rank.sort_unstable(); + // because several words in same query can't match all a the position 0, + // we substract the word index to the position. + let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); + // here we do the means of the words of the branch + min_rank = min_rank.min(branch_rank / branch_len as u64); + } } min_rank From e65bad16ccd273625a624d44bde06e01eaf08bdb Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 25 Mar 2021 11:10:12 +0100 Subject: [PATCH 19/45] Compute the words prefixes at the end of an update --- http-ui/src/main.rs | 68 ------ infos/src/main.rs | 1 + milli/src/index.rs | 6 +- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 16 ++ .../update/index_documents/merge_function.rs | 4 + milli/src/update/index_documents/mod.rs | 37 +++- milli/src/update/mod.rs | 9 +- milli/src/update/update_builder.rs | 35 +--- milli/src/update/word_prefix_docids.rs | 75 +++++++ .../word_prefix_pair_proximity_docids.rs | 89 ++++++++ milli/src/update/words_level_positions.rs | 90 ++++++-- milli/src/update/words_prefixes.rs | 196 ------------------ milli/src/update/words_prefixes_fst.rs | 104 ++++++++++ 14 files changed, 409 insertions(+), 323 deletions(-) create mode 100644 milli/src/update/word_prefix_docids.rs create mode 100644 milli/src/update/word_prefix_pair_proximity_docids.rs delete mode 100644 milli/src/update/words_prefixes.rs create mode 100644 milli/src/update/words_prefixes_fst.rs diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index c85bd9b15..00618f58a 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -228,8 +228,6 @@ enum UpdateMeta { ClearDocuments, Settings(Settings), Facets(Facets), - WordsPrefixes(WordsPrefixes), - WordsLevelPositions(WordsLevelPositions), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -290,14 +288,6 @@ struct WordsLevelPositions { min_level_size: Option, } -// Any value that is present is considered Some value, including null. -fn deserialize_some<'de, T, D>(deserializer: D) -> Result, D::Error> -where T: Deserialize<'de>, - D: Deserializer<'de> -{ - Deserialize::deserialize(deserializer).map(Some) -} - #[tokio::main] async fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); @@ -496,36 +486,6 @@ async fn main() -> anyhow::Result<()> { Err(e) => Err(e) } } - UpdateMeta::WordsPrefixes(settings) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.words_prefixes(&mut wtxn, &index_cloned); - if let Some(value) = settings.threshold { - builder.threshold(value); - } - if let Some(value) = settings.max_prefix_length { - builder.max_prefix_length(value); - } - match builder.execute() { - Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e) - } - }, - UpdateMeta::WordsLevelPositions(levels) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.words_level_positions(&mut wtxn, &index_cloned); - if let Some(value) = levels.level_group_size { - builder.level_group_size(value); - } - if let Some(value) = levels.min_level_size { - builder.min_level_size(value); - } - match builder.execute() { - Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()) - } - } }; let meta = match result { @@ -942,32 +902,6 @@ async fn main() -> anyhow::Result<()> { warp::reply() }); - let update_store_cloned = update_store.clone(); - let update_status_sender_cloned = update_status_sender.clone(); - let change_words_prefixes_route = warp::filters::method::post() - .and(warp::path!("words-prefixes")) - .and(warp::body::json()) - .map(move |settings: WordsPrefixes| { - let meta = UpdateMeta::WordsPrefixes(settings); - let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); - let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); - eprintln!("update {} registered", update_id); - warp::reply() - }); - - let update_store_cloned = update_store.clone(); - let update_status_sender_cloned = update_status_sender.clone(); - let change_words_level_positions_route = warp::filters::method::post() - .and(warp::path!("words-level-positions")) - .and(warp::body::json()) - .map(move |levels: WordsLevelPositions| { - let meta = UpdateMeta::WordsLevelPositions(levels); - let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); - let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); - eprintln!("update {} registered", update_id); - warp::reply() - }); - let update_store_cloned = update_store.clone(); let update_status_sender_cloned = update_status_sender.clone(); let abort_update_id_route = warp::filters::method::delete() @@ -1042,8 +976,6 @@ async fn main() -> anyhow::Result<()> { .or(clearing_route) .or(change_settings_route) .or(change_facet_levels_route) - .or(change_words_prefixes_route) - .or(change_words_level_positions_route) .or(update_ws_route); let addr = SocketAddr::from_str(&opt.http_listen_addr)?; diff --git a/infos/src/main.rs b/infos/src/main.rs index 0e6403d7b..e730a8b43 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -338,6 +338,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho facet_field_id_value_docids, field_id_docid_facet_values: _, documents, + .. } = index; let main_name = "main"; diff --git a/milli/src/index.rs b/milli/src/index.rs index 0659b207a..ba7747250 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -54,6 +54,8 @@ pub struct Index { pub word_prefix_pair_proximity_docids: Database, /// Maps the word, level and position range with the docids that corresponds to it. pub word_level_position_docids: Database, + /// Maps the level positions of a word prefix with all the docids where this prefix appears. + pub word_prefix_level_position_docids: Database, /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. pub facet_field_id_value_docids: Database, /// Maps the document id, the facet field id and the globally ordered value. @@ -64,7 +66,7 @@ pub struct Index { impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(10); + options.max_dbs(11); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -74,6 +76,7 @@ impl Index { let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; + let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; @@ -98,6 +101,7 @@ impl Index { word_pair_proximity_docids, word_prefix_pair_proximity_docids, word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 6d7dd72b8..f89c2d00c 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -29,6 +29,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_pair_proximity_docids, word_prefix_pair_proximity_docids, word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, @@ -57,6 +58,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; word_level_position_docids.clear(self.wtxn)?; + word_prefix_level_position_docids.clear(self.wtxn)?; facet_field_id_value_docids.clear(self.wtxn)?; field_id_docid_facet_values.clear(self.wtxn)?; documents.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index f9303d339..4c5f8d61a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -89,6 +89,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_pair_proximity_docids, word_prefix_pair_proximity_docids, word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, @@ -345,6 +346,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + // We delete the documents ids that are under the word prefix level position docids. + let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + } else if docids.len() != previous_len { + iter.put_current(bytes, &docids)?; + } + } + + drop(iter); + Ok(self.documents_ids.len()) } } diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 54f994fc0..a6d008513 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -52,6 +52,10 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) - cbo_roaring_bitmap_merge(values) } +pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + cbo_roaring_bitmap_merge(values) +} + pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { cbo_roaring_bitmap_merge(values) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7a2196481..8ebdf1634 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -3,6 +3,7 @@ use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom}; use std::num::{NonZeroU32, NonZeroUsize}; +use std::str; use std::sync::mpsc::sync_channel; use std::time::Instant; @@ -13,18 +14,21 @@ use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionTy use heed::types::ByteSlice; use log::{debug, info, error}; use memmap::Mmap; -use rayon::ThreadPool; use rayon::prelude::*; +use rayon::ThreadPool; use serde::{Serialize, Deserialize}; use crate::index::Index; -use crate::update::{Facets, WordsLevelPositions, WordsPrefixes, UpdateIndexingStep}; +use crate::update::{ + Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep, + WordPrefixPairProximityDocids, +}; use self::store::{Store, Readers}; pub use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, docid_word_positions_merge, documents_merge, - word_level_position_docids_merge, facet_field_value_docids_merge, - field_id_docid_facet_values_merge, + word_level_position_docids_merge, word_prefix_level_positions_docids_merge, + facet_field_value_docids_merge, field_id_docid_facet_values_merge, }; pub use self::transform::{Transform, TransformOutput}; @@ -719,10 +723,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { builder.execute()?; // Run the words prefixes update operation. - let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id); - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id); if let Some(value) = self.words_prefix_threshold { builder.threshold(value); } @@ -731,8 +732,26 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + // Run the word prefix docids update operation. + let mut builder = WordPrefixDocids::new(self.wtxn, self.index); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + builder.max_nb_chunks = self.max_nb_chunks; + builder.max_memory = self.max_memory; + builder.execute()?; + + // Run the word prefix pair proximity docids update operation. + let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + builder.max_nb_chunks = self.max_nb_chunks; + builder.max_memory = self.max_memory; + builder.execute()?; + // Run the words level positions update operation. - let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id); + let mut builder = WordsLevelPositions::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 1fc4890fb..203937e2f 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -6,8 +6,10 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc pub use self::settings::{Setting, Settings}; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; +pub use self::word_prefix_docids::WordPrefixDocids; +pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids; pub use self::words_level_positions::WordsLevelPositions; -pub use self::words_prefixes::WordsPrefixes; +pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; @@ -17,6 +19,7 @@ mod index_documents; mod settings; mod update_builder; mod update_step; +mod word_prefix_docids; +mod word_prefix_pair_proximity_docids; mod words_level_positions; -mod words_prefixes; - +mod words_prefixes_fst; diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 9a4fb850e..8d6eb034d 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -2,10 +2,7 @@ use grenad::CompressionType; use rayon::ThreadPool; use crate::Index; -use super::{ - ClearDocuments, DeleteDocuments, IndexDocuments, Settings, - Facets, WordsPrefixes, WordsLevelPositions, -}; +use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -138,34 +135,4 @@ impl<'a> UpdateBuilder<'a> { builder } - - pub fn words_prefixes<'t, 'u, 'i>( - self, - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordsPrefixes<'t, 'u, 'i> - { - let mut builder = WordsPrefixes::new(wtxn, index, self.update_id); - - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; - - builder - } - - pub fn words_level_positions<'t, 'u, 'i>( - self, - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordsLevelPositions<'t, 'u, 'i> - { - let mut builder = WordsLevelPositions::new(wtxn, index, self.update_id); - - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; - - builder - } } diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs new file mode 100644 index 000000000..58c984212 --- /dev/null +++ b/milli/src/update/word_prefix_docids.rs @@ -0,0 +1,75 @@ +use std::str; + +use crate::Index; +use fst::Streamer; +use grenad::CompressionType; +use heed::types::ByteSlice; + +use crate::update::index_documents::WriteMethod; +use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database}; + +pub struct WordPrefixDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, +} + +impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> { + WordPrefixDocids { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + max_nb_chunks: None, + max_memory: None, + } + } + + pub fn execute(self) -> anyhow::Result<()> { + // Clear the word prefix docids database. + self.index.word_prefix_docids.clear(self.wtxn)?; + + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + + // It is forbidden to keep a mutable reference into the database + // and write into it at the same time, therefore we write into another file. + let mut prefix_docids_sorter = create_sorter( + word_docids_merge, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.max_nb_chunks, + self.max_memory, + ); + + // We iterate over all the prefixes and retrieve the corresponding docids. + let mut prefix_stream = prefix_fst.stream(); + while let Some(bytes) = prefix_stream.next() { + let prefix = str::from_utf8(bytes)?; + let db = self.index.word_docids.remap_data_type::(); + for result in db.prefix_iter(self.wtxn, prefix)? { + let (_word, data) = result?; + prefix_docids_sorter.insert(prefix, data)?; + } + } + + drop(prefix_fst); + + // We finally write the word prefix docids into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_docids.as_polymorph(), + prefix_docids_sorter, + word_docids_merge, + WriteMethod::Append, + )?; + + Ok(()) + } +} diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs new file mode 100644 index 000000000..c972efc4f --- /dev/null +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -0,0 +1,89 @@ +use std::str; + +use fst::automaton::{Automaton, Str}; +use fst::{Streamer, IntoStreamer}; +use grenad::CompressionType; +use heed::BytesEncode; +use heed::types::ByteSlice; +use log::debug; + +use crate::Index; +use crate::heed_codec::StrStrU8Codec; +use crate::update::index_documents::{ + WriteMethod, create_sorter, sorter_into_lmdb_database, + words_pairs_proximities_docids_merge, +}; + +pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, +} + +impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> + { + WordPrefixPairProximityDocids { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + max_nb_chunks: None, + max_memory: None, + } + } + + pub fn execute(self) -> anyhow::Result<()> { + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; + + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + + // Here we create a sorter akin to the previous one. + let mut word_prefix_pair_proximity_docids_sorter = create_sorter( + words_pairs_proximities_docids_merge, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.max_nb_chunks, + self.max_memory, + ); + + // We insert all the word pairs corresponding to the word-prefix pairs + // where the prefixes appears in the prefix FST previously constructed. + let db = self.index.word_pair_proximity_docids.remap_data_type::(); + for result in db.iter(self.wtxn)? { + let ((word1, word2, prox), data) = result?; + let automaton = Str::new(word2).starts_with(); + let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); + while let Some(prefix) = matching_prefixes.next() { + let prefix = str::from_utf8(prefix)?; + let pair = (word1, prefix, prox); + let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); + word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; + } + } + + drop(prefix_fst); + + // We finally write the word prefix pair proximity docids into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + word_prefix_pair_proximity_docids_sorter, + words_pairs_proximities_docids_merge, + WriteMethod::Append, + )?; + + Ok(()) + } +} diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 70bc89860..1b772c37d 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -1,17 +1,22 @@ -use std::cmp; +use std::{cmp, str}; use std::convert::TryFrom; use std::fs::File; use std::num::NonZeroU32; +use fst::automaton::{self, Automaton}; +use fst::{Streamer, IntoStreamer}; use grenad::{CompressionType, Reader, Writer, FileFuse}; -use heed::types::{DecodeIgnore, Str}; +use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; use crate::update::index_documents::WriteMethod; -use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; +use crate::update::index_documents::{ + create_writer, create_sorter, writer_into_reader, write_into_lmdb_database, + word_prefix_level_positions_docids_merge, sorter_into_lmdb_database +}; use crate::{Index, TreeLevel}; pub struct WordsLevelPositions<'t, 'u, 'i> { @@ -20,27 +25,24 @@ pub struct WordsLevelPositions<'t, 'u, 'i> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) chunk_fusing_shrink_size: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, level_group_size: NonZeroU32, min_level_size: NonZeroU32, - _update_id: u64, } impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - update_id: u64, - ) -> WordsLevelPositions<'t, 'u, 'i> - { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> { WordsLevelPositions { wtxn, index, chunk_compression_type: CompressionType::None, chunk_compression_level: None, chunk_fusing_shrink_size: None, + max_nb_chunks: None, + max_memory: None, level_group_size: NonZeroU32::new(4).unwrap(), min_level_size: NonZeroU32::new(5).unwrap(), - _update_id: update_id, } } @@ -76,7 +78,71 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_level_position_docids.as_polymorph(), entries, - |_, _| anyhow::bail!("invalid facet level merging"), + |_, _| anyhow::bail!("invalid word level position merging"), + WriteMethod::Append, + )?; + + // We compute the word prefix level positions database. + self.index.word_prefix_level_position_docids.clear(self.wtxn)?; + + let mut word_prefix_level_positions_docids_sorter = create_sorter( + word_prefix_level_positions_docids_merge, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.max_nb_chunks, + self.max_memory, + ); + + // We insert the word prefix level positions where the level is equal to 0 and + // corresponds to the word-prefix level positions where the prefixes appears + // in the prefix FST previously constructed. + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + let db = self.index.word_level_position_docids.remap_data_type::(); + for result in db.iter(self.wtxn)? { + let ((word, level, left, right), data) = result?; + if level == TreeLevel::min_value() { + let automaton = automaton::Str::new(word).starts_with(); + let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); + while let Some(prefix) = matching_prefixes.next() { + let prefix = str::from_utf8(prefix)?; + let key = (prefix, level, left, right); + let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap(); + word_prefix_level_positions_docids_sorter.insert(bytes, data)?; + } + } + } + + // We finally write all the word prefix level positions docids with + // a level equal to 0 into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_level_position_docids.as_polymorph(), + word_prefix_level_positions_docids_sorter, + word_prefix_level_positions_docids_merge, + WriteMethod::Append, + )?; + + let entries = compute_positions_levels( + self.wtxn, + self.index.word_prefix_docids.remap_data_type::(), + self.index.word_prefix_level_position_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + )?; + + // The previously computed entries also defines the level 0 entries + // so we can clear the database and append all of these entries. + self.index.word_prefix_level_position_docids.clear(self.wtxn)?; + + write_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_level_position_docids.as_polymorph(), + entries, + |_, _| anyhow::bail!("invalid word prefix level position merging"), WriteMethod::Append, )?; diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs deleted file mode 100644 index f2fe526a2..000000000 --- a/milli/src/update/words_prefixes.rs +++ /dev/null @@ -1,196 +0,0 @@ -use std::iter::FromIterator; -use std::str; - -use chrono::Utc; -use fst::automaton::Str; -use fst::{Automaton, Streamer, IntoStreamer}; -use grenad::CompressionType; -use heed::BytesEncode; -use heed::types::ByteSlice; - -use crate::heed_codec::StrStrU8Codec; -use crate::update::index_documents::WriteMethod; -use crate::update::index_documents::{create_sorter, sorter_into_lmdb_database}; -use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge}; -use crate::{Index, SmallString32}; - -pub struct WordsPrefixes<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, - threshold: f64, - max_prefix_length: usize, - _update_id: u64, -} - -impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - update_id: u64, - ) -> WordsPrefixes<'t, 'u, 'i> - { - WordsPrefixes { - wtxn, - index, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - chunk_fusing_shrink_size: None, - max_nb_chunks: None, - max_memory: None, - threshold: 0.1 / 100.0, // .01% - max_prefix_length: 4, - _update_id: update_id, - } - } - - /// Set the ratio of concerned words required to make a prefix be part of the words prefixes - /// database. If a word prefix is supposed to match more than this number of words in the - /// dictionnary, therefore this prefix is added to the words prefixes datastructures. - /// - /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped - /// to these bounds otherwise. - pub fn threshold(&mut self, value: f64) -> &mut Self { - self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] - self - } - - /// Set the maximum length of prefixes in bytes. - /// - /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped - /// to these bounds, otherwise. - pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value.min(25).max(1); // clamp [1, 25] - self - } - - pub fn execute(self) -> anyhow::Result<()> { - self.index.set_updated_at(self.wtxn, &Utc::now())?; - // Clear the words prefixes datastructures. - self.index.word_prefix_docids.clear(self.wtxn)?; - self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; - - let words_fst = self.index.words_fst(&self.wtxn)?; - let number_of_words = words_fst.len(); - let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; - - // It is forbidden to keep a mutable reference into the database - // and write into it at the same time, therefore we write into another file. - let mut prefix_docids_sorter = create_sorter( - word_docids_merge, - self.chunk_compression_type, - self.chunk_compression_level, - self.chunk_fusing_shrink_size, - self.max_nb_chunks, - self.max_memory, - ); - - let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); - for n in 1..=self.max_prefix_length { - - let mut current_prefix = SmallString32::new(); - let mut current_prefix_count = 0; - let mut builder = fst::SetBuilder::memory(); - - let mut stream = words_fst.stream(); - while let Some(bytes) = stream.next() { - // We try to get the first n bytes out of this string but we only want - // to split at valid characters bounds. If we try to split in the middle of - // a character we ignore this word and go to the next one. - let word = str::from_utf8(bytes)?; - let prefix = match word.get(..n) { - Some(prefix) => prefix, - None => continue, - }; - - // This is the first iteration of the loop, - // or the current word doesn't starts with the current prefix. - if current_prefix_count == 0 || prefix != current_prefix.as_str() { - current_prefix = SmallString32::from(prefix); - current_prefix_count = 0; - } - - current_prefix_count += 1; - - // There is enough words corresponding to this prefix to add it to the cache. - if current_prefix_count == min_number_of_words { - builder.insert(prefix)?; - } - } - - // We construct the final set for prefixes of size n. - prefix_fsts.push(builder.into_set()); - } - - // We merge all of the previously computed prefixes into on final set. - let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); - let mut builder = fst::SetBuilder::memory(); - builder.extend_stream(op.r#union())?; - let prefix_fst = builder.into_set(); - - // We iterate over all the prefixes and retrieve the corresponding docids. - let mut prefix_stream = prefix_fst.stream(); - while let Some(bytes) = prefix_stream.next() { - let prefix = str::from_utf8(bytes)?; - let db = self.index.word_docids.remap_data_type::(); - for result in db.prefix_iter(self.wtxn, prefix)? { - let (_word, data) = result?; - prefix_docids_sorter.insert(prefix, data)?; - } - } - - // Set the words prefixes FST in the dtabase. - self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; - - // We finally write the word prefix docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_docids.as_polymorph(), - prefix_docids_sorter, - word_docids_merge, - WriteMethod::Append, - )?; - - // We compute the word prefix pair proximity database. - - // Here we create a sorter akin to the previous one. - let mut word_prefix_pair_proximity_docids_sorter = create_sorter( - words_pairs_proximities_docids_merge, - self.chunk_compression_type, - self.chunk_compression_level, - self.chunk_fusing_shrink_size, - self.max_nb_chunks, - self.max_memory, - ); - - // We insert all the word pairs corresponding to the word-prefix pairs - // where the prefixes appears in the prefix FST previously constructed. - let db = self.index.word_pair_proximity_docids.remap_data_type::(); - for result in db.iter(self.wtxn)? { - let ((word1, word2, prox), data) = result?; - let automaton = Str::new(word2).starts_with(); - let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); - while let Some(prefix) = matching_prefixes.next() { - let prefix = str::from_utf8(prefix)?; - let pair = (word1, prefix, prox); - let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); - word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; - } - } - - // We finally write the word prefix pair proximity docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - word_prefix_pair_proximity_docids_sorter, - words_pairs_proximities_docids_merge, - WriteMethod::Append, - )?; - - Ok(()) - } -} diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs new file mode 100644 index 000000000..f53b0ee00 --- /dev/null +++ b/milli/src/update/words_prefixes_fst.rs @@ -0,0 +1,104 @@ +use std::iter::FromIterator; +use std::str; + +use fst::Streamer; +use crate::{Index, SmallString32}; + +pub struct WordsPrefixesFst<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + threshold: f64, + max_prefix_length: usize, + _update_id: u64, +} + +impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + update_id: u64, + ) -> WordsPrefixesFst<'t, 'u, 'i> + { + WordsPrefixesFst { + wtxn, + index, + threshold: 0.1 / 100.0, // .01% + max_prefix_length: 4, + _update_id: update_id, + } + } + + /// Set the ratio of concerned words required to make a prefix be part of the words prefixes + /// database. If a word prefix is supposed to match more than this number of words in the + /// dictionnary, therefore this prefix is added to the words prefixes datastructures. + /// + /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped + /// to these bounds otherwise. + pub fn threshold(&mut self, value: f64) -> &mut Self { + self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] + self + } + + /// Set the maximum length of prefixes in bytes. + /// + /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped + /// to these bounds, otherwise. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value.min(25).max(1); // clamp [1, 25] + self + } + + pub fn execute(self) -> anyhow::Result<()> { + let words_fst = self.index.words_fst(&self.wtxn)?; + let number_of_words = words_fst.len(); + let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; + + let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); + for n in 1..=self.max_prefix_length { + + let mut current_prefix = SmallString32::new(); + let mut current_prefix_count = 0; + let mut builder = fst::SetBuilder::memory(); + + let mut stream = words_fst.stream(); + while let Some(bytes) = stream.next() { + // We try to get the first n bytes out of this string but we only want + // to split at valid characters bounds. If we try to split in the middle of + // a character we ignore this word and go to the next one. + let word = str::from_utf8(bytes)?; + let prefix = match word.get(..n) { + Some(prefix) => prefix, + None => continue, + }; + + // This is the first iteration of the loop, + // or the current word doesn't starts with the current prefix. + if current_prefix_count == 0 || prefix != current_prefix.as_str() { + current_prefix = SmallString32::from(prefix); + current_prefix_count = 0; + } + + current_prefix_count += 1; + + // There is enough words corresponding to this prefix to add it to the cache. + if current_prefix_count == min_number_of_words { + builder.insert(prefix)?; + } + } + + // We construct the final set for prefixes of size n. + prefix_fsts.push(builder.into_set()); + } + + // We merge all of the previously computed prefixes into on final set. + let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); + let mut builder = fst::SetBuilder::memory(); + builder.extend_stream(op.r#union())?; + let prefix_fst = builder.into_set(); + + // Set the words prefixes FST in the dtabase. + self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; + + Ok(()) + } +} From 1aad66bdaafcf29428f30c3cf7463c0635396a7e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 25 Mar 2021 11:17:32 +0100 Subject: [PATCH 20/45] Compute stats about the word prefix level positions database in the infos crate --- infos/src/main.rs | 101 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 10 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index e730a8b43..81b753084 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -21,6 +21,7 @@ const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; +const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids"; const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values"; const DOCUMENTS_DB_NAME: &str = "documents"; @@ -33,6 +34,7 @@ const ALL_DATABASE_NAMES: &[&str] = &[ WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_LEVEL_POSITION_DOCIDS_DB_NAME, + WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME, FIELD_ID_DOCID_FACET_VALUES_DB_NAME, DOCUMENTS_DB_NAME, @@ -122,10 +124,21 @@ enum Command { #[structopt(long)] full_display: bool, - /// The field name in the document. + /// Words appearing in the documents. words: Vec, }, + /// Outputs a CSV with the documents ids along with + /// the word prefix level positions where it appears. + WordPrefixesLevelPositionsDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// Prefixes of words appearing in the documents. + prefixes: Vec, + }, + /// Outputs a CSV with the documents ids, words and the positions where this word appears. DocidsWordsPositions { /// Display the whole positions in detail. @@ -236,6 +249,9 @@ fn main() -> anyhow::Result<()> { WordsLevelPositionsDocids { full_display, words } => { words_level_positions_docids(&index, &rtxn, !full_display, words) }, + WordPrefixesLevelPositionsDocids { full_display, prefixes } => { + word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) + }, DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) }, @@ -335,6 +351,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho word_pair_proximity_docids, word_prefix_pair_proximity_docids, word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values: _, documents, @@ -348,6 +365,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids"; let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let word_level_position_docids_name = "word_level_position_docids"; + let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; let facet_field_id_value_docids_name = "facet_field_id_value_docids"; let documents_name = "documents"; @@ -411,6 +429,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho if heap.len() > limit { heap.pop(); } } + for result in word_prefix_level_position_docids.remap_data_type::().iter(rtxn)? { + let ((word, level, left, right), value) = result?; + let key = format!("{} {} {:?}", word, level, left..=right); + heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name))); + if heap.len() > limit { heap.pop(); } + } + let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; for (field_id, field_type) in faceted_fields { @@ -588,6 +613,45 @@ fn words_level_positions_docids( Ok(wtr.flush()?) } +fn word_prefixes_level_positions_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + prefixes: Vec, +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?; + + for word in prefixes.iter().map(AsRef::as_ref) { + let range = { + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); + left..=right + }; + for result in index.word_prefix_level_position_docids.range(rtxn, &range)? { + let ((w, level, left, right), docids) = result?; + + let count = docids.len().to_string(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + let position_range = if level == TreeLevel::min_value() { + format!("{:?}", left) + } else { + format!("{:?}", left..=right) + }; + let level = level.to_string(); + wtr.write_record(&[w, &level, &position_range, &count, &docids])?; + } + } + + Ok(wtr.flush()?) +} + fn docids_words_positions( index: &Index, rtxn: &heed::RoTxn, @@ -779,6 +843,21 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> anyhow::Result<()> { use heed::types::ByteSlice; + let Index { + env: _, + main, + word_docids, + word_prefix_docids, + docid_word_positions, + word_pair_proximity_docids, + word_prefix_pair_proximity_docids, + word_level_position_docids, + word_prefix_level_position_docids, + facet_field_id_value_docids, + field_id_docid_facet_values, + documents, + } = index; + let names = if names.is_empty() { ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect() } else { @@ -787,15 +866,17 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a for name in names { let database = match name.as_str() { - MAIN_DB_NAME => &index.main, - WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(), - WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(), - DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(), - FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => index.facet_field_id_value_docids.as_polymorph(), - FIELD_ID_DOCID_FACET_VALUES_DB_NAME => index.field_id_docid_facet_values.as_polymorph(), - DOCUMENTS_DB_NAME => index.documents.as_polymorph(), + MAIN_DB_NAME => &main, + WORD_PREFIX_DOCIDS_DB_NAME => word_prefix_docids.as_polymorph(), + WORD_DOCIDS_DB_NAME => word_docids.as_polymorph(), + DOCID_WORD_POSITIONS_DB_NAME => docid_word_positions.as_polymorph(), + WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_pair_proximity_docids.as_polymorph(), + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), + WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), + WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), + FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(), + FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(), + DOCUMENTS_DB_NAME => documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), }; From 7ff4a2a708d4d08a25fd800348316ee361108c5d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 25 Mar 2021 23:45:06 +0100 Subject: [PATCH 21/45] Display the number of entries in the infos crate --- infos/src/main.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/infos/src/main.rs b/infos/src/main.rs index 81b753084..5a12a9d4d 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -882,16 +882,19 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a let mut key_size: u64 = 0; let mut val_size: u64 = 0; + let mut number_entries: u64 = 0; for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? { let (k, v) = result?; key_size += k.len() as u64; val_size += v.len() as u64; + number_entries += 1; } println!("The {} database weigh:", name); println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true)); println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true)); println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true)); + println!("\tnumber of entries: {}", number_entries); } Ok(()) From 361193099fdeedb8b4b6fb5bf450bc9baa07f5cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 29 Mar 2021 16:25:14 +0200 Subject: [PATCH 22/45] Reduce the amount of branches when query tree flattened --- milli/src/search/criteria/attribute.rs | 83 +++++++++++++++----------- 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 160807847..31c11e7bb 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,3 +1,4 @@ +use std::cmp; use std::collections::{BTreeMap, HashMap, btree_map}; use std::mem::take; @@ -15,7 +16,7 @@ pub struct Attribute<'t> { candidates: Option, bucket_candidates: RoaringBitmap, parent: Box, - flattened_query_tree: Option>>, + flattened_query_tree: Option>>>, current_buckets: Option>, } @@ -115,33 +116,43 @@ impl<'t> Criterion for Attribute<'t> { fn linear_compute_candidates( ctx: &dyn Context, - branches: &Vec>, + branches: &Vec>>, allowed_candidates: &RoaringBitmap, ) -> anyhow::Result> { - fn compute_candidate_rank(branches: &Vec>, words_positions: HashMap) -> u64 { + fn compute_candidate_rank(branches: &Vec>>, words_positions: HashMap) -> u64 { let mut min_rank = u64::max_value(); for branch in branches { + let branch_len = branch.len(); let mut branch_rank = Vec::with_capacity(branch_len); - for Query { prefix, kind } in branch { - // find the best position of the current word in the document. - let position = match kind { - QueryKind::Exact { word, .. } => { - if *prefix { - word_derivations(word, true, 0, &words_positions) - .flat_map(|positions| positions.iter().next()).min() - } else { - words_positions.get(word) - .map(|positions| positions.iter().next()) - .flatten() - } - }, - QueryKind::Tolerant { typo, word } => { - word_derivations(word, *prefix, *typo, &words_positions) - .flat_map(|positions| positions.iter().next()).min() - }, - }; + for derivates in branch { + let mut position = None; + for Query { prefix, kind } in derivates { + // find the best position of the current word in the document. + let current_position = match kind { + QueryKind::Exact { word, .. } => { + if *prefix { + word_derivations(word, true, 0, &words_positions) + .flat_map(|positions| positions.iter().next()).min() + } else { + words_positions.get(word) + .map(|positions| positions.iter().next()) + .flatten() + } + }, + QueryKind::Tolerant { typo, word } => { + word_derivations(word, *prefix, *typo, &words_positions) + .flat_map(|positions| positions.iter().next()).min() + }, + }; + + match (position, current_position) { + (Some(p), Some(cp)) => position = Some(cmp::min(p, cp)), + (None, Some(cp)) => position = Some(cp), + _ => (), + } + } // if a position is found, we add it to the branch score, // otherwise the branch is considered as unfindable in this document and we break. @@ -194,10 +205,10 @@ fn linear_compute_candidates( } // TODO can we keep refs of Query -fn flatten_query_tree(query_tree: &Operation) -> Vec> { +fn flatten_query_tree(query_tree: &Operation) -> Vec>> { use crate::search::criteria::Operation::{And, Or, Consecutive}; - fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec> { + fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec>> { match tail.split_first() { Some((thead, tail)) => { let tail = and_recurse(thead, tail); @@ -215,13 +226,17 @@ fn flatten_query_tree(query_tree: &Operation) -> Vec> { } } - fn recurse(op: &Operation) -> Vec> { + fn recurse(op: &Operation) -> Vec>> { match op { And(ops) | Consecutive(ops) => { ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) }, - Or(_, ops) => ops.into_iter().map(recurse).flatten().collect(), - Operation::Query(query) => vec![vec![query.clone()]], + Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { + vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] + } else { + ops.into_iter().map(recurse).flatten().collect() + }, + Operation::Query(query) => vec![vec![vec![query.clone()]]], } } @@ -256,19 +271,19 @@ mod tests { ]); let expected = vec![ - vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }], + vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]], vec![ - Query { prefix: false, kind: QueryKind::exact(S("manythe")) }, - Query { prefix: false, kind: QueryKind::exact(S("fish")) }, + vec![Query { prefix: false, kind: QueryKind::exact(S("manythe")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], ], vec![ - Query { prefix: false, kind: QueryKind::exact(S("many")) }, - Query { prefix: false, kind: QueryKind::exact(S("thefish")) }, + vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("thefish")) }], ], vec![ - Query { prefix: false, kind: QueryKind::exact(S("many")) }, - Query { prefix: false, kind: QueryKind::exact(S("the")) }, - Query { prefix: false, kind: QueryKind::exact(S("fish")) }, + vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("the")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], ], ]; From 59f58c15f7cedf16acdbb3a89bae17247ffcc3ab Mon Sep 17 00:00:00 2001 From: many Date: Wed, 31 Mar 2021 19:23:02 +0200 Subject: [PATCH 23/45] Implement attribute criterion * Implement WordLevelIterator * Implement QueryLevelIterator * Implement set algorithm based on iterators Not tested + Some TODO to fix --- milli/src/search/criteria/attribute.rs | 354 +++++++++++++++++++++++-- milli/src/search/criteria/final.rs | 4 +- milli/src/search/criteria/mod.rs | 52 +++- milli/src/search/criteria/proximity.rs | 4 +- milli/src/search/criteria/typo.rs | 4 +- milli/src/search/criteria/words.rs | 4 +- milli/src/tree_level.rs | 4 + 7 files changed, 394 insertions(+), 32 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 31c11e7bb..af336c21f 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,17 +1,17 @@ -use std::cmp; +use std::{cmp::{self, Ordering}, collections::BinaryHeap}; use std::collections::{BTreeMap, HashMap, btree_map}; use std::mem::take; use roaring::RoaringBitmap; -use crate::{search::build_dfa}; +use crate::{TreeLevel, search::build_dfa}; use crate::search::criteria::Query; use crate::search::query_tree::{Operation, QueryKind}; use crate::search::WordDerivationsCache; use super::{Criterion, CriterionResult, Context, resolve_query_tree}; pub struct Attribute<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, query_tree: Option, candidates: Option, bucket_candidates: RoaringBitmap, @@ -21,7 +21,7 @@ pub struct Attribute<'t> { } impl<'t> Attribute<'t> { - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Attribute { ctx, query_tree: None, @@ -51,23 +51,27 @@ impl<'t> Criterion for Attribute<'t> { flatten_query_tree(&qt) }); - let current_buckets = match self.current_buckets.as_mut() { - Some(current_buckets) => current_buckets, - None => { - let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; - self.current_buckets.get_or_insert(new_buckets.into_iter()) - }, - }; + let found_candidates = if candidates.len() < 1000 { + let current_buckets = match self.current_buckets.as_mut() { + Some(current_buckets) => current_buckets, + None => { + let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; + self.current_buckets.get_or_insert(new_buckets.into_iter()) + }, + }; - let found_candidates = match current_buckets.next() { - Some((_score, candidates)) => candidates, - None => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), - })); - }, + match current_buckets.next() { + Some((_score, candidates)) => candidates, + None => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + } + } else { + set_compute_candidates(self.ctx, flattened_query_tree, candidates)? }; candidates.difference_with(&found_candidates); @@ -114,6 +118,316 @@ impl<'t> Criterion for Attribute<'t> { } } +struct WordLevelIterator<'t, 'q> { + inner: Box> + 't>, + level: TreeLevel, + interval_size: u32, + word: &'q str, + in_prefix_cache: bool, + inner_next: Option<(u32, u32, RoaringBitmap)>, + current_interval: Option<(u32, u32)>, +} + +impl<'t, 'q> WordLevelIterator<'t, 'q> { + fn new(ctx: &'t dyn Context<'t>, query: &'q Query) -> heed::Result> { + // TODO make it typo/prefix tolerant + let word = query.kind.word(); + let in_prefix_cache = query.prefix && ctx.in_prefix_cache(word); + match ctx.word_position_last_level(word, in_prefix_cache)? { + Some(level) => { + let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); + let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?; + Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) + }, + None => Ok(None), + } + } + + fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel) -> heed::Result { + let level = level.min(&self.level).clone(); + let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); + let word = self.word; + let in_prefix_cache = self.in_prefix_cache; + // TODO try to dig starting from the current interval + // let left = self.current_interval.map(|(left, _)| left); + let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?; + + Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) + } + + fn next(&mut self) -> heed::Result> { + fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left } + + let inner_next = match self.inner_next.take() { + Some(inner_next) => Some(inner_next), + None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)), + }; + + match inner_next { + Some((left, right, docids)) => { + match self.current_interval { + Some((last_left, last_right)) if !is_next_interval(last_right, left) => { + let blank_left = last_left + self.interval_size; + let blank_right = last_right + self.interval_size; + self.current_interval = Some((blank_left, blank_right)); + self.inner_next = Some((left, right, docids)); + Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) + }, + _ => { + self.current_interval = Some((left, right)); + Ok(Some((left, right, docids))) + } + } + }, + None => Ok(None), + } + } +} + +struct QueryLevelIterator<'t, 'q> { + previous: Option>>, + inner: Vec>, + level: TreeLevel, + accumulator: Vec>, + previous_accumulator: Vec>, +} + +impl<'t, 'q> QueryLevelIterator<'t, 'q> { + fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec) -> heed::Result> { + let mut inner = Vec::with_capacity(queries.len()); + for query in queries { + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, query)? { + inner.push(word_level_iterator); + } + } + + let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone()); + match highest { + Some(level) => Ok(Some(Self { + previous: None, + inner, + level, + accumulator: vec![], + previous_accumulator: vec![], + })), + None => Ok(None), + } + } + + fn previous(&mut self, previous: QueryLevelIterator<'t, 'q>) -> &Self { + self.previous = Some(Box::new(previous)); + self + } + + fn dig(&self, ctx: &'t dyn Context<'t>) -> heed::Result { + let (level, previous) = match &self.previous { + Some(previous) => { + let previous = previous.dig(ctx)?; + (previous.level.min(self.level), Some(Box::new(previous))) + }, + None => (self.level.saturating_sub(1), None), + }; + + let mut inner = Vec::with_capacity(self.inner.len()); + for word_level_iterator in self.inner.iter() { + inner.push(word_level_iterator.dig(ctx, &level)?); + } + + Ok(Self {previous, inner, level, accumulator: vec![], previous_accumulator: vec![]}) + } + + + + fn inner_next(&mut self, level: TreeLevel) -> heed::Result> { + let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None; + let u8_level = Into::::into(level); + let interval_size = 4u32.pow(u8_level as u32); + for wli in self.inner.iter_mut() { + let wli_u8_level = Into::::into(wli.level.clone()); + let accumulated_count = 4u32.pow((u8_level - wli_u8_level) as u32); + for _ in 0..accumulated_count { + if let Some((next_left, _, next_docids)) = wli.next()? { + accumulated = accumulated.take().map( + |(acc_left, acc_right, mut acc_docids)| { + acc_docids.union_with(&next_docids); + (acc_left, acc_right, acc_docids) + } + ).or_else(|| Some((next_left, next_left + interval_size, next_docids))); + } + } + } + + Ok(accumulated) + } + + fn next(&mut self) -> heed::Result<(TreeLevel, Option<(u32, u32, RoaringBitmap)>)> { + let previous_result = match self.previous.as_mut() { + Some(previous) => { + Some(previous.next()?) + }, + None => None, + }; + + match previous_result { + Some((previous_level, previous_next)) => { + let inner_next = self.inner_next(previous_level)?; + self.accumulator.push(inner_next); + self.previous_accumulator.push(previous_next); + // TODO @many clean firsts intervals of both accumulators when both RoaringBitmap are empty, + // WARNING the cleaned intervals count needs to be kept to skip at the end + let mut merged_interval = None; + for current in self.accumulator.iter().rev().zip(self.previous_accumulator.iter()) { + if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { + let (_, _, merged_docids) = merged_interval.get_or_insert_with(|| (left_a + left_b, right_a + right_b, RoaringBitmap::new())); + merged_docids.union_with(&(a & b)); + } + } + Ok((previous_level, merged_interval)) + }, + None => { + let level = self.level.clone(); + let next_interval = self.inner_next(level.clone())?; + self.accumulator = vec![next_interval.clone()]; + Ok((level, next_interval)) + } + } + } +} + +struct Branch<'t, 'q> { + query_level_iterator: QueryLevelIterator<'t, 'q>, + last_result: Option<(u32, u32, RoaringBitmap)>, + tree_level: TreeLevel, + branch_size: u32, +} + +impl<'t, 'q> Branch<'t, 'q> { + fn cmp(&self, other: &Self) -> Ordering { + fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((1..branch_size).sum()) / branch_size } + match (&self.last_result, &other.last_result) { + (Some((s_left, _, _)), Some((o_left, _, _))) => { + // we compute a rank form the left interval. + let self_rank = compute_rank(*s_left, self.branch_size); + let other_rank = compute_rank(*o_left, other.branch_size); + let left_cmp = self_rank.cmp(&other_rank).reverse(); + // on level: higher is better, + // we want to reduce highest levels first. + let level_cmp = self.tree_level.cmp(&other.tree_level); + + left_cmp.then(level_cmp) + }, + (Some(_), None) => Ordering::Greater, + (None, Some(_)) => Ordering::Less, + (None, None) => Ordering::Equal, + } + } +} + +impl<'t, 'q> Ord for Branch<'t, 'q> { + fn cmp(&self, other: &Self) -> Ordering { + self.cmp(other) + } +} + +impl<'t, 'q> PartialOrd for Branch<'t, 'q> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'t, 'q> PartialEq for Branch<'t, 'q> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl<'t, 'q> Eq for Branch<'t, 'q> {} + +fn initialize_query_level_iterators<'t, 'q>( + ctx: &'t dyn Context<'t>, + branches: &'q Vec>>, +) -> heed::Result>> { + + let mut positions = BinaryHeap::with_capacity(branches.len()); + for branch in branches { + let mut branch_positions = Vec::with_capacity(branch.len()); + for query in branch { + match QueryLevelIterator::new(ctx, query)? { + Some(qli) => branch_positions.push(qli), + None => { + // the branch seems to be invalid, so we skip it. + branch_positions.clear(); + break; + }, + } + } + // QueryLevelIterator need to be sorted by level and folded in descending order. + branch_positions.sort_unstable_by_key(|qli| qli.level); + let folded_query_level_iterators = branch_positions + .into_iter() + .rev() + .fold(None, |fold: Option, qli| match fold { + Some(mut fold) => { + fold.previous(qli); + Some(fold) + }, + None => Some(qli), + }); + + if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { + let (tree_level, last_result) = folded_query_level_iterators.next()?; + let branch = Branch { + last_result, + tree_level, + query_level_iterator: folded_query_level_iterators, + branch_size: branch.len() as u32, + }; + positions.push(branch); + } + } + + Ok(positions) +} + +fn set_compute_candidates<'t>( + ctx: &'t dyn Context<'t>, + branches: &Vec>>, + allowed_candidates: &RoaringBitmap, +) -> anyhow::Result +{ + let mut branches_heap = initialize_query_level_iterators(ctx, branches)?; + let lowest_level = TreeLevel::min_value(); + + while let Some(mut branch) = branches_heap.peek_mut() { + let is_lowest_level = branch.tree_level == lowest_level; + match branch.last_result.as_mut() { + Some((_, _, candidates)) => { + candidates.intersect_with(&allowed_candidates); + if candidates.len() > 0 && is_lowest_level { + // we have candidates, but we can't dig deeper, return candidates. + return Ok(std::mem::take(candidates)); + } else if candidates.len() > 0 { + // we have candidates, lets dig deeper in levels. + let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?; + let (tree_level, last_result) = query_level_iterator.next()?; + branch.query_level_iterator = query_level_iterator; + branch.tree_level = tree_level; + branch.last_result = last_result; + } else { + // we don't have candidates, get next interval. + let (_, last_result) = branch.query_level_iterator.next()?; + branch.last_result = last_result; + } + }, + // None = no candidates to find. + None => return Ok(RoaringBitmap::new()), + } + } + + // we made all iterations without finding anything. + Ok(RoaringBitmap::new()) +} + fn linear_compute_candidates( ctx: &dyn Context, branches: &Vec>>, diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index fe224ef94..d3c394467 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -19,13 +19,13 @@ pub struct FinalResult { } pub struct Final<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, parent: Box, wdcache: WordDerivationsCache, } impl<'t> Final<'t> { - pub fn new(ctx: &'t dyn Context, parent: Box) -> Final<'t> { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Final<'t> { Final { ctx, parent, wdcache: WordDerivationsCache::new() } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 5e75be6ce..b972a0b2c 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -4,7 +4,7 @@ use std::borrow::Cow; use anyhow::bail; use roaring::RoaringBitmap; -use crate::search::{word_derivations, WordDerivationsCache}; +use crate::{TreeLevel, search::{word_derivations, WordDerivationsCache}}; use crate::{Index, DocumentId}; use super::query_tree::{Operation, Query, QueryKind}; @@ -64,7 +64,7 @@ impl Default for Candidates { } } -pub trait Context { +pub trait Context<'c> { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; @@ -73,6 +73,8 @@ pub trait Context { fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; fn docid_words_positions(&self, docid: DocumentId) -> heed::Result>; + fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>>; + fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; } pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, @@ -81,7 +83,7 @@ pub struct CriteriaBuilder<'t> { words_prefixes_fst: fst::Set>, } -impl<'a> Context for CriteriaBuilder<'a> { +impl<'c> Context<'c> for CriteriaBuilder<'c> { fn documents_ids(&self) -> heed::Result { self.index.documents_ids(self.rtxn) } @@ -120,6 +122,40 @@ impl<'a> Context for CriteriaBuilder<'a> { } Ok(words_positions) } + + fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>> { + let range = { + let left = left.unwrap_or(u32::min_value()); + let right = right.unwrap_or(u32::max_value()); + let left = (word, level, left, left); + let right = (word, level, right, right); + left..=right + }; + let db = match in_prefix_cache { + true => self.index.word_prefix_level_position_docids, + false => self.index.word_level_position_docids, + }; + + Ok(Box::new(db.range(self.rtxn, &range)?)) + } + + fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result> { + let range = { + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); + left..=right + }; + let db = match in_prefix_cache { + true => self.index.word_prefix_level_position_docids, + false => self.index.word_level_position_docids, + }; + let last_level = db + .remap_data_type::() + .range(self.rtxn, &range)?.last().transpose()? + .map(|((_, level, _, _), _)| level); + + Ok(last_level) + } } impl<'t> CriteriaBuilder<'t> { @@ -354,7 +390,7 @@ pub mod test { docid_words: HashMap>, } - impl<'a> Context for TestContext<'a> { + impl<'c> Context<'c> for TestContext<'c> { fn documents_ids(&self) -> heed::Result { Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids)) } @@ -397,6 +433,14 @@ pub mod test { Ok(HashMap::new()) } } + + fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option, _right: Option) -> heed::Result> + 'c>> { + todo!() + } + + fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result> { + todo!() + } } impl<'a> Default for TestContext<'a> { diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index dc1daafb2..ca412bf28 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -13,7 +13,7 @@ use super::{Criterion, CriterionResult, Context, query_docids, query_pair_proxim type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; pub struct Proximity<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, /// ((max_proximity, query_tree), allowed_candidates) state: Option<(Option<(usize, Operation)>, RoaringBitmap)>, proximity: u8, @@ -24,7 +24,7 @@ pub struct Proximity<'t> { } impl<'t> Proximity<'t> { - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Proximity { ctx, state: None, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 40b06afc4..bf58fa258 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -9,7 +9,7 @@ use crate::search::{word_derivations, WordDerivationsCache}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; pub struct Typo<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, query_tree: Option<(usize, Operation)>, number_typos: u8, candidates: Candidates, @@ -19,7 +19,7 @@ pub struct Typo<'t> { } impl<'t> Typo<'t> { - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Typo { ctx, query_tree: None, diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 5bb9d8d90..047b3c5f0 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -8,7 +8,7 @@ use crate::search::query_tree::Operation; use super::{resolve_query_tree, Criterion, CriterionResult, Context, WordDerivationsCache}; pub struct Words<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, query_trees: Vec, candidates: Option, bucket_candidates: RoaringBitmap, @@ -17,7 +17,7 @@ pub struct Words<'t> { } impl<'t> Words<'t> { - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Words { ctx, query_trees: Vec::default(), diff --git a/milli/src/tree_level.rs b/milli/src/tree_level.rs index 7ce2904e2..b69316cf6 100644 --- a/milli/src/tree_level.rs +++ b/milli/src/tree_level.rs @@ -21,6 +21,10 @@ impl TreeLevel { pub const fn min_value() -> TreeLevel { TreeLevel(0) } + + pub fn saturating_sub(&self, lhs: u8) -> TreeLevel { + TreeLevel(self.0.saturating_sub(lhs)) + } } impl Into for TreeLevel { From 1eee0029a8d3633f42a045d654c94048cd9f4e40 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 1 Apr 2021 14:42:23 +0200 Subject: [PATCH 24/45] Make attribute criterion typo/prefix tolerant --- milli/src/search/criteria/attribute.rs | 57 ++++++++++++++++++-------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index af336c21f..87f9d4dde 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,4 +1,4 @@ -use std::{cmp::{self, Ordering}, collections::BinaryHeap}; +use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap}; use std::collections::{BTreeMap, HashMap, btree_map}; use std::mem::take; @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use crate::{TreeLevel, search::build_dfa}; use crate::search::criteria::Query; use crate::search::query_tree::{Operation, QueryKind}; -use crate::search::WordDerivationsCache; +use crate::search::{word_derivations, WordDerivationsCache}; use super::{Criterion, CriterionResult, Context, resolve_query_tree}; pub struct Attribute<'t> { @@ -71,7 +71,7 @@ impl<'t> Criterion for Attribute<'t> { }, } } else { - set_compute_candidates(self.ctx, flattened_query_tree, candidates)? + set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? }; candidates.difference_with(&found_candidates); @@ -122,21 +122,18 @@ struct WordLevelIterator<'t, 'q> { inner: Box> + 't>, level: TreeLevel, interval_size: u32, - word: &'q str, + word: Cow<'q, str>, in_prefix_cache: bool, inner_next: Option<(u32, u32, RoaringBitmap)>, current_interval: Option<(u32, u32)>, } impl<'t, 'q> WordLevelIterator<'t, 'q> { - fn new(ctx: &'t dyn Context<'t>, query: &'q Query) -> heed::Result> { - // TODO make it typo/prefix tolerant - let word = query.kind.word(); - let in_prefix_cache = query.prefix && ctx.in_prefix_cache(word); - match ctx.word_position_last_level(word, in_prefix_cache)? { + fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result> { + match ctx.word_position_last_level(&word, in_prefix_cache)? { Some(level) => { let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); - let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?; + let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) }, None => Ok(None), @@ -146,11 +143,11 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel) -> heed::Result { let level = level.min(&self.level).clone(); let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); - let word = self.word; + let word = self.word.clone(); let in_prefix_cache = self.in_prefix_cache; // TODO try to dig starting from the current interval // let left = self.current_interval.map(|(left, _)| left); - let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?; + let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) } @@ -193,11 +190,33 @@ struct QueryLevelIterator<'t, 'q> { } impl<'t, 'q> QueryLevelIterator<'t, 'q> { - fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec) -> heed::Result> { + fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { let mut inner = Vec::with_capacity(queries.len()); for query in queries { - if let Some(word_level_iterator) = WordLevelIterator::new(ctx, query)? { - inner.push(word_level_iterator); + match &query.kind { + QueryKind::Exact { word, .. } => { + if !query.prefix || ctx.in_prefix_cache(&word) { + let word = Cow::Borrowed(query.kind.word()); + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? { + inner.push(word_level_iterator); + } + } else { + for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { + let word = Cow::Owned(word.to_owned()); + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { + inner.push(word_level_iterator); + } + } + } + }, + QueryKind::Tolerant { typo, word } => { + for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { + let word = Cow::Owned(word.to_owned()); + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { + inner.push(word_level_iterator); + } + } + } } } @@ -346,13 +365,14 @@ impl<'t, 'q> Eq for Branch<'t, 'q> {} fn initialize_query_level_iterators<'t, 'q>( ctx: &'t dyn Context<'t>, branches: &'q Vec>>, -) -> heed::Result>> { + wdcache: &mut WordDerivationsCache, +) -> anyhow::Result>> { let mut positions = BinaryHeap::with_capacity(branches.len()); for branch in branches { let mut branch_positions = Vec::with_capacity(branch.len()); for query in branch { - match QueryLevelIterator::new(ctx, query)? { + match QueryLevelIterator::new(ctx, query, wdcache)? { Some(qli) => branch_positions.push(qli), None => { // the branch seems to be invalid, so we skip it. @@ -393,9 +413,10 @@ fn set_compute_candidates<'t>( ctx: &'t dyn Context<'t>, branches: &Vec>>, allowed_candidates: &RoaringBitmap, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { - let mut branches_heap = initialize_query_level_iterators(ctx, branches)?; + let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?; let lowest_level = TreeLevel::min_value(); while let Some(mut branch) = branches_heap.peek_mut() { From b3e2280bb93fa4806229c8ee4188c4903b654887 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 1 Apr 2021 19:02:13 +0200 Subject: [PATCH 25/45] Debug attribute criterion * debug folding when initializing iterators --- milli/src/search/criteria/attribute.rs | 28 ++++++++++++++------------ 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 87f9d4dde..d96ec493f 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -51,7 +51,7 @@ impl<'t> Criterion for Attribute<'t> { flatten_query_tree(&qt) }); - let found_candidates = if candidates.len() < 1000 { + let found_candidates = if candidates.len() < 1_000 { let current_buckets = match self.current_buckets.as_mut() { Some(current_buckets) => current_buckets, None => { @@ -322,10 +322,10 @@ struct Branch<'t, 'q> { impl<'t, 'q> Branch<'t, 'q> { fn cmp(&self, other: &Self) -> Ordering { - fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((1..branch_size).sum()) / branch_size } + fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((0..branch_size).sum()) / branch_size } match (&self.last_result, &other.last_result) { (Some((s_left, _, _)), Some((o_left, _, _))) => { - // we compute a rank form the left interval. + // we compute a rank from the left interval. let self_rank = compute_rank(*s_left, self.branch_size); let other_rank = compute_rank(*o_left, other.branch_size); let left_cmp = self_rank.cmp(&other_rank).reverse(); @@ -371,8 +371,8 @@ fn initialize_query_level_iterators<'t, 'q>( let mut positions = BinaryHeap::with_capacity(branches.len()); for branch in branches { let mut branch_positions = Vec::with_capacity(branch.len()); - for query in branch { - match QueryLevelIterator::new(ctx, query, wdcache)? { + for queries in branch { + match QueryLevelIterator::new(ctx, queries, wdcache)? { Some(qli) => branch_positions.push(qli), None => { // the branch seems to be invalid, so we skip it. @@ -386,10 +386,10 @@ fn initialize_query_level_iterators<'t, 'q>( let folded_query_level_iterators = branch_positions .into_iter() .rev() - .fold(None, |fold: Option, qli| match fold { - Some(mut fold) => { - fold.previous(qli); - Some(fold) + .fold(None, |fold: Option, mut qli| match fold { + Some(fold) => { + qli.previous(fold); + Some(qli) }, None => Some(qli), }); @@ -418,6 +418,7 @@ fn set_compute_candidates<'t>( { let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?; let lowest_level = TreeLevel::min_value(); + let mut final_candidates = RoaringBitmap::new(); while let Some(mut branch) = branches_heap.peek_mut() { let is_lowest_level = branch.tree_level == lowest_level; @@ -426,7 +427,8 @@ fn set_compute_candidates<'t>( candidates.intersect_with(&allowed_candidates); if candidates.len() > 0 && is_lowest_level { // we have candidates, but we can't dig deeper, return candidates. - return Ok(std::mem::take(candidates)); + final_candidates = std::mem::take(candidates); + break; } else if candidates.len() > 0 { // we have candidates, lets dig deeper in levels. let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?; @@ -441,12 +443,12 @@ fn set_compute_candidates<'t>( } }, // None = no candidates to find. - None => return Ok(RoaringBitmap::new()), + None => break, } + } - // we made all iterations without finding anything. - Ok(RoaringBitmap::new()) + Ok(final_candidates) } fn linear_compute_candidates( From 17c8c6f945bdffebdf6a935160566f9e6deaa8be Mon Sep 17 00:00:00 2001 From: many Date: Tue, 6 Apr 2021 15:03:41 +0200 Subject: [PATCH 26/45] Make set algorithm return None when nothing can be returned --- milli/src/search/criteria/attribute.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index d96ec493f..12c6b36b8 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -71,7 +71,18 @@ impl<'t> Criterion for Attribute<'t> { }, } } else { - set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? + let found_candidates = set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)?; + + match found_candidates { + Some(candidates) => candidates, + None => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + } }; candidates.difference_with(&found_candidates); @@ -414,11 +425,11 @@ fn set_compute_candidates<'t>( branches: &Vec>>, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> anyhow::Result> { let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?; let lowest_level = TreeLevel::min_value(); - let mut final_candidates = RoaringBitmap::new(); + let mut final_candidates = None; while let Some(mut branch) = branches_heap.peek_mut() { let is_lowest_level = branch.tree_level == lowest_level; @@ -427,7 +438,7 @@ fn set_compute_candidates<'t>( candidates.intersect_with(&allowed_candidates); if candidates.len() > 0 && is_lowest_level { // we have candidates, but we can't dig deeper, return candidates. - final_candidates = std::mem::take(candidates); + final_candidates = Some(std::mem::take(candidates)); break; } else if candidates.len() > 0 { // we have candidates, lets dig deeper in levels. From 0efa011e0965fa6e6a1da630d6a3c1cead9ba0e4 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 12 Apr 2021 11:19:25 +0200 Subject: [PATCH 27/45] Make a small code clean-up --- milli/src/search/criteria/attribute.rs | 90 ++++++++++++++------------ 1 file changed, 47 insertions(+), 43 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 12c6b36b8..af3e08af1 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -326,30 +326,25 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { struct Branch<'t, 'q> { query_level_iterator: QueryLevelIterator<'t, 'q>, - last_result: Option<(u32, u32, RoaringBitmap)>, + last_result: (u32, u32, RoaringBitmap), tree_level: TreeLevel, branch_size: u32, } impl<'t, 'q> Branch<'t, 'q> { fn cmp(&self, other: &Self) -> Ordering { - fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((0..branch_size).sum()) / branch_size } - match (&self.last_result, &other.last_result) { - (Some((s_left, _, _)), Some((o_left, _, _))) => { - // we compute a rank from the left interval. - let self_rank = compute_rank(*s_left, self.branch_size); - let other_rank = compute_rank(*o_left, other.branch_size); - let left_cmp = self_rank.cmp(&other_rank).reverse(); - // on level: higher is better, - // we want to reduce highest levels first. - let level_cmp = self.tree_level.cmp(&other.tree_level); + let compute_rank = |left: u32, branch_size: u32| left.saturating_sub((0..branch_size).sum()) / branch_size; + let (s_left, _, _) = self.last_result; + let (o_left, _, _) = other.last_result; + // we compute a rank from the left interval. + let self_rank = compute_rank(s_left, self.branch_size); + let other_rank = compute_rank(o_left, other.branch_size); + let left_cmp = self_rank.cmp(&other_rank).reverse(); + // on level: higher is better, + // we want to reduce highest levels first. + let level_cmp = self.tree_level.cmp(&other.tree_level); - left_cmp.then(level_cmp) - }, - (Some(_), None) => Ordering::Greater, - (None, Some(_)) => Ordering::Less, - (None, None) => Ordering::Equal, - } + left_cmp.then(level_cmp) } } @@ -407,13 +402,15 @@ fn initialize_query_level_iterators<'t, 'q>( if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { let (tree_level, last_result) = folded_query_level_iterators.next()?; - let branch = Branch { - last_result, - tree_level, - query_level_iterator: folded_query_level_iterators, - branch_size: branch.len() as u32, - }; - positions.push(branch); + if let Some(last_result) = last_result { + let branch = Branch { + last_result, + tree_level, + query_level_iterator: folded_query_level_iterators, + branch_size: branch.len() as u32, + }; + positions.push(branch); + } } } @@ -433,28 +430,35 @@ fn set_compute_candidates<'t>( while let Some(mut branch) = branches_heap.peek_mut() { let is_lowest_level = branch.tree_level == lowest_level; - match branch.last_result.as_mut() { - Some((_, _, candidates)) => { - candidates.intersect_with(&allowed_candidates); - if candidates.len() > 0 && is_lowest_level { - // we have candidates, but we can't dig deeper, return candidates. - final_candidates = Some(std::mem::take(candidates)); - break; - } else if candidates.len() > 0 { - // we have candidates, lets dig deeper in levels. - let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?; - let (tree_level, last_result) = query_level_iterator.next()?; + let (_, _, candidates) = &mut branch.last_result; + candidates.intersect_with(&allowed_candidates); + if candidates.is_empty() { + // we don't have candidates, get next interval. + match branch.query_level_iterator.next()? { + (_, Some(last_result)) => { + branch.last_result = last_result; + }, + // TODO clean up this + (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); }, + } + + } + else if is_lowest_level { + // we have candidates, but we can't dig deeper, return candidates. + final_candidates = Some(take(candidates)); + break; + } else { + // we have candidates, lets dig deeper in levels. + let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?; + match query_level_iterator.next()? { + (tree_level, Some(last_result)) => { branch.query_level_iterator = query_level_iterator; branch.tree_level = tree_level; branch.last_result = last_result; - } else { - // we don't have candidates, get next interval. - let (_, last_result) = branch.query_level_iterator.next()?; - branch.last_result = last_result; - } - }, - // None = no candidates to find. - None => break, + }, + // TODO clean up this + (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); }, + } } } From 2b036449be4f2c4a1ca15d5b4d1cfed3a6828e07 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 13 Apr 2021 15:06:12 +0200 Subject: [PATCH 28/45] Fix the return of equal candidates in different pages --- milli/src/search/criteria/attribute.rs | 79 +++++++++++++++++--------- 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index af3e08af1..8d150730f 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,5 +1,6 @@ use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap}; use std::collections::{BTreeMap, HashMap, btree_map}; +use std::collections::binary_heap::PeekMut; use std::mem::take; use roaring::RoaringBitmap; @@ -332,13 +333,26 @@ struct Branch<'t, 'q> { } impl<'t, 'q> Branch<'t, 'q> { - fn cmp(&self, other: &Self) -> Ordering { - let compute_rank = |left: u32, branch_size: u32| left.saturating_sub((0..branch_size).sum()) / branch_size; - let (s_left, _, _) = self.last_result; - let (o_left, _, _) = other.last_result; + fn next(&mut self) -> heed::Result { + match self.query_level_iterator.next()? { + (tree_level, Some(last_result)) => { + self.last_result = last_result; + self.tree_level = tree_level; + Ok(true) + }, + (_, None) => Ok(false), + } + } + + fn compute_rank(&self) -> u32 { // we compute a rank from the left interval. - let self_rank = compute_rank(s_left, self.branch_size); - let other_rank = compute_rank(o_left, other.branch_size); + let (left, _, _) = self.last_result; + left.saturating_sub((0..self.branch_size).sum()) * 60 / self.branch_size + } + + fn cmp(&self, other: &Self) -> Ordering { + let self_rank = self.compute_rank(); + let other_rank = other.compute_rank(); let left_cmp = self_rank.cmp(&other_rank).reverse(); // on level: higher is better, // we want to reduce highest levels first. @@ -426,44 +440,53 @@ fn set_compute_candidates<'t>( { let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?; let lowest_level = TreeLevel::min_value(); - let mut final_candidates = None; + let mut final_candidates: Option<(u32, RoaringBitmap)> = None; while let Some(mut branch) = branches_heap.peek_mut() { let is_lowest_level = branch.tree_level == lowest_level; + let branch_rank = branch.compute_rank(); let (_, _, candidates) = &mut branch.last_result; candidates.intersect_with(&allowed_candidates); if candidates.is_empty() { // we don't have candidates, get next interval. - match branch.query_level_iterator.next()? { - (_, Some(last_result)) => { - branch.last_result = last_result; - }, - // TODO clean up this - (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); }, - } - + if !branch.next()? { PeekMut::pop(branch); } } else if is_lowest_level { // we have candidates, but we can't dig deeper, return candidates. - final_candidates = Some(take(candidates)); - break; + final_candidates = match final_candidates.take() { + Some((best_rank, mut best_candidates)) => { + // if current is worst than best we break to return + // candidates that correspond to the best rank + if branch_rank > best_rank { + final_candidates = Some((best_rank, best_candidates)); + break; + // else we add current candidates to best candidates + // and we fetch the next page + } else { + best_candidates.union_with(candidates); + if !branch.next()? { PeekMut::pop(branch); } + Some((best_rank, best_candidates)) + } + }, + // we take current candidates as best candidates + // and we fetch the next page + None => { + let candidates = take(candidates); + if !branch.next()? { PeekMut::pop(branch); } + Some((branch_rank, candidates)) + }, + }; } else { // we have candidates, lets dig deeper in levels. - let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?; - match query_level_iterator.next()? { - (tree_level, Some(last_result)) => { - branch.query_level_iterator = query_level_iterator; - branch.tree_level = tree_level; - branch.last_result = last_result; - }, - // TODO clean up this - (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); }, - } + branch.query_level_iterator = branch.query_level_iterator.dig(ctx)?; + if !branch.next()? { PeekMut::pop(branch); } } } - Ok(final_candidates) + Ok(final_candidates.map(|(_rank, candidates)| { + candidates + })) } fn linear_compute_candidates( From f8537900168265841993c4eb0bbd3fcc539a76b4 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 13 Apr 2021 18:25:38 +0200 Subject: [PATCH 29/45] Use the LCM of 10 first numbers to compute attribute rank --- milli/src/search/criteria/attribute.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 8d150730f..5ab60c58d 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -11,6 +11,10 @@ use crate::search::query_tree::{Operation, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; use super::{Criterion, CriterionResult, Context, resolve_query_tree}; +/// To be able to divide integers by the number of words in the query +/// we want to find a multiplier that allow us to divide by any number between 1 and 10. +/// We Choosed the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). +const LCM_10_FIRST_NUMBERS: u32 = 2520; pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, query_tree: Option, @@ -347,7 +351,7 @@ impl<'t, 'q> Branch<'t, 'q> { fn compute_rank(&self) -> u32 { // we compute a rank from the left interval. let (left, _, _) = self.last_result; - left.saturating_sub((0..self.branch_size).sum()) * 60 / self.branch_size + left.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size } fn cmp(&self, other: &Self) -> Ordering { @@ -545,7 +549,7 @@ fn linear_compute_candidates( // we substract the word index to the position. let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); // here we do the means of the words of the branch - min_rank = min_rank.min(branch_rank / branch_len as u64); + min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); } } From 716c8e22b0bb82a65f2d9320af8d3d68ffc9a79f Mon Sep 17 00:00:00 2001 From: many Date: Thu, 15 Apr 2021 10:44:27 +0200 Subject: [PATCH 30/45] Add style and comments --- milli/src/search/criteria/attribute.rs | 51 +++++++++++++++----------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 5ab60c58d..2672169de 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -15,6 +15,7 @@ use super::{Criterion, CriterionResult, Context, resolve_query_tree}; /// we want to find a multiplier that allow us to divide by any number between 1 and 10. /// We Choosed the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). const LCM_10_FIRST_NUMBERS: u32 = 2520; + pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, query_tree: Option, @@ -134,6 +135,9 @@ impl<'t> Criterion for Attribute<'t> { } } +/// WordLevelIterator is an pseudo-Iterator over intervals of word-position for one word, +/// it will begin at the first non-empty interval and will return every interval without +/// jumping over empty intervals. struct WordLevelIterator<'t, 'q> { inner: Box> + 't>, level: TreeLevel, @@ -197,12 +201,14 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { } } +/// QueryLevelIterator is an pseudo-Iterator for a Query, +/// It contains WordLevelIterators and is chainned with other QueryLevelIterator. struct QueryLevelIterator<'t, 'q> { - previous: Option>>, + parent: Option>>, inner: Vec>, level: TreeLevel, accumulator: Vec>, - previous_accumulator: Vec>, + parent_accumulator: Vec>, } impl<'t, 'q> QueryLevelIterator<'t, 'q> { @@ -239,26 +245,27 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone()); match highest { Some(level) => Ok(Some(Self { - previous: None, + parent: None, inner, level, accumulator: vec![], - previous_accumulator: vec![], + parent_accumulator: vec![], })), None => Ok(None), } } - fn previous(&mut self, previous: QueryLevelIterator<'t, 'q>) -> &Self { - self.previous = Some(Box::new(previous)); + fn parent(&mut self, parent: QueryLevelIterator<'t, 'q>) -> &Self { + self.parent = Some(Box::new(parent)); self } + /// create a new QueryLevelIterator with a lower level than the current one. fn dig(&self, ctx: &'t dyn Context<'t>) -> heed::Result { - let (level, previous) = match &self.previous { - Some(previous) => { - let previous = previous.dig(ctx)?; - (previous.level.min(self.level), Some(Box::new(previous))) + let (level, parent) = match &self.parent { + Some(parent) => { + let parent = parent.dig(ctx)?; + (parent.level.min(self.level), Some(Box::new(parent))) }, None => (self.level.saturating_sub(1), None), }; @@ -268,7 +275,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { inner.push(word_level_iterator.dig(ctx, &level)?); } - Ok(Self {previous, inner, level, accumulator: vec![], previous_accumulator: vec![]}) + Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![]}) } @@ -295,29 +302,31 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { Ok(accumulated) } + /// return the next meta-interval created from inner WordLevelIterators, + /// and from eventual chainned QueryLevelIterator. fn next(&mut self) -> heed::Result<(TreeLevel, Option<(u32, u32, RoaringBitmap)>)> { - let previous_result = match self.previous.as_mut() { - Some(previous) => { - Some(previous.next()?) + let parent_result = match self.parent.as_mut() { + Some(parent) => { + Some(parent.next()?) }, None => None, }; - match previous_result { - Some((previous_level, previous_next)) => { - let inner_next = self.inner_next(previous_level)?; + match parent_result { + Some((parent_level, parent_next)) => { + let inner_next = self.inner_next(parent_level)?; self.accumulator.push(inner_next); - self.previous_accumulator.push(previous_next); + self.parent_accumulator.push(parent_next); // TODO @many clean firsts intervals of both accumulators when both RoaringBitmap are empty, // WARNING the cleaned intervals count needs to be kept to skip at the end let mut merged_interval = None; - for current in self.accumulator.iter().rev().zip(self.previous_accumulator.iter()) { + for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()) { if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { let (_, _, merged_docids) = merged_interval.get_or_insert_with(|| (left_a + left_b, right_a + right_b, RoaringBitmap::new())); merged_docids.union_with(&(a & b)); } } - Ok((previous_level, merged_interval)) + Ok((parent_level, merged_interval)) }, None => { let level = self.level.clone(); @@ -412,7 +421,7 @@ fn initialize_query_level_iterators<'t, 'q>( .rev() .fold(None, |fold: Option, mut qli| match fold { Some(fold) => { - qli.previous(fold); + qli.parent(fold); Some(qli) }, None => Some(qli), From e77291a6f3065824779d630c9fd4449869b338ee Mon Sep 17 00:00:00 2001 From: many Date: Thu, 15 Apr 2021 12:22:44 +0200 Subject: [PATCH 31/45] Optimize Atrribute criterion on big requests --- milli/src/search/criteria/attribute.rs | 160 +++++++++++++++---------- 1 file changed, 97 insertions(+), 63 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 2672169de..745d8cdb0 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -101,7 +101,7 @@ impl<'t> Criterion for Attribute<'t> { }, (Some(qt), None) => { let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?; - self.bucket_candidates.union_with(&query_tree_candidates); + self.bucket_candidates |= &query_tree_candidates; self.candidates = Some(query_tree_candidates); }, (None, Some(_)) => { @@ -123,7 +123,7 @@ impl<'t> Criterion for Attribute<'t> { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree; self.candidates = candidates; - self.bucket_candidates.union_with(&bucket_candidates); + self.bucket_candidates |= bucket_candidates; self.flattened_query_tree = None; self.current_buckets = None; }, @@ -160,14 +160,12 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { } } - fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel) -> heed::Result { + fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option) -> heed::Result { let level = level.min(&self.level).clone(); let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); let word = self.word.clone(); let in_prefix_cache = self.in_prefix_cache; - // TODO try to dig starting from the current interval - // let left = self.current_interval.map(|(left, _)| left); - let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; + let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) } @@ -209,6 +207,7 @@ struct QueryLevelIterator<'t, 'q> { level: TreeLevel, accumulator: Vec>, parent_accumulator: Vec>, + interval_to_skip: usize, } impl<'t, 'q> QueryLevelIterator<'t, 'q> { @@ -250,6 +249,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { level, accumulator: vec![], parent_accumulator: vec![], + interval_to_skip: 0, })), None => Ok(None), } @@ -270,16 +270,15 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { None => (self.level.saturating_sub(1), None), }; + let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten(); let mut inner = Vec::with_capacity(self.inner.len()); for word_level_iterator in self.inner.iter() { - inner.push(word_level_iterator.dig(ctx, &level)?); + inner.push(word_level_iterator.dig(ctx, &level, left_interval)?); } - Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![]}) + Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0}) } - - fn inner_next(&mut self, level: TreeLevel) -> heed::Result> { let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None; let u8_level = Into::::into(level); @@ -289,12 +288,13 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { let accumulated_count = 4u32.pow((u8_level - wli_u8_level) as u32); for _ in 0..accumulated_count { if let Some((next_left, _, next_docids)) = wli.next()? { - accumulated = accumulated.take().map( - |(acc_left, acc_right, mut acc_docids)| { - acc_docids.union_with(&next_docids); - (acc_left, acc_right, acc_docids) - } - ).or_else(|| Some((next_left, next_left + interval_size, next_docids))); + accumulated = match accumulated.take(){ + Some((acc_left, acc_right, mut acc_docids)) => { + acc_docids |= next_docids; + Some((acc_left, acc_right, acc_docids)) + }, + None => Some((next_left, next_left + interval_size, next_docids)), + }; } } } @@ -304,35 +304,59 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { /// return the next meta-interval created from inner WordLevelIterators, /// and from eventual chainned QueryLevelIterator. - fn next(&mut self) -> heed::Result<(TreeLevel, Option<(u32, u32, RoaringBitmap)>)> { + fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result> { let parent_result = match self.parent.as_mut() { Some(parent) => { - Some(parent.next()?) + Some(parent.next(allowed_candidates, tree_level)?) }, None => None, }; match parent_result { - Some((parent_level, parent_next)) => { - let inner_next = self.inner_next(parent_level)?; + Some(parent_next) => { + let inner_next = self.inner_next(tree_level)?; + self.interval_to_skip += self.accumulator.iter().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip).take_while(|current| { + match current { + (Some((_, _, inner)), Some((_, _, parent))) => { + inner.is_disjoint(allowed_candidates) && parent.is_empty() + }, + (Some((_, _, inner)), None) => { + inner.is_disjoint(allowed_candidates) + }, + (None, Some((_, _, parent))) => { + parent.is_empty() + }, + (None, None) => true, + } + }).count(); self.accumulator.push(inner_next); self.parent_accumulator.push(parent_next); - // TODO @many clean firsts intervals of both accumulators when both RoaringBitmap are empty, - // WARNING the cleaned intervals count needs to be kept to skip at the end - let mut merged_interval = None; - for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()) { + let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; + + for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) { if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { - let (_, _, merged_docids) = merged_interval.get_or_insert_with(|| (left_a + left_b, right_a + right_b, RoaringBitmap::new())); - merged_docids.union_with(&(a & b)); + match merged_interval.as_mut() { + Some((_, _, merged_docids)) => *merged_docids |= a & b, + None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)), + } } } - Ok((parent_level, merged_interval)) + Ok(merged_interval) }, None => { - let level = self.level.clone(); - let next_interval = self.inner_next(level.clone())?; - self.accumulator = vec![next_interval.clone()]; - Ok((level, next_interval)) + let level = self.level; + match self.inner_next(level)? { + Some((left, right, mut candidates)) => { + self.accumulator = vec![Some((left, right, RoaringBitmap::new()))]; + candidates &= allowed_candidates; + Ok(Some((left, right, candidates))) + + }, + None => { + self.accumulator = vec![None]; + Ok(None) + }, + } } } } @@ -346,17 +370,31 @@ struct Branch<'t, 'q> { } impl<'t, 'q> Branch<'t, 'q> { - fn next(&mut self) -> heed::Result { - match self.query_level_iterator.next()? { - (tree_level, Some(last_result)) => { + fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result { + let tree_level = self.query_level_iterator.level; + match self.query_level_iterator.next(allowed_candidates, tree_level)? { + Some(last_result) => { self.last_result = last_result; self.tree_level = tree_level; Ok(true) }, - (_, None) => Ok(false), + None => Ok(false), } } + fn dig(&mut self, ctx: &'t dyn Context<'t>) -> heed::Result<()> { + self.query_level_iterator = self.query_level_iterator.dig(ctx)?; + Ok(()) + } + + fn lazy_next(&mut self) { + let u8_level = Into::::into(self.tree_level.clone()); + let interval_size = 4u32.pow(u8_level as u32); + let (left, right, _) = self.last_result; + + self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new()); + } + fn compute_rank(&self) -> u32 { // we compute a rank from the left interval. let (left, _, _) = self.last_result; @@ -367,11 +405,11 @@ impl<'t, 'q> Branch<'t, 'q> { let self_rank = self.compute_rank(); let other_rank = other.compute_rank(); let left_cmp = self_rank.cmp(&other_rank).reverse(); - // on level: higher is better, - // we want to reduce highest levels first. - let level_cmp = self.tree_level.cmp(&other.tree_level); + // on level: lower is better, + // we want to dig faster into levels on interesting branches. + let level_cmp = self.tree_level.cmp(&other.tree_level).reverse(); - left_cmp.then(level_cmp) + left_cmp.then(level_cmp).then(self.last_result.2.len().cmp(&other.last_result.2.len())) } } @@ -398,6 +436,7 @@ impl<'t, 'q> Eq for Branch<'t, 'q> {} fn initialize_query_level_iterators<'t, 'q>( ctx: &'t dyn Context<'t>, branches: &'q Vec>>, + allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result>> { @@ -418,7 +457,6 @@ fn initialize_query_level_iterators<'t, 'q>( branch_positions.sort_unstable_by_key(|qli| qli.level); let folded_query_level_iterators = branch_positions .into_iter() - .rev() .fold(None, |fold: Option, mut qli| match fold { Some(fold) => { qli.parent(fold); @@ -428,7 +466,8 @@ fn initialize_query_level_iterators<'t, 'q>( }); if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { - let (tree_level, last_result) = folded_query_level_iterators.next()?; + let tree_level = folded_query_level_iterators.level; + let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?; if let Some(last_result) = last_result { let branch = Branch { last_result, @@ -451,48 +490,43 @@ fn set_compute_candidates<'t>( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { - let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?; + let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; let lowest_level = TreeLevel::min_value(); let mut final_candidates: Option<(u32, RoaringBitmap)> = None; + let mut allowed_candidates = allowed_candidates.clone(); while let Some(mut branch) = branches_heap.peek_mut() { let is_lowest_level = branch.tree_level == lowest_level; let branch_rank = branch.compute_rank(); - let (_, _, candidates) = &mut branch.last_result; - candidates.intersect_with(&allowed_candidates); + // if current is worst than best we break to return + // candidates that correspond to the best rank + if let Some((best_rank, _)) = final_candidates { if branch_rank > best_rank { break; } } + let _left = branch.last_result.0; + let candidates = take(&mut branch.last_result.2); if candidates.is_empty() { // we don't have candidates, get next interval. - if !branch.next()? { PeekMut::pop(branch); } + if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } } else if is_lowest_level { - // we have candidates, but we can't dig deeper, return candidates. + // we have candidates, but we can't dig deeper. + allowed_candidates -= &candidates; final_candidates = match final_candidates.take() { + // we add current candidates to best candidates Some((best_rank, mut best_candidates)) => { - // if current is worst than best we break to return - // candidates that correspond to the best rank - if branch_rank > best_rank { - final_candidates = Some((best_rank, best_candidates)); - break; - // else we add current candidates to best candidates - // and we fetch the next page - } else { - best_candidates.union_with(candidates); - if !branch.next()? { PeekMut::pop(branch); } - Some((best_rank, best_candidates)) - } + best_candidates |= candidates; + branch.lazy_next(); + Some((best_rank, best_candidates)) }, // we take current candidates as best candidates - // and we fetch the next page None => { - let candidates = take(candidates); - if !branch.next()? { PeekMut::pop(branch); } + branch.lazy_next(); Some((branch_rank, candidates)) }, }; } else { // we have candidates, lets dig deeper in levels. - branch.query_level_iterator = branch.query_level_iterator.dig(ctx)?; - if !branch.next()? { PeekMut::pop(branch); } + branch.dig(ctx)?; + if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } } } From 71740805a7c2f45eff9db63f5ae4e4705352d189 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 21 Apr 2021 11:44:29 +0200 Subject: [PATCH 32/45] Fix forgotten typo tests --- Cargo.lock | 14 ++++++++++++-- milli/Cargo.toml | 2 +- milli/src/search/criteria/typo.rs | 5 +++-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 065be362f..0e42f60f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1005,7 +1005,7 @@ dependencies = [ "heed", "jemallocator", "milli", - "roaring", + "roaring 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json", "stderrlog", "structopt", @@ -1287,7 +1287,7 @@ dependencies = [ "rand 0.8.3", "rayon", "regex", - "roaring", + "roaring 0.6.5 (git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops)", "serde", "serde_json", "slice-group-by", @@ -1973,6 +1973,16 @@ dependencies = [ "retain_mut", ] +[[package]] +name = "roaring" +version = "0.6.5" +source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops#6689f8c9dd2efdbfde4442d4d803e87169780593" +dependencies = [ + "bytemuck", + "byteorder", + "retain_mut", +] + [[package]] name = "rustc_version" version = "0.2.3" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ef9c64b7b..b54c0d768 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -27,7 +27,7 @@ once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" regex = "1.4.3" -roaring = "0.6.5" +roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "optimize-ops" } serde = { version = "1.0.123", features = ["derive"] } serde_json = { version = "1.0.62", features = ["preserve_order"] } slice-group-by = "0.2.6" diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index bf58fa258..5a3c93ac8 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -328,6 +328,7 @@ mod test { let parent = Initial::new(query_tree, facet_candidates); let mut criteria = Typo::new(&context, Box::new(parent)); + assert!(criteria.next(&mut wdcache).unwrap().unwrap().candidates.is_none()); assert!(criteria.next(&mut wdcache).unwrap().is_none()); } @@ -440,7 +441,7 @@ mod test { ]), ])), candidates: Some(&candidates_1 & &facet_candidates), - bucket_candidates: candidates_1 & &facet_candidates, + bucket_candidates: facet_candidates.clone(), }; assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); @@ -462,7 +463,7 @@ mod test { ]), ])), candidates: Some(&candidates_2 & &facet_candidates), - bucket_candidates: candidates_2 & &facet_candidates, + bucket_candidates: RoaringBitmap::new(), }; assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); From 0d7d3ce802d4e1ef5226bb90d1bc65f140fdf104 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 21 Apr 2021 11:53:07 +0200 Subject: [PATCH 33/45] Update roaring package --- Cargo.lock | 18 ++++-------------- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0e42f60f4..6a30891ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1005,7 +1005,7 @@ dependencies = [ "heed", "jemallocator", "milli", - "roaring 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", + "roaring", "serde_json", "stderrlog", "structopt", @@ -1287,7 +1287,7 @@ dependencies = [ "rand 0.8.3", "rayon", "regex", - "roaring 0.6.5 (git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops)", + "roaring", "serde", "serde_json", "slice-group-by", @@ -1964,19 +1964,9 @@ checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1" [[package]] name = "roaring" -version = "0.6.5" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6744a4a918e91359ad1d356a91e2e943a86d9fb9ae77f715d617032ea2af88f" -dependencies = [ - "bytemuck", - "byteorder", - "retain_mut", -] - -[[package]] -name = "roaring" -version = "0.6.5" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops#6689f8c9dd2efdbfde4442d4d803e87169780593" +checksum = "a4b2e7ab0bbb2d144558ae3f4761a0db06d21463b45756fc64c3393cdba3d447" dependencies = [ "bytemuck", "byteorder", diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 59cfbd661..8b5867fde 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -11,7 +11,7 @@ csv = "1.1.5" heed = "0.10.6" jemallocator = "0.3.2" milli = { path = "../milli" } -roaring = "0.6.5" +roaring = "0.6.6" serde_json = "1.0.62" stderrlog = "0.5.1" structopt = { version = "0.3.21", default-features = false } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b54c0d768..8b359a09b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -27,7 +27,7 @@ once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" regex = "1.4.3" -roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "optimize-ops" } +roaring = "0.6.6" serde = { version = "1.0.123", features = ["derive"] } serde_json = { version = "1.0.62", features = ["preserve_order"] } slice-group-by = "0.2.6" From 0daa0e170ac5f81d517aca2384ec4fcb237fe76e Mon Sep 17 00:00:00 2001 From: Many Date: Mon, 26 Apr 2021 11:30:42 +0200 Subject: [PATCH 34/45] Fix PR comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- infos/src/main.rs | 3 +-- milli/src/search/criteria/attribute.rs | 2 +- milli/src/search/criteria/final.rs | 6 +----- milli/src/search/criteria/mod.rs | 10 +++++++++- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 5a12a9d4d..902394af8 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -354,8 +354,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values: _, - documents, - .. + documents } = index; let main_name = "main"; diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 745d8cdb0..18a18816c 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -13,7 +13,7 @@ use super::{Criterion, CriterionResult, Context, resolve_query_tree}; /// To be able to divide integers by the number of words in the query /// we want to find a multiplier that allow us to divide by any number between 1 and 10. -/// We Choosed the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). +/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). const LCM_10_FIRST_NUMBERS: u32 = 2520; pub struct Attribute<'t> { diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index d3c394467..f8bc43204 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -44,11 +44,7 @@ impl<'t> Final<'t> { bucket_candidates.union_with(&candidates); - return Ok(Some(FinalResult { - query_tree, - candidates, - bucket_candidates, - })); + return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); }, None => return Ok(None), } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index b972a0b2c..d3eac94fd 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -123,7 +123,15 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { Ok(words_positions) } - fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>> { + fn word_position_iterator( + &self, + word: &str, + level: TreeLevel, + in_prefix_cache: bool, + left: Option, + right: Option + ) -> heed::Result> + 'c>> + { let range = { let left = left.unwrap_or(u32::min_value()); let right = right.unwrap_or(u32::max_value()); From 47d780b8ce43fad2631efb473e25b5ea12992476 Mon Sep 17 00:00:00 2001 From: Many Date: Mon, 26 Apr 2021 14:51:52 +0200 Subject: [PATCH 35/45] Update milli/src/search/criteria/mod.rs Co-authored-by: Irevoire --- milli/src/search/criteria/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d3eac94fd..01af1ffbd 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -130,7 +130,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { in_prefix_cache: bool, left: Option, right: Option - ) -> heed::Result> + 'c>> + ) -> heed::Result> + 'c>> { let range = { let left = left.unwrap_or(u32::min_value()); From 0e4e6dfada834728644df9a5ab5afb8698a6c85f Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:29:52 +0200 Subject: [PATCH 36/45] Update milli/src/search/criteria/proximity.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/proximity.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index ca412bf28..4c73d7459 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -30,7 +30,7 @@ impl<'t> Proximity<'t> { state: None, proximity: 0, bucket_candidates: RoaringBitmap::new(), - parent: parent, + parent, candidates_cache: Cache::new(), plane_sweep_cache: None, } From 498c2b298c795810726c18f8c95ef16ce490c02d Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:30:02 +0200 Subject: [PATCH 37/45] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 18a18816c..820085c31 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -57,7 +57,7 @@ impl<'t> Criterion for Attribute<'t> { flatten_query_tree(&qt) }); - let found_candidates = if candidates.len() < 1_000 { + let found_candidates = if candidates.len() < 1000 { let current_buckets = match self.current_buckets.as_mut() { Some(current_buckets) => current_buckets, None => { From b3d6c6a9a0e8447daefa92b022c03fe39d5b08e3 Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:31:13 +0200 Subject: [PATCH 38/45] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 820085c31..31725e221 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -77,9 +77,7 @@ impl<'t> Criterion for Attribute<'t> { }, } } else { - let found_candidates = set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)?; - - match found_candidates { + match set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? { Some(candidates) => candidates, None => { return Ok(Some(CriterionResult { From e92d13767667f7ee5c4bfca6fca339d8c26c5e85 Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:31:42 +0200 Subject: [PATCH 39/45] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 31725e221..a1a31247b 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -453,7 +453,7 @@ fn initialize_query_level_iterators<'t, 'q>( } // QueryLevelIterator need to be sorted by level and folded in descending order. branch_positions.sort_unstable_by_key(|qli| qli.level); - let folded_query_level_iterators = branch_positions + let folded_query_level_iterators = branch_positions .into_iter() .fold(None, |fold: Option, mut qli| match fold { Some(fold) => { From c862b1bc6be8c364a474104139f90272ccd1787f Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:32:10 +0200 Subject: [PATCH 40/45] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index a1a31247b..c7d10e431 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -465,7 +465,7 @@ fn initialize_query_level_iterators<'t, 'q>( if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { let tree_level = folded_query_level_iterators.level; - let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?; + let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?; if let Some(last_result) = last_result { let branch = Branch { last_result, From 3b1358b62f539ec6f74a74deb40850dbff6ba34f Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:32:19 +0200 Subject: [PATCH 41/45] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index c7d10e431..8f2e34ca9 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -554,7 +554,7 @@ fn linear_compute_candidates( QueryKind::Exact { word, .. } => { if *prefix { word_derivations(word, true, 0, &words_positions) - .flat_map(|positions| positions.iter().next()).min() + .flat_map(|positions| positions.iter().next()).min() } else { words_positions.get(word) .map(|positions| positions.iter().next()) From 329bd4a1bbe4ddfe408fb33611f7d7d8e6d91661 Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:39:03 +0200 Subject: [PATCH 42/45] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 8f2e34ca9..62e992fad 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -304,9 +304,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { /// and from eventual chainned QueryLevelIterator. fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result> { let parent_result = match self.parent.as_mut() { - Some(parent) => { - Some(parent.next(allowed_candidates, tree_level)?) - }, + Some(parent) => Some(parent.next(allowed_candidates, tree_level)?), None => None, }; From 3794ffc9529d89bf6e965ad0c99eb477a5881bc6 Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:39:23 +0200 Subject: [PATCH 43/45] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 62e992fad..3d7132e77 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -496,7 +496,9 @@ fn set_compute_candidates<'t>( let branch_rank = branch.compute_rank(); // if current is worst than best we break to return // candidates that correspond to the best rank - if let Some((best_rank, _)) = final_candidates { if branch_rank > best_rank { break; } } + if let Some((best_rank, _)) = final_candidates { + if branch_rank > best_rank { break } + } let _left = branch.last_result.0; let candidates = take(&mut branch.last_result.2); if candidates.is_empty() { From 0add4d735c95ed8bcddeb2e2afa853bab7dcf62e Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:40:34 +0200 Subject: [PATCH 44/45] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 3d7132e77..e1069b5f5 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -529,9 +529,7 @@ fn set_compute_candidates<'t>( } - Ok(final_candidates.map(|(_rank, candidates)| { - candidates - })) + Ok(final_candidates.map(|(_rank, candidates)| candidates)) } fn linear_compute_candidates( From 3b7e6afb55e76749bb69b11e5ab15d488bc8924f Mon Sep 17 00:00:00 2001 From: many Date: Wed, 28 Apr 2021 13:53:27 +0200 Subject: [PATCH 45/45] Make some refacto and add documentation --- milli/src/search/criteria/attribute.rs | 64 ++++++++++++++++++-------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index e1069b5f5..bbbc0de1a 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -16,6 +16,10 @@ use super::{Criterion, CriterionResult, Context, resolve_query_tree}; /// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). const LCM_10_FIRST_NUMBERS: u32 = 2520; +/// To compute the interval size of a level, +/// we use 4 as the exponentiation base and the level as the exponent. +const LEVEL_EXPONENTIATION_BASE: u32 = 4; + pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, query_tree: Option, @@ -150,7 +154,7 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result> { match ctx.word_position_last_level(&word, in_prefix_cache)? { Some(level) => { - let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level.clone()) as u32); let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) }, @@ -160,7 +164,7 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option) -> heed::Result { let level = level.min(&self.level).clone(); - let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level.clone()) as u32); let word = self.word.clone(); let in_prefix_cache = self.in_prefix_cache; let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; @@ -280,10 +284,10 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { fn inner_next(&mut self, level: TreeLevel) -> heed::Result> { let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None; let u8_level = Into::::into(level); - let interval_size = 4u32.pow(u8_level as u32); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); for wli in self.inner.iter_mut() { let wli_u8_level = Into::::into(wli.level.clone()); - let accumulated_count = 4u32.pow((u8_level - wli_u8_level) as u32); + let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32); for _ in 0..accumulated_count { if let Some((next_left, _, next_docids)) = wli.next()? { accumulated = match accumulated.take(){ @@ -311,20 +315,12 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { match parent_result { Some(parent_next) => { let inner_next = self.inner_next(tree_level)?; - self.interval_to_skip += self.accumulator.iter().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip).take_while(|current| { - match current { - (Some((_, _, inner)), Some((_, _, parent))) => { - inner.is_disjoint(allowed_candidates) && parent.is_empty() - }, - (Some((_, _, inner)), None) => { - inner.is_disjoint(allowed_candidates) - }, - (None, Some((_, _, parent))) => { - parent.is_empty() - }, - (None, None) => true, - } - }).count(); + self.interval_to_skip += interval_to_skip( + &self.parent_accumulator, + &self.accumulator, + self.interval_to_skip, + allowed_candidates + ); self.accumulator.push(inner_next); self.parent_accumulator.push(parent_next); let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; @@ -358,6 +354,29 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { } } +/// Count the number of interval that can be skiped when we make the cross-intersections +/// in order to compute the next meta-interval. +/// A pair of intervals is skiped when both intervals doesn't contain any allowed docids. +fn interval_to_skip( + parent_accumulator: &[Option<(u32, u32, RoaringBitmap)>], + current_accumulator: &[Option<(u32, u32, RoaringBitmap)>], + already_skiped: usize, + allowed_candidates: &RoaringBitmap, +) -> usize { + parent_accumulator.into_iter() + .zip(current_accumulator.into_iter()) + .skip(already_skiped) + .take_while(|(parent, current)| { + let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); + let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); + skip_parent && skip_current + }) + .count() + +} + +/// A Branch is represent a possible alternative of the original query and is build with the Query Tree, +/// This branch allows us to iterate over meta-interval of position and to dig in it if it contains interesting candidates. struct Branch<'t, 'q> { query_level_iterator: QueryLevelIterator<'t, 'q>, last_result: (u32, u32, RoaringBitmap), @@ -366,6 +385,8 @@ struct Branch<'t, 'q> { } impl<'t, 'q> Branch<'t, 'q> { + /// return the next meta-interval of the branch, + /// and update inner interval in order to be ranked by the BinaryHeap. fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result { let tree_level = self.query_level_iterator.level; match self.query_level_iterator.next(allowed_candidates, tree_level)? { @@ -378,19 +399,24 @@ impl<'t, 'q> Branch<'t, 'q> { } } + /// make the current Branch iterate over smaller intervals. fn dig(&mut self, ctx: &'t dyn Context<'t>) -> heed::Result<()> { self.query_level_iterator = self.query_level_iterator.dig(ctx)?; Ok(()) } + /// because next() method could be time consuming, + /// update inner interval in order to be ranked by the binary_heap without computing it, + /// the next() method should be called when the real interval is needed. fn lazy_next(&mut self) { let u8_level = Into::::into(self.tree_level.clone()); - let interval_size = 4u32.pow(u8_level as u32); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); let (left, right, _) = self.last_result; self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new()); } + /// return the score of the current inner interval. fn compute_rank(&self) -> u32 { // we compute a rank from the left interval. let (left, _, _) = self.last_result;