From 6c659dc12fc0b68c0b1f909ebf7e6c0fba936ce1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 20 Mar 2023 09:41:37 +0100 Subject: [PATCH 001/234] Use MiMalloc in milli tests --- Cargo.lock | 1 + milli/Cargo.toml | 1 + milli/src/lib.rs | 8 ++++++++ 3 files changed, 10 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 853d1a896..0bdad9131 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2645,6 +2645,7 @@ dependencies = [ "maplit", "md5", "memmap2", + "mimalloc", "obkv", "once_cell", "ordered-float", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 04591e8fd..224878cd1 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -58,6 +58,7 @@ logging_timer = "1.1.0" csv = "1.1.6" [dev-dependencies] +mimalloc = { version = "0.1.29", default-features = false } big_s = "1.0.2" insta = "1.21.0" maplit = "1.0.2" diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 865195df5..6de737042 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,7 +1,15 @@ #![cfg_attr(all(test, fuzzing), feature(no_coverage))] +#![allow(unused, clippy::type_complexity)] + +#[cfg(test)] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + #[macro_use] pub mod documents; +pub use search::new; + mod asc_desc; mod criterion; mod error; From 1d937f831b5b471c616414e4dc5d46bb50eb9fe1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 20 Mar 2023 09:41:55 +0100 Subject: [PATCH 002/234] Temporarily remove codegen-units - 1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index b1f475410..628c6bcbb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ edition = "2021" license = "MIT" [profile.release] -codegen-units = 1 +# codegen-units = 1 [profile.dev.package.flate2] opt-level = 3 From 2d88089129a89f56e6906710359323a560a0d831 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:40:41 +0100 Subject: [PATCH 003/234] Remove unused term matching strategies --- milli/src/search/mod.rs | 9 +-------- milli/src/search/query_tree.rs | 37 ---------------------------------- 2 files changed, 1 insertion(+), 45 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index bd140284d..5e741c7f3 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -36,6 +36,7 @@ mod distinct; pub mod facet; mod fst_utils; mod matches; +pub mod new; mod query_tree; pub struct Search<'a> { @@ -344,14 +345,6 @@ pub enum CriterionImplementationStrategy { pub enum TermsMatchingStrategy { // remove last word first Last, - // remove first word first - First, - // remove more frequent word first - Frequency, - // remove smallest word first - Size, - // only one of the word is mandatory - Any, // all words are mandatory All, } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 541dd8f7a..24e33bdd8 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -487,49 +487,12 @@ fn create_query_tree( for _ in 0..=remove_count { let pos = match terms_matching_strategy { TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false), - TermsMatchingStrategy::Any => { - let operation = Operation::Or( - true, - vec![ - // branch allowing matching documents to contains any query word. - ngrams(ctx, authorize_typos, &query, true)?, - // branch forcing matching documents to contains all the query words, - // keeping this documents of the top of the resulted list. - ngrams(ctx, authorize_typos, &query, false)?, - ], - ); - - return Ok(operation); - } TermsMatchingStrategy::Last => query .iter() .enumerate() .filter(|(_, part)| !part.is_phrase()) .last() .map(|(pos, _)| pos), - TermsMatchingStrategy::First => { - query.iter().enumerate().find(|(_, part)| !part.is_phrase()).map(|(pos, _)| pos) - } - TermsMatchingStrategy::Size => query - .iter() - .enumerate() - .filter(|(_, part)| !part.is_phrase()) - .min_by_key(|(_, part)| match part { - PrimitiveQueryPart::Word(s, _) => s.len(), - _ => unreachable!(), - }) - .map(|(pos, _)| pos), - TermsMatchingStrategy::Frequency => query - .iter() - .enumerate() - .filter(|(_, part)| !part.is_phrase()) - .max_by_key(|(_, part)| match part { - PrimitiveQueryPart::Word(s, _) => { - ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()) - } - _ => unreachable!(), - }) - .map(|(pos, _)| pos), }; // compute and push the current branch on the front From 79e0a6dd4e8fe6a8b0f5eb3f38cf24d444e10c01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:41:58 +0100 Subject: [PATCH 004/234] Introduce a new search module, eventually meant to replace the old one The code here does not compile, because I am merely splitting one giant commit into smaller ones where each commit explains a single file. --- milli/src/search/new/mod.rs | 55 +++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 milli/src/search/new/mod.rs diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs new file mode 100644 index 000000000..17e74e70e --- /dev/null +++ b/milli/src/search/new/mod.rs @@ -0,0 +1,55 @@ +pub mod db_cache; +pub mod graph_based_ranking_rule; +pub mod query_graph; +pub mod query_term; +pub mod ranking_rule_graph; +pub mod ranking_rules; +pub mod resolve_query_graph; +pub mod sort; +pub mod words; + +use charabia::Tokenize; +use heed::RoTxn; +pub use query_graph::*; +pub use ranking_rules::*; +use roaring::RoaringBitmap; + +use self::{ + db_cache::DatabaseCache, + query_term::{word_derivations, LocatedQueryTerm}, +}; +use crate::{Index, Result}; + +pub enum BitmapOrAllRef<'s> { + Bitmap(&'s RoaringBitmap), + All, +} + +pub fn make_query_graph<'transaction>( + index: &Index, + txn: &RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + query: &str, +) -> Result { + assert!(!query.is_empty()); + let fst = index.words_fst(txn).unwrap(); + let query = LocatedQueryTerm::from_query(query.tokenize(), None, |word, is_prefix| { + word_derivations( + index, + txn, + word, + if word.len() < 4 { + 0 + } else if word.len() < 100 { + 1 + } else { + 2 + }, + is_prefix, + &fst, + ) + }) + .unwrap(); + let graph = QueryGraph::from_query(index, txn, db_cache, query)?; + Ok(graph) +} From a83007c0136f5f7011ad1f9669eec1c78f4e568d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:42:54 +0100 Subject: [PATCH 005/234] Introduce structure to represent search queries as graphs --- milli/src/search/new/query_graph.rs | 401 ++++++++++++++++++++++++++++ milli/src/search/new/query_term.rs | 305 +++++++++++++++++++++ 2 files changed, 706 insertions(+) create mode 100644 milli/src/search/new/query_graph.rs create mode 100644 milli/src/search/new/query_term.rs diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs new file mode 100644 index 000000000..726a1460c --- /dev/null +++ b/milli/src/search/new/query_graph.rs @@ -0,0 +1,401 @@ +use std::collections::HashSet; +use std::fmt::Debug; + +use heed::RoTxn; + +use super::{ + db_cache::DatabaseCache, + query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}, +}; +use crate::{Index, Result}; + +#[derive(Clone)] +pub enum QueryNode { + Term(LocatedQueryTerm), + Deleted, + Start, + End, +} + +#[derive(Debug, Clone)] +pub struct Edges { + pub incoming: HashSet, + pub outgoing: HashSet, +} + +#[derive(Debug, Clone)] +pub struct QueryGraph { + pub root_node: usize, + pub end_node: usize, + pub nodes: Vec, + pub edges: Vec, +} + +fn _assert_sizes() { + let _: [u8; 112] = [0; std::mem::size_of::()]; + let _: [u8; 96] = [0; std::mem::size_of::()]; +} + +impl Default for QueryGraph { + /// Create a new QueryGraph with two disconnected nodes: the root and end nodes. + fn default() -> Self { + let nodes = vec![QueryNode::Start, QueryNode::End]; + let edges = vec![ + Edges { incoming: HashSet::new(), outgoing: HashSet::new() }, + Edges { incoming: HashSet::new(), outgoing: HashSet::new() }, + ]; + + Self { root_node: 0, end_node: 1, nodes, edges } + } +} + +impl QueryGraph { + fn connect_to_node(&mut self, from_nodes: &[usize], end_node: usize) { + for &from_node in from_nodes { + self.edges[from_node].outgoing.insert(end_node); + self.edges[end_node].incoming.insert(from_node); + } + } + fn add_node(&mut self, from_nodes: &[usize], node: QueryNode) -> usize { + let new_node_idx = self.nodes.len(); + self.nodes.push(node); + self.edges.push(Edges { + incoming: from_nodes.iter().copied().collect(), + outgoing: HashSet::new(), + }); + for from_node in from_nodes { + self.edges[*from_node].outgoing.insert(new_node_idx); + } + new_node_idx + } +} + +impl QueryGraph { + // TODO: return the list of all matching words here as well + + pub fn from_query<'transaction>( + index: &Index, + txn: &RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + query: Vec, + ) -> Result { + // TODO: maybe empty nodes should not be removed here, to compute + // the score of the `words` ranking rule correctly + // it is very easy to traverse the graph and remove afterwards anyway + // Still, I'm keeping this here as a demo + let mut empty_nodes = vec![]; + + let word_set = index.words_fst(txn)?; + let mut graph = QueryGraph::default(); + + let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = + (vec![], vec![], vec![graph.root_node]); + + // TODO: add all the word derivations found in the fst + // and add split words / support phrases + + for length in 1..=query.len() { + let query = &query[..length]; + + let term0 = query.last().unwrap(); + + let mut new_nodes = vec![]; + let new_node_idx = graph.add_node(&prev0, QueryNode::Term(term0.clone())); + new_nodes.push(new_node_idx); + if term0.is_empty() { + empty_nodes.push(new_node_idx); + } + + if !prev1.is_empty() { + if let Some((ngram2_str, ngram2_pos)) = + LocatedQueryTerm::ngram2(&query[length - 2], &query[length - 1]) + { + if word_set.contains(ngram2_str.as_bytes()) { + let ngram2 = LocatedQueryTerm { + value: QueryTerm::Word { + derivations: WordDerivations { + original: ngram2_str.clone(), + // TODO: could add a typo if it's an ngram? + zero_typo: vec![ngram2_str], + one_typo: vec![], + two_typos: vec![], + use_prefix_db: false, + }, + }, + positions: ngram2_pos, + }; + let ngram2_idx = graph.add_node(&prev1, QueryNode::Term(ngram2)); + new_nodes.push(ngram2_idx); + } + } + } + if !prev2.is_empty() { + if let Some((ngram3_str, ngram3_pos)) = LocatedQueryTerm::ngram3( + &query[length - 3], + &query[length - 2], + &query[length - 1], + ) { + if word_set.contains(ngram3_str.as_bytes()) { + let ngram3 = LocatedQueryTerm { + value: QueryTerm::Word { + derivations: WordDerivations { + original: ngram3_str.clone(), + // TODO: could add a typo if it's an ngram? + zero_typo: vec![ngram3_str], + one_typo: vec![], + two_typos: vec![], + use_prefix_db: false, + }, + }, + positions: ngram3_pos, + }; + let ngram3_idx = graph.add_node(&prev2, QueryNode::Term(ngram3)); + new_nodes.push(ngram3_idx); + } + } + } + (prev0, prev1, prev2) = (new_nodes, prev0, prev1); + } + graph.connect_to_node(&prev0, graph.end_node); + + graph.remove_nodes_keep_edges(&empty_nodes); + + Ok(graph) + } + pub fn remove_nodes(&mut self, nodes: &[usize]) { + for &node in nodes { + self.nodes[node] = QueryNode::Deleted; + let edges = self.edges[node].clone(); + for &pred in edges.incoming.iter() { + self.edges[pred].outgoing.remove(&node); + } + for succ in edges.outgoing { + self.edges[succ].incoming.remove(&node); + } + self.edges[node] = Edges { incoming: HashSet::new(), outgoing: HashSet::new() }; + } + } + pub fn remove_nodes_keep_edges(&mut self, nodes: &[usize]) { + for &node in nodes { + self.nodes[node] = QueryNode::Deleted; + let edges = self.edges[node].clone(); + for &pred in edges.incoming.iter() { + self.edges[pred].outgoing.remove(&node); + self.edges[pred].outgoing.extend(edges.outgoing.iter()); + } + for succ in edges.outgoing { + self.edges[succ].incoming.remove(&node); + self.edges[succ].incoming.extend(edges.incoming.iter()); + } + self.edges[node] = Edges { incoming: HashSet::new(), outgoing: HashSet::new() }; + } + } + pub fn remove_words_at_position(&mut self, position: i8) { + let mut nodes_to_remove_keeping_edges = vec![]; + let mut nodes_to_remove = vec![]; + for (node_idx, node) in self.nodes.iter().enumerate() { + let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue }; + if positions.contains(&position) { + nodes_to_remove_keeping_edges.push(node_idx) + } else if positions.contains(&position) { + nodes_to_remove.push(node_idx) + } + } + + self.remove_nodes(&nodes_to_remove); + self.remove_nodes_keep_edges(&nodes_to_remove_keeping_edges); + + self.simplify(); + } + + fn simplify(&mut self) { + loop { + let mut nodes_to_remove = vec![]; + for (node_idx, node) in self.nodes.iter().enumerate() { + if (!matches!(node, QueryNode::End | QueryNode::Deleted) + && self.edges[node_idx].outgoing.is_empty()) + || (!matches!(node, QueryNode::Start | QueryNode::Deleted) + && self.edges[node_idx].incoming.is_empty()) + { + nodes_to_remove.push(node_idx); + } + } + if nodes_to_remove.is_empty() { + break; + } else { + self.remove_nodes(&nodes_to_remove); + } + } + } +} +impl Debug for QueryNode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + QueryNode::Term(term @ LocatedQueryTerm { value, positions: _ }) => match value { + QueryTerm::Word { + derivations: + WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db }, + } => { + if term.is_empty() { + write!(f, "\"{original} (∅)\"") + } else { + let derivations = std::iter::once(original.clone()) + .chain(zero_typo.iter().map(|s| format!("T0 .. {s}"))) + .chain(one_typo.iter().map(|s| format!("T1 .. {s}"))) + .chain(two_typos.iter().map(|s| format!("T2 .. {s}"))) + .collect::>() + .join(" | "); + + write!(f, "\"{derivations}")?; + if *use_prefix_db { + write!(f, " | +prefix_db")?; + } + write!(f, " | pos:{}..={}", term.positions.start(), term.positions.end())?; + write!(f, "\"")?; + /* + "beautiful" [label = " beautiful | beauiful | beautifol"] + */ + Ok(()) + } + } + QueryTerm::Phrase(ws) => { + let joined = + ws.iter().filter_map(|x| x.clone()).collect::>().join(" "); + let in_quotes = format!("\"{joined}\""); + let escaped = in_quotes.escape_default().collect::(); + write!(f, "\"{escaped}\"") + } + }, + QueryNode::Start => write!(f, "\"START\""), + QueryNode::End => write!(f, "\"END\""), + QueryNode::Deleted => write!(f, "\"_deleted_\""), + } + } +} + +/* +TODO: + +1. Find the minimum number of words to check to resolve the 10 query trees at once. + (e.g. just 0 | 01 | 012 ) +2. Simplify the query tree after removal of a node ✅ +3. Create the proximity graph ✅ +4. Assign different proximities for the ngrams ✅ +5. Walk the proximity graph, finding all the potential paths of weight N from START to END ✅ +(without checking the bitmaps) + +*/ +impl QueryGraph { + pub fn graphviz(&self) -> String { + let mut desc = String::new(); + desc.push_str( + r#" +digraph G { +rankdir = LR; +node [shape = "record"] +"#, + ); + + for node in 0..self.nodes.len() { + if matches!(self.nodes[node], QueryNode::Deleted) { + continue; + } + desc.push_str(&format!("{node} [label = {:?}]", &self.nodes[node],)); + if node == self.root_node { + desc.push_str("[color = blue]"); + } else if node == self.end_node { + desc.push_str("[color = red]"); + } + desc.push_str(";\n"); + + for edge in self.edges[node].outgoing.iter() { + desc.push_str(&format!("{node} -> {edge};\n")); + } + // for edge in self.edges[node].incoming.iter() { + // desc.push_str(&format!("{node} -> {edge} [color = grey];\n")); + // } + } + + desc.push('}'); + desc + } +} + +#[cfg(test)] +mod tests { + use charabia::Tokenize; + + use super::{LocatedQueryTerm, QueryGraph, QueryNode}; + use crate::index::tests::TempIndex; + use crate::new::db_cache::DatabaseCache; + use crate::search::new::query_term::word_derivations; + + #[test] + fn build_graph() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + index + .update_settings(|s| { + s.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + index + .add_documents(documents!({ + "text": "0 1 2 3 4 5 6 7 01 23 234 56 79 709 7356", + })) + .unwrap(); + + // let fst = fst::Set::from_iter(["01", "23", "234", "56"]).unwrap(); + let txn = index.read_txn().unwrap(); + let mut db_cache = DatabaseCache::default(); + + let fst = index.words_fst(&txn).unwrap(); + let query = LocatedQueryTerm::from_query( + "0 no 1 2 3 4 5 6 7".tokenize(), + None, + |word, is_prefix| { + word_derivations( + &index, + &txn, + word, + if word.len() < 3 { + 0 + } else if word.len() < 6 { + 1 + } else { + 2 + }, + is_prefix, + &fst, + ) + }, + ) + .unwrap(); + + let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap(); + println!("{}", graph.graphviz()); + + // let positions_to_remove = vec![3, 6, 0, 4]; + // for p in positions_to_remove { + // graph.remove_words_at_position(p); + // println!("{}", graph.graphviz()); + // } + + // let proximities = |w1: &str, w2: &str| -> Vec { + // if matches!((w1, w2), ("56", "7")) { + // vec![] + // } else { + // vec![1, 2] + // } + // }; + + // let prox_graph = ProximityGraph::from_query_graph(graph, proximities); + + // println!("{}", prox_graph.graphviz()); + } +} + +// fn remove_element_from_vector(v: &mut Vec, el: usize) { +// let position = v.iter().position(|&x| x == el).unwrap(); +// v.swap_remove(position); +// } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs new file mode 100644 index 000000000..4d2b22264 --- /dev/null +++ b/milli/src/search/new/query_term.rs @@ -0,0 +1,305 @@ +// TODO: put primitive query part in here + +use std::borrow::Cow; +use std::mem; +use std::ops::RangeInclusive; + +use charabia::normalizer::NormalizedTokenIter; +use charabia::{SeparatorKind, TokenKind}; +use fst::automaton::Str; +use fst::{Automaton, IntoStreamer, Streamer}; +use heed::types::DecodeIgnore; +use heed::RoTxn; + +use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; +use crate::search::{build_dfa, get_first}; +use crate::{Index, Result}; + +#[derive(Debug, Clone)] +pub struct WordDerivations { + // TODO: should have a list for the words corresponding to the prefix as well! + // This is to implement the `exactness` ranking rule. + // However, we could also consider every term in `zero_typo` (except first one) to + // be words of that the original word is a prefix of + pub original: String, + pub zero_typo: Vec, + pub one_typo: Vec, + pub two_typos: Vec, + pub use_prefix_db: bool, +} +impl WordDerivations { + pub fn all_derivations_except_prefix_db(&self) -> impl Iterator + Clone { + self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter()) + } + fn is_empty(&self) -> bool { + self.zero_typo.is_empty() + && self.one_typo.is_empty() + && self.two_typos.is_empty() + && !self.use_prefix_db + } +} + +pub fn word_derivations( + index: &Index, + txn: &RoTxn, + word: &str, + max_typo: u8, + is_prefix: bool, + fst: &fst::Set>, +) -> Result { + let use_prefix_db = is_prefix + && index.word_prefix_docids.remap_data_type::().get(txn, word)?.is_some(); + + let mut zero_typo = vec![]; + let mut one_typo = vec![]; + let mut two_typos = vec![]; + + if max_typo == 0 { + if is_prefix { + let prefix = Str::new(word).starts_with(); + let mut stream = fst.search(prefix).into_stream(); + + while let Some(word) = stream.next() { + let word = std::str::from_utf8(word)?; + zero_typo.push(word.to_string()); + } + } else if fst.contains(word) { + zero_typo.push(word.to_string()); + } + } else if max_typo == 1 { + let dfa = build_dfa(word, 1, is_prefix); + let starts = StartsWith(Str::new(get_first(word))); + let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); + + while let Some((word, state)) = stream.next() { + let word = std::str::from_utf8(word)?; + let d = dfa.distance(state.1); + match d.to_u8() { + 0 => { + zero_typo.push(word.to_string()); + } + 1 => { + one_typo.push(word.to_string()); + } + _ => panic!(), + } + } + } else { + let starts = StartsWith(Str::new(get_first(word))); + let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); + let second_dfa = build_dfa(word, 2, is_prefix); + let second = Intersection(&second_dfa, &starts); + let automaton = Union(first, &second); + + let mut stream = fst.search_with_state(automaton).into_stream(); + + while let Some((found_word, state)) = stream.next() { + let found_word = std::str::from_utf8(found_word)?; + // in the case the typo is on the first letter, we know the number of typo + // is two + if get_first(found_word) != get_first(word) { + two_typos.push(found_word.to_string()); + } else { + // Else, we know that it is the second dfa that matched and compute the + // correct distance + let d = second_dfa.distance((state.1).0); + match d.to_u8() { + 0 => { + zero_typo.push(found_word.to_string()); + } + 1 => { + one_typo.push(found_word.to_string()); + } + 2 => { + two_typos.push(found_word.to_string()); + } + _ => panic!(), + } + } + } + } + + Ok(WordDerivations { original: word.to_owned(), zero_typo, one_typo, two_typos, use_prefix_db }) +} + +#[derive(Debug, Clone)] +pub enum QueryTerm { + Phrase(Vec>), + Word { derivations: WordDerivations }, +} +impl QueryTerm { + pub fn original_single_word(&self) -> Option<&str> { + match self { + QueryTerm::Phrase(_) => None, + QueryTerm::Word { derivations } => { + if derivations.is_empty() { + None + } else { + Some(derivations.original.as_str()) + } + } + } + } +} + +#[derive(Debug, Clone)] +pub struct LocatedQueryTerm { + pub value: QueryTerm, // value should be able to contain the word derivations as well + pub positions: RangeInclusive, +} + +impl LocatedQueryTerm { + pub fn is_empty(&self) -> bool { + match &self.value { + QueryTerm::Phrase(_) => false, + QueryTerm::Word { derivations, .. } => derivations.is_empty(), + } + } + /// Create primitive query from tokenized query string, + /// the primitive query is an intermediate state to build the query tree. + pub fn from_query( + query: NormalizedTokenIter>, + words_limit: Option, + derivations: impl Fn(&str, bool) -> Result, + ) -> Result> { + let mut primitive_query = Vec::new(); + let mut phrase = Vec::new(); + + let mut quoted = false; + + let parts_limit = words_limit.unwrap_or(usize::MAX); + + let mut position = -1i8; + let mut phrase_start = -1i8; + let mut phrase_end = -1i8; + + let mut peekable = query.peekable(); + while let Some(token) = peekable.next() { + // early return if word limit is exceeded + if primitive_query.len() >= parts_limit { + return Ok(primitive_query); + } + + match token.kind { + TokenKind::Word | TokenKind::StopWord => { + position += 1; + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, + // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, + // 3. if the word is the last token of the query we push it as a prefix word. + if quoted { + phrase_end = position; + if phrase.is_empty() { + phrase_start = position; + } + if let TokenKind::StopWord = token.kind { + phrase.push(None); + } else { + // TODO: in a phrase, check that every word exists + // otherwise return WordDerivations::Empty + phrase.push(Some(token.lemma().to_string())); + } + } else if peekable.peek().is_some() { + if let TokenKind::StopWord = token.kind { + } else { + let derivations = derivations(token.lemma(), false)?; + let located_term = LocatedQueryTerm { + value: QueryTerm::Word { derivations }, + positions: position..=position, + }; + primitive_query.push(located_term); + } + } else { + let derivations = derivations(token.lemma(), true)?; + let located_term = LocatedQueryTerm { + value: QueryTerm::Word { derivations }, + positions: position..=position, + }; + primitive_query.push(located_term); + } + } + TokenKind::Separator(separator_kind) => { + match separator_kind { + SeparatorKind::Hard => { + position += 1; + } + SeparatorKind::Soft => { + position += 0; + } + } + let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + // swap quoted state if we encounter a double quote + if quote_count % 2 != 0 { + quoted = !quoted; + } + // if there is a quote or a hard separator we close the phrase. + if !phrase.is_empty() + && (quote_count > 0 || separator_kind == SeparatorKind::Hard) + { + let located_query_term = LocatedQueryTerm { + value: QueryTerm::Phrase(mem::take(&mut phrase)), + positions: phrase_start..=phrase_end, + }; + primitive_query.push(located_query_term); + } + } + _ => (), + } + } + + // If a quote is never closed, we consider all of the end of the query as a phrase. + if !phrase.is_empty() { + let located_query_term = LocatedQueryTerm { + value: QueryTerm::Phrase(mem::take(&mut phrase)), + positions: phrase_start..=phrase_end, + }; + primitive_query.push(located_query_term); + } + + Ok(primitive_query) + } +} + +impl LocatedQueryTerm { + pub fn ngram2( + x: &LocatedQueryTerm, + y: &LocatedQueryTerm, + ) -> Option<(String, RangeInclusive)> { + if *x.positions.end() != y.positions.start() - 1 { + println!( + "x positions end: {}, y positions start: {}", + *x.positions.end(), + y.positions.start() + ); + return None; + } + match (&x.value.original_single_word(), &y.value.original_single_word()) { + (Some(w1), Some(w2)) => { + let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end()); + Some(term) + } + _ => None, + } + } + pub fn ngram3( + x: &LocatedQueryTerm, + y: &LocatedQueryTerm, + z: &LocatedQueryTerm, + ) -> Option<(String, RangeInclusive)> { + if *x.positions.end() != y.positions.start() - 1 + || *y.positions.end() != z.positions.start() - 1 + { + return None; + } + match ( + &x.value.original_single_word(), + &y.value.original_single_word(), + &z.value.original_single_word(), + ) { + (Some(w1), Some(w2), Some(w3)) => { + let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end()); + Some(term) + } + _ => None, + } + } +} From 5065d8b0c163ad741ec41b0af330055748ff2bc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:43:23 +0100 Subject: [PATCH 006/234] Introduce a DatabaseCache to memorize the addresses of LMDB values --- milli/src/search/new/db_cache.rs | 119 +++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 milli/src/search/new/db_cache.rs diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs new file mode 100644 index 000000000..15f9f7873 --- /dev/null +++ b/milli/src/search/new/db_cache.rs @@ -0,0 +1,119 @@ +use std::collections::{hash_map::Entry, HashMap}; + +use heed::{types::ByteSlice, RoTxn}; + +use crate::{Index, Result}; + +#[derive(Default)] +pub struct DatabaseCache<'transaction> { + pub word_pair_proximity_docids: HashMap<(u8, String, String), Option<&'transaction [u8]>>, + pub word_prefix_pair_proximity_docids: + HashMap<(u8, String, String), Option<&'transaction [u8]>>, + pub word_docids: HashMap>, + pub exact_word_docids: HashMap>, + pub word_prefix_docids: HashMap>, +} +impl<'transaction> DatabaseCache<'transaction> { + pub fn get_word_docids( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + word: &str, + ) -> Result> { + let bitmap_ptr = match self.word_docids.entry(word.to_owned()) { + Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(), + Entry::Vacant(entry) => { + let bitmap_ptr = index.word_docids.remap_data_type::().get(txn, word)?; + entry.insert(bitmap_ptr); + bitmap_ptr + } + }; + Ok(bitmap_ptr) + } + pub fn get_prefix_docids( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + prefix: &str, + ) -> Result> { + // In the future, this will be a frozen roaring bitmap + let bitmap_ptr = match self.word_prefix_docids.entry(prefix.to_owned()) { + Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(), + Entry::Vacant(entry) => { + let bitmap_ptr = + index.word_prefix_docids.remap_data_type::().get(txn, prefix)?; + entry.insert(bitmap_ptr); + bitmap_ptr + } + }; + Ok(bitmap_ptr) + } + + pub fn get_word_pair_proximity_docids( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + word1: &str, + word2: &str, + proximity: u8, + ) -> Result> { + let key = (proximity, word1.to_owned(), word2.to_owned()); + match self.word_pair_proximity_docids.entry(key.clone()) { + Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), + Entry::Vacant(entry) => { + // Note that now, we really want to do a prefix iter over (w1, w2) to get all the possible proximities + // but oh well + // + // Actually, we shouldn'transaction greedily access this DB at all + // a DB (w1, w2) -> [proximities] would be much better + // We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity + // And if we worked with words encoded as integers, the set of words could be a roaring bitmap + // Then, to find all the proximities between two list of words, we'd do: + + // inputs: + // - words1 (roaring bitmap) + // - words2 (roaring bitmap) + // output: + // - [(word1, word2, [proximities])] + // algo: + // let mut ouput = vec![]; + // for word1 in words1 { + // let all_words_in_proximity_of_w1 = pair_words_db.get(word1); + // let words_in_proximity_of_w1 = all_words_in_proximity_of_w1 & words2; + // for word2 in words_in_proximity_of_w1 { + // let proximties = prox_db.get(word1, word2); + // output.push(word1, word2, proximities); + // } + // } + let bitmap_ptr = index + .word_pair_proximity_docids + .remap_data_type::() + .get(txn, &(key.0, key.1.as_str(), key.2.as_str()))?; + entry.insert(bitmap_ptr); + Ok(bitmap_ptr) + } + } + } + + pub fn get_word_prefix_pair_proximity_docids( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + word1: &str, + prefix2: &str, + proximity: u8, + ) -> Result> { + let key = (proximity, word1.to_owned(), prefix2.to_owned()); + match self.word_prefix_pair_proximity_docids.entry(key.clone()) { + Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), + Entry::Vacant(entry) => { + let bitmap_ptr = index + .word_prefix_pair_proximity_docids + .remap_data_type::() + .get(txn, &(key.0, key.1.as_str(), key.2.as_str()))?; + entry.insert(bitmap_ptr); + Ok(bitmap_ptr) + } + } + } +} From ce0d1e0e137940e45c7646d443182ef180b74a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:44:03 +0100 Subject: [PATCH 007/234] Introduce a common way to manage the coordination between ranking rules --- milli/src/search/new/ranking_rules.rs | 523 ++++++++++++++++++++++++++ 1 file changed, 523 insertions(+) create mode 100644 milli/src/search/new/ranking_rules.rs diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs new file mode 100644 index 000000000..ce883ad6a --- /dev/null +++ b/milli/src/search/new/ranking_rules.rs @@ -0,0 +1,523 @@ +use heed::RoTxn; +use roaring::RoaringBitmap; + +use super::db_cache::DatabaseCache; +use super::resolve_query_graph::resolve_query_graph; +use super::QueryGraph; +use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; +use crate::new::ranking_rule_graph::proximity::ProximityGraph; +use crate::new::words::Words; +// use crate::search::new::sort::Sort; +use crate::{Index, Result, TermsMatchingStrategy}; + +pub trait RankingRuleOutputIter<'transaction, Query> { + fn next_bucket(&mut self) -> Result>>; +} + +pub struct RankingRuleOutputIterWrapper<'transaction, Query> { + iter: Box>> + 'transaction>, +} +impl<'transaction, Query> RankingRuleOutputIterWrapper<'transaction, Query> { + pub fn new( + iter: Box>> + 'transaction>, + ) -> Self { + Self { iter } + } +} +impl<'transaction, Query> RankingRuleOutputIter<'transaction, Query> + for RankingRuleOutputIterWrapper<'transaction, Query> +{ + fn next_bucket(&mut self) -> Result>> { + match self.iter.next() { + Some(x) => x.map(Some), + None => Ok(None), + } + } +} + +pub trait RankingRuleQueryTrait: Sized + Clone + 'static {} +#[derive(Clone)] +pub struct PlaceholderQuery; +impl RankingRuleQueryTrait for PlaceholderQuery {} +impl RankingRuleQueryTrait for QueryGraph {} + +pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { + // TODO: add an update_candidates function to deal with distinct + // attributes? + + fn start_iteration( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + query: &Query, + ) -> Result<()>; + + fn next_bucket( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + ) -> Result>>; + + fn end_iteration( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + ); +} + +#[derive(Debug)] +pub struct RankingRuleOutput { + /// The query tree that must be used by the child ranking rule to fetch candidates. + pub query: Q, + /// The allowed candidates for the child ranking rule + pub candidates: RoaringBitmap, +} + +#[allow(unused)] +pub fn get_start_universe<'transaction>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + query_graph: &QueryGraph, + term_matching_strategy: TermsMatchingStrategy, + // filters: Filters, + // mut distinct: Option, +) -> Result { + // NOTE: + // + // There is a performance problem when using `distinct` + exhaustive number of hits, + // especially for search that yield many results (many ~= almost all of the + // dataset). + // + // We'll solve it later. Maybe there are smart ways to go about it. + // + // For example, if there are millions of possible values for the distinct attribute, + // then we could just look at the documents which share any distinct attribute with + // another one, and remove the later docids them from the universe. + // => NO! because we don't know which one to remove, only after the sorting is done can we know it + // => this kind of computation can be done, but only in the evaluation of the number + // of hits for the documents that aren't returned by the search. + // + // `Distinct` otherwise should always be computed during + + let universe = index.documents_ids(txn).unwrap(); + + // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. + // NOTE: this is wrong + // Instead, we should only compute the documents corresponding to the last remaining + // word, 2-gram, and 3-gran. + // let candidates = resolve_query_graph(index, txn, db_cache, query_graph, &universe)?; + + // Distinct should be lazy if placeholder? + // + // // because the initial_candidates should be an exhaustive count of the matching documents, + // // we precompute the distinct attributes. + // let initial_candidates = match &mut distinct { + // Some(distinct) => { + // let mut initial_candidates = RoaringBitmap::new(); + // for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { + // initial_candidates.insert(c?); + // } + // initial_candidates + // } + // None => candidates.clone(), + // }; + + Ok(/*candidates*/ universe) +} + +pub fn execute_search<'transaction>( + index: &Index, + txn: &'transaction heed::RoTxn, + // TODO: ranking rules parameter + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + query_graph: &QueryGraph, + // _from: usize, + // _length: usize, +) -> Result> { + let words = Words::new(TermsMatchingStrategy::Last); + // let sort = Sort::new(index, txn, "sort1".to_owned(), true)?; + let proximity = GraphBasedRankingRule::::default(); + // TODO: ranking rules given as argument + let mut ranking_rules: Vec>> = + vec![Box::new(words), Box::new(proximity) /* Box::new(sort) */]; + + let ranking_rules_len = ranking_rules.len(); + ranking_rules[0].start_iteration(index, txn, db_cache, universe, query_graph)?; + + // TODO: parent_candidates could be used only during debugging? + let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; + candidates[0] = universe.clone(); + + let mut cur_ranking_rule_index = 0; + + macro_rules! back { + () => { + candidates[cur_ranking_rule_index].clear(); + ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache); + if cur_ranking_rule_index == 0 { + break; + } else { + cur_ranking_rule_index -= 1; + } + }; + } + + let mut results = vec![]; + // TODO: skip buckets when we want to start from an offset + while results.len() < 20 { + // The universe for this bucket is zero or one element, so we don't need to sort + // anything, just extend the results and go back to the parent ranking rule. + if candidates[cur_ranking_rule_index].len() <= 1 { + results.extend(&candidates[cur_ranking_rule_index]); + back!(); + continue; + } + let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, &candidates[cur_ranking_rule_index])? else { + back!(); + continue; + }; + + candidates[cur_ranking_rule_index] -= &next_bucket.candidates; + + if next_bucket.candidates.len() <= 1 { + // Only zero or one candidate, no need to sort through the child ranking rule. + results.extend(next_bucket.candidates); + continue; + } else { + // many candidates, give to next ranking rule, if any + if cur_ranking_rule_index == ranking_rules_len - 1 { + // TODO: don't extend too much, up to the limit only + results.extend(next_bucket.candidates); + } else { + cur_ranking_rule_index += 1; + candidates[cur_ranking_rule_index] = next_bucket.candidates.clone(); + ranking_rules[cur_ranking_rule_index].start_iteration( + index, + txn, + db_cache, + &next_bucket.candidates, + &next_bucket.query, + )?; + } + } + } + + Ok(results) +} + +#[cfg(test)] +mod tests { + use std::fs::File; + use std::io::{BufRead, BufReader, Cursor, Seek}; + use std::time::Instant; + + use heed::EnvOpenOptions; + + use super::{execute_search, get_start_universe}; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; + use crate::new::db_cache::DatabaseCache; + use crate::new::make_query_graph; + use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; + use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; + + #[test] + fn execute_new_search() { + let index = TempIndex::new(); + index + .add_documents(documents!([ + { + "id": 7, + "text": "the super quick super brown fox jumps over", + }, + { + "id": 8, + "text": "the super quick brown fox jumps over", + }, + { + "id": 9, + "text": "the quick super brown fox jumps over", + }, + { + "id": 10, + "text": "the quick brown fox jumps over", + }, + { + "id": 11, + "text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 12, + "text": "the quick brown cat jumps over the lazy dog", + }, + ])) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut db_cache = DatabaseCache::default(); + + let query_graph = + make_query_graph(&index, &txn, &mut db_cache, "the quick brown fox jumps over") + .unwrap(); + println!("{}", query_graph.graphviz()); + + // TODO: filters + maybe distinct attributes? + let universe = get_start_universe( + &index, + &txn, + &mut db_cache, + &query_graph, + TermsMatchingStrategy::Last, + ) + .unwrap(); + println!("universe: {universe:?}"); + + let results = + execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /* 0, 20 */) + .unwrap(); + println!("{results:?}") + } + + #[test] + fn search_movies_new() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + let txn = index.read_txn().unwrap(); + + let primary_key = index.primary_key(&txn).unwrap().unwrap(); + let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + + // loop { + // let start = Instant::now(); + + // let mut db_cache = DatabaseCache::default(); + + // let query_graph = make_query_graph( + // &index, + // &txn, + // &mut db_cache, + // "released from prison by the government", + // ) + // .unwrap(); + // // println!("{}", query_graph.graphviz()); + + // // TODO: filters + maybe distinct attributes? + // let universe = get_start_universe( + // &index, + // &txn, + // &mut db_cache, + // &query_graph, + // TermsMatchingStrategy::Last, + // ) + // .unwrap(); + // // println!("universe: {universe:?}"); + + // let results = execute_search( + // &index, + // &txn, + // &mut db_cache, + // &universe, + // &query_graph, /* 0, 20 */ + // ) + // .unwrap(); + + // let elapsed = start.elapsed(); + // println!("{}us: {results:?}", elapsed.as_micros()); + // } + let start = Instant::now(); + + let mut db_cache = DatabaseCache::default(); + + let query_graph = + make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government") + .unwrap(); + // println!("{}", query_graph.graphviz()); + + // TODO: filters + maybe distinct attributes? + let universe = get_start_universe( + &index, + &txn, + &mut db_cache, + &query_graph, + TermsMatchingStrategy::Last, + ) + .unwrap(); + // println!("universe: {universe:?}"); + + let results = + execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /* 0, 20 */) + .unwrap(); + + let elapsed = start.elapsed(); + + let ids = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|x| { + let obkv = &x.1; + let id = obkv.get(primary_key).unwrap(); + let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + id.as_str().unwrap().to_owned() + }) + .collect::>(); + + println!("{}us: {results:?}", elapsed.as_micros()); + println!("external ids: {ids:?}"); + } + + #[test] + fn search_movies_old() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + let txn = index.read_txn().unwrap(); + + let start = Instant::now(); + + let mut s = Search::new(&txn, &index); + s.query("released from prison by the government"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); + let docs = s.execute().unwrap(); + + let elapsed = start.elapsed(); + + println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + } + + #[test] + fn _settings_movies() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + // let primary_key = "id"; + // let searchable_fields = vec!["title", "overview"]; + // let filterable_fields = vec!["release_date", "genres"]; + // let sortable_fields = vec[]; + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_min_word_len_one_typo(5); + builder.set_min_word_len_two_typos(100); + + // builder.set_primary_key(primary_key.to_owned()); + + // let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_searchable_fields(searchable_fields); + + // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_filterable_fields(filterable_fields); + + builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + + // let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_sortable_fields(sortable_fields); + + builder.execute(|_| (), || false).unwrap(); + } + + // #[test] + fn _index_movies() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + let primary_key = "id"; + let searchable_fields = vec!["title", "overview"]; + let filterable_fields = vec!["release_date", "genres"]; + // let sortable_fields = vec[]; + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_primary_key(primary_key.to_owned()); + + let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(filterable_fields); + + builder.set_criteria(vec![Criterion::Words]); + + // let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_sortable_fields(sortable_fields); + + builder.execute(|_| (), || false).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) + .unwrap(); + + let documents = documents_from( + "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json", + "json", + ); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + } + + fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { + let reader = File::open(filename) + .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename)); + let reader = BufReader::new(reader); + let documents = match filetype { + "csv" => documents_from_csv(reader).unwrap(), + "json" => documents_from_json(reader).unwrap(), + "jsonl" => documents_from_jsonl(reader).unwrap(), + otherwise => panic!("invalid update format {:?}", otherwise), + }; + DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() + } + + fn documents_from_jsonl(reader: impl BufRead) -> crate::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { + let object = result.unwrap(); + documents.append_json_object(&object)?; + } + + documents.into_inner().map_err(Into::into) + } + + fn documents_from_json(reader: impl BufRead) -> crate::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + documents.append_json_array(reader)?; + + documents.into_inner().map_err(Into::into) + } + + fn documents_from_csv(reader: impl BufRead) -> crate::Result> { + let csv = csv::Reader::from_reader(reader); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) + } +} From 46249ea90185ac36c1d3a21ff26e48016162b293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:45:17 +0100 Subject: [PATCH 008/234] Implement a function to find a QueryGraph's docids --- milli/src/search/new/resolve_query_graph.rs | 193 ++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 milli/src/search/new/resolve_query_graph.rs diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs new file mode 100644 index 000000000..748524492 --- /dev/null +++ b/milli/src/search/new/resolve_query_graph.rs @@ -0,0 +1,193 @@ +use heed::{BytesDecode, RoTxn}; +use roaring::{MultiOps, RoaringBitmap}; +use std::collections::{HashMap, HashSet, VecDeque}; + +use super::db_cache::DatabaseCache; +use super::query_term::{QueryTerm, WordDerivations}; +use super::QueryGraph; +use crate::{Index, Result, RoaringBitmapCodec}; + +// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. + +#[derive(Default)] +pub struct NodeDocIdsCache { + pub cache: HashMap, +} + +pub fn resolve_query_graph<'transaction>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + q: &QueryGraph, + universe: &RoaringBitmap, +) -> Result { + // TODO: there is definitely a faster way to compute this big + // roaring bitmap expression + + // resolve_query_graph_rec(index, txn, q, q.root_node, &mut docids, &mut cache)?; + + let mut nodes_resolved = HashSet::new(); + // TODO: should be given as an argument and kept between invocations of resolve query graph + let mut nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()]; + + let mut next_nodes_to_visit = VecDeque::new(); + next_nodes_to_visit.push_front(q.root_node); + + while let Some(node) = next_nodes_to_visit.pop_front() { + let predecessors = &q.edges[node].incoming; + if !predecessors.is_subset(&nodes_resolved) { + next_nodes_to_visit.push_back(node); + continue; + } + // Take union of all predecessors + let predecessors_iter = predecessors.iter().map(|p| &nodes_docids[*p]); + let predecessors_docids = MultiOps::union(predecessors_iter); + + let n = &q.nodes[node]; + // println!("resolving {node} {n:?}, predecessors: {predecessors:?}, their docids: {predecessors_docids:?}"); + let node_docids = match n { + super::QueryNode::Term(located_term) => { + let term = &located_term.value; + match term { + QueryTerm::Phrase(_) => todo!("resolve phrase"), + QueryTerm::Word { + derivations: + WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db }, + } => { + let derivations_docids = { + let mut or_docids = vec![]; + for word in + zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) + { + if let Some(word_docids) = + db_cache.get_word_docids(index, txn, word)? + { + or_docids.push(word_docids); + } + } + if *use_prefix_db { + if let Some(prefix_docids) = + db_cache.get_prefix_docids(index, txn, original.as_str())? + { + or_docids.push(prefix_docids); + } + } + or_docids + }; + let derivations_iter = derivations_docids + .into_iter() + .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()); + let derivations_docids = MultiOps::union(derivations_iter); + // TODO: if `or` is empty, register that somewhere, and immediately return an empty bitmap + // On the other hand, `or` *cannot* be empty, only its intersection with the universe can + // + // TODO: Or we don't do anything and accumulate all these operations in a tree of operations + // between frozen roaring bitmap that is resolved only at the very end + predecessors_docids & derivations_docids + } + } + } + super::QueryNode::Deleted => { + todo!() + } + super::QueryNode::Start => universe.clone(), + super::QueryNode::End => { + return Ok(predecessors_docids); + } + }; + nodes_resolved.insert(node); + nodes_docids[node] = node_docids; + + for &succ in q.edges[node].outgoing.iter() { + if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(&succ) { + next_nodes_to_visit.push_back(succ); + } + } + // This is currently slow but could easily be implemented very efficiently + for &prec in q.edges[node].incoming.iter() { + if q.edges[prec].outgoing.is_subset(&nodes_resolved) { + nodes_docids[prec].clear(); + } + } + // println!("cached docids: {nodes_docids:?}"); + } + + panic!() +} + +#[cfg(test)] +mod tests { + use charabia::Tokenize; + + use super::resolve_query_graph; + use crate::db_snap; + use crate::index::tests::TempIndex; + use crate::new::db_cache::DatabaseCache; + use crate::search::new::query_term::{word_derivations, LocatedQueryTerm}; + use crate::search::new::QueryGraph; + + #[test] + fn test_resolve_query_graph() { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + index + .add_documents(documents!([ + {"id": 0, "text": "0"}, + {"id": 1, "text": "1"}, + {"id": 2, "text": "2"}, + {"id": 3, "text": "3"}, + {"id": 4, "text": "4"}, + {"id": 5, "text": "5"}, + {"id": 6, "text": "6"}, + {"id": 7, "text": "7"}, + {"id": 8, "text": "0 1 2 3 4 5 6 7"}, + {"id": 9, "text": "7 6 5 4 3 2 1 0"}, + {"id": 10, "text": "01 234 56 7"}, + {"id": 11, "text": "7 56 0 1 23 5 4"}, + {"id": 12, "text": "0 1 2 3 4 5 6"}, + {"id": 13, "text": "01 23 4 5 7"}, + ])) + .unwrap(); + db_snap!(index, word_docids, @"7512d0b80659f6bf37d98b374ada8098"); + + let txn = index.read_txn().unwrap(); + let mut db_cache = DatabaseCache::default(); + let fst = index.words_fst(&txn).unwrap(); + let query = LocatedQueryTerm::from_query( + "no 0 1 2 3 no 4 5 6 7".tokenize(), + None, + |word, is_prefix| { + word_derivations( + &index, + &txn, + word, + if word.len() < 3 { + 0 + } else if word.len() < 6 { + 1 + } else { + 2 + }, + is_prefix, + &fst, + ) + }, + ) + .unwrap(); + let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap(); + println!("{}", graph.graphviz()); + + let universe = index.documents_ids(&txn).unwrap(); + insta::assert_debug_snapshot!(universe, @"RoaringBitmap<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]>"); + let docids = resolve_query_graph(&index, &txn, &mut db_cache, &graph, &universe).unwrap(); + insta::assert_debug_snapshot!(docids, @"RoaringBitmap<[8, 9, 11]>"); + + // TODO: test with a reduced universe + } +} From c9bf6bb2fa861a9ff5a60088bc0eab8ea1763c86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:46:00 +0100 Subject: [PATCH 009/234] Introduce a structure to implement ranking rules with graph algorithms --- .../search/new/ranking_rule_graph/build.rs | 43 ++++ .../src/search/new/ranking_rule_graph/mod.rs | 207 ++++++++++++++++++ 2 files changed, 250 insertions(+) create mode 100644 milli/src/search/new/ranking_rule_graph/build.rs create mode 100644 milli/src/search/new/ranking_rule_graph/mod.rs diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs new file mode 100644 index 000000000..605fe82d1 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -0,0 +1,43 @@ +use std::collections::{BTreeSet, HashMap, HashSet}; + +use heed::RoTxn; + +use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::new::db_cache::DatabaseCache; +use crate::new::QueryGraph; +use crate::{Index, Result}; + +impl RankingRuleGraph { + pub fn build<'db_cache, 'transaction: 'db_cache>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + query_graph: QueryGraph, + ) -> Result { + let mut ranking_rule_graph = Self { query_graph, all_edges: vec![], node_edges: vec![] }; + + for (node_idx, node) in ranking_rule_graph.query_graph.nodes.iter().enumerate() { + ranking_rule_graph.node_edges.push(BTreeSet::new()); + let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap(); + + let Some(from_node_data) = G::build_visit_from_node(index, txn, db_cache, node)? else { continue }; + + for &successor_idx in ranking_rule_graph.query_graph.edges[node_idx].outgoing.iter() { + let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx]; + let Some(edges) = G::build_visit_to_node(index, txn, db_cache, to_node, &from_node_data)? else { continue }; + for (cost, details) in edges { + ranking_rule_graph.all_edges.push(Some(Edge { + from_node: node_idx, + to_node: successor_idx, + cost, + details, + })); + new_edges.insert(ranking_rule_graph.all_edges.len() - 1); + } + } + } + ranking_rule_graph.simplify(); + + Ok(ranking_rule_graph) + } +} diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs new file mode 100644 index 000000000..12f397df3 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -0,0 +1,207 @@ +pub mod build; +pub mod cheapest_paths; +pub mod edge_docids_cache; +pub mod empty_paths_cache; +pub mod paths_map; +pub mod proximity; +pub mod resolve_paths; + +use std::collections::{BTreeSet, HashMap, HashSet}; +use std::ops::ControlFlow; + +use heed::RoTxn; +use roaring::RoaringBitmap; + +use super::db_cache::DatabaseCache; +use super::{QueryGraph, QueryNode}; +use crate::{Index, Result}; + +#[derive(Debug, Clone)] +pub enum EdgeDetails { + Unconditional, + Data(E), +} + +#[derive(Debug, Clone)] +pub struct Edge { + from_node: usize, + to_node: usize, + cost: u8, + details: EdgeDetails, +} + +#[derive(Debug, Clone)] +pub struct EdgePointer<'graph, E> { + pub index: EdgeIndex, + pub edge: &'graph Edge, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct EdgeIndex(pub usize); +// { +// // TODO: they could all be u16 instead +// // There may be a way to store all the edge indices in a u32 as well, +// // if the edges are in a vector +// // then we can store sets of edges in a bitmap efficiently +// pub from: usize, +// pub to: usize, +// pub edge_idx: usize, +// } + +pub trait RankingRuleGraphTrait { + type EdgeDetails: Sized; + type BuildVisitedFromNode; + + fn edge_details_dot_label(edge: &Self::EdgeDetails) -> String; + + fn compute_docids<'transaction>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + edge_details: &Self::EdgeDetails, + ) -> Result; + + fn build_visit_from_node<'transaction>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + from_node: &QueryNode, + ) -> Result>; + + fn build_visit_to_node<'from_data, 'transaction: 'from_data>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + to_node: &QueryNode, + from_node_data: &'from_data Self::BuildVisitedFromNode, + ) -> Result)>>>; +} + +pub struct RankingRuleGraph { + pub query_graph: QueryGraph, + // pub edges: Vec>>>, + pub all_edges: Vec>>, + pub node_edges: Vec>, + // pub removed_edges: HashSet, + // pub tmp_removed_edges: HashSet, +} +impl RankingRuleGraph { + // NOTE: returns the edge even if it was removed + pub fn get_edge(&self, edge_index: EdgeIndex) -> &Option> { + &self.all_edges[edge_index.0] + } + pub fn visit_edges<'graph, O>( + &'graph self, + from: usize, + to: usize, + mut visit: impl FnMut(EdgeIndex, &'graph Edge) -> ControlFlow, + ) -> Option { + let from_edges = &self.node_edges[from]; + for &edge_idx in from_edges { + let edge = self.all_edges[edge_idx].as_ref().unwrap(); + if edge.to_node == to { + let cf = visit(EdgeIndex(edge_idx), edge); + match cf { + ControlFlow::Continue(_) => continue, + ControlFlow::Break(o) => return Some(o), + } + } + } + + None + } + + fn remove_edge(&mut self, edge_index: EdgeIndex) { + let edge_opt = &mut self.all_edges[edge_index.0]; + let Some(Edge { from_node, to_node, cost, details }) = &edge_opt else { return }; + + let node_edges = &mut self.node_edges[*from_node]; + node_edges.remove(&edge_index.0); + + *edge_opt = None; + } + pub fn remove_nodes(&mut self, nodes: &[usize]) { + for &node in nodes { + let edge_indices = &mut self.node_edges[node]; + for edge_index in edge_indices.iter() { + self.all_edges[*edge_index] = None; + } + edge_indices.clear(); + + let preds = &self.query_graph.edges[node].incoming; + for pred in preds { + let edge_indices = &mut self.node_edges[*pred]; + for edge_index in edge_indices.iter() { + let edge_opt = &mut self.all_edges[*edge_index]; + let Some(edge) = edge_opt else { continue; }; + if edge.to_node == node { + *edge_opt = None; + } + } + panic!("remove nodes is incorrect at the moment"); + edge_indices.clear(); + } + } + self.query_graph.remove_nodes(nodes); + } + pub fn simplify(&mut self) { + loop { + let mut nodes_to_remove = vec![]; + for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { + if !matches!(node, QueryNode::End | QueryNode::Deleted) + && self.node_edges[node_idx].is_empty() + { + nodes_to_remove.push(node_idx); + } + } + if nodes_to_remove.is_empty() { + break; + } else { + self.remove_nodes(&nodes_to_remove); + } + } + } + // fn is_removed_edge(&self, edge: EdgeIndex) -> bool { + // self.removed_edges.contains(&edge) || self.tmp_removed_edges.contains(&edge) + // } + + pub fn graphviz(&self) -> String { + let mut desc = String::new(); + desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n"); + + for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { + if matches!(node, QueryNode::Deleted) { + continue; + } + desc.push_str(&format!("{node_idx} [label = {:?}]", node)); + if node_idx == self.query_graph.root_node { + desc.push_str("[color = blue]"); + } else if node_idx == self.query_graph.end_node { + desc.push_str("[color = red]"); + } + desc.push_str(";\n"); + } + for edge in self.all_edges.iter().flatten() { + let Edge { from_node, to_node, cost, details } = edge; + + match &details { + EdgeDetails::Unconditional => { + desc.push_str(&format!( + "{from_node} -> {to_node} [label = \"always cost {cost}\"];\n", + cost = edge.cost, + )); + } + EdgeDetails::Data(details) => { + desc.push_str(&format!( + "{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\"];\n", + cost = edge.cost, + edge_label = G::edge_details_dot_label(details) + )); + } + } + } + + desc.push('}'); + desc + } +} From 864f6410ed492a315838717c4e46b23a626556f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:46:49 +0100 Subject: [PATCH 010/234] Introduce a structure to represent a set of graph paths efficiently --- .../new/ranking_rule_graph/paths_map.rs | 427 ++++++++++++++++++ 1 file changed, 427 insertions(+) create mode 100644 milli/src/search/new/ranking_rule_graph/paths_map.rs diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs new file mode 100644 index 000000000..589a1a52f --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -0,0 +1,427 @@ +use std::collections::hash_map::DefaultHasher; +use std::collections::HashSet; +use std::fmt::Write; +use std::hash::{Hash, Hasher}; + +use super::cheapest_paths::Path; +use super::{EdgeDetails, EdgeIndex, RankingRuleGraph, RankingRuleGraphTrait, Edge}; +use crate::new::QueryNode; + + +#[derive(Debug)] +pub struct PathsMap { + nodes: Vec<(EdgeIndex, PathsMap)>, + value: Option +} +impl Default for PathsMap { + fn default() -> Self { + Self { nodes: vec![], value: None } + } +} + +impl PathsMap { + pub fn from_paths(paths: &[Path]) -> Self { + let mut result = Self::default(); + for p in paths { + result.add_path(p); + } + result + } + pub fn add_path(&mut self, path: &Path) { + self.insert(path.edges.iter().copied(), path.cost); + } +} +impl PathsMap { + pub fn is_empty(&self) -> bool { + self.nodes.is_empty() && self.value.is_none() + } + + pub fn insert(&mut self, mut edges: impl Iterator, value: V) { + match edges.next() { + None => { + self.value = Some(value); + } + Some(first_edge) => { + // comment + for (edge, next_node) in &mut self.nodes { + if edge == &first_edge { + return next_node.insert(edges, value); + } + } + let mut rest = PathsMap::default(); + rest.insert(edges, value); + self.nodes.push((first_edge, rest)); + } + } + } + fn remove_first_rec(&mut self, cur: &mut Vec) -> (bool, V) { + let Some((first_edge, rest)) = self.nodes.first_mut() else { + // The PathsMap has to be correct by construction here, otherwise + // the unwrap() will crash + return (true, self.value.take().unwrap()) + }; + cur.push(*first_edge); + let (rest_is_empty, value) = rest.remove_first_rec(cur); + if rest_is_empty { + self.nodes.remove(0); + (self.nodes.is_empty(), value) + } else { + (false, value) + } + } + pub fn remove_first(&mut self) -> Option<(Vec, V)> { + if self.is_empty() { + return None + } + + let mut result = vec![]; + let (_, value) = self.remove_first_rec(&mut result); + Some((result, value)) + } + pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { + if let Some(value) = &self.value { + visit(cur, value); + } + for (first_edge, rest) in self.nodes.iter() { + cur.push(*first_edge); + rest.iterate_rec(cur, visit); + cur.pop(); + } + } + pub fn iterate(&self, mut visit: impl FnMut(&Vec, &V)) { + self.iterate_rec(&mut vec![], &mut visit) + } + + pub fn remove_prefixes(&mut self, prefixes: &PathsMap) { + prefixes.iterate(|prefix, _v| { + self.remove_prefix(prefix); + }); + } + pub fn remove_edges(&mut self, forbidden_edges: &HashSet) { + let mut i = 0; + while i < self.nodes.len() { + let should_remove = if forbidden_edges.contains(&self.nodes[i].0) { + true + } else if !self.nodes[i].1.nodes.is_empty() { + self.nodes[i].1.remove_edges(forbidden_edges); + self.nodes[i].1.nodes.is_empty() + } else { + false + }; + if should_remove { + self.nodes.remove(i); + } else { + i += 1; + } + } + } + pub fn remove_edge(&mut self, forbidden_edge: &EdgeIndex) { + let mut i = 0; + while i < self.nodes.len() { + let should_remove = if &self.nodes[i].0 == forbidden_edge { + true + } else if !self.nodes[i].1.nodes.is_empty() { + self.nodes[i].1.remove_edge(forbidden_edge); + self.nodes[i].1.nodes.is_empty() + } else { + false + }; + if should_remove { + self.nodes.remove(i); + } else { + i += 1; + } + } + } + pub fn remove_prefix(&mut self, forbidden_prefix: &[EdgeIndex]) { + let [first_edge, remaining_prefix @ ..] = forbidden_prefix else { + self.nodes.clear(); + self.value = None; + return; + }; + + let mut i = 0; + while i < self.nodes.len() { + let edge = self.nodes[i].0; + let should_remove = if edge == *first_edge { + self.nodes[i].1.remove_prefix(remaining_prefix); + self.nodes[i].1.nodes.is_empty() + } else { + false + }; + if should_remove { + self.nodes.remove(i); + } else { + i += 1; + } + } + } + + pub fn edge_indices_after_prefix(&self, prefix: &[EdgeIndex]) -> Vec { + let [first_edge, remaining_prefix @ ..] = prefix else { + return self.nodes.iter().map(|n| n.0).collect(); + }; + for (edge, rest) in self.nodes.iter(){ + if edge == first_edge { + return rest.edge_indices_after_prefix(remaining_prefix); + } + } + vec![] + } + + pub fn contains_prefix_of_path(&self, path: &[EdgeIndex]) -> bool { + if self.value.is_some() { + return true + } + match path { + [] => { + false + } + [first_edge, remaining_path @ ..] => { + for (edge, rest) in self.nodes.iter(){ + if edge == first_edge { + return rest.contains_prefix_of_path(remaining_path); + } + } + false + } + } + } + + pub fn graphviz(&self, graph: &RankingRuleGraph) -> String { + let mut desc = String::new(); + desc.push_str("digraph G {\n"); + self.graphviz_rec(&mut desc, vec![], graph); + desc.push_str("\n}\n"); + desc + } + fn graphviz_rec(&self, desc: &mut String, path_from: Vec, graph: &RankingRuleGraph) { + let id_from = { + let mut h = DefaultHasher::new(); + path_from.hash(&mut h); + h.finish() + }; + for (edge_idx, rest) in self.nodes.iter() { + let Some(Edge { from_node, to_node, cost, details }) = graph.get_edge(*edge_idx).as_ref() else { + continue; + }; + let mut path_to = path_from.clone(); + path_to.push({ + let mut h = DefaultHasher::new(); + edge_idx.hash(&mut h); + h.finish() + }); + let id_to = { + let mut h = DefaultHasher::new(); + path_to.hash(&mut h); + h.finish() + }; + writeln!(desc, "{id_to} [label = \"{from_node}→{to_node} [{cost}]\"];").unwrap(); + writeln!(desc, "{id_from} -> {id_to};").unwrap(); + + rest.graphviz_rec(desc, path_to, graph); + } + } +} + +impl RankingRuleGraph { + + pub fn graphviz_with_path(&self, path: &Path) -> String { + let mut desc = String::new(); + desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n"); + + for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { + if matches!(node, QueryNode::Deleted) { + continue; + } + desc.push_str(&format!("{node_idx} [label = {:?}]", node)); + if node_idx == self.query_graph.root_node { + desc.push_str("[color = blue]"); + } else if node_idx == self.query_graph.end_node { + desc.push_str("[color = red]"); + } + desc.push_str(";\n"); + } + + for (edge_idx, edge) in self.all_edges.iter().enumerate() { + let Some(edge) = edge else { continue }; + let Edge { from_node, to_node, cost, details } = edge; + let color = if path.edges.contains(&EdgeIndex(edge_idx)) { + "red" + } else { + "green" + }; + match &edge.details { + EdgeDetails::Unconditional => { + desc.push_str(&format!( + "{from_node} -> {to_node} [label = \"cost {cost}\", color = {color}];\n", + cost = edge.cost, + )); + } + EdgeDetails::Data(details) => { + desc.push_str(&format!( + "{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\", color = {color}];\n", + cost = edge.cost, + edge_label = G::edge_details_dot_label(details), + )); + } + } + } + + desc.push('}'); + desc + } + +} + +#[cfg(test)] +mod tests { + use super::PathsMap; + use crate::db_snap; + use crate::index::tests::TempIndex; + use crate::new::db_cache::DatabaseCache; + use crate::new::ranking_rule_graph::cheapest_paths::KCheapestPathsState; + use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; + use crate::new::ranking_rule_graph::proximity::ProximityGraph; + use crate::new::ranking_rule_graph::{RankingRuleGraph, EdgeIndex}; + use crate::search::new::query_term::{word_derivations, LocatedQueryTerm}; + use crate::search::new::QueryGraph; + use charabia::Tokenize; + + #[test] + fn paths_tree() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + index + .update_settings(|s| { + s.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "text": "0 1 2 3 4 5" + }, + { + "text": "0 a 1 b 2 3 4 5" + }, + { + "text": "0 a 1 b 3 a 4 b 5" + }, + { + "text": "0 a a 1 b 2 3 4 5" + }, + { + "text": "0 a a a a 1 b 3 45" + }, + ])) + .unwrap(); + + db_snap!(index, word_pair_proximity_docids, @"679d1126b569b3e8b10dd937c3faedf9"); + + let txn = index.read_txn().unwrap(); + let mut db_cache = DatabaseCache::default(); + let fst = index.words_fst(&txn).unwrap(); + let query = + LocatedQueryTerm::from_query("0 1 2 3 4 5".tokenize(), None, |word, is_prefix| { + word_derivations(&index, &txn, word, if word.len() < 3 { + 0 + } else if word.len() < 6 { + 1 + } else { + 2 + },is_prefix, &fst) + }) + .unwrap(); + let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap(); + let empty_paths_cache = EmptyPathsCache::default(); + let mut db_cache = DatabaseCache::default(); + + let mut prox_graph = + RankingRuleGraph::::build(&index, &txn, &mut db_cache, graph).unwrap(); + + println!("{}", prox_graph.graphviz()); + + let mut state = KCheapestPathsState::new(&prox_graph).unwrap(); + + let mut path_tree = PathsMap::default(); + while state.next_cost() <= 6 { + let next_state = state.compute_paths_of_next_lowest_cost(&mut prox_graph, &empty_paths_cache, &mut path_tree); + if let Some(next_state) = next_state { + state = next_state; + } else { + break; + } + } + + let desc = path_tree.graphviz(&prox_graph); + println!("{desc}"); + + // let path = vec![EdgeIndex { from: 0, to: 2, edge_idx: 0 }, EdgeIndex { from: 2, to: 3, edge_idx: 0 }, EdgeIndex { from: 3, to: 4, edge_idx: 0 }, EdgeIndex { from: 4, to: 5, edge_idx: 0 }, EdgeIndex { from: 5, to: 8, edge_idx: 0 }, EdgeIndex { from: 8, to: 1, edge_idx: 0 }, EdgeIndex { from: 1, to: 10, edge_idx: 0 }]; + // println!("{}", psath_tree.contains_prefix_of_path(&path)); + + + // let path = vec![EdgeIndex { from: 0, to: 2, edge_idx: 0 }, EdgeIndex { from: 2, to: 3, edge_idx: 0 }, EdgeIndex { from: 3, to: 4, edge_idx: 0 }, EdgeIndex { from: 4, to: 5, edge_idx: 0 }, EdgeIndex { from: 5, to: 6, edge_idx: 0 }, EdgeIndex { from: 6, to: 7, edge_idx: 0 }, EdgeIndex { from: 7, to: 1, edge_idx: 0 }]; + + + // path_tree.iterate(|path, cost| { + // println!("cost {cost} for path: {path:?}"); + // }); + + // path_tree.remove_forbidden_prefix(&[ + // EdgeIndex { from: 0, to: 2, edge_idx: 0 }, + // EdgeIndex { from: 2, to: 3, edge_idx: 2 }, + // ]); + // let desc = path_tree.graphviz(); + // println!("{desc}"); + + // path_tree.remove_forbidden_edge(&EdgeIndex { from: 5, to: 6, cost: 1 }); + + // let desc = path_tree.graphviz(); + // println!("AFTER REMOVING 5-6 [1]:\n{desc}"); + + // path_tree.remove_forbidden_edge(&EdgeIndex { from: 3, to: 4, cost: 1 }); + + // let desc = path_tree.graphviz(); + // println!("AFTER REMOVING 3-4 [1]:\n{desc}"); + + // let p = path_tree.remove_first(); + // println!("PATH: {p:?}"); + // let desc = path_tree.graphviz(); + // println!("AFTER REMOVING: {desc}"); + + // let p = path_tree.remove_first(); + // println!("PATH: {p:?}"); + // let desc = path_tree.graphviz(); + // println!("AFTER REMOVING: {desc}"); + + // path_tree.remove_all_containing_edge(&EdgeIndex { from: 5, to: 6, cost: 2 }); + + // let desc = path_tree.graphviz(); + // println!("{desc}"); + + // let first_edges = path_tree.remove_first().unwrap(); + // println!("{first_edges:?}"); + // let desc = path_tree.graphviz(); + // println!("{desc}"); + + // let first_edges = path_tree.remove_first().unwrap(); + // println!("{first_edges:?}"); + // let desc = path_tree.graphviz(); + // println!("{desc}"); + + // let first_edges = path_tree.remove_first().unwrap(); + // println!("{first_edges:?}"); + // let desc = path_tree.graphviz(); + // println!("{desc}"); + + // println!("{path_tree:?}"); + } + + + #[test] + fn test_contains_prefix_of_path() { + + } +} From 23bf572dea9dfbac3bf5d0bdd4672851c4dd8c4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:47:23 +0100 Subject: [PATCH 011/234] Introduce cache structures used with ranking rule graphs --- .../ranking_rule_graph/edge_docids_cache.rs | 55 +++++++++++++++++++ .../ranking_rule_graph/empty_paths_cache.rs | 23 ++++++++ 2 files changed, 78 insertions(+) create mode 100644 milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs create mode 100644 milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs new file mode 100644 index 000000000..301810847 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -0,0 +1,55 @@ +use std::collections::HashMap; +use std::marker::PhantomData; + +use heed::RoTxn; +use roaring::RoaringBitmap; + +use super::{EdgeDetails, EdgeIndex, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::new::db_cache::DatabaseCache; +use crate::new::BitmapOrAllRef; +use crate::{Index, Result}; + +pub struct EdgeDocidsCache { + pub cache: HashMap, + + // TODO: There is a big difference between `cache`, which is always valid, and + // `empty_path_prefixes`, which is only accurate for a particular universe + // ALSO, we should have a universe-specific `empty_edge` to use + // pub empty_path_prefixes: HashSet>, + _phantom: PhantomData, +} +impl Default for EdgeDocidsCache { + fn default() -> Self { + Self { + cache: Default::default(), + // empty_path_prefixes: Default::default(), + _phantom: Default::default(), + } + } +} +impl EdgeDocidsCache { + pub fn get_edge_docids<'s, 'transaction>( + &'s mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + edge_index: &EdgeIndex, + graph: &RankingRuleGraph, + ) -> Result> { + if self.cache.contains_key(edge_index) { + return Ok(BitmapOrAllRef::Bitmap(&self.cache[edge_index])); + } + let edge = graph.get_edge(*edge_index).as_ref().unwrap(); + + match &edge.details { + EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All), + EdgeDetails::Data(details) => { + let docids = G::compute_docids(index, txn, db_cache, details)?; + + let _ = self.cache.insert(*edge_index, docids); + let docids = &self.cache[edge_index]; + Ok(BitmapOrAllRef::Bitmap(docids)) + } + } + } +} diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs new file mode 100644 index 000000000..989a08a0d --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -0,0 +1,23 @@ +use std::collections::HashSet; + +use super::{paths_map::PathsMap, EdgeIndex}; + +#[derive(Default)] +pub struct EmptyPathsCache { + pub empty_edges: HashSet, + pub empty_prefixes: PathsMap<()>, +} +impl EmptyPathsCache { + pub fn path_is_empty(&self, path: &[EdgeIndex]) -> bool { + for edge in path { + // TODO: should be a bitmap intersection + if self.empty_edges.contains(edge) { + return true; + } + } + if self.empty_prefixes.contains_prefix_of_path(path) { + return true; + } + false + } +} From 48aae76b154d6e7014dc0f2fb3b689c28ae45990 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:48:05 +0100 Subject: [PATCH 012/234] Introduce a function to find the docids of a set of paths in a graph --- .../new/ranking_rule_graph/resolve_paths.rs | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 milli/src/search/new/ranking_rule_graph/resolve_paths.rs diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs new file mode 100644 index 000000000..76823a32a --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -0,0 +1,81 @@ +#![allow(clippy::too_many_arguments)] + +use heed::RoTxn; +use roaring::{MultiOps, RoaringBitmap}; + +use super::edge_docids_cache::EdgeDocidsCache; +use super::empty_paths_cache::EmptyPathsCache; +use super::paths_map::PathsMap; +use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use crate::new::db_cache::DatabaseCache; +use crate::new::ranking_rule_graph::Edge; +use crate::new::BitmapOrAllRef; +use crate::{Index, Result}; + +impl RankingRuleGraph { + pub fn resolve_paths<'transaction>( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + edge_docids_cache: &mut EdgeDocidsCache, + empty_paths_cache: &mut EmptyPathsCache, + universe: &RoaringBitmap, + mut paths: PathsMap, + ) -> Result { + let mut path_bitmaps = vec![]; + + paths.remove_edges(&empty_paths_cache.empty_edges); + paths.remove_prefixes(&empty_paths_cache.empty_prefixes); + + 'path_loop: while let Some((edge_indexes, _)) = paths.remove_first() { + // if path is excluded, continue... + let mut processed_edges = vec![]; + let mut path_bitmap = universe.clone(); + 'edge_loop: for edge_index in edge_indexes { + processed_edges.push(edge_index); + let edge_docids = + edge_docids_cache.get_edge_docids(index, txn, db_cache, &edge_index, self)?; + match edge_docids { + BitmapOrAllRef::Bitmap(edge_docids) => { + if edge_docids.is_disjoint(universe) { + // 1. Store in the cache that this edge is empty for this universe + empty_paths_cache.empty_edges.insert(edge_index); + // 2. remove all the paths that contain this edge for this universe + paths.remove_edge(&edge_index); + // 3. remove this edge from the proximity graph + + self.remove_edge(edge_index); + + // 4. continue executing this function again on the remaining paths + continue 'path_loop; + } else { + path_bitmap &= edge_docids; + if path_bitmap.is_disjoint(universe) { + // 1. Store in the cache that this prefix is empty for this universe + empty_paths_cache + .empty_prefixes + .insert(processed_edges.iter().copied(), ()); + // 2. remove all the paths beginning with this prefix + paths.remove_prefix(&processed_edges); + // 3. continue executing this function again on the remaining paths? + continue 'path_loop; + } + } + } + BitmapOrAllRef::All => continue 'edge_loop, + } + } + path_bitmaps.push(path_bitmap); + } + let docids = MultiOps::union(path_bitmaps); + Ok(docids) + // for each path, translate it to an intersection of cached roaring bitmaps + // then do a union for all paths + + // get the docids of the given paths in the proximity graph + // in the fastest possible way + // 1. roaring MultiOps (before we can do the Frozen+AST thing) + // 2. minimize number of operations + } +} From a70ab8b0724d129d0c389fb854118dc316d7c07d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:48:30 +0100 Subject: [PATCH 013/234] Introduce a function to find the K shortest paths in a graph --- .../new/ranking_rule_graph/cheapest_paths.rs | 251 ++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 milli/src/search/new/ranking_rule_graph/cheapest_paths.rs diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs new file mode 100644 index 000000000..3bd43fd6f --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -0,0 +1,251 @@ +use std::collections::{BTreeMap, HashSet}; + +use itertools::Itertools; + +use super::{ + empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, Edge, EdgeIndex, RankingRuleGraph, + RankingRuleGraphTrait, +}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Path { + pub edges: Vec, + pub cost: u64, +} + +struct DijkstraState { + unvisited: HashSet, // should be a small bitset + distances: Vec, // or binary heap (f64, usize) + edges: Vec, + edge_costs: Vec, + paths: Vec>, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct PathEdgeId { + pub from: usize, + pub to: usize, + pub id: Id, +} + +pub struct KCheapestPathsState { + cheapest_paths: PathsMap, + potential_cheapest_paths: BTreeMap>, + pub kth_cheapest_path: Path, +} + +impl KCheapestPathsState { + pub fn next_cost(&self) -> u64 { + self.kth_cheapest_path.cost + } + + pub fn new( + graph: &RankingRuleGraph, + ) -> Option { + let Some(cheapest_path) = graph.cheapest_path_to_end(graph.query_graph.root_node) else { + return None + }; + let cheapest_paths = PathsMap::from_paths(&[cheapest_path.clone()]); + let potential_cheapest_paths = BTreeMap::new(); + Some(KCheapestPathsState { + cheapest_paths, + potential_cheapest_paths, + kth_cheapest_path: cheapest_path, + }) + } + + pub fn remove_empty_paths(mut self, empty_paths_cache: &EmptyPathsCache) -> Option { + self.cheapest_paths.remove_edges(&empty_paths_cache.empty_edges); + self.cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes); + + let mut costs_to_delete = HashSet::new(); + for (cost, potential_cheapest_paths) in self.potential_cheapest_paths.iter_mut() { + potential_cheapest_paths.remove_edges(&empty_paths_cache.empty_edges); + potential_cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes); + if potential_cheapest_paths.is_empty() { + costs_to_delete.insert(*cost); + } + } + for cost in costs_to_delete { + self.potential_cheapest_paths.remove(&cost); + } + + if self.cheapest_paths.is_empty() {} + + todo!() + } + + pub fn compute_paths_of_next_lowest_cost( + mut self, + graph: &mut RankingRuleGraph, + empty_paths_cache: &EmptyPathsCache, + into_map: &mut PathsMap, + ) -> Option { + into_map.add_path(&self.kth_cheapest_path); + let cur_cost = self.kth_cheapest_path.cost; + while self.kth_cheapest_path.cost <= cur_cost { + if let Some(next_self) = self.compute_next_cheapest_paths(graph, empty_paths_cache) { + self = next_self; + if self.kth_cheapest_path.cost == cur_cost { + into_map.add_path(&self.kth_cheapest_path); + } + } else { + return None; + } + } + Some(self) + } + + // TODO: use the cache to potentially remove edges that return an empty RoaringBitmap + // TODO: return an Option<&'self Path>? + fn compute_next_cheapest_paths( + mut self, + graph: &mut RankingRuleGraph, + empty_paths_cache: &EmptyPathsCache, + ) -> Option { + // for all nodes in the last cheapest path (called spur_node), except last one... + for (i, edge_idx) in self.kth_cheapest_path.edges[..self.kth_cheapest_path.edges.len() - 1] + .iter() + .enumerate() + { + let Some(edge) = graph.all_edges[edge_idx.0].as_ref() else { continue; }; + let Edge { from_node: spur_node, .. } = edge; + + // TODO: + // Here, check that the root path is not dicarded by the empty_paths_cache + // If it is, then continue to the next spur_node + let root_path = &self.kth_cheapest_path.edges[..i]; + if empty_paths_cache.path_is_empty(root_path) { + continue; + } + + let root_cost = root_path + .iter() + .fold(0, |sum, next| sum + graph.get_edge(*next).as_ref().unwrap().cost as u64); + + let mut tmp_removed_edges = vec![]; + // for all the paths already found that share a common prefix with the root path + // we delete the edge from the spur node to the next one + for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) { + let was_removed = graph.node_edges[*spur_node].remove(&edge_index_to_remove.0); + if was_removed { + tmp_removed_edges.push(edge_index_to_remove.0); + } + } + + // Compute the cheapest path from the spur node to the destination + // we will combine it with the root path to get a potential kth cheapest path + let spur_path = graph.cheapest_path_to_end(*spur_node); + // restore the temporarily removed edges + graph.node_edges[*spur_node].extend(tmp_removed_edges); + + let Some(spur_path) = spur_path else { continue; }; + let total_cost = root_cost + spur_path.cost; + let total_path = Path { + edges: root_path.iter().chain(spur_path.edges.iter()).cloned().collect(), + cost: total_cost, + }; + let entry = self.potential_cheapest_paths.entry(total_cost).or_default(); + entry.add_path(&total_path); + } + while let Some(mut next_cheapest_paths_entry) = self.potential_cheapest_paths.first_entry() + { + // This could be implemented faster + // Here, maybe I should filter the potential cheapest paths so that they + // don't contain any removed edge? + + let cost = *next_cheapest_paths_entry.key(); + let next_cheapest_paths = next_cheapest_paths_entry.get_mut(); + + while let Some((next_cheapest_path, cost2)) = next_cheapest_paths.remove_first() { + assert_eq!(cost, cost2); + if next_cheapest_path + .iter() + .any(|edge_index| graph.all_edges.get(edge_index.0).is_none()) + { + continue; + } else { + self.cheapest_paths.insert(next_cheapest_path.iter().copied(), cost); + + if next_cheapest_paths.is_empty() { + next_cheapest_paths_entry.remove(); + } + self.kth_cheapest_path = Path { edges: next_cheapest_path, cost }; + + return Some(self); + } + } + let _ = next_cheapest_paths_entry.remove_entry(); + } + None + } +} + +impl RankingRuleGraph { + fn cheapest_path_to_end(&self, from: usize) -> Option { + let mut dijkstra = DijkstraState { + unvisited: (0..self.query_graph.nodes.len()).collect(), + distances: vec![u64::MAX; self.query_graph.nodes.len()], + edges: vec![EdgeIndex(usize::MAX); self.query_graph.nodes.len()], + edge_costs: vec![u8::MAX; self.query_graph.nodes.len()], + paths: vec![None; self.query_graph.nodes.len()], + }; + dijkstra.distances[from] = 0; + + // TODO: could use a binary heap here to store the distances + while let Some(&cur_node) = + dijkstra.unvisited.iter().min_by_key(|&&n| dijkstra.distances[n]) + { + let cur_node_dist = dijkstra.distances[cur_node]; + if cur_node_dist == u64::MAX { + return None; + } + if cur_node == self.query_graph.end_node { + break; + } + + let succ_cur_node: HashSet<_> = self.node_edges[cur_node] + .iter() + .map(|e| self.all_edges[*e].as_ref().unwrap().to_node) + .collect(); + // TODO: this intersection may be slow but shouldn't be, + // can use a bitmap intersection instead + let unvisited_succ_cur_node = succ_cur_node.intersection(&dijkstra.unvisited); + for &succ in unvisited_succ_cur_node { + let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(cur_node, succ) else { + continue + }; + + // println!("cur node dist {cur_node_dist}"); + let old_dist_succ = &mut dijkstra.distances[succ]; + let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64; + if new_potential_distance < *old_dist_succ { + *old_dist_succ = new_potential_distance; + dijkstra.edges[succ] = cheapest_edge; + dijkstra.edge_costs[succ] = cheapest_edge_cost; + dijkstra.paths[succ] = Some(cur_node); + } + } + dijkstra.unvisited.remove(&cur_node); + } + + let mut cur = self.query_graph.end_node; + // let mut edge_costs = vec![]; + // let mut distances = vec![]; + let mut path_edges = vec![]; + while let Some(n) = dijkstra.paths[cur] { + path_edges.push(dijkstra.edges[cur]); + cur = n; + } + path_edges.reverse(); + Some(Path { edges: path_edges, cost: dijkstra.distances[self.query_graph.end_node] }) + } + + // TODO: this implementation is VERY fragile, as we assume that the edges are ordered by cost + // already. Change it. + pub fn cheapest_edge(&self, cur_node: usize, succ: usize) -> Option<(EdgeIndex, u8)> { + self.visit_edges(cur_node, succ, |edge_idx, edge| { + std::ops::ControlFlow::Break((edge_idx, edge.cost)) + }) + } +} From c64585352940fae26a6c75de5750459bf53fd499 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:48:49 +0100 Subject: [PATCH 014/234] Introduce a generic graph-based ranking rule --- .../search/new/graph_based_ranking_rule.rs | 166 ++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 milli/src/search/new/graph_based_ranking_rule.rs diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs new file mode 100644 index 000000000..0f72b9d5d --- /dev/null +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -0,0 +1,166 @@ +use heed::RoTxn; +use roaring::RoaringBitmap; + +use crate::{ + new::ranking_rule_graph::cheapest_paths::{self, Path}, + Index, Result, +}; + +use super::{ + db_cache::DatabaseCache, + ranking_rule_graph::{ + cheapest_paths::KCheapestPathsState, edge_docids_cache::EdgeDocidsCache, + empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, RankingRuleGraph, + RankingRuleGraphTrait, + }, + QueryGraph, RankingRule, RankingRuleOutput, +}; + +pub struct GraphBasedRankingRule { + state: Option>, +} +impl Default for GraphBasedRankingRule { + fn default() -> Self { + Self { state: None } + } +} + +pub struct GraphBasedRankingRuleState { + graph: RankingRuleGraph, + cheapest_paths_state: Option, + edge_docids_cache: EdgeDocidsCache, + empty_paths_cache: EmptyPathsCache, +} + +impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGraph> + for GraphBasedRankingRule +{ + fn start_iteration( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + query_graph: &QueryGraph, + ) -> Result<()> { + // if let Some(state) = &mut self.state { + // // TODO: update the previous state + // // TODO: update the existing graph incrementally, based on a diff + + // } else { + let graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?; + // println!("Initialized Proximity Ranking Rule."); + // println!("GRAPH:"); + // let graphviz = graph.graphviz(); + // println!("{graphviz}"); + + let cheapest_paths_state = KCheapestPathsState::new(&graph); + let state = GraphBasedRankingRuleState { + graph, + cheapest_paths_state, + edge_docids_cache: <_>::default(), + empty_paths_cache: <_>::default(), + }; + + // let desc = state.graph.graphviz_with_path( + // &state.cheapest_paths_state.as_ref().unwrap().kth_cheapest_path.clone(), + // ); + // println!("Cheapest path: {desc}"); + + self.state = Some(state); + // } + + Ok(()) + } + + fn next_bucket( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + ) -> Result>> { + assert!(universe.len() > 1); + let mut state = self.state.take().unwrap(); + + let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else { + return Ok(None); + }; + // println!("Proximity: Next Bucket"); + + let mut paths = PathsMap::default(); + + // let desc = state.graph.dot_description_with_path(&cheapest_paths_state.kth_cheapest_path); + // println!("CHeapest Path: {desc}"); + // TODO: when does it return None? -> when there is no cheapest path + // How to handle it? -> ... return all document ids from the universe? + // + // TODO: Give an empty_edge and empty_prefix argument to the + // compute_paths_of_next_lowest_cost function + if let Some(next_cheapest_paths_state) = cheapest_paths_state + .compute_paths_of_next_lowest_cost( + &mut state.graph, + &state.empty_paths_cache, + &mut paths, + ) + { + state.cheapest_paths_state = Some(next_cheapest_paths_state); + } else { + state.cheapest_paths_state = None; + // If returns None if there are no longer any paths to compute + // BUT! paths_map may not be empty, and we need to compute the current bucket still + } + + // println!("PATHS: {}", paths.graphviz(&state.graph)); + + // paths.iterate(|path, cost| { + // let desc = state.graph.graphviz_with_path(&Path { edges: path.clone(), cost: *cost }); + // println!("Path to resolve of cost {cost}: {desc}"); + // }); + + // let desc = state.graph.dot_description_with_path( + // &state.cheapest_paths_state.as_ref().unwrap().kth_cheapest_path.clone(), + // ); + // println!("Cheapest path: {desc}"); + + // TODO: verify that this is correct + // If the paths are empty, we should probably return the universe? + // BUT! Is there a case where the paths are empty AND the universe is + // not empty? + if paths.is_empty() { + self.state = None; + return Ok(None); + } + // Here, log all the paths? + + let bucket = state.graph.resolve_paths( + index, + txn, + db_cache, + &mut state.edge_docids_cache, + &mut state.empty_paths_cache, + universe, + paths, + )?; + // The call above also updated the graph such that it doesn't contain the empty edges anymore. + // println!("Resolved all the paths: {bucket:?} from universe {:?}", state.universe); + // let graphviz = state.graph.graphviz(); + // println!("{graphviz}"); + + let next_query_graph = state.graph.query_graph.clone(); + + self.state = Some(state); + + Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket })) + } + + fn end_iteration( + &mut self, + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + ) { + // println!("PROXIMITY: end iteration"); + self.state = None; + } +} From 89d696c1e309c136d29d6c5afe57d84f8cd44ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:49:05 +0100 Subject: [PATCH 015/234] Introduce the proximity ranking rule as a graph-based ranking rule --- .../new/ranking_rule_graph/proximity/build.rs | 165 ++++++++++++++++++ .../proximity/compute_docids.rs | 31 ++++ .../new/ranking_rule_graph/proximity/mod.rs | 61 +++++++ 3 files changed, 257 insertions(+) create mode 100644 milli/src/search/new/ranking_rule_graph/proximity/build.rs create mode 100644 milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs create mode 100644 milli/src/search/new/ranking_rule_graph/proximity/mod.rs diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs new file mode 100644 index 000000000..07ec3bb5e --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -0,0 +1,165 @@ +use std::collections::BTreeMap; + +use super::ProximityEdge; +use crate::new::db_cache::DatabaseCache; +use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use crate::new::ranking_rule_graph::proximity::WordPair; +use crate::new::ranking_rule_graph::{Edge, EdgeDetails}; +use crate::new::QueryNode; +use crate::{Index, Result}; +use heed::RoTxn; +use itertools::Itertools; + +pub fn visit_from_node(from_node: &QueryNode) -> Result> { + Ok(Some(match from_node { + QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => { + match value1 { + QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()), + QueryTerm::Phrase(phrase1) => { + // TODO: remove second unwrap + let original = phrase1.last().unwrap().as_ref().unwrap().clone(); + ( + WordDerivations { + original: original.clone(), + zero_typo: vec![original], + one_typo: vec![], + two_typos: vec![], + use_prefix_db: false, + }, + *pos1.end(), + ) + } + } + } + QueryNode::Start => ( + WordDerivations { + original: String::new(), + zero_typo: vec![], + one_typo: vec![], + two_typos: vec![], + use_prefix_db: false, + }, + -100, + ), + _ => return Ok(None), + })) +} + +pub fn visit_to_node<'transaction, 'from_data>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + to_node: &QueryNode, + from_node_data: &'from_data (WordDerivations, i8), +) -> Result)>>> { + let (derivations1, pos1) = from_node_data; + let term2 = match &to_node { + QueryNode::End => return Ok(Some(vec![(0, EdgeDetails::Unconditional)])), + QueryNode::Deleted | QueryNode::Start => return Ok(None), + QueryNode::Term(term) => term, + }; + let LocatedQueryTerm { value: value2, positions: pos2 } = term2; + + let (derivations2, pos2, ngram_len2) = match value2 { + QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()), + QueryTerm::Phrase(phrase2) => { + // TODO: remove second unwrap + let original = phrase2.last().unwrap().as_ref().unwrap().clone(); + ( + WordDerivations { + original: original.clone(), + zero_typo: vec![original], + one_typo: vec![], + two_typos: vec![], + use_prefix_db: false, + }, + *pos2.start(), + 1, + ) + } + }; + + // TODO: here we would actually do it for each combination of word1 and word2 + // and take the union of them + if pos1 + 1 != pos2 { + // TODO: how should this actually be handled? + // We want to effectively ignore this pair of terms + // Unconditionally walk through the edge without computing the docids + // But also what should the cost be? + return Ok(Some(vec![(0, EdgeDetails::Unconditional)])); + } + + let updb1 = derivations1.use_prefix_db; + let updb2 = derivations2.use_prefix_db; + + // left term cannot be a prefix + assert!(!updb1); + + let derivations1 = derivations1.all_derivations_except_prefix_db(); + let original_word_2 = derivations2.original.clone(); + let mut cost_proximity_word_pairs = BTreeMap::>>::new(); + + if updb2 { + for word1 in derivations1.clone() { + for proximity in 0..(7 - ngram_len2) { + let cost = (proximity + ngram_len2 - 1) as u8; + if db_cache + .get_word_prefix_pair_proximity_docids( + index, + txn, + word1, + original_word_2.as_str(), + proximity as u8, + )? + .is_some() + { + cost_proximity_word_pairs + .entry(cost) + .or_default() + .entry(proximity as u8) + .or_default() + .push(WordPair::WordPrefix { + left: word1.to_owned(), + right_prefix: original_word_2.to_owned(), + }); + } + } + } + } + + let derivations2 = derivations2.all_derivations_except_prefix_db(); + // TODO: safeguard in case the cartesian product is too large? + let product_derivations = derivations1.cartesian_product(derivations2); + + for (word1, word2) in product_derivations { + for proximity in 0..(7 - ngram_len2) { + let cost = (proximity + ngram_len2 - 1) as u8; + // TODO: do the opposite way with a proximity penalty as well! + // search for (word2, word1, proximity-1), I guess? + if db_cache + .get_word_pair_proximity_docids(index, txn, word1, word2, proximity as u8)? + .is_some() + { + cost_proximity_word_pairs + .entry(cost) + .or_default() + .entry(proximity as u8) + .or_default() + .push(WordPair::Words { left: word1.to_owned(), right: word2.to_owned() }); + } + } + } + let mut new_edges = cost_proximity_word_pairs + .into_iter() + .flat_map(|(cost, proximity_word_pairs)| { + let mut edges = vec![]; + for (proximity, word_pairs) in proximity_word_pairs { + edges + .push((cost, EdgeDetails::Data(ProximityEdge { pairs: word_pairs, proximity }))) + } + edges + }) + .collect::>(); + new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeDetails::Unconditional)); + Ok(Some(new_edges)) +} diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs new file mode 100644 index 000000000..325042761 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -0,0 +1,31 @@ +use roaring::MultiOps; + +use super::{ProximityEdge, WordPair}; +use crate::new::db_cache::DatabaseCache; +use crate::CboRoaringBitmapCodec; + +pub fn compute_docids<'transaction>( + index: &crate::Index, + txn: &'transaction heed::RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + edge: &ProximityEdge, +) -> crate::Result { + let ProximityEdge { pairs, proximity } = edge; + // TODO: we should know already which pair of words to look for + let mut pair_docids = vec![]; + for pair in pairs.iter() { + let bytes = match pair { + WordPair::Words { left, right } => { + db_cache.get_word_pair_proximity_docids(index, txn, left, right, *proximity) + } + WordPair::WordPrefix { left, right_prefix } => db_cache + .get_word_prefix_pair_proximity_docids(index, txn, left, right_prefix, *proximity), + }?; + let bitmap = + bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); + pair_docids.push(bitmap); + } + pair_docids.sort_by_key(|rb| rb.len()); + let docids = MultiOps::union(pair_docids); + Ok(docids) +} diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs new file mode 100644 index 000000000..199a5eb4a --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -0,0 +1,61 @@ +pub mod build; +pub mod compute_docids; + +use super::{Edge, EdgeDetails, RankingRuleGraphTrait}; +use crate::new::db_cache::DatabaseCache; +use crate::new::query_term::WordDerivations; +use crate::new::QueryNode; +use crate::{Index, Result}; +use heed::RoTxn; + +#[derive(Debug, Clone)] +pub enum WordPair { + // TODO: add WordsSwapped and WordPrefixSwapped case + Words { left: String, right: String }, + WordPrefix { left: String, right_prefix: String }, +} + +pub struct ProximityEdge { + pairs: Vec, + proximity: u8, +} + +pub enum ProximityGraph {} + +impl RankingRuleGraphTrait for ProximityGraph { + type EdgeDetails = ProximityEdge; + type BuildVisitedFromNode = (WordDerivations, i8); + + fn edge_details_dot_label(edge: &Self::EdgeDetails) -> String { + let ProximityEdge { pairs, proximity } = edge; + format!(", prox {proximity}, {} pairs", pairs.len()) + } + + fn compute_docids<'db_cache, 'transaction>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + edge: &Self::EdgeDetails, + ) -> Result { + compute_docids::compute_docids(index, txn, db_cache, edge) + } + + fn build_visit_from_node<'transaction>( + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + from_node: &QueryNode, + ) -> Result> { + build::visit_from_node(from_node) + } + + fn build_visit_to_node<'from_data, 'transaction: 'from_data>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + to_node: &QueryNode, + from_node_data: &'from_data Self::BuildVisitedFromNode, + ) -> Result)>>> { + build::visit_to_node(index, txn, db_cache, to_node, from_node_data) + } +} From 345c99d5bd5cd098133e627c6d8f1714bcfcd9f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:49:25 +0100 Subject: [PATCH 016/234] Introduce the words ranking rule working with the new search structures --- milli/src/search/new/words.rs | 149 ++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 milli/src/search/new/words.rs diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs new file mode 100644 index 000000000..4d812d9ff --- /dev/null +++ b/milli/src/search/new/words.rs @@ -0,0 +1,149 @@ +use std::collections::BTreeSet; + +use heed::RoTxn; +use roaring::RoaringBitmap; + +use super::db_cache::DatabaseCache; +use super::resolve_query_graph::resolve_query_graph; +use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput}; +use crate::{Index, Result, TermsMatchingStrategy}; + +pub struct Words { + exhausted: bool, + query_graph: Option, + iterating: bool, + positions_to_remove: Vec, + terms_matching_strategy: TermsMatchingStrategy, +} +impl Words { + pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self { + Self { + exhausted: true, + query_graph: None, + iterating: false, + positions_to_remove: vec![], + terms_matching_strategy, + } + } +} + +impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { + fn start_iteration( + &mut self, + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + parent_candidates: &RoaringBitmap, + parent_query_graph: &QueryGraph, + ) -> Result<()> { + // println!("Words: start iteration"); + self.exhausted = false; + self.query_graph = Some(parent_query_graph.clone()); + + // TODO: a phrase can contain many positions, but represents a single node. + // That's a problem. + let positions_to_remove = match self.terms_matching_strategy { + TermsMatchingStrategy::Last => { + let mut all_positions = BTreeSet::new(); + for n in parent_query_graph.nodes.iter() { + match n { + QueryNode::Term(term) => { + all_positions.extend(term.positions.clone().into_iter()); + } + QueryNode::Deleted | QueryNode::Start | QueryNode::End => {} + } + } + all_positions.into_iter().collect() + } + TermsMatchingStrategy::All => vec![], + }; + // println!("positions to remove: {positions_to_remove:?}"); + self.positions_to_remove = positions_to_remove; + self.iterating = true; + Ok(()) + } + + fn next_bucket( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + ) -> Result>> { + // println!("Words: next bucket"); + assert!(self.iterating); + assert!(universe.len() > 1); + if self.exhausted { + return Ok(None); + } + let Some(query_graph) = &mut self.query_graph else { panic!() }; + // let graphviz = query_graph.graphviz(); + // println!("\n===={graphviz}\n===="); + let this_bucket = resolve_query_graph(index, txn, db_cache, query_graph, universe)?; + // println!("WORDS: this bucket: {this_bucket:?}"); + let child_query_graph = query_graph.clone(); + // this_bucket is the one that must be returned now + // self.cur_bucket is set to the next bucket + // TODO: Check whether a position exists in the graph before removing it and + // returning the next bucket. + // while graph.does_not_contain(positions_to_remove.last()) { positions_to_remove.pop() } + if self.positions_to_remove.is_empty() { + self.exhausted = true; + } else { + let position_to_remove = self.positions_to_remove.pop().unwrap(); + query_graph.remove_words_at_position(position_to_remove); + } + + Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket })) + } + + fn end_iteration( + &mut self, + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + ) { + // println!("Words: end iteration"); + self.iterating = false; + self.exhausted = true; + self.positions_to_remove = vec![]; + } +} + +#[cfg(test)] +mod tests { + // use charabia::Tokenize; + // use roaring::RoaringBitmap; + + // use crate::{ + // index::tests::TempIndex, + // search::{criteria::CriteriaBuilder, new::QueryGraphOrPlaceholder}, + // }; + + // use super::Words; + + // fn placeholder() { + // let qt = QueryGraphOrPlaceholder::Placeholder; + // let index = TempIndex::new(); + // let rtxn = index.read_txn().unwrap(); + + // let query = "a beautiful summer house by the beach overlooking what seems"; + // // let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); + // // let (qt, parts, matching_words) = builder.build(query.tokenize()).unwrap().unwrap(); + + // // let cb = CriteriaBuilder::new(&rtxn, &index).unwrap(); + // // let x = cb + // // .build( + // // Some(qt), + // // Some(parts), + // // None, + // // None, + // // false, + // // None, + // // crate::CriterionImplementationStrategy::OnlySetBased, + // // ) + // // .unwrap(); + + // // let rr = Words::new(&index, &RoaringBitmap::from_sorted_iter(0..1000)).unwrap(); + // } +} From 132191360b96c4b9249442637a85d13302f17696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:49:43 +0100 Subject: [PATCH 017/234] Introduce the sort ranking rule working with the new search structures --- milli/src/search/new/sort.rs | 118 +++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 milli/src/search/new/sort.rs diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs new file mode 100644 index 000000000..9a48a49e7 --- /dev/null +++ b/milli/src/search/new/sort.rs @@ -0,0 +1,118 @@ +use heed::RoTxn; +use roaring::RoaringBitmap; + +use super::{ + db_cache::DatabaseCache, RankingRule, RankingRuleOutput, RankingRuleOutputIter, + RankingRuleOutputIterWrapper, RankingRuleQueryTrait, +}; +use crate::{ + // facet::FacetType, + heed_codec::{facet::FacetGroupKeyCodec, ByteSliceRefCodec}, + search::facet::{ascending_facet_sort, descending_facet_sort}, + FieldId, + Index, + Result, +}; + +// TODO: The implementation of Sort is not correct: +// (1) it should not return documents it has already returned (does the current implementation have the same bug?) +// (2) at the end, it should return all the remaining documents (this could be ensured at the trait level?) + +pub struct Sort<'transaction, Query> { + field_id: Option, + is_ascending: bool, + iter: Option>, +} +impl<'transaction, Query> Sort<'transaction, Query> { + pub fn new( + index: &'transaction Index, + rtxn: &'transaction heed::RoTxn, + field_name: String, + is_ascending: bool, + ) -> Result { + let fields_ids_map = index.fields_ids_map(rtxn)?; + let field_id = fields_ids_map.id(&field_name); + + // TODO: What is this, why? + // let faceted_candidates = match field_id { + // Some(field_id) => { + // let number_faceted = + // index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?; + // let string_faceted = + // index.faceted_documents_ids(rtxn, field_id, FacetType::String)?; + // number_faceted | string_faceted + // } + // None => RoaringBitmap::default(), + // }; + + Ok(Self { field_id, is_ascending, iter: None }) + } +} + +impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query> + for Sort<'transaction, Query> +{ + fn start_iteration( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + parent_candidates: &RoaringBitmap, + parent_query_graph: &Query, + ) -> Result<()> { + let iter: RankingRuleOutputIterWrapper = match self.field_id { + Some(field_id) => { + let make_iter = + if self.is_ascending { ascending_facet_sort } else { descending_facet_sort }; + + let number_iter = make_iter( + txn, + index + .facet_id_f64_docids + .remap_key_type::>(), + field_id, + parent_candidates.clone(), + )?; + + let string_iter = make_iter( + txn, + index + .facet_id_string_docids + .remap_key_type::>(), + field_id, + parent_candidates.clone(), + )?; + let query_graph = parent_query_graph.clone(); + RankingRuleOutputIterWrapper::new(Box::new(number_iter.chain(string_iter).map( + move |docids| { + Ok(RankingRuleOutput { query: query_graph.clone(), candidates: docids? }) + }, + ))) + } + None => RankingRuleOutputIterWrapper::new(Box::new(std::iter::empty())), + }; + self.iter = Some(iter); + Ok(()) + } + + fn next_bucket( + &mut self, + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + _universe: &RoaringBitmap, + ) -> Result>> { + let iter = self.iter.as_mut().unwrap(); + // TODO: we should make use of the universe in the function below + iter.next_bucket() + } + + fn end_iteration( + &mut self, + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + ) { + self.iter = None; + } +} From 66d0c636944e7c260f34ca023e80c526c05ab104 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 12:33:32 +0100 Subject: [PATCH 018/234] Add some documentation and use bitmaps instead of hashmaps when possible --- milli/src/search/new/query_graph.rs | 100 ++++++----- .../search/new/ranking_rule_graph/build.rs | 30 ++-- .../new/ranking_rule_graph/cheapest_paths.rs | 80 +++++---- .../ranking_rule_graph/edge_docids_cache.rs | 6 + .../src/search/new/ranking_rule_graph/mod.rs | 167 ++++++++++-------- .../new/ranking_rule_graph/paths_map.rs | 6 +- .../new/ranking_rule_graph/proximity/build.rs | 10 +- .../new/ranking_rule_graph/proximity/mod.rs | 4 +- milli/src/search/new/ranking_rules.rs | 100 ++++++----- milli/src/search/new/resolve_query_graph.rs | 27 +-- 10 files changed, 298 insertions(+), 232 deletions(-) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 726a1460c..821c1a226 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,7 +1,8 @@ -use std::collections::HashSet; use std::fmt::Debug; +use std::{collections::HashSet, fmt}; use heed::RoTxn; +use roaring::RoaringBitmap; use super::{ db_cache::DatabaseCache, @@ -19,21 +20,31 @@ pub enum QueryNode { #[derive(Debug, Clone)] pub struct Edges { - pub incoming: HashSet, - pub outgoing: HashSet, + // TODO: use a tiny bitset instead + // something like a simple Vec where most queries will see a vector of one element + pub predecessors: RoaringBitmap, + pub successors: RoaringBitmap, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct NodeIndex(pub u32); +impl fmt::Display for NodeIndex { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } } #[derive(Debug, Clone)] pub struct QueryGraph { - pub root_node: usize, - pub end_node: usize, + pub root_node: NodeIndex, + pub end_node: NodeIndex, pub nodes: Vec, pub edges: Vec, } fn _assert_sizes() { let _: [u8; 112] = [0; std::mem::size_of::()]; - let _: [u8; 96] = [0; std::mem::size_of::()]; + let _: [u8; 48] = [0; std::mem::size_of::()]; } impl Default for QueryGraph { @@ -41,32 +52,32 @@ impl Default for QueryGraph { fn default() -> Self { let nodes = vec![QueryNode::Start, QueryNode::End]; let edges = vec![ - Edges { incoming: HashSet::new(), outgoing: HashSet::new() }, - Edges { incoming: HashSet::new(), outgoing: HashSet::new() }, + Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }, + Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }, ]; - Self { root_node: 0, end_node: 1, nodes, edges } + Self { root_node: NodeIndex(0), end_node: NodeIndex(1), nodes, edges } } } impl QueryGraph { - fn connect_to_node(&mut self, from_nodes: &[usize], end_node: usize) { + fn connect_to_node(&mut self, from_nodes: &[NodeIndex], to_node: NodeIndex) { for &from_node in from_nodes { - self.edges[from_node].outgoing.insert(end_node); - self.edges[end_node].incoming.insert(from_node); + self.edges[from_node.0 as usize].successors.insert(to_node.0); + self.edges[to_node.0 as usize].predecessors.insert(from_node.0); } } - fn add_node(&mut self, from_nodes: &[usize], node: QueryNode) -> usize { - let new_node_idx = self.nodes.len(); + fn add_node(&mut self, from_nodes: &[NodeIndex], node: QueryNode) -> NodeIndex { + let new_node_idx = self.nodes.len() as u32; self.nodes.push(node); self.edges.push(Edges { - incoming: from_nodes.iter().copied().collect(), - outgoing: HashSet::new(), + predecessors: from_nodes.iter().map(|x| x.0).collect(), + successors: RoaringBitmap::new(), }); for from_node in from_nodes { - self.edges[*from_node].outgoing.insert(new_node_idx); + self.edges[from_node.0 as usize].successors.insert(new_node_idx); } - new_node_idx + NodeIndex(new_node_idx) } } @@ -88,7 +99,7 @@ impl QueryGraph { let word_set = index.words_fst(txn)?; let mut graph = QueryGraph::default(); - let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = + let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![graph.root_node]); // TODO: add all the word derivations found in the fst @@ -162,38 +173,41 @@ impl QueryGraph { Ok(graph) } - pub fn remove_nodes(&mut self, nodes: &[usize]) { + pub fn remove_nodes(&mut self, nodes: &[NodeIndex]) { for &node in nodes { - self.nodes[node] = QueryNode::Deleted; - let edges = self.edges[node].clone(); - for &pred in edges.incoming.iter() { - self.edges[pred].outgoing.remove(&node); + self.nodes[node.0 as usize] = QueryNode::Deleted; + let edges = self.edges[node.0 as usize].clone(); + for pred in edges.predecessors.iter() { + self.edges[pred as usize].successors.remove(node.0); } - for succ in edges.outgoing { - self.edges[succ].incoming.remove(&node); + for succ in edges.successors { + self.edges[succ as usize].predecessors.remove(node.0); } - self.edges[node] = Edges { incoming: HashSet::new(), outgoing: HashSet::new() }; + self.edges[node.0 as usize] = + Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }; } } - pub fn remove_nodes_keep_edges(&mut self, nodes: &[usize]) { + pub fn remove_nodes_keep_edges(&mut self, nodes: &[NodeIndex]) { for &node in nodes { - self.nodes[node] = QueryNode::Deleted; - let edges = self.edges[node].clone(); - for &pred in edges.incoming.iter() { - self.edges[pred].outgoing.remove(&node); - self.edges[pred].outgoing.extend(edges.outgoing.iter()); + self.nodes[node.0 as usize] = QueryNode::Deleted; + let edges = self.edges[node.0 as usize].clone(); + for pred in edges.predecessors.iter() { + self.edges[pred as usize].successors.remove(node.0); + self.edges[pred as usize].successors |= &edges.successors; } - for succ in edges.outgoing { - self.edges[succ].incoming.remove(&node); - self.edges[succ].incoming.extend(edges.incoming.iter()); + for succ in edges.successors { + self.edges[succ as usize].predecessors.remove(node.0); + self.edges[succ as usize].predecessors |= &edges.predecessors; } - self.edges[node] = Edges { incoming: HashSet::new(), outgoing: HashSet::new() }; + self.edges[node.0 as usize] = + Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }; } } pub fn remove_words_at_position(&mut self, position: i8) { let mut nodes_to_remove_keeping_edges = vec![]; let mut nodes_to_remove = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { + let node_idx = NodeIndex(node_idx as u32); let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue }; if positions.contains(&position) { nodes_to_remove_keeping_edges.push(node_idx) @@ -213,11 +227,11 @@ impl QueryGraph { let mut nodes_to_remove = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { if (!matches!(node, QueryNode::End | QueryNode::Deleted) - && self.edges[node_idx].outgoing.is_empty()) + && self.edges[node_idx].successors.is_empty()) || (!matches!(node, QueryNode::Start | QueryNode::Deleted) - && self.edges[node_idx].incoming.is_empty()) + && self.edges[node_idx].predecessors.is_empty()) { - nodes_to_remove.push(node_idx); + nodes_to_remove.push(NodeIndex(node_idx as u32)); } } if nodes_to_remove.is_empty() { @@ -301,14 +315,14 @@ node [shape = "record"] continue; } desc.push_str(&format!("{node} [label = {:?}]", &self.nodes[node],)); - if node == self.root_node { + if node == self.root_node.0 as usize { desc.push_str("[color = blue]"); - } else if node == self.end_node { + } else if node == self.end_node.0 as usize { desc.push_str("[color = red]"); } desc.push_str(";\n"); - for edge in self.edges[node].outgoing.iter() { + for edge in self.edges[node].successors.iter() { desc.push_str(&format!("{node} -> {edge};\n")); } // for edge in self.edges[node].incoming.iter() { diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 605fe82d1..45dda3c1f 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,10 +1,11 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use heed::RoTxn; +use roaring::RoaringBitmap; use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; -use crate::new::QueryGraph; +use crate::new::{NodeIndex, QueryGraph}; use crate::{Index, Result}; impl RankingRuleGraph { @@ -14,29 +15,38 @@ impl RankingRuleGraph { db_cache: &mut DatabaseCache<'transaction>, query_graph: QueryGraph, ) -> Result { - let mut ranking_rule_graph = Self { query_graph, all_edges: vec![], node_edges: vec![] }; + let mut ranking_rule_graph = + Self { query_graph, all_edges: vec![], node_edges: vec![], successors: vec![] }; for (node_idx, node) in ranking_rule_graph.query_graph.nodes.iter().enumerate() { - ranking_rule_graph.node_edges.push(BTreeSet::new()); + ranking_rule_graph.node_edges.push(RoaringBitmap::new()); + ranking_rule_graph.successors.push(RoaringBitmap::new()); let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap(); + let new_successors = ranking_rule_graph.successors.last_mut().unwrap(); let Some(from_node_data) = G::build_visit_from_node(index, txn, db_cache, node)? else { continue }; - for &successor_idx in ranking_rule_graph.query_graph.edges[node_idx].outgoing.iter() { - let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx]; - let Some(edges) = G::build_visit_to_node(index, txn, db_cache, to_node, &from_node_data)? else { continue }; + for successor_idx in ranking_rule_graph.query_graph.edges[node_idx].successors.iter() { + let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx as usize]; + let mut edges = + G::build_visit_to_node(index, txn, db_cache, to_node, &from_node_data)?; + if edges.is_empty() { + continue; + } + edges.sort_by_key(|e| e.0); for (cost, details) in edges { ranking_rule_graph.all_edges.push(Some(Edge { - from_node: node_idx, - to_node: successor_idx, + from_node: NodeIndex(node_idx as u32), + to_node: NodeIndex(successor_idx), cost, details, })); - new_edges.insert(ranking_rule_graph.all_edges.len() - 1); + new_edges.insert(ranking_rule_graph.all_edges.len() as u32 - 1); + new_successors.insert(successor_idx); } } } - ranking_rule_graph.simplify(); + // ranking_rule_graph.simplify(); Ok(ranking_rule_graph) } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 3bd43fd6f..f1c1035a3 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -1,6 +1,9 @@ use std::collections::{BTreeMap, HashSet}; use itertools::Itertools; +use roaring::RoaringBitmap; + +use crate::new::NodeIndex; use super::{ empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, Edge, EdgeIndex, RankingRuleGraph, @@ -14,18 +17,11 @@ pub struct Path { } struct DijkstraState { - unvisited: HashSet, // should be a small bitset - distances: Vec, // or binary heap (f64, usize) + unvisited: RoaringBitmap, // should be a small bitset? + distances: Vec, // or binary heap, or btreemap? (f64, usize) edges: Vec, edge_costs: Vec, - paths: Vec>, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct PathEdgeId { - pub from: usize, - pub to: usize, - pub id: Id, + paths: Vec>, } pub struct KCheapestPathsState { @@ -127,9 +123,10 @@ impl KCheapestPathsState { // for all the paths already found that share a common prefix with the root path // we delete the edge from the spur node to the next one for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) { - let was_removed = graph.node_edges[*spur_node].remove(&edge_index_to_remove.0); + let was_removed = + graph.node_edges[spur_node.0 as usize].remove(edge_index_to_remove.0 as u32); if was_removed { - tmp_removed_edges.push(edge_index_to_remove.0); + tmp_removed_edges.push(edge_index_to_remove.0 as u32); } } @@ -137,7 +134,7 @@ impl KCheapestPathsState { // we will combine it with the root path to get a potential kth cheapest path let spur_path = graph.cheapest_path_to_end(*spur_node); // restore the temporarily removed edges - graph.node_edges[*spur_node].extend(tmp_removed_edges); + graph.node_edges[spur_node.0 as usize].extend(tmp_removed_edges); let Some(spur_path) = spur_path else { continue; }; let total_cost = root_cost + spur_path.cost; @@ -182,68 +179,73 @@ impl KCheapestPathsState { } impl RankingRuleGraph { - fn cheapest_path_to_end(&self, from: usize) -> Option { + fn cheapest_path_to_end(&self, from: NodeIndex) -> Option { let mut dijkstra = DijkstraState { - unvisited: (0..self.query_graph.nodes.len()).collect(), + unvisited: (0..self.query_graph.nodes.len() as u32).collect(), distances: vec![u64::MAX; self.query_graph.nodes.len()], edges: vec![EdgeIndex(usize::MAX); self.query_graph.nodes.len()], edge_costs: vec![u8::MAX; self.query_graph.nodes.len()], paths: vec![None; self.query_graph.nodes.len()], }; - dijkstra.distances[from] = 0; + dijkstra.distances[from.0 as usize] = 0; - // TODO: could use a binary heap here to store the distances - while let Some(&cur_node) = - dijkstra.unvisited.iter().min_by_key(|&&n| dijkstra.distances[n]) + // TODO: could use a binary heap here to store the distances, or a btreemap + while let Some(cur_node) = + dijkstra.unvisited.iter().min_by_key(|&n| dijkstra.distances[n as usize]) { - let cur_node_dist = dijkstra.distances[cur_node]; + let cur_node_dist = dijkstra.distances[cur_node as usize]; if cur_node_dist == u64::MAX { return None; } - if cur_node == self.query_graph.end_node { + if cur_node == self.query_graph.end_node.0 { break; } - let succ_cur_node: HashSet<_> = self.node_edges[cur_node] - .iter() - .map(|e| self.all_edges[*e].as_ref().unwrap().to_node) - .collect(); + // this is expensive, but shouldn't + // ideally I could quickly get a bitmap of all a node's successors + // then take the intersection with unvisited + let succ_cur_node: &RoaringBitmap = &self.successors[cur_node as usize]; + // .iter() + // .map(|e| self.all_edges[e as usize].as_ref().unwrap().to_node.0) + // .collect(); // TODO: this intersection may be slow but shouldn't be, // can use a bitmap intersection instead - let unvisited_succ_cur_node = succ_cur_node.intersection(&dijkstra.unvisited); - for &succ in unvisited_succ_cur_node { - let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(cur_node, succ) else { + let unvisited_succ_cur_node = succ_cur_node & &dijkstra.unvisited; + for succ in unvisited_succ_cur_node { + // cheapest_edge() is also potentially too expensive + let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(NodeIndex(cur_node), NodeIndex(succ)) else { continue }; // println!("cur node dist {cur_node_dist}"); - let old_dist_succ = &mut dijkstra.distances[succ]; + let old_dist_succ = &mut dijkstra.distances[succ as usize]; let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64; if new_potential_distance < *old_dist_succ { *old_dist_succ = new_potential_distance; - dijkstra.edges[succ] = cheapest_edge; - dijkstra.edge_costs[succ] = cheapest_edge_cost; - dijkstra.paths[succ] = Some(cur_node); + dijkstra.edges[succ as usize] = cheapest_edge; + dijkstra.edge_costs[succ as usize] = cheapest_edge_cost; + dijkstra.paths[succ as usize] = Some(NodeIndex(cur_node)); } } - dijkstra.unvisited.remove(&cur_node); + dijkstra.unvisited.remove(cur_node); } let mut cur = self.query_graph.end_node; // let mut edge_costs = vec![]; // let mut distances = vec![]; let mut path_edges = vec![]; - while let Some(n) = dijkstra.paths[cur] { - path_edges.push(dijkstra.edges[cur]); + while let Some(n) = dijkstra.paths[cur.0 as usize] { + path_edges.push(dijkstra.edges[cur.0 as usize]); cur = n; } path_edges.reverse(); - Some(Path { edges: path_edges, cost: dijkstra.distances[self.query_graph.end_node] }) + Some(Path { + edges: path_edges, + cost: dijkstra.distances[self.query_graph.end_node.0 as usize], + }) } - // TODO: this implementation is VERY fragile, as we assume that the edges are ordered by cost - // already. Change it. - pub fn cheapest_edge(&self, cur_node: usize, succ: usize) -> Option<(EdgeIndex, u8)> { + pub fn cheapest_edge(&self, cur_node: NodeIndex, succ: NodeIndex) -> Option<(EdgeIndex, u8)> { self.visit_edges(cur_node, succ, |edge_idx, edge| { std::ops::ControlFlow::Break((edge_idx, edge.cost)) }) diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 301810847..0c9768f04 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -9,6 +9,12 @@ use crate::new::db_cache::DatabaseCache; use crate::new::BitmapOrAllRef; use crate::{Index, Result}; +// TODO: the cache should have a G::EdgeDetails as key +// but then it means that we should have a quick way of +// computing their hash and comparing them +// which can be done... +// by using a pointer (real, Rc, bumpalo, or in a vector)??? + pub struct EdgeDocidsCache { pub cache: HashMap, diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 12f397df3..f7a312240 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -13,7 +13,7 @@ use heed::RoTxn; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; -use super::{QueryGraph, QueryNode}; +use super::{NodeIndex, QueryGraph, QueryNode}; use crate::{Index, Result}; #[derive(Debug, Clone)] @@ -24,8 +24,8 @@ pub enum EdgeDetails { #[derive(Debug, Clone)] pub struct Edge { - from_node: usize, - to_node: usize, + from_node: NodeIndex, + to_node: NodeIndex, cost: u8, details: EdgeDetails, } @@ -38,22 +38,20 @@ pub struct EdgePointer<'graph, E> { #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct EdgeIndex(pub usize); -// { -// // TODO: they could all be u16 instead -// // There may be a way to store all the edge indices in a u32 as well, -// // if the edges are in a vector -// // then we can store sets of edges in a bitmap efficiently -// pub from: usize, -// pub to: usize, -// pub edge_idx: usize, -// } pub trait RankingRuleGraphTrait { + /// The details of an edge connecting two query nodes. These details + /// should be sufficient to compute the edge's cost and associated document ids + /// in [`compute_docids`](RankingRuleGraphTrait). type EdgeDetails: Sized; + type BuildVisitedFromNode; - fn edge_details_dot_label(edge: &Self::EdgeDetails) -> String; + /// Return the label of the given edge details, to be used when visualising + /// the ranking rule graph using GraphViz. + fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String; + /// Compute the document ids associated with the given edge. fn compute_docids<'transaction>( index: &Index, txn: &'transaction RoTxn, @@ -61,6 +59,10 @@ pub trait RankingRuleGraphTrait { edge_details: &Self::EdgeDetails, ) -> Result; + /// Prepare to build the edges outgoing from `from_node`. + /// + /// This call is followed by zero, one or more calls to [`build_visit_to_node`](RankingRuleGraphTrait::build_visit_to_node), + /// which builds the actual edges. fn build_visit_from_node<'transaction>( index: &Index, txn: &'transaction RoTxn, @@ -68,39 +70,59 @@ pub trait RankingRuleGraphTrait { from_node: &QueryNode, ) -> Result>; + /// Return the cost and details of the edges going from the previously visited node + /// (with [`build_visit_from_node`](RankingRuleGraphTrait::build_visit_from_node)) to `to_node`. fn build_visit_to_node<'from_data, 'transaction: 'from_data>( index: &Index, txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, - ) -> Result)>>>; + ) -> Result)>>; } pub struct RankingRuleGraph { pub query_graph: QueryGraph, // pub edges: Vec>>>, pub all_edges: Vec>>, - pub node_edges: Vec>, + + pub node_edges: Vec, + + pub successors: Vec, + // to get the edges between two nodes: + // 1. get node_outgoing_edges[from] + // 2. get node_incoming_edges[to] + // 3. take intersection betweem the two + + // TODO: node edges could be different I guess + // something like: + // pub node_edges: Vec + // where each index is the result of: + // the successor index in the top 16 bits, the edge index in the bottom 16 bits + + // TODO: + // node_successors? + // pub removed_edges: HashSet, // pub tmp_removed_edges: HashSet, } impl RankingRuleGraph { - // NOTE: returns the edge even if it was removed pub fn get_edge(&self, edge_index: EdgeIndex) -> &Option> { &self.all_edges[edge_index.0] } + + // Visit all edges between the two given nodes in order of increasing cost. pub fn visit_edges<'graph, O>( &'graph self, - from: usize, - to: usize, + from: NodeIndex, + to: NodeIndex, mut visit: impl FnMut(EdgeIndex, &'graph Edge) -> ControlFlow, ) -> Option { - let from_edges = &self.node_edges[from]; - for &edge_idx in from_edges { - let edge = self.all_edges[edge_idx].as_ref().unwrap(); + let from_edges = &self.node_edges[from.0 as usize]; + for edge_idx in from_edges { + let edge = self.all_edges[edge_idx as usize].as_ref().unwrap(); if edge.to_node == to { - let cf = visit(EdgeIndex(edge_idx), edge); + let cf = visit(EdgeIndex(edge_idx as usize), edge); match cf { ControlFlow::Continue(_) => continue, ControlFlow::Break(o) => return Some(o), @@ -113,54 +135,61 @@ impl RankingRuleGraph { fn remove_edge(&mut self, edge_index: EdgeIndex) { let edge_opt = &mut self.all_edges[edge_index.0]; - let Some(Edge { from_node, to_node, cost, details }) = &edge_opt else { return }; - - let node_edges = &mut self.node_edges[*from_node]; - node_edges.remove(&edge_index.0); - + let Some(edge) = &edge_opt else { return }; + let (from_node, to_node) = (edge.from_node, edge.to_node); *edge_opt = None; - } - pub fn remove_nodes(&mut self, nodes: &[usize]) { - for &node in nodes { - let edge_indices = &mut self.node_edges[node]; - for edge_index in edge_indices.iter() { - self.all_edges[*edge_index] = None; - } - edge_indices.clear(); - let preds = &self.query_graph.edges[node].incoming; - for pred in preds { - let edge_indices = &mut self.node_edges[*pred]; - for edge_index in edge_indices.iter() { - let edge_opt = &mut self.all_edges[*edge_index]; - let Some(edge) = edge_opt else { continue; }; - if edge.to_node == node { - *edge_opt = None; - } - } - panic!("remove nodes is incorrect at the moment"); - edge_indices.clear(); - } - } - self.query_graph.remove_nodes(nodes); - } - pub fn simplify(&mut self) { - loop { - let mut nodes_to_remove = vec![]; - for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { - if !matches!(node, QueryNode::End | QueryNode::Deleted) - && self.node_edges[node_idx].is_empty() - { - nodes_to_remove.push(node_idx); - } - } - if nodes_to_remove.is_empty() { - break; - } else { - self.remove_nodes(&nodes_to_remove); - } + let from_node_edges = &mut self.node_edges[from_node.0 as usize]; + from_node_edges.remove(edge_index.0 as u32); + + let mut new_successors_from_node = RoaringBitmap::new(); + for edge in from_node_edges.iter() { + let Edge { to_node, .. } = &self.all_edges[edge as usize].as_ref().unwrap(); + new_successors_from_node.insert(to_node.0); } + self.successors[from_node.0 as usize] = new_successors_from_node; } + // pub fn remove_nodes(&mut self, nodes: &[usize]) { + // for &node in nodes { + // let edge_indices = &mut self.node_edges[node]; + // for edge_index in edge_indices.iter() { + // self.all_edges[*edge_index] = None; + // } + // edge_indices.clear(); + + // let preds = &self.query_graph.edges[node].incoming; + // for pred in preds { + // let edge_indices = &mut self.node_edges[*pred]; + // for edge_index in edge_indices.iter() { + // let edge_opt = &mut self.all_edges[*edge_index]; + // let Some(edge) = edge_opt else { continue; }; + // if edge.to_node == node { + // *edge_opt = None; + // } + // } + // panic!("remove nodes is incorrect at the moment"); + // edge_indices.clear(); + // } + // } + // self.query_graph.remove_nodes(nodes); + // } + // pub fn simplify(&mut self) { + // loop { + // let mut nodes_to_remove = vec![]; + // for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { + // if !matches!(node, QueryNode::End | QueryNode::Deleted) + // && self.node_edges[node_idx].is_empty() + // { + // nodes_to_remove.push(node_idx); + // } + // } + // if nodes_to_remove.is_empty() { + // break; + // } else { + // self.remove_nodes(&nodes_to_remove); + // } + // } + // } // fn is_removed_edge(&self, edge: EdgeIndex) -> bool { // self.removed_edges.contains(&edge) || self.tmp_removed_edges.contains(&edge) // } @@ -174,9 +203,9 @@ impl RankingRuleGraph { continue; } desc.push_str(&format!("{node_idx} [label = {:?}]", node)); - if node_idx == self.query_graph.root_node { + if node_idx == self.query_graph.root_node.0 as usize { desc.push_str("[color = blue]"); - } else if node_idx == self.query_graph.end_node { + } else if node_idx == self.query_graph.end_node.0 as usize { desc.push_str("[color = red]"); } desc.push_str(";\n"); @@ -195,7 +224,7 @@ impl RankingRuleGraph { desc.push_str(&format!( "{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\"];\n", cost = edge.cost, - edge_label = G::edge_details_dot_label(details) + edge_label = G::graphviz_edge_details_label(details) )); } } diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 589a1a52f..b1e4bb451 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -235,9 +235,9 @@ impl RankingRuleGraph { continue; } desc.push_str(&format!("{node_idx} [label = {:?}]", node)); - if node_idx == self.query_graph.root_node { + if node_idx == self.query_graph.root_node.0 as usize { desc.push_str("[color = blue]"); - } else if node_idx == self.query_graph.end_node { + } else if node_idx == self.query_graph.end_node.0 as usize { desc.push_str("[color = red]"); } desc.push_str(";\n"); @@ -262,7 +262,7 @@ impl RankingRuleGraph { desc.push_str(&format!( "{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\", color = {color}];\n", cost = edge.cost, - edge_label = G::edge_details_dot_label(details), + edge_label = G::graphviz_edge_details_label(details), )); } } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 07ec3bb5e..7149f8bf6 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -51,11 +51,11 @@ pub fn visit_to_node<'transaction, 'from_data>( db_cache: &mut DatabaseCache<'transaction>, to_node: &QueryNode, from_node_data: &'from_data (WordDerivations, i8), -) -> Result)>>> { +) -> Result)>> { let (derivations1, pos1) = from_node_data; let term2 = match &to_node { - QueryNode::End => return Ok(Some(vec![(0, EdgeDetails::Unconditional)])), - QueryNode::Deleted | QueryNode::Start => return Ok(None), + QueryNode::End => return Ok(vec![(0, EdgeDetails::Unconditional)]), + QueryNode::Deleted | QueryNode::Start => return Ok(vec![]), QueryNode::Term(term) => term, }; let LocatedQueryTerm { value: value2, positions: pos2 } = term2; @@ -86,7 +86,7 @@ pub fn visit_to_node<'transaction, 'from_data>( // We want to effectively ignore this pair of terms // Unconditionally walk through the edge without computing the docids // But also what should the cost be? - return Ok(Some(vec![(0, EdgeDetails::Unconditional)])); + return Ok(vec![(0, EdgeDetails::Unconditional)]); } let updb1 = derivations1.use_prefix_db; @@ -161,5 +161,5 @@ pub fn visit_to_node<'transaction, 'from_data>( }) .collect::>(); new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeDetails::Unconditional)); - Ok(Some(new_edges)) + Ok(new_edges) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 199a5eb4a..e4905ead9 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -26,7 +26,7 @@ impl RankingRuleGraphTrait for ProximityGraph { type EdgeDetails = ProximityEdge; type BuildVisitedFromNode = (WordDerivations, i8); - fn edge_details_dot_label(edge: &Self::EdgeDetails) -> String { + fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String { let ProximityEdge { pairs, proximity } = edge; format!(", prox {proximity}, {} pairs", pairs.len()) } @@ -55,7 +55,7 @@ impl RankingRuleGraphTrait for ProximityGraph { db_cache: &mut DatabaseCache<'transaction>, to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, - ) -> Result)>>> { + ) -> Result)>> { build::visit_to_node(index, txn, db_cache, to_node, from_node_data) } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index ce883ad6a..b980c1dc4 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -36,15 +36,17 @@ impl<'transaction, Query> RankingRuleOutputIter<'transaction, Query> } pub trait RankingRuleQueryTrait: Sized + Clone + 'static {} + #[derive(Clone)] pub struct PlaceholderQuery; impl RankingRuleQueryTrait for PlaceholderQuery {} impl RankingRuleQueryTrait for QueryGraph {} pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { - // TODO: add an update_candidates function to deal with distinct - // attributes? - + /// Prepare the ranking rule such that it can start iterating over its + /// buckets using [`next_bucket`](RankingRule::next_bucket). + /// + /// The given universe is the universe that will be given to [`next_bucket`](RankingRule::next_bucket). fn start_iteration( &mut self, index: &Index, @@ -54,6 +56,13 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { query: &Query, ) -> Result<()>; + /// Return the next bucket of this ranking rule. + /// + /// The returned candidates MUST be a subset of the given universe. + /// + /// The universe given as argument is either: + /// - a subset of the universe given to the previous call to [`next_bucket`](RankingRule::next_bucket); OR + /// - the universe given to [`start_iteration`](RankingRule::start_iteration) fn next_bucket( &mut self, index: &Index, @@ -62,6 +71,8 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { universe: &RoaringBitmap, ) -> Result>>; + /// Finish iterating over the buckets, which yields control to the parent ranking rule + /// The next call to this ranking rule, if any, will be [`start_iteration`](RankingRule::start_iteration). fn end_iteration( &mut self, index: &Index, @@ -72,7 +83,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { #[derive(Debug)] pub struct RankingRuleOutput { - /// The query tree that must be used by the child ranking rule to fetch candidates. + /// The query corresponding to the current bucket for the child ranking rule pub query: Q, /// The allowed candidates for the child ranking rule pub candidates: RoaringBitmap, @@ -151,7 +162,6 @@ pub fn execute_search<'transaction>( let ranking_rules_len = ranking_rules.len(); ranking_rules[0].start_iteration(index, txn, db_cache, universe, query_graph)?; - // TODO: parent_candidates could be used only during debugging? let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; candidates[0] = universe.clone(); @@ -296,43 +306,43 @@ mod tests { let primary_key = index.primary_key(&txn).unwrap().unwrap(); let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - // loop { - // let start = Instant::now(); + loop { + let start = Instant::now(); - // let mut db_cache = DatabaseCache::default(); + let mut db_cache = DatabaseCache::default(); - // let query_graph = make_query_graph( - // &index, - // &txn, - // &mut db_cache, - // "released from prison by the government", - // ) - // .unwrap(); - // // println!("{}", query_graph.graphviz()); + let query_graph = make_query_graph( + &index, + &txn, + &mut db_cache, + "released from prison by the government", + ) + .unwrap(); + // println!("{}", query_graph.graphviz()); - // // TODO: filters + maybe distinct attributes? - // let universe = get_start_universe( - // &index, - // &txn, - // &mut db_cache, - // &query_graph, - // TermsMatchingStrategy::Last, - // ) - // .unwrap(); - // // println!("universe: {universe:?}"); + // TODO: filters + maybe distinct attributes? + let universe = get_start_universe( + &index, + &txn, + &mut db_cache, + &query_graph, + TermsMatchingStrategy::Last, + ) + .unwrap(); + // println!("universe: {universe:?}"); - // let results = execute_search( - // &index, - // &txn, - // &mut db_cache, - // &universe, - // &query_graph, /* 0, 20 */ - // ) - // .unwrap(); + let results = execute_search( + &index, + &txn, + &mut db_cache, + &universe, + &query_graph, /* 0, 20 */ + ) + .unwrap(); - // let elapsed = start.elapsed(); - // println!("{}us: {results:?}", elapsed.as_micros()); - // } + let elapsed = start.elapsed(); + println!("{}us: {results:?}", elapsed.as_micros()); + } let start = Instant::now(); let mut db_cache = DatabaseCache::default(); @@ -388,7 +398,7 @@ mod tests { let mut s = Search::new(&txn, &index); s.query("released from prison by the government"); s.terms_matching_strategy(TermsMatchingStrategy::Last); - // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); + s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap(); let elapsed = start.elapsed(); @@ -431,7 +441,7 @@ mod tests { builder.execute(|_| (), || false).unwrap(); } - // #[test] + #[test] fn _index_movies() { let mut options = EnvOpenOptions::new(); options.map_size(100 * 1024 * 1024 * 1024); // 100 GB @@ -446,20 +456,14 @@ mod tests { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(primary_key.to_owned()); - let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); builder.set_searchable_fields(searchable_fields); - let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); builder.set_filterable_fields(filterable_fields); - - builder.set_criteria(vec![Criterion::Words]); - - // let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_sortable_fields(sortable_fields); - + builder.set_min_word_len_one_typo(5); + builder.set_min_word_len_two_typos(100); + builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); builder.execute(|_| (), || false).unwrap(); let config = IndexerConfig::default(); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 748524492..8bc56bb23 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -4,11 +4,12 @@ use std::collections::{HashMap, HashSet, VecDeque}; use super::db_cache::DatabaseCache; use super::query_term::{QueryTerm, WordDerivations}; -use super::QueryGraph; +use super::{NodeIndex, QueryGraph}; use crate::{Index, Result, RoaringBitmapCodec}; // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. +// TODO: reuse NodeDocidsCache in between calls to resolve_query_graph #[derive(Default)] pub struct NodeDocIdsCache { pub cache: HashMap, @@ -26,7 +27,7 @@ pub fn resolve_query_graph<'transaction>( // resolve_query_graph_rec(index, txn, q, q.root_node, &mut docids, &mut cache)?; - let mut nodes_resolved = HashSet::new(); + let mut nodes_resolved = RoaringBitmap::new(); // TODO: should be given as an argument and kept between invocations of resolve query graph let mut nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()]; @@ -34,16 +35,16 @@ pub fn resolve_query_graph<'transaction>( next_nodes_to_visit.push_front(q.root_node); while let Some(node) = next_nodes_to_visit.pop_front() { - let predecessors = &q.edges[node].incoming; + let predecessors = &q.edges[node.0 as usize].predecessors; if !predecessors.is_subset(&nodes_resolved) { next_nodes_to_visit.push_back(node); continue; } // Take union of all predecessors - let predecessors_iter = predecessors.iter().map(|p| &nodes_docids[*p]); + let predecessors_iter = predecessors.iter().map(|p| &nodes_docids[p as usize]); let predecessors_docids = MultiOps::union(predecessors_iter); - let n = &q.nodes[node]; + let n = &q.nodes[node.0 as usize]; // println!("resolving {node} {n:?}, predecessors: {predecessors:?}, their docids: {predecessors_docids:?}"); let node_docids = match n { super::QueryNode::Term(located_term) => { @@ -95,18 +96,18 @@ pub fn resolve_query_graph<'transaction>( return Ok(predecessors_docids); } }; - nodes_resolved.insert(node); - nodes_docids[node] = node_docids; + nodes_resolved.insert(node.0); + nodes_docids[node.0 as usize] = node_docids; - for &succ in q.edges[node].outgoing.iter() { - if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(&succ) { - next_nodes_to_visit.push_back(succ); + for succ in q.edges[node.0 as usize].successors.iter() { + if !next_nodes_to_visit.contains(&NodeIndex(succ)) && !nodes_resolved.contains(succ) { + next_nodes_to_visit.push_back(NodeIndex(succ)); } } // This is currently slow but could easily be implemented very efficiently - for &prec in q.edges[node].incoming.iter() { - if q.edges[prec].outgoing.is_subset(&nodes_resolved) { - nodes_docids[prec].clear(); + for prec in q.edges[node.0 as usize].predecessors.iter() { + if q.edges[prec as usize].successors.is_subset(&nodes_resolved) { + nodes_docids[prec as usize].clear(); } } // println!("cached docids: {nodes_docids:?}"); From dcf3f1d18a3afad62cc52e99a5c359e4c610f1e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 12:55:44 +0100 Subject: [PATCH 019/234] Remove EdgeIndex and NodeIndex types, prefer u32 instead --- milli/src/search/new/db_cache.rs | 13 ++-- milli/src/search/new/query_graph.rs | 62 ++++++++---------- .../search/new/ranking_rule_graph/build.rs | 8 +-- .../new/ranking_rule_graph/cheapest_paths.rs | 63 +++++++------------ .../ranking_rule_graph/edge_docids_cache.rs | 19 +++--- .../ranking_rule_graph/empty_paths_cache.rs | 11 ++-- .../src/search/new/ranking_rule_graph/mod.rs | 53 +++++++--------- .../new/ranking_rule_graph/paths_map.rs | 52 +++++++-------- .../new/ranking_rule_graph/resolve_paths.rs | 2 +- milli/src/search/new/resolve_query_graph.rs | 21 ++++--- 10 files changed, 139 insertions(+), 165 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 15f9f7873..0a058d339 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -1,17 +1,18 @@ -use std::collections::{hash_map::Entry, HashMap}; +use std::collections::hash_map::Entry; +use fxhash::FxHashMap; use heed::{types::ByteSlice, RoTxn}; use crate::{Index, Result}; #[derive(Default)] pub struct DatabaseCache<'transaction> { - pub word_pair_proximity_docids: HashMap<(u8, String, String), Option<&'transaction [u8]>>, + pub word_pair_proximity_docids: FxHashMap<(u8, String, String), Option<&'transaction [u8]>>, pub word_prefix_pair_proximity_docids: - HashMap<(u8, String, String), Option<&'transaction [u8]>>, - pub word_docids: HashMap>, - pub exact_word_docids: HashMap>, - pub word_prefix_docids: HashMap>, + FxHashMap<(u8, String, String), Option<&'transaction [u8]>>, + pub word_docids: FxHashMap>, + pub exact_word_docids: FxHashMap>, + pub word_prefix_docids: FxHashMap>, } impl<'transaction> DatabaseCache<'transaction> { pub fn get_word_docids( diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 821c1a226..cbe319af7 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -26,18 +26,10 @@ pub struct Edges { pub successors: RoaringBitmap, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct NodeIndex(pub u32); -impl fmt::Display for NodeIndex { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(&self.0, f) - } -} - #[derive(Debug, Clone)] pub struct QueryGraph { - pub root_node: NodeIndex, - pub end_node: NodeIndex, + pub root_node: u32, + pub end_node: u32, pub nodes: Vec, pub edges: Vec, } @@ -56,28 +48,28 @@ impl Default for QueryGraph { Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }, ]; - Self { root_node: NodeIndex(0), end_node: NodeIndex(1), nodes, edges } + Self { root_node: 0, end_node: 1, nodes, edges } } } impl QueryGraph { - fn connect_to_node(&mut self, from_nodes: &[NodeIndex], to_node: NodeIndex) { + fn connect_to_node(&mut self, from_nodes: &[u32], to_node: u32) { for &from_node in from_nodes { - self.edges[from_node.0 as usize].successors.insert(to_node.0); - self.edges[to_node.0 as usize].predecessors.insert(from_node.0); + self.edges[from_node as usize].successors.insert(to_node); + self.edges[to_node as usize].predecessors.insert(from_node); } } - fn add_node(&mut self, from_nodes: &[NodeIndex], node: QueryNode) -> NodeIndex { + fn add_node(&mut self, from_nodes: &[u32], node: QueryNode) -> u32 { let new_node_idx = self.nodes.len() as u32; self.nodes.push(node); self.edges.push(Edges { - predecessors: from_nodes.iter().map(|x| x.0).collect(), + predecessors: from_nodes.iter().collect(), successors: RoaringBitmap::new(), }); for from_node in from_nodes { - self.edges[from_node.0 as usize].successors.insert(new_node_idx); + self.edges[*from_node as usize].successors.insert(new_node_idx); } - NodeIndex(new_node_idx) + new_node_idx } } @@ -99,7 +91,7 @@ impl QueryGraph { let word_set = index.words_fst(txn)?; let mut graph = QueryGraph::default(); - let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = + let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![graph.root_node]); // TODO: add all the word derivations found in the fst @@ -173,33 +165,33 @@ impl QueryGraph { Ok(graph) } - pub fn remove_nodes(&mut self, nodes: &[NodeIndex]) { + pub fn remove_nodes(&mut self, nodes: &[u32]) { for &node in nodes { - self.nodes[node.0 as usize] = QueryNode::Deleted; - let edges = self.edges[node.0 as usize].clone(); + self.nodes[node as usize] = QueryNode::Deleted; + let edges = self.edges[node as usize].clone(); for pred in edges.predecessors.iter() { - self.edges[pred as usize].successors.remove(node.0); + self.edges[pred as usize].successors.remove(node); } for succ in edges.successors { - self.edges[succ as usize].predecessors.remove(node.0); + self.edges[succ as usize].predecessors.remove(node); } - self.edges[node.0 as usize] = + self.edges[node as usize] = Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }; } } - pub fn remove_nodes_keep_edges(&mut self, nodes: &[NodeIndex]) { + pub fn remove_nodes_keep_edges(&mut self, nodes: &[u32]) { for &node in nodes { - self.nodes[node.0 as usize] = QueryNode::Deleted; - let edges = self.edges[node.0 as usize].clone(); + self.nodes[node as usize] = QueryNode::Deleted; + let edges = self.edges[node as usize].clone(); for pred in edges.predecessors.iter() { - self.edges[pred as usize].successors.remove(node.0); + self.edges[pred as usize].successors.remove(node); self.edges[pred as usize].successors |= &edges.successors; } for succ in edges.successors { - self.edges[succ as usize].predecessors.remove(node.0); + self.edges[succ as usize].predecessors.remove(node); self.edges[succ as usize].predecessors |= &edges.predecessors; } - self.edges[node.0 as usize] = + self.edges[node as usize] = Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }; } } @@ -207,7 +199,7 @@ impl QueryGraph { let mut nodes_to_remove_keeping_edges = vec![]; let mut nodes_to_remove = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { - let node_idx = NodeIndex(node_idx as u32); + let node_idx = node_idx as u32; let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue }; if positions.contains(&position) { nodes_to_remove_keeping_edges.push(node_idx) @@ -231,7 +223,7 @@ impl QueryGraph { || (!matches!(node, QueryNode::Start | QueryNode::Deleted) && self.edges[node_idx].predecessors.is_empty()) { - nodes_to_remove.push(NodeIndex(node_idx as u32)); + nodes_to_remove.push(node_idx as u32); } } if nodes_to_remove.is_empty() { @@ -315,9 +307,9 @@ node [shape = "record"] continue; } desc.push_str(&format!("{node} [label = {:?}]", &self.nodes[node],)); - if node == self.root_node.0 as usize { + if node == self.root_node as usize { desc.push_str("[color = blue]"); - } else if node == self.end_node.0 as usize { + } else if node == self.end_node as usize { desc.push_str("[color = red]"); } desc.push_str(";\n"); diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 45dda3c1f..6978491cd 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,11 +1,11 @@ -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeSet, HashSet}; use heed::RoTxn; use roaring::RoaringBitmap; use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; -use crate::new::{NodeIndex, QueryGraph}; +use crate::new::QueryGraph; use crate::{Index, Result}; impl RankingRuleGraph { @@ -36,8 +36,8 @@ impl RankingRuleGraph { edges.sort_by_key(|e| e.0); for (cost, details) in edges { ranking_rule_graph.all_edges.push(Some(Edge { - from_node: NodeIndex(node_idx as u32), - to_node: NodeIndex(successor_idx), + from_node: node_idx as u32, + to_node: successor_idx, cost, details, })); diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index f1c1035a3..00babb560 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -3,25 +3,23 @@ use std::collections::{BTreeMap, HashSet}; use itertools::Itertools; use roaring::RoaringBitmap; -use crate::new::NodeIndex; - use super::{ - empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, Edge, EdgeIndex, RankingRuleGraph, + empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, Edge, RankingRuleGraph, RankingRuleGraphTrait, }; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Path { - pub edges: Vec, + pub edges: Vec, pub cost: u64, } struct DijkstraState { unvisited: RoaringBitmap, // should be a small bitset? distances: Vec, // or binary heap, or btreemap? (f64, usize) - edges: Vec, + edges: Vec, edge_costs: Vec, - paths: Vec>, + paths: Vec>, } pub struct KCheapestPathsState { @@ -104,29 +102,26 @@ impl KCheapestPathsState { .iter() .enumerate() { - let Some(edge) = graph.all_edges[edge_idx.0].as_ref() else { continue; }; + let Some(edge) = graph.all_edges[*edge_idx as usize].as_ref() else { continue; }; let Edge { from_node: spur_node, .. } = edge; - // TODO: - // Here, check that the root path is not dicarded by the empty_paths_cache - // If it is, then continue to the next spur_node let root_path = &self.kth_cheapest_path.edges[..i]; if empty_paths_cache.path_is_empty(root_path) { continue; } - let root_cost = root_path - .iter() - .fold(0, |sum, next| sum + graph.get_edge(*next).as_ref().unwrap().cost as u64); + let root_cost = root_path.iter().fold(0, |sum, next| { + sum + graph.all_edges[*next as usize].as_ref().unwrap().cost as u64 + }); let mut tmp_removed_edges = vec![]; // for all the paths already found that share a common prefix with the root path // we delete the edge from the spur node to the next one for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) { let was_removed = - graph.node_edges[spur_node.0 as usize].remove(edge_index_to_remove.0 as u32); + graph.node_edges[*spur_node as usize].remove(edge_index_to_remove); if was_removed { - tmp_removed_edges.push(edge_index_to_remove.0 as u32); + tmp_removed_edges.push(edge_index_to_remove); } } @@ -134,7 +129,7 @@ impl KCheapestPathsState { // we will combine it with the root path to get a potential kth cheapest path let spur_path = graph.cheapest_path_to_end(*spur_node); // restore the temporarily removed edges - graph.node_edges[spur_node.0 as usize].extend(tmp_removed_edges); + graph.node_edges[*spur_node as usize].extend(tmp_removed_edges); let Some(spur_path) = spur_path else { continue; }; let total_cost = root_cost + spur_path.cost; @@ -158,7 +153,7 @@ impl KCheapestPathsState { assert_eq!(cost, cost2); if next_cheapest_path .iter() - .any(|edge_index| graph.all_edges.get(edge_index.0).is_none()) + .any(|edge_index| graph.all_edges[*edge_index as usize].is_none()) { continue; } else { @@ -179,15 +174,15 @@ impl KCheapestPathsState { } impl RankingRuleGraph { - fn cheapest_path_to_end(&self, from: NodeIndex) -> Option { + fn cheapest_path_to_end(&self, from: u32) -> Option { let mut dijkstra = DijkstraState { unvisited: (0..self.query_graph.nodes.len() as u32).collect(), distances: vec![u64::MAX; self.query_graph.nodes.len()], - edges: vec![EdgeIndex(usize::MAX); self.query_graph.nodes.len()], + edges: vec![u32::MAX; self.query_graph.nodes.len()], edge_costs: vec![u8::MAX; self.query_graph.nodes.len()], paths: vec![None; self.query_graph.nodes.len()], }; - dijkstra.distances[from.0 as usize] = 0; + dijkstra.distances[from as usize] = 0; // TODO: could use a binary heap here to store the distances, or a btreemap while let Some(cur_node) = @@ -197,55 +192,43 @@ impl RankingRuleGraph { if cur_node_dist == u64::MAX { return None; } - if cur_node == self.query_graph.end_node.0 { + if cur_node == self.query_graph.end_node { break; } - // this is expensive, but shouldn't - // ideally I could quickly get a bitmap of all a node's successors - // then take the intersection with unvisited - let succ_cur_node: &RoaringBitmap = &self.successors[cur_node as usize]; - // .iter() - // .map(|e| self.all_edges[e as usize].as_ref().unwrap().to_node.0) - // .collect(); - // TODO: this intersection may be slow but shouldn't be, - // can use a bitmap intersection instead + let succ_cur_node = &self.successors[cur_node as usize]; let unvisited_succ_cur_node = succ_cur_node & &dijkstra.unvisited; for succ in unvisited_succ_cur_node { - // cheapest_edge() is also potentially too expensive - let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(NodeIndex(cur_node), NodeIndex(succ)) else { + let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(cur_node, succ) else { continue }; - // println!("cur node dist {cur_node_dist}"); let old_dist_succ = &mut dijkstra.distances[succ as usize]; let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64; if new_potential_distance < *old_dist_succ { *old_dist_succ = new_potential_distance; dijkstra.edges[succ as usize] = cheapest_edge; dijkstra.edge_costs[succ as usize] = cheapest_edge_cost; - dijkstra.paths[succ as usize] = Some(NodeIndex(cur_node)); + dijkstra.paths[succ as usize] = Some(cur_node); } } dijkstra.unvisited.remove(cur_node); } let mut cur = self.query_graph.end_node; - // let mut edge_costs = vec![]; - // let mut distances = vec![]; let mut path_edges = vec![]; - while let Some(n) = dijkstra.paths[cur.0 as usize] { - path_edges.push(dijkstra.edges[cur.0 as usize]); + while let Some(n) = dijkstra.paths[cur as usize] { + path_edges.push(dijkstra.edges[cur as usize]); cur = n; } path_edges.reverse(); Some(Path { edges: path_edges, - cost: dijkstra.distances[self.query_graph.end_node.0 as usize], + cost: dijkstra.distances[self.query_graph.end_node as usize], }) } - pub fn cheapest_edge(&self, cur_node: NodeIndex, succ: NodeIndex) -> Option<(EdgeIndex, u8)> { + pub fn cheapest_edge(&self, cur_node: u32, succ: u32) -> Option<(u32, u8)> { self.visit_edges(cur_node, succ, |edge_idx, edge| { std::ops::ControlFlow::Break((edge_idx, edge.cost)) }) diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 0c9768f04..263b78d6a 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -1,10 +1,11 @@ use std::collections::HashMap; use std::marker::PhantomData; +use fxhash::FxHashMap; use heed::RoTxn; use roaring::RoaringBitmap; -use super::{EdgeDetails, EdgeIndex, RankingRuleGraph, RankingRuleGraphTrait}; +use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; use crate::new::BitmapOrAllRef; use crate::{Index, Result}; @@ -16,12 +17,12 @@ use crate::{Index, Result}; // by using a pointer (real, Rc, bumpalo, or in a vector)??? pub struct EdgeDocidsCache { - pub cache: HashMap, + pub cache: FxHashMap, // TODO: There is a big difference between `cache`, which is always valid, and // `empty_path_prefixes`, which is only accurate for a particular universe // ALSO, we should have a universe-specific `empty_edge` to use - // pub empty_path_prefixes: HashSet>, + // pub empty_path_prefixes: HashSet>, _phantom: PhantomData, } impl Default for EdgeDocidsCache { @@ -39,21 +40,21 @@ impl EdgeDocidsCache { index: &Index, txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, - edge_index: &EdgeIndex, + edge_index: u32, graph: &RankingRuleGraph, ) -> Result> { - if self.cache.contains_key(edge_index) { - return Ok(BitmapOrAllRef::Bitmap(&self.cache[edge_index])); + if self.cache.contains_key(&edge_index) { + return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); } - let edge = graph.get_edge(*edge_index).as_ref().unwrap(); + let edge = graph.all_edges[edge_index as usize].as_ref().unwrap(); match &edge.details { EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All), EdgeDetails::Data(details) => { let docids = G::compute_docids(index, txn, db_cache, details)?; - let _ = self.cache.insert(*edge_index, docids); - let docids = &self.cache[edge_index]; + let _ = self.cache.insert(edge_index, docids); + let docids = &self.cache[&edge_index]; Ok(BitmapOrAllRef::Bitmap(docids)) } } diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index 989a08a0d..5748dce3c 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,17 +1,18 @@ use std::collections::HashSet; -use super::{paths_map::PathsMap, EdgeIndex}; +use roaring::RoaringBitmap; + +use super::paths_map::PathsMap; #[derive(Default)] pub struct EmptyPathsCache { - pub empty_edges: HashSet, + pub empty_edges: RoaringBitmap, pub empty_prefixes: PathsMap<()>, } impl EmptyPathsCache { - pub fn path_is_empty(&self, path: &[EdgeIndex]) -> bool { + pub fn path_is_empty(&self, path: &[u32]) -> bool { for edge in path { - // TODO: should be a bitmap intersection - if self.empty_edges.contains(edge) { + if self.empty_edges.contains(*edge) { return true; } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index f7a312240..b5c928ffa 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -6,14 +6,14 @@ pub mod paths_map; pub mod proximity; pub mod resolve_paths; -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeSet, HashSet}; use std::ops::ControlFlow; use heed::RoTxn; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; -use super::{NodeIndex, QueryGraph, QueryNode}; +use super::{QueryGraph, QueryNode}; use crate::{Index, Result}; #[derive(Debug, Clone)] @@ -24,21 +24,18 @@ pub enum EdgeDetails { #[derive(Debug, Clone)] pub struct Edge { - from_node: NodeIndex, - to_node: NodeIndex, + from_node: u32, + to_node: u32, cost: u8, details: EdgeDetails, } #[derive(Debug, Clone)] pub struct EdgePointer<'graph, E> { - pub index: EdgeIndex, + pub index: u32, pub edge: &'graph Edge, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct EdgeIndex(pub usize); - pub trait RankingRuleGraphTrait { /// The details of an edge connecting two query nodes. These details /// should be sufficient to compute the edge's cost and associated document ids @@ -103,26 +100,22 @@ pub struct RankingRuleGraph { // TODO: // node_successors? - // pub removed_edges: HashSet, - // pub tmp_removed_edges: HashSet, + // pub removed_edges: HashSet, + // pub tmp_removed_edges: HashSet, } impl RankingRuleGraph { - pub fn get_edge(&self, edge_index: EdgeIndex) -> &Option> { - &self.all_edges[edge_index.0] - } - // Visit all edges between the two given nodes in order of increasing cost. pub fn visit_edges<'graph, O>( &'graph self, - from: NodeIndex, - to: NodeIndex, - mut visit: impl FnMut(EdgeIndex, &'graph Edge) -> ControlFlow, + from: u32, + to: u32, + mut visit: impl FnMut(u32, &'graph Edge) -> ControlFlow, ) -> Option { - let from_edges = &self.node_edges[from.0 as usize]; + let from_edges = &self.node_edges[from as usize]; for edge_idx in from_edges { let edge = self.all_edges[edge_idx as usize].as_ref().unwrap(); if edge.to_node == to { - let cf = visit(EdgeIndex(edge_idx as usize), edge); + let cf = visit(edge_idx, edge); match cf { ControlFlow::Continue(_) => continue, ControlFlow::Break(o) => return Some(o), @@ -133,21 +126,21 @@ impl RankingRuleGraph { None } - fn remove_edge(&mut self, edge_index: EdgeIndex) { - let edge_opt = &mut self.all_edges[edge_index.0]; + fn remove_edge(&mut self, edge_index: u32) { + let edge_opt = &mut self.all_edges[edge_index as usize]; let Some(edge) = &edge_opt else { return }; let (from_node, to_node) = (edge.from_node, edge.to_node); *edge_opt = None; - let from_node_edges = &mut self.node_edges[from_node.0 as usize]; - from_node_edges.remove(edge_index.0 as u32); + let from_node_edges = &mut self.node_edges[from_node as usize]; + from_node_edges.remove(edge_index); let mut new_successors_from_node = RoaringBitmap::new(); - for edge in from_node_edges.iter() { - let Edge { to_node, .. } = &self.all_edges[edge as usize].as_ref().unwrap(); - new_successors_from_node.insert(to_node.0); + for from_node_edge in from_node_edges.iter() { + let Edge { to_node, .. } = &self.all_edges[from_node_edge as usize].as_ref().unwrap(); + new_successors_from_node.insert(*to_node); } - self.successors[from_node.0 as usize] = new_successors_from_node; + self.successors[from_node as usize] = new_successors_from_node; } // pub fn remove_nodes(&mut self, nodes: &[usize]) { // for &node in nodes { @@ -190,7 +183,7 @@ impl RankingRuleGraph { // } // } // } - // fn is_removed_edge(&self, edge: EdgeIndex) -> bool { + // fn is_removed_edge(&self, edge: u32) -> bool { // self.removed_edges.contains(&edge) || self.tmp_removed_edges.contains(&edge) // } @@ -203,9 +196,9 @@ impl RankingRuleGraph { continue; } desc.push_str(&format!("{node_idx} [label = {:?}]", node)); - if node_idx == self.query_graph.root_node.0 as usize { + if node_idx == self.query_graph.root_node as usize { desc.push_str("[color = blue]"); - } else if node_idx == self.query_graph.end_node.0 as usize { + } else if node_idx == self.query_graph.end_node as usize { desc.push_str("[color = red]"); } desc.push_str(";\n"); diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index b1e4bb451..572cb975f 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -3,14 +3,16 @@ use std::collections::HashSet; use std::fmt::Write; use std::hash::{Hash, Hasher}; +use roaring::RoaringBitmap; + use super::cheapest_paths::Path; -use super::{EdgeDetails, EdgeIndex, RankingRuleGraph, RankingRuleGraphTrait, Edge}; +use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait, Edge}; use crate::new::QueryNode; #[derive(Debug)] pub struct PathsMap { - nodes: Vec<(EdgeIndex, PathsMap)>, + nodes: Vec<(u32, PathsMap)>, value: Option } impl Default for PathsMap { @@ -36,7 +38,7 @@ impl PathsMap { self.nodes.is_empty() && self.value.is_none() } - pub fn insert(&mut self, mut edges: impl Iterator, value: V) { + pub fn insert(&mut self, mut edges: impl Iterator, value: V) { match edges.next() { None => { self.value = Some(value); @@ -54,7 +56,7 @@ impl PathsMap { } } } - fn remove_first_rec(&mut self, cur: &mut Vec) -> (bool, V) { + fn remove_first_rec(&mut self, cur: &mut Vec) -> (bool, V) { let Some((first_edge, rest)) = self.nodes.first_mut() else { // The PathsMap has to be correct by construction here, otherwise // the unwrap() will crash @@ -69,7 +71,7 @@ impl PathsMap { (false, value) } } - pub fn remove_first(&mut self) -> Option<(Vec, V)> { + pub fn remove_first(&mut self) -> Option<(Vec, V)> { if self.is_empty() { return None } @@ -78,7 +80,7 @@ impl PathsMap { let (_, value) = self.remove_first_rec(&mut result); Some((result, value)) } - pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { + pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { if let Some(value) = &self.value { visit(cur, value); } @@ -88,7 +90,7 @@ impl PathsMap { cur.pop(); } } - pub fn iterate(&self, mut visit: impl FnMut(&Vec, &V)) { + pub fn iterate(&self, mut visit: impl FnMut(&Vec, &V)) { self.iterate_rec(&mut vec![], &mut visit) } @@ -97,10 +99,10 @@ impl PathsMap { self.remove_prefix(prefix); }); } - pub fn remove_edges(&mut self, forbidden_edges: &HashSet) { + pub fn remove_edges(&mut self, forbidden_edges: &RoaringBitmap) { let mut i = 0; while i < self.nodes.len() { - let should_remove = if forbidden_edges.contains(&self.nodes[i].0) { + let should_remove = if forbidden_edges.contains(self.nodes[i].0) { true } else if !self.nodes[i].1.nodes.is_empty() { self.nodes[i].1.remove_edges(forbidden_edges); @@ -115,7 +117,7 @@ impl PathsMap { } } } - pub fn remove_edge(&mut self, forbidden_edge: &EdgeIndex) { + pub fn remove_edge(&mut self, forbidden_edge: &u32) { let mut i = 0; while i < self.nodes.len() { let should_remove = if &self.nodes[i].0 == forbidden_edge { @@ -133,7 +135,7 @@ impl PathsMap { } } } - pub fn remove_prefix(&mut self, forbidden_prefix: &[EdgeIndex]) { + pub fn remove_prefix(&mut self, forbidden_prefix: &[u32]) { let [first_edge, remaining_prefix @ ..] = forbidden_prefix else { self.nodes.clear(); self.value = None; @@ -157,7 +159,7 @@ impl PathsMap { } } - pub fn edge_indices_after_prefix(&self, prefix: &[EdgeIndex]) -> Vec { + pub fn edge_indices_after_prefix(&self, prefix: &[u32]) -> Vec { let [first_edge, remaining_prefix @ ..] = prefix else { return self.nodes.iter().map(|n| n.0).collect(); }; @@ -169,7 +171,7 @@ impl PathsMap { vec![] } - pub fn contains_prefix_of_path(&self, path: &[EdgeIndex]) -> bool { + pub fn contains_prefix_of_path(&self, path: &[u32]) -> bool { if self.value.is_some() { return true } @@ -202,7 +204,7 @@ impl PathsMap { h.finish() }; for (edge_idx, rest) in self.nodes.iter() { - let Some(Edge { from_node, to_node, cost, details }) = graph.get_edge(*edge_idx).as_ref() else { + let Some(Edge { from_node, to_node, cost, details }) = graph.all_edges[*edge_idx as usize].as_ref() else { continue; }; let mut path_to = path_from.clone(); @@ -235,9 +237,9 @@ impl RankingRuleGraph { continue; } desc.push_str(&format!("{node_idx} [label = {:?}]", node)); - if node_idx == self.query_graph.root_node.0 as usize { + if node_idx == self.query_graph.root_node as usize { desc.push_str("[color = blue]"); - } else if node_idx == self.query_graph.end_node.0 as usize { + } else if node_idx == self.query_graph.end_node as usize { desc.push_str("[color = red]"); } desc.push_str(";\n"); @@ -246,7 +248,7 @@ impl RankingRuleGraph { for (edge_idx, edge) in self.all_edges.iter().enumerate() { let Some(edge) = edge else { continue }; let Edge { from_node, to_node, cost, details } = edge; - let color = if path.edges.contains(&EdgeIndex(edge_idx)) { + let color = if path.edges.contains(&(edge_idx as u32)) { "red" } else { "green" @@ -283,7 +285,7 @@ mod tests { use crate::new::ranking_rule_graph::cheapest_paths::KCheapestPathsState; use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; use crate::new::ranking_rule_graph::proximity::ProximityGraph; - use crate::new::ranking_rule_graph::{RankingRuleGraph, EdgeIndex}; + use crate::new::ranking_rule_graph::RankingRuleGraph; use crate::search::new::query_term::{word_derivations, LocatedQueryTerm}; use crate::search::new::QueryGraph; use charabia::Tokenize; @@ -358,11 +360,11 @@ mod tests { let desc = path_tree.graphviz(&prox_graph); println!("{desc}"); - // let path = vec![EdgeIndex { from: 0, to: 2, edge_idx: 0 }, EdgeIndex { from: 2, to: 3, edge_idx: 0 }, EdgeIndex { from: 3, to: 4, edge_idx: 0 }, EdgeIndex { from: 4, to: 5, edge_idx: 0 }, EdgeIndex { from: 5, to: 8, edge_idx: 0 }, EdgeIndex { from: 8, to: 1, edge_idx: 0 }, EdgeIndex { from: 1, to: 10, edge_idx: 0 }]; + // let path = vec![u32 { from: 0, to: 2, edge_idx: 0 }, u32 { from: 2, to: 3, edge_idx: 0 }, u32 { from: 3, to: 4, edge_idx: 0 }, u32 { from: 4, to: 5, edge_idx: 0 }, u32 { from: 5, to: 8, edge_idx: 0 }, u32 { from: 8, to: 1, edge_idx: 0 }, u32 { from: 1, to: 10, edge_idx: 0 }]; // println!("{}", psath_tree.contains_prefix_of_path(&path)); - // let path = vec![EdgeIndex { from: 0, to: 2, edge_idx: 0 }, EdgeIndex { from: 2, to: 3, edge_idx: 0 }, EdgeIndex { from: 3, to: 4, edge_idx: 0 }, EdgeIndex { from: 4, to: 5, edge_idx: 0 }, EdgeIndex { from: 5, to: 6, edge_idx: 0 }, EdgeIndex { from: 6, to: 7, edge_idx: 0 }, EdgeIndex { from: 7, to: 1, edge_idx: 0 }]; + // let path = vec![u32 { from: 0, to: 2, edge_idx: 0 }, u32 { from: 2, to: 3, edge_idx: 0 }, u32 { from: 3, to: 4, edge_idx: 0 }, u32 { from: 4, to: 5, edge_idx: 0 }, u32 { from: 5, to: 6, edge_idx: 0 }, u32 { from: 6, to: 7, edge_idx: 0 }, u32 { from: 7, to: 1, edge_idx: 0 }]; // path_tree.iterate(|path, cost| { @@ -370,18 +372,18 @@ mod tests { // }); // path_tree.remove_forbidden_prefix(&[ - // EdgeIndex { from: 0, to: 2, edge_idx: 0 }, - // EdgeIndex { from: 2, to: 3, edge_idx: 2 }, + // u32 { from: 0, to: 2, edge_idx: 0 }, + // u32 { from: 2, to: 3, edge_idx: 2 }, // ]); // let desc = path_tree.graphviz(); // println!("{desc}"); - // path_tree.remove_forbidden_edge(&EdgeIndex { from: 5, to: 6, cost: 1 }); + // path_tree.remove_forbidden_edge(&u32 { from: 5, to: 6, cost: 1 }); // let desc = path_tree.graphviz(); // println!("AFTER REMOVING 5-6 [1]:\n{desc}"); - // path_tree.remove_forbidden_edge(&EdgeIndex { from: 3, to: 4, cost: 1 }); + // path_tree.remove_forbidden_edge(&u32 { from: 3, to: 4, cost: 1 }); // let desc = path_tree.graphviz(); // println!("AFTER REMOVING 3-4 [1]:\n{desc}"); @@ -396,7 +398,7 @@ mod tests { // let desc = path_tree.graphviz(); // println!("AFTER REMOVING: {desc}"); - // path_tree.remove_all_containing_edge(&EdgeIndex { from: 5, to: 6, cost: 2 }); + // path_tree.remove_all_containing_edge(&u32 { from: 5, to: 6, cost: 2 }); // let desc = path_tree.graphviz(); // println!("{desc}"); diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index 76823a32a..95cf4629b 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -35,7 +35,7 @@ impl RankingRuleGraph { 'edge_loop: for edge_index in edge_indexes { processed_edges.push(edge_index); let edge_docids = - edge_docids_cache.get_edge_docids(index, txn, db_cache, &edge_index, self)?; + edge_docids_cache.get_edge_docids(index, txn, db_cache, edge_index, self)?; match edge_docids { BitmapOrAllRef::Bitmap(edge_docids) => { if edge_docids.is_disjoint(universe) { diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 8bc56bb23..e0e3c5321 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -1,10 +1,11 @@ +use fxhash::FxHashMap; use heed::{BytesDecode, RoTxn}; use roaring::{MultiOps, RoaringBitmap}; use std::collections::{HashMap, HashSet, VecDeque}; use super::db_cache::DatabaseCache; use super::query_term::{QueryTerm, WordDerivations}; -use super::{NodeIndex, QueryGraph}; +use super::QueryGraph; use crate::{Index, Result, RoaringBitmapCodec}; // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. @@ -12,7 +13,7 @@ use crate::{Index, Result, RoaringBitmapCodec}; // TODO: reuse NodeDocidsCache in between calls to resolve_query_graph #[derive(Default)] pub struct NodeDocIdsCache { - pub cache: HashMap, + pub cache: FxHashMap, } pub fn resolve_query_graph<'transaction>( @@ -35,7 +36,7 @@ pub fn resolve_query_graph<'transaction>( next_nodes_to_visit.push_front(q.root_node); while let Some(node) = next_nodes_to_visit.pop_front() { - let predecessors = &q.edges[node.0 as usize].predecessors; + let predecessors = &q.edges[node as usize].predecessors; if !predecessors.is_subset(&nodes_resolved) { next_nodes_to_visit.push_back(node); continue; @@ -44,7 +45,7 @@ pub fn resolve_query_graph<'transaction>( let predecessors_iter = predecessors.iter().map(|p| &nodes_docids[p as usize]); let predecessors_docids = MultiOps::union(predecessors_iter); - let n = &q.nodes[node.0 as usize]; + let n = &q.nodes[node as usize]; // println!("resolving {node} {n:?}, predecessors: {predecessors:?}, their docids: {predecessors_docids:?}"); let node_docids = match n { super::QueryNode::Term(located_term) => { @@ -96,16 +97,16 @@ pub fn resolve_query_graph<'transaction>( return Ok(predecessors_docids); } }; - nodes_resolved.insert(node.0); - nodes_docids[node.0 as usize] = node_docids; + nodes_resolved.insert(node); + nodes_docids[node as usize] = node_docids; - for succ in q.edges[node.0 as usize].successors.iter() { - if !next_nodes_to_visit.contains(&NodeIndex(succ)) && !nodes_resolved.contains(succ) { - next_nodes_to_visit.push_back(NodeIndex(succ)); + for succ in q.edges[node as usize].successors.iter() { + if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) { + next_nodes_to_visit.push_back(succ); } } // This is currently slow but could easily be implemented very efficiently - for prec in q.edges[node.0 as usize].predecessors.iter() { + for prec in q.edges[node as usize].predecessors.iter() { if q.edges[prec as usize].successors.is_subset(&nodes_resolved) { nodes_docids[prec as usize].clear(); } From a938fbde4a9046c059b4d29d21b642840eb1e140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 13:21:41 +0100 Subject: [PATCH 020/234] Use a cache when resolving the query graph --- milli/src/search/new/resolve_query_graph.rs | 121 ++++++++++++-------- milli/src/search/new/words.rs | 13 ++- 2 files changed, 85 insertions(+), 49 deletions(-) diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index e0e3c5321..6110bae49 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -4,7 +4,7 @@ use roaring::{MultiOps, RoaringBitmap}; use std::collections::{HashMap, HashSet, VecDeque}; use super::db_cache::DatabaseCache; -use super::query_term::{QueryTerm, WordDerivations}; +use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use super::QueryGraph; use crate::{Index, Result, RoaringBitmapCodec}; @@ -13,13 +13,66 @@ use crate::{Index, Result, RoaringBitmapCodec}; // TODO: reuse NodeDocidsCache in between calls to resolve_query_graph #[derive(Default)] pub struct NodeDocIdsCache { - pub cache: FxHashMap, + pub cache: FxHashMap, +} +impl NodeDocIdsCache { + fn get_docids<'cache, 'transaction>( + &'cache mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + term: &QueryTerm, + node_idx: u32, + ) -> Result<&'cache RoaringBitmap> { + if self.cache.contains_key(&node_idx) { + return Ok(&self.cache[&node_idx]); + }; + let docids = match term { + QueryTerm::Phrase(_) => { + todo!("resolve phrase") + } + QueryTerm::Word { + derivations: + WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db }, + } => { + let derivations_docids = { + let mut or_docids = vec![]; + for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) { + if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? { + or_docids.push(word_docids); + } + } + if *use_prefix_db { + if let Some(prefix_docids) = + db_cache.get_prefix_docids(index, txn, original.as_str())? + { + or_docids.push(prefix_docids); + } + } + or_docids + }; + let derivations_iter = derivations_docids + .into_iter() + .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()); + MultiOps::union(derivations_iter) + // TODO: if `or` is empty, register that somewhere, and immediately return an empty bitmap + // On the other hand, `or` *cannot* be empty, only its intersection with the universe can + // + // TODO: Or we don't do anything and accumulate all these operations in a tree of operations + // between frozen roaring bitmap that is resolved only at the very end + } + }; + let _ = self.cache.insert(node_idx, docids); + let docids = &self.cache[&node_idx]; + Ok(docids) + } } pub fn resolve_query_graph<'transaction>( index: &Index, txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, + node_docids_cache: &mut NodeDocIdsCache, q: &QueryGraph, universe: &RoaringBitmap, ) -> Result { @@ -30,7 +83,7 @@ pub fn resolve_query_graph<'transaction>( let mut nodes_resolved = RoaringBitmap::new(); // TODO: should be given as an argument and kept between invocations of resolve query graph - let mut nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()]; + let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()]; let mut next_nodes_to_visit = VecDeque::new(); next_nodes_to_visit.push_front(q.root_node); @@ -42,7 +95,7 @@ pub fn resolve_query_graph<'transaction>( continue; } // Take union of all predecessors - let predecessors_iter = predecessors.iter().map(|p| &nodes_docids[p as usize]); + let predecessors_iter = predecessors.iter().map(|p| &path_nodes_docids[p as usize]); let predecessors_docids = MultiOps::union(predecessors_iter); let n = &q.nodes[node as usize]; @@ -50,47 +103,12 @@ pub fn resolve_query_graph<'transaction>( let node_docids = match n { super::QueryNode::Term(located_term) => { let term = &located_term.value; - match term { - QueryTerm::Phrase(_) => todo!("resolve phrase"), - QueryTerm::Word { - derivations: - WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db }, - } => { - let derivations_docids = { - let mut or_docids = vec![]; - for word in - zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) - { - if let Some(word_docids) = - db_cache.get_word_docids(index, txn, word)? - { - or_docids.push(word_docids); - } - } - if *use_prefix_db { - if let Some(prefix_docids) = - db_cache.get_prefix_docids(index, txn, original.as_str())? - { - or_docids.push(prefix_docids); - } - } - or_docids - }; - let derivations_iter = derivations_docids - .into_iter() - .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()); - let derivations_docids = MultiOps::union(derivations_iter); - // TODO: if `or` is empty, register that somewhere, and immediately return an empty bitmap - // On the other hand, `or` *cannot* be empty, only its intersection with the universe can - // - // TODO: Or we don't do anything and accumulate all these operations in a tree of operations - // between frozen roaring bitmap that is resolved only at the very end - predecessors_docids & derivations_docids - } - } + let derivations_docids = + node_docids_cache.get_docids(index, txn, db_cache, term, node)?; + predecessors_docids & derivations_docids } super::QueryNode::Deleted => { - todo!() + panic!() } super::QueryNode::Start => universe.clone(), super::QueryNode::End => { @@ -98,7 +116,7 @@ pub fn resolve_query_graph<'transaction>( } }; nodes_resolved.insert(node); - nodes_docids[node as usize] = node_docids; + path_nodes_docids[node as usize] = node_docids; for succ in q.edges[node as usize].successors.iter() { if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) { @@ -108,7 +126,7 @@ pub fn resolve_query_graph<'transaction>( // This is currently slow but could easily be implemented very efficiently for prec in q.edges[node as usize].predecessors.iter() { if q.edges[prec as usize].successors.is_subset(&nodes_resolved) { - nodes_docids[prec as usize].clear(); + path_nodes_docids[prec as usize].clear(); } } // println!("cached docids: {nodes_docids:?}"); @@ -125,6 +143,7 @@ mod tests { use crate::db_snap; use crate::index::tests::TempIndex; use crate::new::db_cache::DatabaseCache; + use crate::new::resolve_query_graph::NodeDocIdsCache; use crate::search::new::query_term::{word_derivations, LocatedQueryTerm}; use crate::search::new::QueryGraph; @@ -184,10 +203,18 @@ mod tests { .unwrap(); let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap(); println!("{}", graph.graphviz()); - + let mut node_docids_cache = NodeDocIdsCache::default(); let universe = index.documents_ids(&txn).unwrap(); insta::assert_debug_snapshot!(universe, @"RoaringBitmap<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]>"); - let docids = resolve_query_graph(&index, &txn, &mut db_cache, &graph, &universe).unwrap(); + let docids = resolve_query_graph( + &index, + &txn, + &mut db_cache, + &mut node_docids_cache, + &graph, + &universe, + ) + .unwrap(); insta::assert_debug_snapshot!(docids, @"RoaringBitmap<[8, 9, 11]>"); // TODO: test with a reduced universe diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 4d812d9ff..5852d137a 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -4,7 +4,7 @@ use heed::RoTxn; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; -use super::resolve_query_graph::resolve_query_graph; +use super::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}; use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput}; use crate::{Index, Result, TermsMatchingStrategy}; @@ -14,6 +14,7 @@ pub struct Words { iterating: bool, positions_to_remove: Vec, terms_matching_strategy: TermsMatchingStrategy, + node_docids_cache: NodeDocIdsCache, } impl Words { pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self { @@ -23,6 +24,7 @@ impl Words { iterating: false, positions_to_remove: vec![], terms_matching_strategy, + node_docids_cache: <_>::default(), } } } @@ -79,7 +81,14 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { let Some(query_graph) = &mut self.query_graph else { panic!() }; // let graphviz = query_graph.graphviz(); // println!("\n===={graphviz}\n===="); - let this_bucket = resolve_query_graph(index, txn, db_cache, query_graph, universe)?; + let this_bucket = resolve_query_graph( + index, + txn, + db_cache, + &mut self.node_docids_cache, + query_graph, + universe, + )?; // println!("WORDS: this bucket: {this_bucket:?}"); let child_query_graph = query_graph.clone(); // this_bucket is the one that must be returned now From c8e251bf24e66a67d9e0483ce79045f393263575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 13:57:34 +0100 Subject: [PATCH 021/234] Remove noise in codebase --- milli/src/search/new/db_cache.rs | 8 +- .../search/new/graph_based_ranking_rule.rs | 71 +------ milli/src/search/new/mod.rs | 6 +- milli/src/search/new/query_graph.rs | 111 +---------- milli/src/search/new/query_term.rs | 4 - .../search/new/ranking_rule_graph/build.rs | 2 - .../new/ranking_rule_graph/cheapest_paths.rs | 7 +- .../ranking_rule_graph/edge_docids_cache.rs | 6 - .../src/search/new/ranking_rule_graph/mod.rs | 59 +----- .../new/ranking_rule_graph/paths_map.rs | 188 ++---------------- .../new/ranking_rule_graph/proximity/build.rs | 5 +- .../proximity/compute_docids.rs | 11 +- .../new/ranking_rule_graph/proximity/mod.rs | 3 +- .../new/ranking_rule_graph/resolve_paths.rs | 10 +- milli/src/search/new/ranking_rules.rs | 101 +--------- milli/src/search/new/resolve_query_graph.rs | 101 +--------- milli/src/search/new/sort.rs | 17 +- milli/src/search/new/words.rs | 45 +---- 18 files changed, 63 insertions(+), 692 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 0a058d339..ae7cb9b91 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -1,7 +1,8 @@ use std::collections::hash_map::Entry; use fxhash::FxHashMap; -use heed::{types::ByteSlice, RoTxn}; +use heed::types::ByteSlice; +use heed::RoTxn; use crate::{Index, Result}; @@ -62,10 +63,7 @@ impl<'transaction> DatabaseCache<'transaction> { match self.word_pair_proximity_docids.entry(key.clone()) { Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), Entry::Vacant(entry) => { - // Note that now, we really want to do a prefix iter over (w1, w2) to get all the possible proximities - // but oh well - // - // Actually, we shouldn'transaction greedily access this DB at all + // We shouldn't greedily access this DB at all // a DB (w1, w2) -> [proximities] would be much better // We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity // And if we worked with words encoded as integers, the set of words could be a roaring bitmap diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 0f72b9d5d..43de6a531 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -1,20 +1,15 @@ use heed::RoTxn; use roaring::RoaringBitmap; -use crate::{ - new::ranking_rule_graph::cheapest_paths::{self, Path}, - Index, Result, -}; - -use super::{ - db_cache::DatabaseCache, - ranking_rule_graph::{ - cheapest_paths::KCheapestPathsState, edge_docids_cache::EdgeDocidsCache, - empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, RankingRuleGraph, - RankingRuleGraphTrait, - }, - QueryGraph, RankingRule, RankingRuleOutput, -}; +use super::db_cache::DatabaseCache; +use super::ranking_rule_graph::cheapest_paths::KCheapestPathsState; +use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache; +use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; +use super::ranking_rule_graph::paths_map::PathsMap; +use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::{QueryGraph, RankingRule, RankingRuleOutput}; +use crate::new::ranking_rule_graph::cheapest_paths::{self, Path}; +use crate::{Index, Result}; pub struct GraphBasedRankingRule { state: Option>, @@ -43,16 +38,8 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { - // if let Some(state) = &mut self.state { - // // TODO: update the previous state - // // TODO: update the existing graph incrementally, based on a diff - - // } else { + // TODO: update old state instead of starting from scratch let graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?; - // println!("Initialized Proximity Ranking Rule."); - // println!("GRAPH:"); - // let graphviz = graph.graphviz(); - // println!("{graphviz}"); let cheapest_paths_state = KCheapestPathsState::new(&graph); let state = GraphBasedRankingRuleState { @@ -62,13 +49,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap empty_paths_cache: <_>::default(), }; - // let desc = state.graph.graphviz_with_path( - // &state.cheapest_paths_state.as_ref().unwrap().kth_cheapest_path.clone(), - // ); - // println!("Cheapest path: {desc}"); - self.state = Some(state); - // } Ok(()) } @@ -86,17 +67,9 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else { return Ok(None); }; - // println!("Proximity: Next Bucket"); let mut paths = PathsMap::default(); - // let desc = state.graph.dot_description_with_path(&cheapest_paths_state.kth_cheapest_path); - // println!("CHeapest Path: {desc}"); - // TODO: when does it return None? -> when there is no cheapest path - // How to handle it? -> ... return all document ids from the universe? - // - // TODO: Give an empty_edge and empty_prefix argument to the - // compute_paths_of_next_lowest_cost function if let Some(next_cheapest_paths_state) = cheapest_paths_state .compute_paths_of_next_lowest_cost( &mut state.graph, @@ -107,31 +80,12 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap state.cheapest_paths_state = Some(next_cheapest_paths_state); } else { state.cheapest_paths_state = None; - // If returns None if there are no longer any paths to compute - // BUT! paths_map may not be empty, and we need to compute the current bucket still } - // println!("PATHS: {}", paths.graphviz(&state.graph)); - - // paths.iterate(|path, cost| { - // let desc = state.graph.graphviz_with_path(&Path { edges: path.clone(), cost: *cost }); - // println!("Path to resolve of cost {cost}: {desc}"); - // }); - - // let desc = state.graph.dot_description_with_path( - // &state.cheapest_paths_state.as_ref().unwrap().kth_cheapest_path.clone(), - // ); - // println!("Cheapest path: {desc}"); - - // TODO: verify that this is correct - // If the paths are empty, we should probably return the universe? - // BUT! Is there a case where the paths are empty AND the universe is - // not empty? if paths.is_empty() { self.state = None; return Ok(None); } - // Here, log all the paths? let bucket = state.graph.resolve_paths( index, @@ -142,10 +96,6 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap universe, paths, )?; - // The call above also updated the graph such that it doesn't contain the empty edges anymore. - // println!("Resolved all the paths: {bucket:?} from universe {:?}", state.universe); - // let graphviz = state.graph.graphviz(); - // println!("{graphviz}"); let next_query_graph = state.graph.query_graph.clone(); @@ -160,7 +110,6 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, ) { - // println!("PROXIMITY: end iteration"); self.state = None; } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 17e74e70e..01b466d20 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -14,10 +14,8 @@ pub use query_graph::*; pub use ranking_rules::*; use roaring::RoaringBitmap; -use self::{ - db_cache::DatabaseCache, - query_term::{word_derivations, LocatedQueryTerm}, -}; +use self::db_cache::DatabaseCache; +use self::query_term::{word_derivations, LocatedQueryTerm}; use crate::{Index, Result}; pub enum BitmapOrAllRef<'s> { diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index cbe319af7..5ed746c27 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,13 +1,12 @@ +use std::collections::HashSet; +use std::fmt; use std::fmt::Debug; -use std::{collections::HashSet, fmt}; use heed::RoTxn; use roaring::RoaringBitmap; -use super::{ - db_cache::DatabaseCache, - query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}, -}; +use super::db_cache::DatabaseCache; +use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::{Index, Result}; #[derive(Clone)] @@ -20,8 +19,7 @@ pub enum QueryNode { #[derive(Debug, Clone)] pub struct Edges { - // TODO: use a tiny bitset instead - // something like a simple Vec where most queries will see a vector of one element + // TODO: use a tiny bitset instead, something like a simple Vec where most queries will see a vector of one element pub predecessors: RoaringBitmap, pub successors: RoaringBitmap, } @@ -75,7 +73,6 @@ impl QueryGraph { impl QueryGraph { // TODO: return the list of all matching words here as well - pub fn from_query<'transaction>( index: &Index, txn: &RoTxn, @@ -94,9 +91,7 @@ impl QueryGraph { let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![graph.root_node]); - // TODO: add all the word derivations found in the fst - // and add split words / support phrases - + // TODO: split words / synonyms for length in 1..=query.len() { let query = &query[..length]; @@ -279,18 +274,6 @@ impl Debug for QueryNode { } } -/* -TODO: - -1. Find the minimum number of words to check to resolve the 10 query trees at once. - (e.g. just 0 | 01 | 012 ) -2. Simplify the query tree after removal of a node ✅ -3. Create the proximity graph ✅ -4. Assign different proximities for the ngrams ✅ -5. Walk the proximity graph, finding all the potential paths of weight N from START to END ✅ -(without checking the bitmaps) - -*/ impl QueryGraph { pub fn graphviz(&self) -> String { let mut desc = String::new(); @@ -317,91 +300,9 @@ node [shape = "record"] for edge in self.edges[node].successors.iter() { desc.push_str(&format!("{node} -> {edge};\n")); } - // for edge in self.edges[node].incoming.iter() { - // desc.push_str(&format!("{node} -> {edge} [color = grey];\n")); - // } } desc.push('}'); desc } } - -#[cfg(test)] -mod tests { - use charabia::Tokenize; - - use super::{LocatedQueryTerm, QueryGraph, QueryNode}; - use crate::index::tests::TempIndex; - use crate::new::db_cache::DatabaseCache; - use crate::search::new::query_term::word_derivations; - - #[test] - fn build_graph() { - let mut index = TempIndex::new(); - index.index_documents_config.autogenerate_docids = true; - index - .update_settings(|s| { - s.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - index - .add_documents(documents!({ - "text": "0 1 2 3 4 5 6 7 01 23 234 56 79 709 7356", - })) - .unwrap(); - - // let fst = fst::Set::from_iter(["01", "23", "234", "56"]).unwrap(); - let txn = index.read_txn().unwrap(); - let mut db_cache = DatabaseCache::default(); - - let fst = index.words_fst(&txn).unwrap(); - let query = LocatedQueryTerm::from_query( - "0 no 1 2 3 4 5 6 7".tokenize(), - None, - |word, is_prefix| { - word_derivations( - &index, - &txn, - word, - if word.len() < 3 { - 0 - } else if word.len() < 6 { - 1 - } else { - 2 - }, - is_prefix, - &fst, - ) - }, - ) - .unwrap(); - - let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap(); - println!("{}", graph.graphviz()); - - // let positions_to_remove = vec![3, 6, 0, 4]; - // for p in positions_to_remove { - // graph.remove_words_at_position(p); - // println!("{}", graph.graphviz()); - // } - - // let proximities = |w1: &str, w2: &str| -> Vec { - // if matches!((w1, w2), ("56", "7")) { - // vec![] - // } else { - // vec![1, 2] - // } - // }; - - // let prox_graph = ProximityGraph::from_query_graph(graph, proximities); - - // println!("{}", prox_graph.graphviz()); - } -} - -// fn remove_element_from_vector(v: &mut Vec, el: usize) { -// let position = v.iter().position(|&x| x == el).unwrap(); -// v.swap_remove(position); -// } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 4d2b22264..52943755a 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -17,10 +17,6 @@ use crate::{Index, Result}; #[derive(Debug, Clone)] pub struct WordDerivations { - // TODO: should have a list for the words corresponding to the prefix as well! - // This is to implement the `exactness` ranking rule. - // However, we could also consider every term in `zero_typo` (except first one) to - // be words of that the original word is a prefix of pub original: String, pub zero_typo: Vec, pub one_typo: Vec, diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 6978491cd..8e7dd7a04 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -46,8 +46,6 @@ impl RankingRuleGraph { } } } - // ranking_rule_graph.simplify(); - Ok(ranking_rule_graph) } } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 00babb560..fdce85159 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -3,10 +3,9 @@ use std::collections::{BTreeMap, HashSet}; use itertools::Itertools; use roaring::RoaringBitmap; -use super::{ - empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, Edge, RankingRuleGraph, - RankingRuleGraphTrait, -}; +use super::empty_paths_cache::EmptyPathsCache; +use super::paths_map::PathsMap; +use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Path { diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 263b78d6a..dddbda6af 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -18,18 +18,12 @@ use crate::{Index, Result}; pub struct EdgeDocidsCache { pub cache: FxHashMap, - - // TODO: There is a big difference between `cache`, which is always valid, and - // `empty_path_prefixes`, which is only accurate for a particular universe - // ALSO, we should have a universe-specific `empty_edge` to use - // pub empty_path_prefixes: HashSet>, _phantom: PhantomData, } impl Default for EdgeDocidsCache { fn default() -> Self { Self { cache: Default::default(), - // empty_path_prefixes: Default::default(), _phantom: Default::default(), } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index b5c928ffa..52b685d08 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -5,7 +5,6 @@ pub mod empty_paths_cache; pub mod paths_map; pub mod proximity; pub mod resolve_paths; - use std::collections::{BTreeSet, HashSet}; use std::ops::ControlFlow; @@ -86,22 +85,10 @@ pub struct RankingRuleGraph { pub node_edges: Vec, pub successors: Vec, - // to get the edges between two nodes: + // TODO: to get the edges between two nodes: // 1. get node_outgoing_edges[from] // 2. get node_incoming_edges[to] // 3. take intersection betweem the two - - // TODO: node edges could be different I guess - // something like: - // pub node_edges: Vec - // where each index is the result of: - // the successor index in the top 16 bits, the edge index in the bottom 16 bits - - // TODO: - // node_successors? - - // pub removed_edges: HashSet, - // pub tmp_removed_edges: HashSet, } impl RankingRuleGraph { // Visit all edges between the two given nodes in order of increasing cost. @@ -142,50 +129,6 @@ impl RankingRuleGraph { } self.successors[from_node as usize] = new_successors_from_node; } - // pub fn remove_nodes(&mut self, nodes: &[usize]) { - // for &node in nodes { - // let edge_indices = &mut self.node_edges[node]; - // for edge_index in edge_indices.iter() { - // self.all_edges[*edge_index] = None; - // } - // edge_indices.clear(); - - // let preds = &self.query_graph.edges[node].incoming; - // for pred in preds { - // let edge_indices = &mut self.node_edges[*pred]; - // for edge_index in edge_indices.iter() { - // let edge_opt = &mut self.all_edges[*edge_index]; - // let Some(edge) = edge_opt else { continue; }; - // if edge.to_node == node { - // *edge_opt = None; - // } - // } - // panic!("remove nodes is incorrect at the moment"); - // edge_indices.clear(); - // } - // } - // self.query_graph.remove_nodes(nodes); - // } - // pub fn simplify(&mut self) { - // loop { - // let mut nodes_to_remove = vec![]; - // for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { - // if !matches!(node, QueryNode::End | QueryNode::Deleted) - // && self.node_edges[node_idx].is_empty() - // { - // nodes_to_remove.push(node_idx); - // } - // } - // if nodes_to_remove.is_empty() { - // break; - // } else { - // self.remove_nodes(&nodes_to_remove); - // } - // } - // } - // fn is_removed_edge(&self, edge: u32) -> bool { - // self.removed_edges.contains(&edge) || self.tmp_removed_edges.contains(&edge) - // } pub fn graphviz(&self) -> String { let mut desc = String::new(); diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 572cb975f..6f6512ae4 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -6,14 +6,13 @@ use std::hash::{Hash, Hasher}; use roaring::RoaringBitmap; use super::cheapest_paths::Path; -use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait, Edge}; +use super::{Edge, EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::QueryNode; - #[derive(Debug)] pub struct PathsMap { nodes: Vec<(u32, PathsMap)>, - value: Option + value: Option, } impl Default for PathsMap { fn default() -> Self { @@ -73,11 +72,11 @@ impl PathsMap { } pub fn remove_first(&mut self) -> Option<(Vec, V)> { if self.is_empty() { - return None + return None; } let mut result = vec![]; - let (_, value) = self.remove_first_rec(&mut result); + let (_, value) = self.remove_first_rec(&mut result); Some((result, value)) } pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { @@ -85,7 +84,7 @@ impl PathsMap { visit(cur, value); } for (first_edge, rest) in self.nodes.iter() { - cur.push(*first_edge); + cur.push(*first_edge); rest.iterate_rec(cur, visit); cur.pop(); } @@ -163,7 +162,7 @@ impl PathsMap { let [first_edge, remaining_prefix @ ..] = prefix else { return self.nodes.iter().map(|n| n.0).collect(); }; - for (edge, rest) in self.nodes.iter(){ + for (edge, rest) in self.nodes.iter() { if edge == first_edge { return rest.edge_indices_after_prefix(remaining_prefix); } @@ -173,14 +172,12 @@ impl PathsMap { pub fn contains_prefix_of_path(&self, path: &[u32]) -> bool { if self.value.is_some() { - return true + return true; } match path { - [] => { - false - } + [] => false, [first_edge, remaining_path @ ..] => { - for (edge, rest) in self.nodes.iter(){ + for (edge, rest) in self.nodes.iter() { if edge == first_edge { return rest.contains_prefix_of_path(remaining_path); } @@ -197,7 +194,12 @@ impl PathsMap { desc.push_str("\n}\n"); desc } - fn graphviz_rec(&self, desc: &mut String, path_from: Vec, graph: &RankingRuleGraph) { + fn graphviz_rec( + &self, + desc: &mut String, + path_from: Vec, + graph: &RankingRuleGraph, + ) { let id_from = { let mut h = DefaultHasher::new(); path_from.hash(&mut h); @@ -227,7 +229,6 @@ impl PathsMap { } impl RankingRuleGraph { - pub fn graphviz_with_path(&self, path: &Path) -> String { let mut desc = String::new(); desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n"); @@ -248,11 +249,7 @@ impl RankingRuleGraph { for (edge_idx, edge) in self.all_edges.iter().enumerate() { let Some(edge) = edge else { continue }; let Edge { from_node, to_node, cost, details } = edge; - let color = if path.edges.contains(&(edge_idx as u32)) { - "red" - } else { - "green" - }; + let color = if path.edges.contains(&(edge_idx as u32)) { "red" } else { "green" }; match &edge.details { EdgeDetails::Unconditional => { desc.push_str(&format!( @@ -273,157 +270,4 @@ impl RankingRuleGraph { desc.push('}'); desc } - -} - -#[cfg(test)] -mod tests { - use super::PathsMap; - use crate::db_snap; - use crate::index::tests::TempIndex; - use crate::new::db_cache::DatabaseCache; - use crate::new::ranking_rule_graph::cheapest_paths::KCheapestPathsState; - use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; - use crate::new::ranking_rule_graph::proximity::ProximityGraph; - use crate::new::ranking_rule_graph::RankingRuleGraph; - use crate::search::new::query_term::{word_derivations, LocatedQueryTerm}; - use crate::search::new::QueryGraph; - use charabia::Tokenize; - - #[test] - fn paths_tree() { - let mut index = TempIndex::new(); - index.index_documents_config.autogenerate_docids = true; - index - .update_settings(|s| { - s.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - index - .add_documents(documents!([ - { - "text": "0 1 2 3 4 5" - }, - { - "text": "0 a 1 b 2 3 4 5" - }, - { - "text": "0 a 1 b 3 a 4 b 5" - }, - { - "text": "0 a a 1 b 2 3 4 5" - }, - { - "text": "0 a a a a 1 b 3 45" - }, - ])) - .unwrap(); - - db_snap!(index, word_pair_proximity_docids, @"679d1126b569b3e8b10dd937c3faedf9"); - - let txn = index.read_txn().unwrap(); - let mut db_cache = DatabaseCache::default(); - let fst = index.words_fst(&txn).unwrap(); - let query = - LocatedQueryTerm::from_query("0 1 2 3 4 5".tokenize(), None, |word, is_prefix| { - word_derivations(&index, &txn, word, if word.len() < 3 { - 0 - } else if word.len() < 6 { - 1 - } else { - 2 - },is_prefix, &fst) - }) - .unwrap(); - let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap(); - let empty_paths_cache = EmptyPathsCache::default(); - let mut db_cache = DatabaseCache::default(); - - let mut prox_graph = - RankingRuleGraph::::build(&index, &txn, &mut db_cache, graph).unwrap(); - - println!("{}", prox_graph.graphviz()); - - let mut state = KCheapestPathsState::new(&prox_graph).unwrap(); - - let mut path_tree = PathsMap::default(); - while state.next_cost() <= 6 { - let next_state = state.compute_paths_of_next_lowest_cost(&mut prox_graph, &empty_paths_cache, &mut path_tree); - if let Some(next_state) = next_state { - state = next_state; - } else { - break; - } - } - - let desc = path_tree.graphviz(&prox_graph); - println!("{desc}"); - - // let path = vec![u32 { from: 0, to: 2, edge_idx: 0 }, u32 { from: 2, to: 3, edge_idx: 0 }, u32 { from: 3, to: 4, edge_idx: 0 }, u32 { from: 4, to: 5, edge_idx: 0 }, u32 { from: 5, to: 8, edge_idx: 0 }, u32 { from: 8, to: 1, edge_idx: 0 }, u32 { from: 1, to: 10, edge_idx: 0 }]; - // println!("{}", psath_tree.contains_prefix_of_path(&path)); - - - // let path = vec![u32 { from: 0, to: 2, edge_idx: 0 }, u32 { from: 2, to: 3, edge_idx: 0 }, u32 { from: 3, to: 4, edge_idx: 0 }, u32 { from: 4, to: 5, edge_idx: 0 }, u32 { from: 5, to: 6, edge_idx: 0 }, u32 { from: 6, to: 7, edge_idx: 0 }, u32 { from: 7, to: 1, edge_idx: 0 }]; - - - // path_tree.iterate(|path, cost| { - // println!("cost {cost} for path: {path:?}"); - // }); - - // path_tree.remove_forbidden_prefix(&[ - // u32 { from: 0, to: 2, edge_idx: 0 }, - // u32 { from: 2, to: 3, edge_idx: 2 }, - // ]); - // let desc = path_tree.graphviz(); - // println!("{desc}"); - - // path_tree.remove_forbidden_edge(&u32 { from: 5, to: 6, cost: 1 }); - - // let desc = path_tree.graphviz(); - // println!("AFTER REMOVING 5-6 [1]:\n{desc}"); - - // path_tree.remove_forbidden_edge(&u32 { from: 3, to: 4, cost: 1 }); - - // let desc = path_tree.graphviz(); - // println!("AFTER REMOVING 3-4 [1]:\n{desc}"); - - // let p = path_tree.remove_first(); - // println!("PATH: {p:?}"); - // let desc = path_tree.graphviz(); - // println!("AFTER REMOVING: {desc}"); - - // let p = path_tree.remove_first(); - // println!("PATH: {p:?}"); - // let desc = path_tree.graphviz(); - // println!("AFTER REMOVING: {desc}"); - - // path_tree.remove_all_containing_edge(&u32 { from: 5, to: 6, cost: 2 }); - - // let desc = path_tree.graphviz(); - // println!("{desc}"); - - // let first_edges = path_tree.remove_first().unwrap(); - // println!("{first_edges:?}"); - // let desc = path_tree.graphviz(); - // println!("{desc}"); - - // let first_edges = path_tree.remove_first().unwrap(); - // println!("{first_edges:?}"); - // let desc = path_tree.graphviz(); - // println!("{desc}"); - - // let first_edges = path_tree.remove_first().unwrap(); - // println!("{first_edges:?}"); - // let desc = path_tree.graphviz(); - // println!("{desc}"); - - // println!("{path_tree:?}"); - } - - - #[test] - fn test_contains_prefix_of_path() { - - } } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 7149f8bf6..bfcac57ee 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -1,5 +1,8 @@ use std::collections::BTreeMap; +use heed::RoTxn; +use itertools::Itertools; + use super::ProximityEdge; use crate::new::db_cache::DatabaseCache; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; @@ -7,8 +10,6 @@ use crate::new::ranking_rule_graph::proximity::WordPair; use crate::new::ranking_rule_graph::{Edge, EdgeDetails}; use crate::new::QueryNode; use crate::{Index, Result}; -use heed::RoTxn; -use itertools::Itertools; pub fn visit_from_node(from_node: &QueryNode) -> Result> { Ok(Some(match from_node { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 325042761..51c6d6ad5 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,17 +1,17 @@ -use roaring::MultiOps; +use heed::RoTxn; +use roaring::{MultiOps, RoaringBitmap}; use super::{ProximityEdge, WordPair}; use crate::new::db_cache::DatabaseCache; -use crate::CboRoaringBitmapCodec; +use crate::{CboRoaringBitmapCodec, Result}; pub fn compute_docids<'transaction>( index: &crate::Index, - txn: &'transaction heed::RoTxn, + txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, edge: &ProximityEdge, -) -> crate::Result { +) -> Result { let ProximityEdge { pairs, proximity } = edge; - // TODO: we should know already which pair of words to look for let mut pair_docids = vec![]; for pair in pairs.iter() { let bytes = match pair { @@ -25,7 +25,6 @@ pub fn compute_docids<'transaction>( bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); pair_docids.push(bitmap); } - pair_docids.sort_by_key(|rb| rb.len()); let docids = MultiOps::union(pair_docids); Ok(docids) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index e4905ead9..16c2acf1f 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -1,12 +1,13 @@ pub mod build; pub mod compute_docids; +use heed::RoTxn; + use super::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; use crate::new::query_term::WordDerivations; use crate::new::QueryNode; use crate::{Index, Result}; -use heed::RoTxn; #[derive(Debug, Clone)] pub enum WordPair { diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index 95cf4629b..d21ddcd86 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -68,14 +68,6 @@ impl RankingRuleGraph { } path_bitmaps.push(path_bitmap); } - let docids = MultiOps::union(path_bitmaps); - Ok(docids) - // for each path, translate it to an intersection of cached roaring bitmaps - // then do a union for all paths - - // get the docids of the given paths in the proximity graph - // in the fastest possible way - // 1. roaring MultiOps (before we can do the Frozen+AST thing) - // 2. minimize number of operations + Ok(MultiOps::union(path_bitmaps)) } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index b980c1dc4..4ceaddb8a 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -97,49 +97,10 @@ pub fn get_start_universe<'transaction>( query_graph: &QueryGraph, term_matching_strategy: TermsMatchingStrategy, // filters: Filters, - // mut distinct: Option, ) -> Result { - // NOTE: - // - // There is a performance problem when using `distinct` + exhaustive number of hits, - // especially for search that yield many results (many ~= almost all of the - // dataset). - // - // We'll solve it later. Maybe there are smart ways to go about it. - // - // For example, if there are millions of possible values for the distinct attribute, - // then we could just look at the documents which share any distinct attribute with - // another one, and remove the later docids them from the universe. - // => NO! because we don't know which one to remove, only after the sorting is done can we know it - // => this kind of computation can be done, but only in the evaluation of the number - // of hits for the documents that aren't returned by the search. - // - // `Distinct` otherwise should always be computed during - + // TODO: actually compute the universe from the query graph let universe = index.documents_ids(txn).unwrap(); - - // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. - // NOTE: this is wrong - // Instead, we should only compute the documents corresponding to the last remaining - // word, 2-gram, and 3-gran. - // let candidates = resolve_query_graph(index, txn, db_cache, query_graph, &universe)?; - - // Distinct should be lazy if placeholder? - // - // // because the initial_candidates should be an exhaustive count of the matching documents, - // // we precompute the distinct attributes. - // let initial_candidates = match &mut distinct { - // Some(distinct) => { - // let mut initial_candidates = RoaringBitmap::new(); - // for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { - // initial_candidates.insert(c?); - // } - // initial_candidates - // } - // None => candidates.clone(), - // }; - - Ok(/*candidates*/ universe) + Ok(universe) } pub fn execute_search<'transaction>( @@ -306,43 +267,6 @@ mod tests { let primary_key = index.primary_key(&txn).unwrap().unwrap(); let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - loop { - let start = Instant::now(); - - let mut db_cache = DatabaseCache::default(); - - let query_graph = make_query_graph( - &index, - &txn, - &mut db_cache, - "released from prison by the government", - ) - .unwrap(); - // println!("{}", query_graph.graphviz()); - - // TODO: filters + maybe distinct attributes? - let universe = get_start_universe( - &index, - &txn, - &mut db_cache, - &query_graph, - TermsMatchingStrategy::Last, - ) - .unwrap(); - // println!("universe: {universe:?}"); - - let results = execute_search( - &index, - &txn, - &mut db_cache, - &universe, - &query_graph, /* 0, 20 */ - ) - .unwrap(); - - let elapsed = start.elapsed(); - println!("{}us: {results:?}", elapsed.as_micros()); - } let start = Instant::now(); let mut db_cache = DatabaseCache::default(); @@ -350,7 +274,6 @@ mod tests { let query_graph = make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government") .unwrap(); - // println!("{}", query_graph.graphviz()); // TODO: filters + maybe distinct attributes? let universe = get_start_universe( @@ -361,7 +284,6 @@ mod tests { TermsMatchingStrategy::Last, ) .unwrap(); - // println!("universe: {universe:?}"); let results = execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /* 0, 20 */) @@ -396,7 +318,7 @@ mod tests { let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("released from prison by the government"); + s.query("b b b b b b b b b b"); s.terms_matching_strategy(TermsMatchingStrategy::Last); s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap(); @@ -414,30 +336,14 @@ mod tests { let index = Index::new(options, "data_movies").unwrap(); let mut wtxn = index.write_txn().unwrap(); - // let primary_key = "id"; - // let searchable_fields = vec!["title", "overview"]; - // let filterable_fields = vec!["release_date", "genres"]; - // let sortable_fields = vec[]; - let config = IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_min_word_len_one_typo(5); builder.set_min_word_len_two_typos(100); - // builder.set_primary_key(primary_key.to_owned()); - - // let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_searchable_fields(searchable_fields); - - // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_filterable_fields(filterable_fields); - builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); - // let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_sortable_fields(sortable_fields); - builder.execute(|_| (), || false).unwrap(); } @@ -452,7 +358,6 @@ mod tests { let primary_key = "id"; let searchable_fields = vec!["title", "overview"]; let filterable_fields = vec!["release_date", "genres"]; - // let sortable_fields = vec[]; let config = IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 6110bae49..e752358a7 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -1,7 +1,8 @@ +use std::collections::{HashMap, HashSet, VecDeque}; + use fxhash::FxHashMap; use heed::{BytesDecode, RoTxn}; use roaring::{MultiOps, RoaringBitmap}; -use std::collections::{HashMap, HashSet, VecDeque}; use super::db_cache::DatabaseCache; use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; @@ -9,8 +10,6 @@ use super::QueryGraph; use crate::{Index, Result, RoaringBitmapCodec}; // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. - -// TODO: reuse NodeDocidsCache in between calls to resolve_query_graph #[derive(Default)] pub struct NodeDocIdsCache { pub cache: FxHashMap, @@ -55,11 +54,6 @@ impl NodeDocIdsCache { .into_iter() .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()); MultiOps::union(derivations_iter) - // TODO: if `or` is empty, register that somewhere, and immediately return an empty bitmap - // On the other hand, `or` *cannot* be empty, only its intersection with the universe can - // - // TODO: Or we don't do anything and accumulate all these operations in a tree of operations - // between frozen roaring bitmap that is resolved only at the very end } }; let _ = self.cache.insert(node_idx, docids); @@ -79,10 +73,7 @@ pub fn resolve_query_graph<'transaction>( // TODO: there is definitely a faster way to compute this big // roaring bitmap expression - // resolve_query_graph_rec(index, txn, q, q.root_node, &mut docids, &mut cache)?; - let mut nodes_resolved = RoaringBitmap::new(); - // TODO: should be given as an argument and kept between invocations of resolve query graph let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()]; let mut next_nodes_to_visit = VecDeque::new(); @@ -123,100 +114,14 @@ pub fn resolve_query_graph<'transaction>( next_nodes_to_visit.push_back(succ); } } + // This is currently slow but could easily be implemented very efficiently for prec in q.edges[node as usize].predecessors.iter() { if q.edges[prec as usize].successors.is_subset(&nodes_resolved) { path_nodes_docids[prec as usize].clear(); } } - // println!("cached docids: {nodes_docids:?}"); } panic!() } - -#[cfg(test)] -mod tests { - use charabia::Tokenize; - - use super::resolve_query_graph; - use crate::db_snap; - use crate::index::tests::TempIndex; - use crate::new::db_cache::DatabaseCache; - use crate::new::resolve_query_graph::NodeDocIdsCache; - use crate::search::new::query_term::{word_derivations, LocatedQueryTerm}; - use crate::search::new::QueryGraph; - - #[test] - fn test_resolve_query_graph() { - let index = TempIndex::new(); - - index - .update_settings(|s| { - s.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - index - .add_documents(documents!([ - {"id": 0, "text": "0"}, - {"id": 1, "text": "1"}, - {"id": 2, "text": "2"}, - {"id": 3, "text": "3"}, - {"id": 4, "text": "4"}, - {"id": 5, "text": "5"}, - {"id": 6, "text": "6"}, - {"id": 7, "text": "7"}, - {"id": 8, "text": "0 1 2 3 4 5 6 7"}, - {"id": 9, "text": "7 6 5 4 3 2 1 0"}, - {"id": 10, "text": "01 234 56 7"}, - {"id": 11, "text": "7 56 0 1 23 5 4"}, - {"id": 12, "text": "0 1 2 3 4 5 6"}, - {"id": 13, "text": "01 23 4 5 7"}, - ])) - .unwrap(); - db_snap!(index, word_docids, @"7512d0b80659f6bf37d98b374ada8098"); - - let txn = index.read_txn().unwrap(); - let mut db_cache = DatabaseCache::default(); - let fst = index.words_fst(&txn).unwrap(); - let query = LocatedQueryTerm::from_query( - "no 0 1 2 3 no 4 5 6 7".tokenize(), - None, - |word, is_prefix| { - word_derivations( - &index, - &txn, - word, - if word.len() < 3 { - 0 - } else if word.len() < 6 { - 1 - } else { - 2 - }, - is_prefix, - &fst, - ) - }, - ) - .unwrap(); - let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap(); - println!("{}", graph.graphviz()); - let mut node_docids_cache = NodeDocIdsCache::default(); - let universe = index.documents_ids(&txn).unwrap(); - insta::assert_debug_snapshot!(universe, @"RoaringBitmap<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]>"); - let docids = resolve_query_graph( - &index, - &txn, - &mut db_cache, - &mut node_docids_cache, - &graph, - &universe, - ) - .unwrap(); - insta::assert_debug_snapshot!(docids, @"RoaringBitmap<[8, 9, 11]>"); - - // TODO: test with a reduced universe - } -} diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 9a48a49e7..3e3fe0faf 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -1,9 +1,10 @@ use heed::RoTxn; use roaring::RoaringBitmap; +use super::db_cache::DatabaseCache; use super::{ - db_cache::DatabaseCache, RankingRule, RankingRuleOutput, RankingRuleOutputIter, - RankingRuleOutputIterWrapper, RankingRuleQueryTrait, + RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper, + RankingRuleQueryTrait, }; use crate::{ // facet::FacetType, @@ -33,18 +34,6 @@ impl<'transaction, Query> Sort<'transaction, Query> { let fields_ids_map = index.fields_ids_map(rtxn)?; let field_id = fields_ids_map.id(&field_name); - // TODO: What is this, why? - // let faceted_candidates = match field_id { - // Some(field_id) => { - // let number_faceted = - // index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?; - // let string_faceted = - // index.faceted_documents_ids(rtxn, field_id, FacetType::String)?; - // number_faceted | string_faceted - // } - // None => RoaringBitmap::default(), - // }; - Ok(Self { field_id, is_ascending, iter: None }) } } diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 5852d137a..1c03586fd 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -79,8 +79,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { return Ok(None); } let Some(query_graph) = &mut self.query_graph else { panic!() }; - // let graphviz = query_graph.graphviz(); - // println!("\n===={graphviz}\n===="); + let this_bucket = resolve_query_graph( index, txn, @@ -89,10 +88,8 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { query_graph, universe, )?; - // println!("WORDS: this bucket: {this_bucket:?}"); + let child_query_graph = query_graph.clone(); - // this_bucket is the one that must be returned now - // self.cur_bucket is set to the next bucket // TODO: Check whether a position exists in the graph before removing it and // returning the next bucket. // while graph.does_not_contain(positions_to_remove.last()) { positions_to_remove.pop() } @@ -118,41 +115,3 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { self.positions_to_remove = vec![]; } } - -#[cfg(test)] -mod tests { - // use charabia::Tokenize; - // use roaring::RoaringBitmap; - - // use crate::{ - // index::tests::TempIndex, - // search::{criteria::CriteriaBuilder, new::QueryGraphOrPlaceholder}, - // }; - - // use super::Words; - - // fn placeholder() { - // let qt = QueryGraphOrPlaceholder::Placeholder; - // let index = TempIndex::new(); - // let rtxn = index.read_txn().unwrap(); - - // let query = "a beautiful summer house by the beach overlooking what seems"; - // // let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); - // // let (qt, parts, matching_words) = builder.build(query.tokenize()).unwrap().unwrap(); - - // // let cb = CriteriaBuilder::new(&rtxn, &index).unwrap(); - // // let x = cb - // // .build( - // // Some(qt), - // // Some(parts), - // // None, - // // None, - // // false, - // // None, - // // crate::CriterionImplementationStrategy::OnlySetBased, - // // ) - // // .unwrap(); - - // // let rr = Words::new(&index, &RoaringBitmap::from_sorted_iter(0..1000)).unwrap(); - // } -} From a61495d6605e0b86680560d4dd076848100063e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 15:08:45 +0100 Subject: [PATCH 022/234] Update Cargo.toml (commit to be deleted later) --- Cargo.toml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 628c6bcbb..b85ee4e2d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,12 +13,15 @@ members = [ "filter-parser", "flatten-serde-json", "json-depth-checker", - "benchmarks" + "benchmarks", ] [workspace.package] -version = "1.1.0" -authors = ["Quentin de Quelen ", "Clément Renault "] +version = "1.0.0" +authors = [ + "Quentin de Quelen ", + "Clément Renault ", +] description = "Meilisearch HTTP server" homepage = "https://meilisearch.com" readme = "README.md" @@ -27,6 +30,7 @@ license = "MIT" [profile.release] # codegen-units = 1 +debug = true [profile.dev.package.flate2] opt-level = 3 From dd12d441343f7b2627da4cfe7377c428e0e182f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 15:10:22 +0100 Subject: [PATCH 023/234] Support swapped word pairs in new proximity ranking rule impl --- milli/src/search/new/db_cache.rs | 23 +++++++++++++++++++ .../proximity/compute_docids.rs | 5 ++++ .../new/ranking_rule_graph/proximity/mod.rs | 2 ++ 3 files changed, 30 insertions(+) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index ae7cb9b91..4232cadaa 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -11,6 +11,8 @@ pub struct DatabaseCache<'transaction> { pub word_pair_proximity_docids: FxHashMap<(u8, String, String), Option<&'transaction [u8]>>, pub word_prefix_pair_proximity_docids: FxHashMap<(u8, String, String), Option<&'transaction [u8]>>, + pub prefix_word_pair_proximity_docids: + FxHashMap<(u8, String, String), Option<&'transaction [u8]>>, pub word_docids: FxHashMap>, pub exact_word_docids: FxHashMap>, pub word_prefix_docids: FxHashMap>, @@ -115,4 +117,25 @@ impl<'transaction> DatabaseCache<'transaction> { } } } + pub fn get_prefix_word_pair_proximity_docids( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + word1: &str, + prefix2: &str, + proximity: u8, + ) -> Result> { + let key = (proximity, prefix2.to_owned(), word1.to_owned()); + match self.prefix_word_pair_proximity_docids.entry(key) { + Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), + Entry::Vacant(entry) => { + let bitmap_ptr = index + .prefix_word_pair_proximity_docids + .remap_data_type::() + .get(txn, &(proximity, prefix2, word1))?; + entry.insert(bitmap_ptr); + Ok(bitmap_ptr) + } + } + } } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 51c6d6ad5..908f50ef6 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -20,6 +20,11 @@ pub fn compute_docids<'transaction>( } WordPair::WordPrefix { left, right_prefix } => db_cache .get_word_prefix_pair_proximity_docids(index, txn, left, right_prefix, *proximity), + WordPair::WordsSwapped { left, right } => { + db_cache.get_word_pair_proximity_docids(index, txn, left, right, *proximity) + } + WordPair::WordPrefixSwapped { left, right_prefix } => db_cache + .get_prefix_word_pair_proximity_docids(index, txn, left, right_prefix, *proximity), }?; let bitmap = bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 16c2acf1f..3b9470be2 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -13,7 +13,9 @@ use crate::{Index, Result}; pub enum WordPair { // TODO: add WordsSwapped and WordPrefixSwapped case Words { left: String, right: String }, + WordsSwapped { left: String, right: String }, WordPrefix { left: String, right_prefix: String }, + WordPrefixSwapped { left: String, right_prefix: String }, } pub struct ProximityEdge { From 6ba4d5e987e7e4458edf82d93215da3d61b2a8db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 22 Feb 2023 15:34:37 +0100 Subject: [PATCH 024/234] Add a search logger --- .../search/new/graph_based_ranking_rule.rs | 14 +- milli/src/search/new/logger/detailed.rs | 238 ++++++++++++++++++ milli/src/search/new/logger/mod.rs | 72 ++++++ milli/src/search/new/mod.rs | 1 + milli/src/search/new/ranking_rules.rs | 76 +++++- milli/src/search/new/sort.rs | 11 +- milli/src/search/new/words.rs | 10 + 7 files changed, 406 insertions(+), 16 deletions(-) create mode 100644 milli/src/search/new/logger/detailed.rs create mode 100644 milli/src/search/new/logger/mod.rs diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 43de6a531..6c061cd4c 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -2,6 +2,7 @@ use heed::RoTxn; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; +use super::logger::SearchLogger; use super::ranking_rule_graph::cheapest_paths::KCheapestPathsState; use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache; use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; @@ -12,11 +13,12 @@ use crate::new::ranking_rule_graph::cheapest_paths::{self, Path}; use crate::{Index, Result}; pub struct GraphBasedRankingRule { + id: String, state: Option>, } -impl Default for GraphBasedRankingRule { - fn default() -> Self { - Self { state: None } +impl GraphBasedRankingRule { + pub fn new(id: String) -> Self { + Self { id, state: None } } } @@ -30,11 +32,15 @@ pub struct GraphBasedRankingRuleState { impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGraph> for GraphBasedRankingRule { + fn id(&self) -> String { + self.id.clone() + } fn start_iteration( &mut self, index: &Index, txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, + logger: &mut dyn SearchLogger, universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { @@ -59,6 +65,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap index: &Index, txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, + logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { assert!(universe.len() > 1); @@ -109,6 +116,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap _index: &Index, _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, + logger: &mut dyn SearchLogger, ) { self.state = None; } diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs new file mode 100644 index 000000000..a108a3a7f --- /dev/null +++ b/milli/src/search/new/logger/detailed.rs @@ -0,0 +1,238 @@ +use rand::random; +use roaring::RoaringBitmap; +use std::fs::File; +use std::path::Path; +use std::{io::Write, path::PathBuf}; + +use crate::new::QueryNode; +use crate::new::ranking_rule_graph::{ + paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph, +}; + +use super::{QueryGraph, RankingRule, RankingRuleQueryTrait, SearchLogger}; + +pub enum SearchEvents { + RankingRuleStartIteration { + ranking_rule_idx: usize, + query: QueryGraph, + universe: RoaringBitmap, + }, + RankingRuleNextBucket { + ranking_rule_idx: usize, + universe: RoaringBitmap, + }, + RankingRuleEndIteration { + ranking_rule_idx: usize, + universe: RoaringBitmap, + }, + ExtendResults { + new: RoaringBitmap, + }, + WordsState { + query_graph: QueryGraph, + }, + ProximityState { + graph: RankingRuleGraph, + paths: PathsMap, + }, +} + +pub struct DetailedSearchLogger { + folder_path: PathBuf, + initial_query: Option, + initial_universe: Option, + ranking_rules_ids: Option>, + events: Vec, +} +impl DetailedSearchLogger { + pub fn new(folder_path: &str) -> Self { + Self { + folder_path: PathBuf::new().join(folder_path), + initial_query: <_>::default(), + initial_universe: <_>::default(), + ranking_rules_ids: <_>::default(), + events: <_>::default(), + } + } +} + +impl SearchLogger for DetailedSearchLogger { + fn initial_query(&mut self, query: &QueryGraph) { + self.initial_query = Some(query.clone()); + } + + fn initial_universe(&mut self, universe: &RoaringBitmap) { + self.initial_universe = Some(universe.clone()); + } + fn ranking_rules(&mut self, rr: &[Box>]) { + self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect()); + } + + fn start_iteration_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + query: &QueryGraph, + universe: &RoaringBitmap, + ) { + self.events.push(SearchEvents::RankingRuleStartIteration { + ranking_rule_idx, + query: query.clone(), + universe: universe.clone(), + }) + } + + fn next_bucket_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + universe: &RoaringBitmap, + ) { + self.events.push(SearchEvents::RankingRuleNextBucket { + ranking_rule_idx, + universe: universe.clone(), + }) + } + + fn end_iteration_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + universe: &RoaringBitmap, + ) { + self.events.push(SearchEvents::RankingRuleEndIteration { + ranking_rule_idx, + universe: universe.clone(), + }) + } + fn add_to_results(&mut self, docids: &RoaringBitmap) { + self.events.push(SearchEvents::ExtendResults { new: docids.clone() }); + } + + fn log_words_state(&mut self, query_graph: &QueryGraph) { + self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); + } + +} + +impl DetailedSearchLogger { + pub fn write_d2_description(&self) { + let mut timestamp_idx = 0; + let mut timestamp = vec![]; + fn activated_id(timestamp: &[usize]) -> String { + let mut s = String::new(); + s.push('0'); + for t in timestamp.iter() { + s.push_str(&format!("{t}")); + } + s + } + + let index_path = self.folder_path.join("index.d2"); + let mut file = std::fs::File::create(&index_path).unwrap(); + writeln!(&mut file, "Control Flow Between Ranking Rules: {{").unwrap(); + writeln!(&mut file, "shape: sequence_diagram"); + for (idx, rr_id) in self.ranking_rules_ids.as_ref().unwrap().iter().enumerate() { + writeln!(&mut file, "{idx}: {rr_id}").unwrap(); + } + writeln!(&mut file, "results"); + for event in self.events.iter() { + match event { + SearchEvents::RankingRuleStartIteration { query, universe, ranking_rule_idx } => { + + let parent_activated_id = activated_id(×tamp); + timestamp.push(0); + let self_activated_id = activated_id(×tamp); + if *ranking_rule_idx != 0 { + let parent_ranking_rule_idx = ranking_rule_idx - 1; + writeln!( + &mut file, + "{parent_ranking_rule_idx}.{parent_activated_id} -> {ranking_rule_idx}.{self_activated_id} : start iteration", + ) + .unwrap(); + } + writeln!(&mut file, + "{ranking_rule_idx}.{self_activated_id} {{ + style {{ + fill: \"#D8A7B1\" + }} +}}").unwrap(); + } + SearchEvents::RankingRuleNextBucket { universe, ranking_rule_idx } => { + let old_activated_id = activated_id(×tamp); + *timestamp.last_mut().unwrap() += 1; + let next_activated_id = activated_id(×tamp); + writeln!(&mut file, + "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket",) + .unwrap(); + } + SearchEvents::RankingRuleEndIteration { universe, ranking_rule_idx } => { + let cur_activated_id = activated_id(×tamp); + timestamp.pop(); + let parent_activated_id = activated_id(×tamp); + let parent_ranking_rule = if *ranking_rule_idx == 0 { + "start".to_owned() + } else { + format!("{}.{parent_activated_id}", ranking_rule_idx - 1) + }; + writeln!( + &mut file, + "{ranking_rule_idx}.{cur_activated_id} -> {parent_ranking_rule} : end iteration", + ) + .unwrap(); + } + SearchEvents::ExtendResults { new } => { + if new.is_empty() { + continue + } + let cur_ranking_rule = timestamp.len() - 1; + let cur_activated_id = activated_id(×tamp); + let docids = new.iter().collect::>(); + let len = new.len(); + let random = random::(); + + writeln!( + &mut file, + "{cur_ranking_rule}.{cur_activated_id} -> results.{random} : \"add {len}\" +results.{random} {{ + tooltip: \"{docids:?}\" + style {{ + fill: \"#B6E2D3\" + }} +}} +" + ) + .unwrap(); + }, + SearchEvents::WordsState { query_graph } => { + let cur_ranking_rule = timestamp.len() - 1; + let cur_activated_id = activated_id(×tamp); + let id = format!("{cur_ranking_rule}.{cur_activated_id}"); + let mut new_file_path = self.folder_path.join(format!("{id}.d2")); + let mut new_file = std::fs::File::create(new_file_path).unwrap(); + Self::query_graph_d2_description(&query_graph, &mut new_file); + writeln!( + &mut file, + "{id} {{ + link: \"{id}.d2.svg\" +}}").unwrap(); + }, + SearchEvents::ProximityState { graph, paths } => todo!(), + } + } + writeln!(&mut file, "}}"); + } + fn query_graph_d2_description(query_graph: &QueryGraph, file: &mut File) { + writeln!(file,"direction: right"); + for node in 0..query_graph.nodes.len() { + if matches!(query_graph.nodes[node], QueryNode::Deleted) { + continue; + } + writeln!(file,"{node}"); + + for edge in query_graph.edges[node].successors.iter() { + writeln!(file, "{node} -> {edge};\n").unwrap(); + } + } + } +} diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs new file mode 100644 index 000000000..94c2de713 --- /dev/null +++ b/milli/src/search/new/logger/mod.rs @@ -0,0 +1,72 @@ +pub mod detailed; + +use roaring::RoaringBitmap; + +use super::{query_graph, QueryGraph, RankingRule, RankingRuleQueryTrait}; + +pub struct DefaultSearchLogger; +impl SearchLogger for DefaultSearchLogger { + fn initial_query(&mut self, query: &Q) {} + + fn initial_universe(&mut self, universe: &RoaringBitmap) {} + + fn ranking_rules(&mut self, rr: &[Box>]) {} + fn start_iteration_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + query: &Q, + universe: &RoaringBitmap, + ) { + } + + fn next_bucket_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + universe: &RoaringBitmap, + ) { + } + + fn end_iteration_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + universe: &RoaringBitmap, + ) { + } + + fn add_to_results(&mut self, docids: &RoaringBitmap) {} + + fn log_words_state(&mut self, query_graph: &Q) {} +} + +pub trait SearchLogger { + fn initial_query(&mut self, query: &Q); + fn initial_universe(&mut self, universe: &RoaringBitmap); + + fn ranking_rules(&mut self, rr: &[Box>]); + + fn start_iteration_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + query: &Q, + universe: &RoaringBitmap, + ); + fn next_bucket_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + universe: &RoaringBitmap, + ); + fn end_iteration_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + universe: &RoaringBitmap, + ); + fn add_to_results(&mut self, docids: &RoaringBitmap); + + fn log_words_state(&mut self, query_graph: &Q); +} diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 01b466d20..7b82fc6e9 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -1,5 +1,6 @@ pub mod db_cache; pub mod graph_based_ranking_rule; +pub mod logger; pub mod query_graph; pub mod query_term; pub mod ranking_rule_graph; diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 4ceaddb8a..66f5b9d69 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -1,7 +1,10 @@ +use std::fmt::Display; + use heed::RoTxn; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; +use super::logger::SearchLogger; use super::resolve_query_graph::resolve_query_graph; use super::QueryGraph; use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; @@ -43,6 +46,8 @@ impl RankingRuleQueryTrait for PlaceholderQuery {} impl RankingRuleQueryTrait for QueryGraph {} pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { + fn id(&self) -> String; + /// Prepare the ranking rule such that it can start iterating over its /// buckets using [`next_bucket`](RankingRule::next_bucket). /// @@ -52,6 +57,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { index: &Index, txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, + logger: &mut dyn SearchLogger, universe: &RoaringBitmap, query: &Query, ) -> Result<()>; @@ -68,6 +74,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { index: &Index, txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, + logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>>; @@ -78,6 +85,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { index: &Index, txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, + logger: &mut dyn SearchLogger, ); } @@ -110,28 +118,36 @@ pub fn execute_search<'transaction>( db_cache: &mut DatabaseCache<'transaction>, universe: &RoaringBitmap, query_graph: &QueryGraph, + logger: &mut dyn SearchLogger, // _from: usize, // _length: usize, ) -> Result> { let words = Words::new(TermsMatchingStrategy::Last); // let sort = Sort::new(index, txn, "sort1".to_owned(), true)?; - let proximity = GraphBasedRankingRule::::default(); + let proximity = GraphBasedRankingRule::::new("proximity".to_owned()); // TODO: ranking rules given as argument let mut ranking_rules: Vec>> = vec![Box::new(words), Box::new(proximity) /* Box::new(sort) */]; + logger.ranking_rules(&ranking_rules); + let ranking_rules_len = ranking_rules.len(); - ranking_rules[0].start_iteration(index, txn, db_cache, universe, query_graph)?; + logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query_graph, universe); + ranking_rules[0].start_iteration(index, txn, db_cache, logger, universe, query_graph)?; let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; candidates[0] = universe.clone(); let mut cur_ranking_rule_index = 0; - macro_rules! back { () => { + logger.end_iteration_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &candidates[cur_ranking_rule_index], + ); candidates[cur_ranking_rule_index].clear(); - ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache); + ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger); if cur_ranking_rule_index == 0 { break; } else { @@ -146,11 +162,19 @@ pub fn execute_search<'transaction>( // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. if candidates[cur_ranking_rule_index].len() <= 1 { + logger.add_to_results(&candidates[cur_ranking_rule_index]); results.extend(&candidates[cur_ranking_rule_index]); back!(); continue; } - let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, &candidates[cur_ranking_rule_index])? else { + + logger.next_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &candidates[cur_ranking_rule_index], + ); + + let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else { back!(); continue; }; @@ -159,20 +183,29 @@ pub fn execute_search<'transaction>( if next_bucket.candidates.len() <= 1 { // Only zero or one candidate, no need to sort through the child ranking rule. + logger.add_to_results(&next_bucket.candidates); results.extend(next_bucket.candidates); continue; } else { // many candidates, give to next ranking rule, if any if cur_ranking_rule_index == ranking_rules_len - 1 { // TODO: don't extend too much, up to the limit only + logger.add_to_results(&next_bucket.candidates); results.extend(next_bucket.candidates); } else { cur_ranking_rule_index += 1; candidates[cur_ranking_rule_index] = next_bucket.candidates.clone(); + logger.start_iteration_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &next_bucket.query, + &candidates[cur_ranking_rule_index], + ); ranking_rules[cur_ranking_rule_index].start_iteration( index, txn, db_cache, + logger, &next_bucket.candidates, &next_bucket.query, )?; @@ -195,6 +228,8 @@ mod tests { use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; use crate::new::db_cache::DatabaseCache; + use crate::new::logger::detailed::DetailedSearchLogger; + use crate::new::logger::{DefaultSearchLogger, SearchLogger}; use crate::new::make_query_graph; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; @@ -231,13 +266,14 @@ mod tests { ])) .unwrap(); let txn = index.read_txn().unwrap(); - + let mut logger = DefaultSearchLogger; let mut db_cache = DatabaseCache::default(); let query_graph = make_query_graph(&index, &txn, &mut db_cache, "the quick brown fox jumps over") .unwrap(); println!("{}", query_graph.graphviz()); + logger.initial_query(&query_graph); // TODO: filters + maybe distinct attributes? let universe = get_start_universe( @@ -250,9 +286,15 @@ mod tests { .unwrap(); println!("universe: {universe:?}"); - let results = - execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /* 0, 20 */) - .unwrap(); + let results = execute_search( + &index, + &txn, + &mut db_cache, + &universe, + &query_graph, + &mut logger, /* 0, 20 */ + ) + .unwrap(); println!("{results:?}") } @@ -285,9 +327,19 @@ mod tests { ) .unwrap(); - let results = - execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /* 0, 20 */) - .unwrap(); + let mut logger = DetailedSearchLogger::new("log"); + + let results = execute_search( + &index, + &txn, + &mut db_cache, + &universe, + &query_graph, + &mut logger, /* 0, 20 */ + ) + .unwrap(); + + logger.write_d2_description(); let elapsed = start.elapsed(); diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 3e3fe0faf..29d244383 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -2,6 +2,7 @@ use heed::RoTxn; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; +use super::logger::SearchLogger; use super::{ RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper, RankingRuleQueryTrait, @@ -20,6 +21,7 @@ use crate::{ // (2) at the end, it should return all the remaining documents (this could be ensured at the trait level?) pub struct Sort<'transaction, Query> { + field_name: String, field_id: Option, is_ascending: bool, iter: Option>, @@ -34,18 +36,23 @@ impl<'transaction, Query> Sort<'transaction, Query> { let fields_ids_map = index.fields_ids_map(rtxn)?; let field_id = fields_ids_map.id(&field_name); - Ok(Self { field_id, is_ascending, iter: None }) + Ok(Self { field_name, field_id, is_ascending, iter: None }) } } impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query> for Sort<'transaction, Query> { + fn id(&self) -> String { + let Self { field_name, is_ascending, .. } = self; + format!("{field_name}:{}", if *is_ascending { "asc" } else { "desc " }) + } fn start_iteration( &mut self, index: &Index, txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, + _logger: &mut dyn SearchLogger, parent_candidates: &RoaringBitmap, parent_query_graph: &Query, ) -> Result<()> { @@ -89,6 +96,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query _index: &Index, _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, + _logger: &mut dyn SearchLogger, _universe: &RoaringBitmap, ) -> Result>> { let iter = self.iter.as_mut().unwrap(); @@ -101,6 +109,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query _index: &Index, _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, + _logger: &mut dyn SearchLogger, ) { self.iter = None; } diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 1c03586fd..4beb5994a 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -4,6 +4,7 @@ use heed::RoTxn; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; +use super::logger::SearchLogger; use super::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}; use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput}; use crate::{Index, Result, TermsMatchingStrategy}; @@ -30,11 +31,15 @@ impl Words { } impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { + fn id(&self) -> String { + "words".to_owned() + } fn start_iteration( &mut self, _index: &Index, _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, + logger: &mut dyn SearchLogger, parent_candidates: &RoaringBitmap, parent_query_graph: &QueryGraph, ) -> Result<()> { @@ -42,6 +47,8 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { self.exhausted = false; self.query_graph = Some(parent_query_graph.clone()); + logger.log_words_state(parent_query_graph); + // TODO: a phrase can contain many positions, but represents a single node. // That's a problem. let positions_to_remove = match self.terms_matching_strategy { @@ -70,6 +77,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { index: &Index, txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, + logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { // println!("Words: next bucket"); @@ -99,6 +107,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { let position_to_remove = self.positions_to_remove.pop().unwrap(); query_graph.remove_words_at_position(position_to_remove); } + logger.log_words_state(query_graph); Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket })) } @@ -108,6 +117,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { _index: &Index, _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, + _logger: &mut dyn SearchLogger, ) { // println!("Words: end iteration"); self.iterating = false; From 173e37584c3a59fe6238a94dcfb370fa75202348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 23 Feb 2023 13:13:19 +0100 Subject: [PATCH 025/234] Improve the visual/detailed search logger --- .../search/new/graph_based_ranking_rule.rs | 2 + milli/src/search/new/logger/detailed.rs | 131 +++++++++++++++++- milli/src/search/new/logger/mod.rs | 20 ++- milli/src/search/new/query_term.rs | 2 +- .../src/search/new/ranking_rule_graph/mod.rs | 31 ++++- .../new/ranking_rule_graph/paths_map.rs | 6 +- .../new/ranking_rule_graph/proximity/mod.rs | 10 ++ milli/src/search/new/ranking_rules.rs | 8 +- milli/src/search/new/words.rs | 6 +- 9 files changed, 192 insertions(+), 24 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 6c061cd4c..a9bb31682 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -94,6 +94,8 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap return Ok(None); } + G::log_state(&state.graph, &paths, logger); + let bucket = state.graph.resolve_paths( index, txn, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index a108a3a7f..36072af4d 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -5,6 +5,8 @@ use std::path::Path; use std::{io::Write, path::PathBuf}; use crate::new::QueryNode; +use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::ranking_rule_graph::{ paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph, }; @@ -112,6 +114,11 @@ impl SearchLogger for DetailedSearchLogger { fn log_words_state(&mut self, query_graph: &QueryGraph) { self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); } + + fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap,) { + self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone() }) + } + } @@ -129,7 +136,7 @@ impl DetailedSearchLogger { } let index_path = self.folder_path.join("index.d2"); - let mut file = std::fs::File::create(&index_path).unwrap(); + let mut file = std::fs::File::create(index_path).unwrap(); writeln!(&mut file, "Control Flow Between Ranking Rules: {{").unwrap(); writeln!(&mut file, "shape: sequence_diagram"); for (idx, rr_id) in self.ranking_rules_ids.as_ref().unwrap().iter().enumerate() { @@ -210,29 +217,143 @@ results.{random} {{ let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let mut new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::query_graph_d2_description(&query_graph, &mut new_file); + Self::query_graph_d2_description(query_graph, &mut new_file); + writeln!( + &mut file, + "{id} {{ + link: \"{id}.d2.svg\" +}}").unwrap(); + }, + SearchEvents::ProximityState { graph, paths } => { + let cur_ranking_rule = timestamp.len() - 1; + let cur_activated_id = activated_id(×tamp); + let id = format!("{cur_ranking_rule}.{cur_activated_id}"); + let mut new_file_path = self.folder_path.join(format!("{id}.d2")); + let mut new_file = std::fs::File::create(new_file_path).unwrap(); + Self::proximity_graph_d2_description(graph, paths, &mut new_file); writeln!( &mut file, "{id} {{ link: \"{id}.d2.svg\" }}").unwrap(); }, - SearchEvents::ProximityState { graph, paths } => todo!(), } } writeln!(&mut file, "}}"); } + + fn query_node_d2_desc(node_idx: usize, node: &QueryNode, file: &mut File) { + match &node { + QueryNode::Term(LocatedQueryTerm { value, positions }) => { + match value { + QueryTerm::Phrase(_) => todo!(), + QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db } } => { + writeln!(file,"{node_idx} : \"{original}\" {{ +shape: class").unwrap(); + for w in zero_typo { + writeln!(file, "\"{w}\" : 0").unwrap(); + } + for w in one_typo { + writeln!(file, "\"{w}\" : 1").unwrap(); + } + for w in two_typos { + writeln!(file, "\"{w}\" : 2").unwrap(); + } + if *use_prefix_db { + writeln!(file, "use prefix DB : true").unwrap(); + } + writeln!(file, "}}").unwrap(); + }, + } + }, + QueryNode::Deleted => panic!(), + QueryNode::Start => { + writeln!(file,"{node_idx} : START").unwrap(); + }, + QueryNode::End => { + writeln!(file,"{node_idx} : END").unwrap(); + }, + } + } fn query_graph_d2_description(query_graph: &QueryGraph, file: &mut File) { writeln!(file,"direction: right"); for node in 0..query_graph.nodes.len() { if matches!(query_graph.nodes[node], QueryNode::Deleted) { continue; } - writeln!(file,"{node}"); - + Self::query_node_d2_desc(node, &query_graph.nodes[node], file); + for edge in query_graph.edges[node].successors.iter() { writeln!(file, "{node} -> {edge};\n").unwrap(); } } } + fn proximity_graph_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, file: &mut File) { + writeln!(file,"direction: right").unwrap(); + + writeln!(file, "Proximity Graph {{").unwrap(); + for (node_idx, node) in graph.query_graph.nodes.iter().enumerate() { + if matches!(node, QueryNode::Deleted) { + continue; + } + Self::query_node_d2_desc(node_idx, node, file); + } + for edge in graph.all_edges.iter().flatten() { + let Edge { from_node, to_node, cost, details } = edge; + + match &details { + EdgeDetails::Unconditional => { + writeln!(file, + "{from_node} -> {to_node} : \"always cost {cost}\"", + cost = edge.cost, + ); + } + EdgeDetails::Data(details) => { + writeln!(file, + "{from_node} -> {to_node} : \"cost {cost} {edge_label}\"", + cost = edge.cost, + edge_label = ProximityGraph::graphviz_edge_details_label(details) + ); + } + } + } + writeln!(file, "}}").unwrap(); + + writeln!(file, "Shortest Paths {{").unwrap(); + Self::paths_d2_description(graph, paths, file); + writeln!(file, "}}").unwrap(); + } + fn paths_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, file: &mut File) { + for (edge_idx, rest) in paths.nodes.iter() { + let Edge { from_node, to_node, cost, .. } = graph.all_edges[*edge_idx as usize].as_ref().unwrap(); + let from_node = &graph.query_graph.nodes[*from_node as usize]; + let from_node_desc = match from_node { + QueryNode::Term(term) => match &term.value { + QueryTerm::Phrase(_) => todo!(), + QueryTerm::Word { derivations } => derivations.original.clone(), + }, + QueryNode::Deleted => panic!(), + QueryNode::Start => "START".to_owned(), + QueryNode::End => "END".to_owned(), + }; + let to_node = &graph.query_graph.nodes[*to_node as usize]; + let to_node_desc = match to_node { + QueryNode::Term(term) => match &term.value { + QueryTerm::Phrase(_) => todo!(), + QueryTerm::Word { derivations } => derivations.original.clone(), + }, + QueryNode::Deleted => panic!(), + QueryNode::Start => "START".to_owned(), + QueryNode::End => "END".to_owned(), + }; + writeln!(file, "{edge_idx}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ + shape: class + }}").unwrap(); + + for (dest_edge_idx, _) in rest.nodes.iter() { + writeln!(file, "{edge_idx} -> {dest_edge_idx}").unwrap(); + } + Self::paths_d2_description(graph, rest, file); + } + } } diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 94c2de713..ccafc7f11 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -1,8 +1,13 @@ +#[cfg(test)] pub mod detailed; use roaring::RoaringBitmap; -use super::{query_graph, QueryGraph, RankingRule, RankingRuleQueryTrait}; +use super::{ + query_graph, + ranking_rule_graph::{paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph}, + QueryGraph, RankingRule, RankingRuleQueryTrait, +}; pub struct DefaultSearchLogger; impl SearchLogger for DefaultSearchLogger { @@ -39,6 +44,13 @@ impl SearchLogger for DefaultSearchLogger { fn add_to_results(&mut self, docids: &RoaringBitmap) {} fn log_words_state(&mut self, query_graph: &Q) {} + + fn log_proximity_state( + &mut self, + query_graph: &RankingRuleGraph, + paths_map: &PathsMap, + ) { + } } pub trait SearchLogger { @@ -69,4 +81,10 @@ pub trait SearchLogger { fn add_to_results(&mut self, docids: &RoaringBitmap); fn log_words_state(&mut self, query_graph: &Q); + + fn log_proximity_state( + &mut self, + query_graph: &RankingRuleGraph, + paths: &PathsMap, + ); } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 52943755a..537857bf2 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -51,7 +51,7 @@ pub fn word_derivations( let mut two_typos = vec![]; if max_typo == 0 { - if is_prefix { + if is_prefix && !use_prefix_db { let prefix = Str::new(word).starts_with(); let mut stream = fst.search(prefix).into_stream(); diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 52b685d08..e677be1d9 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -11,7 +11,10 @@ use std::ops::ControlFlow; use heed::RoTxn; use roaring::RoaringBitmap; +use self::paths_map::PathsMap; + use super::db_cache::DatabaseCache; +use super::logger::SearchLogger; use super::{QueryGraph, QueryNode}; use crate::{Index, Result}; @@ -23,10 +26,10 @@ pub enum EdgeDetails { #[derive(Debug, Clone)] pub struct Edge { - from_node: u32, - to_node: u32, - cost: u8, - details: EdgeDetails, + pub from_node: u32, + pub to_node: u32, + pub cost: u8, + pub details: EdgeDetails, } #[derive(Debug, Clone)] @@ -35,11 +38,11 @@ pub struct EdgePointer<'graph, E> { pub edge: &'graph Edge, } -pub trait RankingRuleGraphTrait { +pub trait RankingRuleGraphTrait: Sized { /// The details of an edge connecting two query nodes. These details /// should be sufficient to compute the edge's cost and associated document ids /// in [`compute_docids`](RankingRuleGraphTrait). - type EdgeDetails: Sized; + type EdgeDetails: Sized + Clone; type BuildVisitedFromNode; @@ -75,6 +78,12 @@ pub trait RankingRuleGraphTrait { to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>>; + + fn log_state( + graph: &RankingRuleGraph, + paths: &PathsMap, + logger: &mut dyn SearchLogger, + ); } pub struct RankingRuleGraph { @@ -90,6 +99,16 @@ pub struct RankingRuleGraph { // 2. get node_incoming_edges[to] // 3. take intersection betweem the two } +impl Clone for RankingRuleGraph { + fn clone(&self) -> Self { + Self { + query_graph: self.query_graph.clone(), + all_edges: self.all_edges.clone(), + node_edges: self.node_edges.clone(), + successors: self.successors.clone(), + } + } +} impl RankingRuleGraph { // Visit all edges between the two given nodes in order of increasing cost. pub fn visit_edges<'graph, O>( diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 6f6512ae4..8360b1975 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -9,10 +9,10 @@ use super::cheapest_paths::Path; use super::{Edge, EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::QueryNode; -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct PathsMap { - nodes: Vec<(u32, PathsMap)>, - value: Option, + pub nodes: Vec<(u32, PathsMap)>, + pub value: Option, } impl Default for PathsMap { fn default() -> Self { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 3b9470be2..66e6bad98 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -3,6 +3,7 @@ pub mod compute_docids; use heed::RoTxn; +use super::paths_map::PathsMap; use super::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; use crate::new::query_term::WordDerivations; @@ -18,6 +19,7 @@ pub enum WordPair { WordPrefixSwapped { left: String, right_prefix: String }, } +#[derive(Clone)] pub struct ProximityEdge { pairs: Vec, proximity: u8, @@ -61,4 +63,12 @@ impl RankingRuleGraphTrait for ProximityGraph { ) -> Result)>> { build::visit_to_node(index, txn, db_cache, to_node, from_node_data) } + + fn log_state( + graph: &super::RankingRuleGraph, + paths: &PathsMap, + logger: &mut dyn crate::new::logger::SearchLogger, + ) { + logger.log_proximity_state(graph, paths); + } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 66f5b9d69..6126676e4 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -270,8 +270,7 @@ mod tests { let mut db_cache = DatabaseCache::default(); let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "the quick brown fox jumps over") - .unwrap(); + make_query_graph(&index, &txn, &mut db_cache, "b b b b b b b b b b").unwrap(); println!("{}", query_graph.graphviz()); logger.initial_query(&query_graph); @@ -314,8 +313,7 @@ mod tests { let mut db_cache = DatabaseCache::default(); let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government") - .unwrap(); + make_query_graph(&index, &txn, &mut db_cache, "b b b b b b b b b b").unwrap(); // TODO: filters + maybe distinct attributes? let universe = get_start_universe( @@ -335,7 +333,7 @@ mod tests { &mut db_cache, &universe, &query_graph, - &mut logger, /* 0, 20 */ + &mut logger, //&mut DefaultSearchLogger, /* 0, 20 */ ) .unwrap(); diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 4beb5994a..63df03f93 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -47,8 +47,6 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { self.exhausted = false; self.query_graph = Some(parent_query_graph.clone()); - logger.log_words_state(parent_query_graph); - // TODO: a phrase can contain many positions, but represents a single node. // That's a problem. let positions_to_remove = match self.terms_matching_strategy { @@ -83,11 +81,14 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { // println!("Words: next bucket"); assert!(self.iterating); assert!(universe.len() > 1); + if self.exhausted { return Ok(None); } let Some(query_graph) = &mut self.query_graph else { panic!() }; + logger.log_words_state(query_graph); + let this_bucket = resolve_query_graph( index, txn, @@ -107,7 +108,6 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { let position_to_remove = self.positions_to_remove.pop().unwrap(); query_graph.remove_words_at_position(position_to_remove); } - logger.log_words_state(query_graph); Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket })) } From 6806640ef0f34a1a914484b72590da4c933b8ed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 27 Feb 2023 10:17:03 +0100 Subject: [PATCH 026/234] Fix d2 description of paths map --- milli/src/search/new/logger/detailed.rs | 16 +++++++++------- milli/src/search/new/ranking_rules.rs | 3 ++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 36072af4d..86fcacb3e 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -320,12 +320,13 @@ shape: class").unwrap(); writeln!(file, "}}").unwrap(); writeln!(file, "Shortest Paths {{").unwrap(); - Self::paths_d2_description(graph, paths, file); + Self::paths_d2_description(graph, "", paths, file); writeln!(file, "}}").unwrap(); } - fn paths_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, file: &mut File) { + fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { + for (edge_idx, rest) in paths.nodes.iter() { - let Edge { from_node, to_node, cost, .. } = graph.all_edges[*edge_idx as usize].as_ref().unwrap(); + let Some(Edge { from_node, to_node, cost, .. }) = graph.all_edges[*edge_idx as usize].as_ref() else { continue }; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { QueryNode::Term(term) => match &term.value { @@ -346,14 +347,15 @@ shape: class").unwrap(); QueryNode::Start => "START".to_owned(), QueryNode::End => "END".to_owned(), }; - writeln!(file, "{edge_idx}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ + let edge_id = format!("{paths_idx}{edge_idx}"); + writeln!(file, "{edge_id}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ shape: class }}").unwrap(); - for (dest_edge_idx, _) in rest.nodes.iter() { - writeln!(file, "{edge_idx} -> {dest_edge_idx}").unwrap(); + let dest_edge_id = format!("{paths_idx}{edge_idx}{dest_edge_idx}"); + writeln!(file, "{edge_id} -> {dest_edge_id}").unwrap(); } - Self::paths_d2_description(graph, rest, file); + Self::paths_d2_description(graph, &format!("{paths_idx}{edge_idx}"), rest, file); } } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 6126676e4..d8d754f21 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -313,7 +313,8 @@ mod tests { let mut db_cache = DatabaseCache::default(); let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "b b b b b b b b b b").unwrap(); + make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government") + .unwrap(); // TODO: filters + maybe distinct attributes? let universe = get_start_universe( From 0e1fbbf7c60e9b79a35edb66737bc2f9707d6490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 27 Feb 2023 14:16:39 +0100 Subject: [PATCH 027/234] Fix bugs in query graph's "remove word" and "cheapest paths" algos --- milli/src/search/new/logger/detailed.rs | 2 +- milli/src/search/new/query_graph.rs | 6 +----- .../new/ranking_rule_graph/cheapest_paths.rs | 17 ++++++----------- milli/src/search/new/ranking_rules.rs | 2 +- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 86fcacb3e..b3f5bbcce 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -326,7 +326,7 @@ shape: class").unwrap(); fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { for (edge_idx, rest) in paths.nodes.iter() { - let Some(Edge { from_node, to_node, cost, .. }) = graph.all_edges[*edge_idx as usize].as_ref() else { continue }; + let Edge { from_node, to_node, cost, .. } = graph.all_edges[*edge_idx as usize].as_ref().unwrap() ; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { QueryNode::Term(term) => match &term.value { diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 5ed746c27..c07343c9b 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -192,18 +192,14 @@ impl QueryGraph { } pub fn remove_words_at_position(&mut self, position: i8) { let mut nodes_to_remove_keeping_edges = vec![]; - let mut nodes_to_remove = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { let node_idx = node_idx as u32; let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue }; - if positions.contains(&position) { + if positions.start() == &position { nodes_to_remove_keeping_edges.push(node_idx) - } else if positions.contains(&position) { - nodes_to_remove.push(node_idx) } } - self.remove_nodes(&nodes_to_remove); self.remove_nodes_keep_edges(&nodes_to_remove_keeping_edges); self.simplify(); diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index fdce85159..759780200 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -74,13 +74,17 @@ impl KCheapestPathsState { empty_paths_cache: &EmptyPathsCache, into_map: &mut PathsMap, ) -> Option { - into_map.add_path(&self.kth_cheapest_path); + if !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges) { + into_map.add_path(&self.kth_cheapest_path); + } let cur_cost = self.kth_cheapest_path.cost; while self.kth_cheapest_path.cost <= cur_cost { if let Some(next_self) = self.compute_next_cheapest_paths(graph, empty_paths_cache) { self = next_self; if self.kth_cheapest_path.cost == cur_cost { into_map.add_path(&self.kth_cheapest_path); + } else { + break; } } else { return None; @@ -89,8 +93,6 @@ impl KCheapestPathsState { Some(self) } - // TODO: use the cache to potentially remove edges that return an empty RoaringBitmap - // TODO: return an Option<&'self Path>? fn compute_next_cheapest_paths( mut self, graph: &mut RankingRuleGraph, @@ -141,19 +143,12 @@ impl KCheapestPathsState { } while let Some(mut next_cheapest_paths_entry) = self.potential_cheapest_paths.first_entry() { - // This could be implemented faster - // Here, maybe I should filter the potential cheapest paths so that they - // don't contain any removed edge? - let cost = *next_cheapest_paths_entry.key(); let next_cheapest_paths = next_cheapest_paths_entry.get_mut(); while let Some((next_cheapest_path, cost2)) = next_cheapest_paths.remove_first() { assert_eq!(cost, cost2); - if next_cheapest_path - .iter() - .any(|edge_index| graph.all_edges[*edge_index as usize].is_none()) - { + if empty_paths_cache.path_is_empty(&next_cheapest_path) { continue; } else { self.cheapest_paths.insert(next_cheapest_path.iter().copied(), cost); diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index d8d754f21..74ded6d97 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -313,7 +313,7 @@ mod tests { let mut db_cache = DatabaseCache::default(); let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government") + make_query_graph(&index, &txn, &mut db_cache, "the sun flower is facing the su") .unwrap(); // TODO: filters + maybe distinct attributes? From 6c85c0d95ea99dc88a571a4290d5493b5afa3ef2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 27 Feb 2023 15:04:40 +0100 Subject: [PATCH 028/234] Fix more bugs + visual empty path cache logging --- .../search/new/graph_based_ranking_rule.rs | 33 ++++---- milli/src/search/new/logger/detailed.rs | 83 +++++++++++-------- milli/src/search/new/logger/mod.rs | 11 ++- .../new/ranking_rule_graph/cheapest_paths.rs | 12 ++- .../ranking_rule_graph/empty_paths_cache.rs | 6 +- .../src/search/new/ranking_rule_graph/mod.rs | 2 + .../new/ranking_rule_graph/proximity/mod.rs | 4 +- .../new/ranking_rule_graph/resolve_paths.rs | 2 +- milli/src/search/new/ranking_rules.rs | 22 +++-- 9 files changed, 107 insertions(+), 68 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index a9bb31682..a466714e3 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -71,30 +71,29 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap assert!(universe.len() > 1); let mut state = self.state.take().unwrap(); - let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else { + let Some(mut cheapest_paths_state) = state.cheapest_paths_state.take() else { return Ok(None); }; let mut paths = PathsMap::default(); - if let Some(next_cheapest_paths_state) = cheapest_paths_state - .compute_paths_of_next_lowest_cost( - &mut state.graph, - &state.empty_paths_cache, - &mut paths, - ) - { - state.cheapest_paths_state = Some(next_cheapest_paths_state); - } else { - state.cheapest_paths_state = None; + while paths.is_empty() { + if let Some(next_cheapest_paths_state) = cheapest_paths_state + .compute_paths_of_next_lowest_cost( + &mut state.graph, + &state.empty_paths_cache, + &mut paths, + ) + { + cheapest_paths_state = next_cheapest_paths_state; + } else { + self.state = None; + return Ok(None); + } } + state.cheapest_paths_state = Some(cheapest_paths_state); - if paths.is_empty() { - self.state = None; - return Ok(None); - } - - G::log_state(&state.graph, &paths, logger); + G::log_state(&state.graph, &paths, &state.empty_paths_cache, logger); let bucket = state.graph.resolve_paths( index, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index b3f5bbcce..b59b30e6e 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -6,6 +6,7 @@ use std::{io::Write, path::PathBuf}; use crate::new::QueryNode; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::ranking_rule_graph::{ paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph, @@ -36,6 +37,7 @@ pub enum SearchEvents { ProximityState { graph: RankingRuleGraph, paths: PathsMap, + empty_paths_cache: EmptyPathsCache, }, } @@ -107,16 +109,16 @@ impl SearchLogger for DetailedSearchLogger { universe: universe.clone(), }) } - fn add_to_results(&mut self, docids: &RoaringBitmap) { - self.events.push(SearchEvents::ExtendResults { new: docids.clone() }); + fn add_to_results(&mut self, docids: &mut dyn Iterator) { + self.events.push(SearchEvents::ExtendResults { new: docids.collect() }); } fn log_words_state(&mut self, query_graph: &QueryGraph) { self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); } - fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap,) { - self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone() }) + fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap, empty_paths_cache: &EmptyPathsCache) { + self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() }) } @@ -224,13 +226,13 @@ results.{random} {{ link: \"{id}.d2.svg\" }}").unwrap(); }, - SearchEvents::ProximityState { graph, paths } => { + SearchEvents::ProximityState { graph, paths, empty_paths_cache } => { let cur_ranking_rule = timestamp.len() - 1; let cur_activated_id = activated_id(×tamp); let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let mut new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::proximity_graph_d2_description(graph, paths, &mut new_file); + Self::proximity_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); writeln!( &mut file, "{id} {{ @@ -288,7 +290,7 @@ shape: class").unwrap(); } } } - fn proximity_graph_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, file: &mut File) { + fn proximity_graph_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, empty_paths_cache: &EmptyPathsCache, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -322,35 +324,48 @@ shape: class").unwrap(); writeln!(file, "Shortest Paths {{").unwrap(); Self::paths_d2_description(graph, "", paths, file); writeln!(file, "}}").unwrap(); - } - fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { + writeln!(file, "Empty Path Prefixes {{").unwrap(); + Self::paths_d2_description(graph, "", &empty_paths_cache.empty_prefixes, file); + writeln!(file, "}}").unwrap(); + + writeln!(file, "Removed Edges {{").unwrap(); + for edge_idx in empty_paths_cache.empty_edges.iter() { + writeln!(file, "{edge_idx}").unwrap(); + } + writeln!(file, "}}").unwrap(); + } + fn edge_d2_description(graph: &RankingRuleGraph, paths_idx: &str, edge_idx: u32, file: &mut File) -> String { + let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; + let from_node = &graph.query_graph.nodes[*from_node as usize]; + let from_node_desc = match from_node { + QueryNode::Term(term) => match &term.value { + QueryTerm::Phrase(_) => todo!(), + QueryTerm::Word { derivations } => derivations.original.clone(), + }, + QueryNode::Deleted => panic!(), + QueryNode::Start => "START".to_owned(), + QueryNode::End => "END".to_owned(), + }; + let to_node = &graph.query_graph.nodes[*to_node as usize]; + let to_node_desc = match to_node { + QueryNode::Term(term) => match &term.value { + QueryTerm::Phrase(_) => todo!(), + QueryTerm::Word { derivations } => derivations.original.clone(), + }, + QueryNode::Deleted => panic!(), + QueryNode::Start => "START".to_owned(), + QueryNode::End => "END".to_owned(), + }; + let edge_id = format!("{paths_idx}{edge_idx}"); + writeln!(file, "{edge_id}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ + shape: class + }}").unwrap(); + edge_id + } + fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { for (edge_idx, rest) in paths.nodes.iter() { - let Edge { from_node, to_node, cost, .. } = graph.all_edges[*edge_idx as usize].as_ref().unwrap() ; - let from_node = &graph.query_graph.nodes[*from_node as usize]; - let from_node_desc = match from_node { - QueryNode::Term(term) => match &term.value { - QueryTerm::Phrase(_) => todo!(), - QueryTerm::Word { derivations } => derivations.original.clone(), - }, - QueryNode::Deleted => panic!(), - QueryNode::Start => "START".to_owned(), - QueryNode::End => "END".to_owned(), - }; - let to_node = &graph.query_graph.nodes[*to_node as usize]; - let to_node_desc = match to_node { - QueryNode::Term(term) => match &term.value { - QueryTerm::Phrase(_) => todo!(), - QueryTerm::Word { derivations } => derivations.original.clone(), - }, - QueryNode::Deleted => panic!(), - QueryNode::Start => "START".to_owned(), - QueryNode::End => "END".to_owned(), - }; - let edge_id = format!("{paths_idx}{edge_idx}"); - writeln!(file, "{edge_id}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ - shape: class - }}").unwrap(); + let edge_id = Self::edge_d2_description(graph, paths_idx, *edge_idx, file); for (dest_edge_idx, _) in rest.nodes.iter() { let dest_edge_id = format!("{paths_idx}{edge_idx}{dest_edge_idx}"); writeln!(file, "{edge_id} -> {dest_edge_id}").unwrap(); diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index ccafc7f11..d1a94f7e5 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -5,7 +5,10 @@ use roaring::RoaringBitmap; use super::{ query_graph, - ranking_rule_graph::{paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph}, + ranking_rule_graph::{ + empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, proximity::ProximityGraph, + RankingRuleGraph, + }, QueryGraph, RankingRule, RankingRuleQueryTrait, }; @@ -41,7 +44,7 @@ impl SearchLogger for DefaultSearchLogger { ) { } - fn add_to_results(&mut self, docids: &RoaringBitmap) {} + fn add_to_results(&mut self, docids: &mut dyn Iterator) {} fn log_words_state(&mut self, query_graph: &Q) {} @@ -49,6 +52,7 @@ impl SearchLogger for DefaultSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap, + empty_paths_cache: &EmptyPathsCache, ) { } } @@ -78,7 +82,7 @@ pub trait SearchLogger { ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, ); - fn add_to_results(&mut self, docids: &RoaringBitmap); + fn add_to_results(&mut self, docids: &mut dyn Iterator); fn log_words_state(&mut self, query_graph: &Q); @@ -86,5 +90,6 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, ); } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 759780200..e58950c98 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -81,7 +81,9 @@ impl KCheapestPathsState { while self.kth_cheapest_path.cost <= cur_cost { if let Some(next_self) = self.compute_next_cheapest_paths(graph, empty_paths_cache) { self = next_self; - if self.kth_cheapest_path.cost == cur_cost { + if self.kth_cheapest_path.cost == cur_cost + && !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges) + { into_map.add_path(&self.kth_cheapest_path); } else { break; @@ -148,7 +150,13 @@ impl KCheapestPathsState { while let Some((next_cheapest_path, cost2)) = next_cheapest_paths.remove_first() { assert_eq!(cost, cost2); - if empty_paths_cache.path_is_empty(&next_cheapest_path) { + // NOTE: it is important not to discard the paths that are forbidden due to a + // forbidden prefix, because the cheapest path algorithm (Dijkstra) cannot take + // this property into account. + if next_cheapest_path + .iter() + .any(|edge_index| graph.all_edges[*edge_index as usize].is_none()) + { continue; } else { self.cheapest_paths.insert(next_cheapest_path.iter().copied(), cost); diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index 5748dce3c..d8d645092 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -4,12 +4,16 @@ use roaring::RoaringBitmap; use super::paths_map::PathsMap; -#[derive(Default)] +#[derive(Default, Clone)] pub struct EmptyPathsCache { pub empty_edges: RoaringBitmap, pub empty_prefixes: PathsMap<()>, } impl EmptyPathsCache { + pub fn forbid_edge(&mut self, edge_idx: u32) { + self.empty_edges.insert(edge_idx); + self.empty_prefixes.remove_edge(&edge_idx); + } pub fn path_is_empty(&self, path: &[u32]) -> bool { for edge in path { if self.empty_edges.contains(*edge) { diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index e677be1d9..d939b6923 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -11,6 +11,7 @@ use std::ops::ControlFlow; use heed::RoTxn; use roaring::RoaringBitmap; +use self::empty_paths_cache::EmptyPathsCache; use self::paths_map::PathsMap; use super::db_cache::DatabaseCache; @@ -82,6 +83,7 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, logger: &mut dyn SearchLogger, ); } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 66e6bad98..c823cbf9c 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -3,6 +3,7 @@ pub mod compute_docids; use heed::RoTxn; +use super::empty_paths_cache::EmptyPathsCache; use super::paths_map::PathsMap; use super::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; @@ -67,8 +68,9 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &super::RankingRuleGraph, paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, logger: &mut dyn crate::new::logger::SearchLogger, ) { - logger.log_proximity_state(graph, paths); + logger.log_proximity_state(graph, paths, empty_paths_cache); } } diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index d21ddcd86..1a97dc485 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -40,7 +40,7 @@ impl RankingRuleGraph { BitmapOrAllRef::Bitmap(edge_docids) => { if edge_docids.is_disjoint(universe) { // 1. Store in the cache that this edge is empty for this universe - empty_paths_cache.empty_edges.insert(edge_index); + empty_paths_cache.forbid_edge(edge_index); // 2. remove all the paths that contain this edge for this universe paths.remove_edge(&edge_index); // 3. remove this edge from the proximity graph diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 74ded6d97..70682a561 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -139,6 +139,7 @@ pub fn execute_search<'transaction>( candidates[0] = universe.clone(); let mut cur_ranking_rule_index = 0; + macro_rules! back { () => { logger.end_iteration_ranking_rule( @@ -157,13 +158,20 @@ pub fn execute_search<'transaction>( } let mut results = vec![]; + macro_rules! add_to_results { + ($candidates:expr) => { + logger.add_to_results(&mut $candidates.iter().take(20 - results.len())); + let iter = $candidates.iter().take(20 - results.len()); + results.extend(iter); + }; + } + // TODO: skip buckets when we want to start from an offset while results.len() < 20 { // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. if candidates[cur_ranking_rule_index].len() <= 1 { - logger.add_to_results(&candidates[cur_ranking_rule_index]); - results.extend(&candidates[cur_ranking_rule_index]); + add_to_results!(candidates[cur_ranking_rule_index]); back!(); continue; } @@ -183,15 +191,12 @@ pub fn execute_search<'transaction>( if next_bucket.candidates.len() <= 1 { // Only zero or one candidate, no need to sort through the child ranking rule. - logger.add_to_results(&next_bucket.candidates); - results.extend(next_bucket.candidates); + add_to_results!(next_bucket.candidates); continue; } else { // many candidates, give to next ranking rule, if any if cur_ranking_rule_index == ranking_rules_len - 1 { - // TODO: don't extend too much, up to the limit only - logger.add_to_results(&next_bucket.candidates); - results.extend(next_bucket.candidates); + add_to_results!(next_bucket.candidates); } else { cur_ranking_rule_index += 1; candidates[cur_ranking_rule_index] = next_bucket.candidates.clone(); @@ -313,8 +318,7 @@ mod tests { let mut db_cache = DatabaseCache::default(); let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "the sun flower is facing the su") - .unwrap(); + make_query_graph(&index, &txn, &mut db_cache, "a a a a a a a a a a").unwrap(); // TODO: filters + maybe distinct attributes? let universe = get_start_universe( From 998d46ac1033e36496cfe7349a6312bc5ea03531 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 27 Feb 2023 16:14:53 +0100 Subject: [PATCH 029/234] Add support for search offset and limit --- milli/src/search/new/logger/detailed.rs | 27 ++++++++- milli/src/search/new/logger/mod.rs | 17 +++++- milli/src/search/new/ranking_rules.rs | 77 +++++++++++++++++++++---- 3 files changed, 105 insertions(+), 16 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index b59b30e6e..dc79a8d29 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -29,7 +29,7 @@ pub enum SearchEvents { universe: RoaringBitmap, }, ExtendResults { - new: RoaringBitmap, + new: Vec, }, WordsState { query_graph: QueryGraph, @@ -39,6 +39,7 @@ pub enum SearchEvents { paths: PathsMap, empty_paths_cache: EmptyPathsCache, }, + RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap }, } pub struct DetailedSearchLogger { @@ -97,6 +98,17 @@ impl SearchLogger for DetailedSearchLogger { universe: universe.clone(), }) } + fn skip_bucket_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + candidates: &RoaringBitmap, + ) { + self.events.push(SearchEvents::RankingRuleSkipBucket { + ranking_rule_idx, + candidates: candidates.clone(), + }) + } fn end_iteration_ranking_rule<'transaction>( &mut self, @@ -109,8 +121,8 @@ impl SearchLogger for DetailedSearchLogger { universe: universe.clone(), }) } - fn add_to_results(&mut self, docids: &mut dyn Iterator) { - self.events.push(SearchEvents::ExtendResults { new: docids.collect() }); + fn add_to_results(&mut self, docids: &[u32]) { + self.events.push(SearchEvents::ExtendResults { new: docids.to_vec() }); } fn log_words_state(&mut self, query_graph: &QueryGraph) { @@ -175,6 +187,15 @@ impl DetailedSearchLogger { "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket",) .unwrap(); } + SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates } => { + let old_activated_id = activated_id(×tamp); + *timestamp.last_mut().unwrap() += 1; + let next_activated_id = activated_id(×tamp); + let len = candidates.len(); + writeln!(&mut file, + "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : skip bucket ({len})",) + .unwrap(); + } SearchEvents::RankingRuleEndIteration { universe, ranking_rule_idx } => { let cur_activated_id = activated_id(×tamp); timestamp.pop(); diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index d1a94f7e5..fd39819ed 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -35,6 +35,13 @@ impl SearchLogger for DefaultSearchLogger { universe: &RoaringBitmap, ) { } + fn skip_bucket_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + candidates: &RoaringBitmap, + ) { + } fn end_iteration_ranking_rule<'transaction>( &mut self, @@ -44,7 +51,7 @@ impl SearchLogger for DefaultSearchLogger { ) { } - fn add_to_results(&mut self, docids: &mut dyn Iterator) {} + fn add_to_results(&mut self, docids: &[u32]) {} fn log_words_state(&mut self, query_graph: &Q) {} @@ -76,13 +83,19 @@ pub trait SearchLogger { ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, ); + fn skip_bucket_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + candidates: &RoaringBitmap, + ); fn end_iteration_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, ); - fn add_to_results(&mut self, docids: &mut dyn Iterator); + fn add_to_results(&mut self, docids: &[u32]); fn log_words_state(&mut self, query_graph: &Q); diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 70682a561..ed51d3345 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -119,8 +119,8 @@ pub fn execute_search<'transaction>( universe: &RoaringBitmap, query_graph: &QueryGraph, logger: &mut dyn SearchLogger, - // _from: usize, - // _length: usize, + from: usize, + length: usize, ) -> Result> { let words = Words::new(TermsMatchingStrategy::Last); // let sort = Sort::new(index, txn, "sort1".to_owned(), true)?; @@ -158,20 +158,58 @@ pub fn execute_search<'transaction>( } let mut results = vec![]; + let mut cur_offset = 0usize; + macro_rules! add_to_results { ($candidates:expr) => { - logger.add_to_results(&mut $candidates.iter().take(20 - results.len())); - let iter = $candidates.iter().take(20 - results.len()); - results.extend(iter); + let candidates = $candidates; + let len = candidates.len(); + if !candidates.is_empty() { + println!("cur_offset: {}, candidates_len: {}", cur_offset, candidates.len()); + if cur_offset < from { + println!(" cur_offset < from"); + if cur_offset + (candidates.len() as usize) < from { + println!(" cur_offset + candidates_len < from"); + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &candidates, + ); + } else { + println!(" cur_offset + candidates_len >= from"); + let all_candidates = candidates.iter().collect::>(); + let (skipped_candidates, candidates) = + all_candidates.split_at(from - cur_offset); + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &skipped_candidates.into_iter().collect(), + ); + let candidates = candidates + .iter() + .take(length - results.len()) + .copied() + .collect::>(); + logger.add_to_results(&candidates); + results.extend(&candidates); + } + } else { + let candidates = + candidates.iter().take(length - results.len()).collect::>(); + logger.add_to_results(&candidates); + results.extend(&candidates); + } + } + cur_offset += len as usize; }; } // TODO: skip buckets when we want to start from an offset - while results.len() < 20 { + while results.len() < length { // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. if candidates[cur_ranking_rule_index].len() <= 1 { - add_to_results!(candidates[cur_ranking_rule_index]); + add_to_results!(&candidates[cur_ranking_rule_index]); back!(); continue; } @@ -197,6 +235,14 @@ pub fn execute_search<'transaction>( // many candidates, give to next ranking rule, if any if cur_ranking_rule_index == ranking_rules_len - 1 { add_to_results!(next_bucket.candidates); + } else if cur_offset + (next_bucket.candidates.len() as usize) < from { + cur_offset += next_bucket.candidates.len() as usize; + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &next_bucket.candidates, + ); + continue; } else { cur_ranking_rule_index += 1; candidates[cur_ranking_rule_index] = next_bucket.candidates.clone(); @@ -296,7 +342,9 @@ mod tests { &mut db_cache, &universe, &query_graph, - &mut logger, /* 0, 20 */ + &mut logger, + 0, + 20, ) .unwrap(); println!("{results:?}") @@ -317,8 +365,13 @@ mod tests { let mut db_cache = DatabaseCache::default(); - let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "a a a a a a a a a a").unwrap(); + let query_graph = make_query_graph( + &index, + &txn, + &mut db_cache, + "and he was released from prison by the government", + ) + .unwrap(); // TODO: filters + maybe distinct attributes? let universe = get_start_universe( @@ -338,7 +391,9 @@ mod tests { &mut db_cache, &universe, &query_graph, - &mut logger, //&mut DefaultSearchLogger, /* 0, 20 */ + &mut logger, //&mut DefaultSearchLogger, + 500, + 100, ) .unwrap(); From 362eb0de86e860612b6776b712ed41057f2df504 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 27 Feb 2023 16:45:07 +0100 Subject: [PATCH 030/234] Add support for filters --- milli/src/search/new/ranking_rules.rs | 117 +++++++++++--------------- 1 file changed, 48 insertions(+), 69 deletions(-) diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index ed51d3345..c7c9d5c97 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -11,7 +11,7 @@ use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::new::ranking_rule_graph::proximity::ProximityGraph; use crate::new::words::Words; // use crate::search::new::sort::Sort; -use crate::{Index, Result, TermsMatchingStrategy}; +use crate::{Filter, Index, Result, TermsMatchingStrategy}; pub trait RankingRuleOutputIter<'transaction, Query> { fn next_bucket(&mut self) -> Result>>; @@ -111,16 +111,18 @@ pub fn get_start_universe<'transaction>( Ok(universe) } +// TODO: can make it generic over the query type (either query graph or placeholder) fairly easily +#[allow(clippy::too_many_arguments)] pub fn execute_search<'transaction>( index: &Index, txn: &'transaction heed::RoTxn, // TODO: ranking rules parameter db_cache: &mut DatabaseCache<'transaction>, - universe: &RoaringBitmap, query_graph: &QueryGraph, - logger: &mut dyn SearchLogger, + filters: Option, from: usize, length: usize, + logger: &mut dyn SearchLogger, ) -> Result> { let words = Words::new(TermsMatchingStrategy::Last); // let sort = Sort::new(index, txn, "sort1".to_owned(), true)?; @@ -131,9 +133,19 @@ pub fn execute_search<'transaction>( logger.ranking_rules(&ranking_rules); + let universe = if let Some(filters) = filters { + filters.evaluate(txn, index)? + } else { + index.documents_ids(txn)? + }; + + if universe.len() < from as u64 { + return Ok(vec![]); + } + let ranking_rules_len = ranking_rules.len(); - logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query_graph, universe); - ranking_rules[0].start_iteration(index, txn, db_cache, logger, universe, query_graph)?; + logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query_graph, &universe); + ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?; let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; candidates[0] = universe.clone(); @@ -160,23 +172,21 @@ pub fn execute_search<'transaction>( let mut results = vec![]; let mut cur_offset = 0usize; - macro_rules! add_to_results { + // Add the candidates to the results. Take the `from`, `limit`, and `cur_offset` into account. + macro_rules! maybe_add_to_results { ($candidates:expr) => { let candidates = $candidates; let len = candidates.len(); + // if the candidates are empty, there is nothing to do; if !candidates.is_empty() { - println!("cur_offset: {}, candidates_len: {}", cur_offset, candidates.len()); if cur_offset < from { - println!(" cur_offset < from"); if cur_offset + (candidates.len() as usize) < from { - println!(" cur_offset + candidates_len < from"); logger.skip_bucket_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index].as_ref(), &candidates, ); } else { - println!(" cur_offset + candidates_len >= from"); let all_candidates = candidates.iter().collect::>(); let (skipped_candidates, candidates) = all_candidates.split_at(from - cur_offset); @@ -203,13 +213,12 @@ pub fn execute_search<'transaction>( cur_offset += len as usize; }; } - // TODO: skip buckets when we want to start from an offset while results.len() < length { // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. if candidates[cur_ranking_rule_index].len() <= 1 { - add_to_results!(&candidates[cur_ranking_rule_index]); + maybe_add_to_results!(&candidates[cur_ranking_rule_index]); back!(); continue; } @@ -227,41 +236,30 @@ pub fn execute_search<'transaction>( candidates[cur_ranking_rule_index] -= &next_bucket.candidates; - if next_bucket.candidates.len() <= 1 { - // Only zero or one candidate, no need to sort through the child ranking rule. - add_to_results!(next_bucket.candidates); + if cur_ranking_rule_index == ranking_rules_len - 1 + || next_bucket.candidates.len() <= 1 + || cur_offset + (next_bucket.candidates.len() as usize) < from + { + maybe_add_to_results!(&next_bucket.candidates); continue; - } else { - // many candidates, give to next ranking rule, if any - if cur_ranking_rule_index == ranking_rules_len - 1 { - add_to_results!(next_bucket.candidates); - } else if cur_offset + (next_bucket.candidates.len() as usize) < from { - cur_offset += next_bucket.candidates.len() as usize; - logger.skip_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &next_bucket.candidates, - ); - continue; - } else { - cur_ranking_rule_index += 1; - candidates[cur_ranking_rule_index] = next_bucket.candidates.clone(); - logger.start_iteration_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &next_bucket.query, - &candidates[cur_ranking_rule_index], - ); - ranking_rules[cur_ranking_rule_index].start_iteration( - index, - txn, - db_cache, - logger, - &next_bucket.candidates, - &next_bucket.query, - )?; - } } + + cur_ranking_rule_index += 1; + candidates[cur_ranking_rule_index] = next_bucket.candidates.clone(); + logger.start_iteration_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &next_bucket.query, + &candidates[cur_ranking_rule_index], + ); + ranking_rules[cur_ranking_rule_index].start_iteration( + index, + txn, + db_cache, + logger, + &next_bucket.candidates, + &next_bucket.query, + )?; } Ok(results) @@ -325,28 +323,9 @@ mod tests { println!("{}", query_graph.graphviz()); logger.initial_query(&query_graph); - // TODO: filters + maybe distinct attributes? - let universe = get_start_universe( - &index, - &txn, - &mut db_cache, - &query_graph, - TermsMatchingStrategy::Last, - ) - .unwrap(); - println!("universe: {universe:?}"); - - let results = execute_search( - &index, - &txn, - &mut db_cache, - &universe, - &query_graph, - &mut logger, - 0, - 20, - ) - .unwrap(); + let results = + execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 20, &mut logger) + .unwrap(); println!("{results:?}") } @@ -389,11 +368,11 @@ mod tests { &index, &txn, &mut db_cache, - &universe, &query_graph, - &mut logger, //&mut DefaultSearchLogger, + None, 500, 100, + &mut logger, //&mut DefaultSearchLogger, ) .unwrap(); From 600e3dd1c56ebbfe06b99c3e9ddc3de35948cbd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 28 Feb 2023 11:49:24 +0100 Subject: [PATCH 031/234] Remove warnings --- milli/src/lib.rs | 2 +- .../search/new/graph_based_ranking_rule.rs | 8 ++-- milli/src/search/new/logger/detailed.rs | 40 ++++++++--------- milli/src/search/new/logger/mod.rs | 45 +++++++++---------- milli/src/search/new/query_graph.rs | 2 - .../search/new/ranking_rule_graph/build.rs | 2 - .../new/ranking_rule_graph/cheapest_paths.rs | 1 - .../ranking_rule_graph/edge_docids_cache.rs | 6 +-- .../ranking_rule_graph/empty_paths_cache.rs | 2 - .../src/search/new/ranking_rule_graph/mod.rs | 6 +-- .../new/ranking_rule_graph/paths_map.rs | 5 +-- .../new/ranking_rule_graph/proximity/build.rs | 2 +- .../new/ranking_rule_graph/proximity/mod.rs | 2 +- .../new/ranking_rule_graph/resolve_paths.rs | 2 +- milli/src/search/new/ranking_rules.rs | 34 ++++---------- milli/src/search/new/resolve_query_graph.rs | 4 +- milli/src/search/new/words.rs | 4 +- 17 files changed, 68 insertions(+), 99 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 6de737042..66dd33036 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,5 +1,5 @@ #![cfg_attr(all(test, fuzzing), feature(no_coverage))] -#![allow(unused, clippy::type_complexity)] +#![allow(clippy::type_complexity)] #[cfg(test)] #[global_allocator] diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index a466714e3..2bf7885bd 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -9,7 +9,7 @@ use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; use super::ranking_rule_graph::paths_map::PathsMap; use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; use super::{QueryGraph, RankingRule, RankingRuleOutput}; -use crate::new::ranking_rule_graph::cheapest_paths::{self, Path}; + use crate::{Index, Result}; pub struct GraphBasedRankingRule { @@ -40,8 +40,8 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap index: &Index, txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, - logger: &mut dyn SearchLogger, - universe: &RoaringBitmap, + _logger: &mut dyn SearchLogger, + _universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { // TODO: update old state instead of starting from scratch @@ -117,7 +117,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap _index: &Index, _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, - logger: &mut dyn SearchLogger, + _logger: &mut dyn SearchLogger, ) { self.state = None; } diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index dc79a8d29..81571c14a 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -1,10 +1,9 @@ use rand::random; use roaring::RoaringBitmap; use std::fs::File; -use std::path::Path; use std::{io::Write, path::PathBuf}; -use crate::new::QueryNode; +use crate::new::{QueryNode, QueryGraph}; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait}; @@ -12,7 +11,7 @@ use crate::new::ranking_rule_graph::{ paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph, }; -use super::{QueryGraph, RankingRule, RankingRuleQueryTrait, SearchLogger}; +use super::{RankingRule, SearchLogger}; pub enum SearchEvents { RankingRuleStartIteration { @@ -76,7 +75,7 @@ impl SearchLogger for DetailedSearchLogger { fn start_iteration_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, query: &QueryGraph, universe: &RoaringBitmap, ) { @@ -90,7 +89,7 @@ impl SearchLogger for DetailedSearchLogger { fn next_bucket_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, ) { self.events.push(SearchEvents::RankingRuleNextBucket { @@ -101,7 +100,7 @@ impl SearchLogger for DetailedSearchLogger { fn skip_bucket_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, candidates: &RoaringBitmap, ) { self.events.push(SearchEvents::RankingRuleSkipBucket { @@ -113,7 +112,7 @@ impl SearchLogger for DetailedSearchLogger { fn end_iteration_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, ) { self.events.push(SearchEvents::RankingRuleEndIteration { @@ -138,7 +137,6 @@ impl SearchLogger for DetailedSearchLogger { impl DetailedSearchLogger { pub fn write_d2_description(&self) { - let mut timestamp_idx = 0; let mut timestamp = vec![]; fn activated_id(timestamp: &[usize]) -> String { let mut s = String::new(); @@ -152,14 +150,14 @@ impl DetailedSearchLogger { let index_path = self.folder_path.join("index.d2"); let mut file = std::fs::File::create(index_path).unwrap(); writeln!(&mut file, "Control Flow Between Ranking Rules: {{").unwrap(); - writeln!(&mut file, "shape: sequence_diagram"); + writeln!(&mut file, "shape: sequence_diagram").unwrap(); for (idx, rr_id) in self.ranking_rules_ids.as_ref().unwrap().iter().enumerate() { writeln!(&mut file, "{idx}: {rr_id}").unwrap(); } - writeln!(&mut file, "results"); + writeln!(&mut file, "results").unwrap(); for event in self.events.iter() { match event { - SearchEvents::RankingRuleStartIteration { query, universe, ranking_rule_idx } => { + SearchEvents::RankingRuleStartIteration { ranking_rule_idx, .. } => { let parent_activated_id = activated_id(×tamp); timestamp.push(0); @@ -179,7 +177,7 @@ impl DetailedSearchLogger { }} }}").unwrap(); } - SearchEvents::RankingRuleNextBucket { universe, ranking_rule_idx } => { + SearchEvents::RankingRuleNextBucket { ranking_rule_idx, .. } => { let old_activated_id = activated_id(×tamp); *timestamp.last_mut().unwrap() += 1; let next_activated_id = activated_id(×tamp); @@ -196,7 +194,7 @@ impl DetailedSearchLogger { "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : skip bucket ({len})",) .unwrap(); } - SearchEvents::RankingRuleEndIteration { universe, ranking_rule_idx } => { + SearchEvents::RankingRuleEndIteration { ranking_rule_idx, .. } => { let cur_activated_id = activated_id(×tamp); timestamp.pop(); let parent_activated_id = activated_id(×tamp); @@ -238,7 +236,7 @@ results.{random} {{ let cur_ranking_rule = timestamp.len() - 1; let cur_activated_id = activated_id(×tamp); let id = format!("{cur_ranking_rule}.{cur_activated_id}"); - let mut new_file_path = self.folder_path.join(format!("{id}.d2")); + let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); Self::query_graph_d2_description(query_graph, &mut new_file); writeln!( @@ -251,7 +249,7 @@ results.{random} {{ let cur_ranking_rule = timestamp.len() - 1; let cur_activated_id = activated_id(×tamp); let id = format!("{cur_ranking_rule}.{cur_activated_id}"); - let mut new_file_path = self.folder_path.join(format!("{id}.d2")); + let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); Self::proximity_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); writeln!( @@ -262,12 +260,12 @@ results.{random} {{ }, } } - writeln!(&mut file, "}}"); + writeln!(&mut file, "}}").unwrap(); } fn query_node_d2_desc(node_idx: usize, node: &QueryNode, file: &mut File) { match &node { - QueryNode::Term(LocatedQueryTerm { value, positions }) => { + QueryNode::Term(LocatedQueryTerm { value, .. }) => { match value { QueryTerm::Phrase(_) => todo!(), QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db } } => { @@ -299,7 +297,7 @@ shape: class").unwrap(); } } fn query_graph_d2_description(query_graph: &QueryGraph, file: &mut File) { - writeln!(file,"direction: right"); + writeln!(file,"direction: right").unwrap(); for node in 0..query_graph.nodes.len() { if matches!(query_graph.nodes[node], QueryNode::Deleted) { continue; @@ -322,21 +320,21 @@ shape: class").unwrap(); Self::query_node_d2_desc(node_idx, node, file); } for edge in graph.all_edges.iter().flatten() { - let Edge { from_node, to_node, cost, details } = edge; + let Edge { from_node, to_node, details, .. } = edge; match &details { EdgeDetails::Unconditional => { writeln!(file, "{from_node} -> {to_node} : \"always cost {cost}\"", cost = edge.cost, - ); + ).unwrap(); } EdgeDetails::Data(details) => { writeln!(file, "{from_node} -> {to_node} : \"cost {cost} {edge_label}\"", cost = edge.cost, edge_label = ProximityGraph::graphviz_edge_details_label(details) - ); + ).unwrap(); } } } diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index fd39819ed..6b1f95152 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -4,62 +4,61 @@ pub mod detailed; use roaring::RoaringBitmap; use super::{ - query_graph, ranking_rule_graph::{ empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph, }, - QueryGraph, RankingRule, RankingRuleQueryTrait, + RankingRule, RankingRuleQueryTrait, }; pub struct DefaultSearchLogger; impl SearchLogger for DefaultSearchLogger { - fn initial_query(&mut self, query: &Q) {} + fn initial_query(&mut self, _query: &Q) {} - fn initial_universe(&mut self, universe: &RoaringBitmap) {} + fn initial_universe(&mut self, _universe: &RoaringBitmap) {} - fn ranking_rules(&mut self, rr: &[Box>]) {} + fn ranking_rules(&mut self, _rr: &[Box>]) {} fn start_iteration_ranking_rule<'transaction>( &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - query: &Q, - universe: &RoaringBitmap, + _ranking_rule_idx: usize, + _ranking_rule: &dyn RankingRule<'transaction, Q>, + _query: &Q, + _universe: &RoaringBitmap, ) { } fn next_bucket_ranking_rule<'transaction>( &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - universe: &RoaringBitmap, + _ranking_rule_idx: usize, + _ranking_rule: &dyn RankingRule<'transaction, Q>, + _universe: &RoaringBitmap, ) { } fn skip_bucket_ranking_rule<'transaction>( &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - candidates: &RoaringBitmap, + _ranking_rule_idx: usize, + _ranking_rule: &dyn RankingRule<'transaction, Q>, + _candidates: &RoaringBitmap, ) { } fn end_iteration_ranking_rule<'transaction>( &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - universe: &RoaringBitmap, + _ranking_rule_idx: usize, + _ranking_rule: &dyn RankingRule<'transaction, Q>, + _universe: &RoaringBitmap, ) { } - fn add_to_results(&mut self, docids: &[u32]) {} + fn add_to_results(&mut self, _docids: &[u32]) {} - fn log_words_state(&mut self, query_graph: &Q) {} + fn log_words_state(&mut self, _query_graph: &Q) {} fn log_proximity_state( &mut self, - query_graph: &RankingRuleGraph, - paths_map: &PathsMap, - empty_paths_cache: &EmptyPathsCache, + _query_graph: &RankingRuleGraph, + _paths_map: &PathsMap, + _empty_paths_cache: &EmptyPathsCache, ) { } } diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index c07343c9b..422896068 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,5 +1,3 @@ -use std::collections::HashSet; -use std::fmt; use std::fmt::Debug; use heed::RoTxn; diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 8e7dd7a04..a0fdd79c6 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,5 +1,3 @@ -use std::collections::{BTreeSet, HashSet}; - use heed::RoTxn; use roaring::RoaringBitmap; diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index e58950c98..e46f6ce66 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -1,6 +1,5 @@ use std::collections::{BTreeMap, HashSet}; -use itertools::Itertools; use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index dddbda6af..cb3e3da38 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::marker::PhantomData; use fxhash::FxHashMap; @@ -22,10 +21,7 @@ pub struct EdgeDocidsCache { } impl Default for EdgeDocidsCache { fn default() -> Self { - Self { - cache: Default::default(), - _phantom: Default::default(), - } + Self { cache: Default::default(), _phantom: Default::default() } } } impl EdgeDocidsCache { diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index d8d645092..db68838b5 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,5 +1,3 @@ -use std::collections::HashSet; - use roaring::RoaringBitmap; use super::paths_map::PathsMap; diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index d939b6923..3a396f3dc 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -5,7 +5,7 @@ pub mod empty_paths_cache; pub mod paths_map; pub mod proximity; pub mod resolve_paths; -use std::collections::{BTreeSet, HashSet}; + use std::ops::ControlFlow; use heed::RoTxn; @@ -137,7 +137,7 @@ impl RankingRuleGraph { fn remove_edge(&mut self, edge_index: u32) { let edge_opt = &mut self.all_edges[edge_index as usize]; let Some(edge) = &edge_opt else { return }; - let (from_node, to_node) = (edge.from_node, edge.to_node); + let (from_node, _to_node) = (edge.from_node, edge.to_node); *edge_opt = None; let from_node_edges = &mut self.node_edges[from_node as usize]; @@ -168,7 +168,7 @@ impl RankingRuleGraph { desc.push_str(";\n"); } for edge in self.all_edges.iter().flatten() { - let Edge { from_node, to_node, cost, details } = edge; + let Edge { from_node, to_node, details, .. } = edge; match &details { EdgeDetails::Unconditional => { diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 8360b1975..111b55140 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -1,5 +1,4 @@ use std::collections::hash_map::DefaultHasher; -use std::collections::HashSet; use std::fmt::Write; use std::hash::{Hash, Hasher}; @@ -206,7 +205,7 @@ impl PathsMap { h.finish() }; for (edge_idx, rest) in self.nodes.iter() { - let Some(Edge { from_node, to_node, cost, details }) = graph.all_edges[*edge_idx as usize].as_ref() else { + let Some(Edge { from_node, to_node, cost, .. }) = graph.all_edges[*edge_idx as usize].as_ref() else { continue; }; let mut path_to = path_from.clone(); @@ -248,7 +247,7 @@ impl RankingRuleGraph { for (edge_idx, edge) in self.all_edges.iter().enumerate() { let Some(edge) = edge else { continue }; - let Edge { from_node, to_node, cost, details } = edge; + let Edge { from_node, to_node, .. } = edge; let color = if path.edges.contains(&(edge_idx as u32)) { "red" } else { "green" }; match &edge.details { EdgeDetails::Unconditional => { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index bfcac57ee..6d2fefa65 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -7,7 +7,7 @@ use super::ProximityEdge; use crate::new::db_cache::DatabaseCache; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::proximity::WordPair; -use crate::new::ranking_rule_graph::{Edge, EdgeDetails}; +use crate::new::ranking_rule_graph::EdgeDetails; use crate::new::QueryNode; use crate::{Index, Result}; diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index c823cbf9c..fc1a44310 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -5,7 +5,7 @@ use heed::RoTxn; use super::empty_paths_cache::EmptyPathsCache; use super::paths_map::PathsMap; -use super::{Edge, EdgeDetails, RankingRuleGraphTrait}; +use super::{EdgeDetails, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; use crate::new::query_term::WordDerivations; use crate::new::QueryNode; diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index 1a97dc485..90650340f 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -8,7 +8,7 @@ use super::empty_paths_cache::EmptyPathsCache; use super::paths_map::PathsMap; use super::{RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; -use crate::new::ranking_rule_graph::Edge; + use crate::new::BitmapOrAllRef; use crate::{Index, Result}; diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index c7c9d5c97..e78bdff0c 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -1,11 +1,9 @@ -use std::fmt::Display; - use heed::RoTxn; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; use super::logger::SearchLogger; -use super::resolve_query_graph::resolve_query_graph; + use super::QueryGraph; use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::new::ranking_rule_graph::proximity::ProximityGraph; @@ -172,7 +170,8 @@ pub fn execute_search<'transaction>( let mut results = vec![]; let mut cur_offset = 0usize; - // Add the candidates to the results. Take the `from`, `limit`, and `cur_offset` into account. + // Add the candidates to the results. Take the `from`, `limit`, and `cur_offset` + // into account and inform the logger. macro_rules! maybe_add_to_results { ($candidates:expr) => { let candidates = $candidates; @@ -213,7 +212,6 @@ pub fn execute_search<'transaction>( cur_offset += len as usize; }; } - // TODO: skip buckets when we want to start from an offset while results.len() < length { // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. @@ -273,7 +271,7 @@ mod tests { use heed::EnvOpenOptions; - use super::{execute_search, get_start_universe}; + use super::execute_search; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; use crate::new::db_cache::DatabaseCache; @@ -344,23 +342,9 @@ mod tests { let mut db_cache = DatabaseCache::default(); - let query_graph = make_query_graph( - &index, - &txn, - &mut db_cache, - "and he was released from prison by the government", - ) - .unwrap(); - - // TODO: filters + maybe distinct attributes? - let universe = get_start_universe( - &index, - &txn, - &mut db_cache, - &query_graph, - TermsMatchingStrategy::Last, - ) - .unwrap(); + let query_graph = + make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government") + .unwrap(); let mut logger = DetailedSearchLogger::new("log"); @@ -370,8 +354,8 @@ mod tests { &mut db_cache, &query_graph, None, - 500, - 100, + 5, + 20, &mut logger, //&mut DefaultSearchLogger, ) .unwrap(); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index e752358a7..4da853e7c 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -1,11 +1,11 @@ -use std::collections::{HashMap, HashSet, VecDeque}; +use std::collections::VecDeque; use fxhash::FxHashMap; use heed::{BytesDecode, RoTxn}; use roaring::{MultiOps, RoaringBitmap}; use super::db_cache::DatabaseCache; -use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use super::query_term::{QueryTerm, WordDerivations}; use super::QueryGraph; use crate::{Index, Result, RoaringBitmapCodec}; diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 63df03f93..e4513eea0 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -39,8 +39,8 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { _index: &Index, _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, - logger: &mut dyn SearchLogger, - parent_candidates: &RoaringBitmap, + _logger: &mut dyn SearchLogger, + _parent_candidates: &RoaringBitmap, parent_query_graph: &QueryGraph, ) -> Result<()> { // println!("Words: start iteration"); From 71f18e43799697996eed93b7bcbc3af10d3aadcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 28 Feb 2023 12:42:29 +0100 Subject: [PATCH 032/234] Add sort ranking rule to new search impl --- milli/src/search/new/logger/detailed.rs | 2 +- milli/src/search/new/logger/mod.rs | 4 ++-- milli/src/search/new/ranking_rules.rs | 27 ++++++++++++++----------- milli/src/search/new/sort.rs | 21 +++++++++++-------- 4 files changed, 31 insertions(+), 23 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 81571c14a..a85d20ccc 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -68,7 +68,7 @@ impl SearchLogger for DetailedSearchLogger { fn initial_universe(&mut self, universe: &RoaringBitmap) { self.initial_universe = Some(universe.clone()); } - fn ranking_rules(&mut self, rr: &[Box>]) { + fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]) { self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect()); } diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 6b1f95152..3b828f7cb 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -17,7 +17,7 @@ impl SearchLogger for DefaultSearchLogger { fn initial_universe(&mut self, _universe: &RoaringBitmap) {} - fn ranking_rules(&mut self, _rr: &[Box>]) {} + fn ranking_rules(&mut self, _rr: &[&mut dyn RankingRule]) {} fn start_iteration_ranking_rule<'transaction>( &mut self, _ranking_rule_idx: usize, @@ -67,7 +67,7 @@ pub trait SearchLogger { fn initial_query(&mut self, query: &Q); fn initial_universe(&mut self, universe: &RoaringBitmap); - fn ranking_rules(&mut self, rr: &[Box>]); + fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]); fn start_iteration_ranking_rule<'transaction>( &mut self, diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index e78bdff0c..f3f71ab4b 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -8,7 +8,7 @@ use super::QueryGraph; use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::new::ranking_rule_graph::proximity::ProximityGraph; use crate::new::words::Words; -// use crate::search::new::sort::Sort; +use crate::search::new::sort::Sort; use crate::{Filter, Index, Result, TermsMatchingStrategy}; pub trait RankingRuleOutputIter<'transaction, Query> { @@ -122,12 +122,12 @@ pub fn execute_search<'transaction>( length: usize, logger: &mut dyn SearchLogger, ) -> Result> { - let words = Words::new(TermsMatchingStrategy::Last); - // let sort = Sort::new(index, txn, "sort1".to_owned(), true)?; - let proximity = GraphBasedRankingRule::::new("proximity".to_owned()); + let words = &mut Words::new(TermsMatchingStrategy::Last); + let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; + let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); // TODO: ranking rules given as argument - let mut ranking_rules: Vec>> = - vec![Box::new(words), Box::new(proximity) /* Box::new(sort) */]; + let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> = + vec![words, proximity, sort]; logger.ranking_rules(&ranking_rules); @@ -142,7 +142,7 @@ pub fn execute_search<'transaction>( } let ranking_rules_len = ranking_rules.len(); - logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query_graph, &universe); + logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, &universe); ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?; let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; @@ -152,9 +152,10 @@ pub fn execute_search<'transaction>( macro_rules! back { () => { + assert!(candidates[cur_ranking_rule_index].is_empty()); logger.end_iteration_ranking_rule( cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), + ranking_rules[cur_ranking_rule_index], &candidates[cur_ranking_rule_index], ); candidates[cur_ranking_rule_index].clear(); @@ -182,7 +183,7 @@ pub fn execute_search<'transaction>( if cur_offset + (candidates.len() as usize) < from { logger.skip_bucket_ranking_rule( cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), + ranking_rules[cur_ranking_rule_index], &candidates, ); } else { @@ -191,7 +192,7 @@ pub fn execute_search<'transaction>( all_candidates.split_at(from - cur_offset); logger.skip_bucket_ranking_rule( cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), + ranking_rules[cur_ranking_rule_index], &skipped_candidates.into_iter().collect(), ); let candidates = candidates @@ -216,6 +217,7 @@ pub fn execute_search<'transaction>( // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. if candidates[cur_ranking_rule_index].len() <= 1 { + candidates[cur_ranking_rule_index].clear(); maybe_add_to_results!(&candidates[cur_ranking_rule_index]); back!(); continue; @@ -223,7 +225,7 @@ pub fn execute_search<'transaction>( logger.next_bucket_ranking_rule( cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), + ranking_rules[cur_ranking_rule_index], &candidates[cur_ranking_rule_index], ); @@ -232,6 +234,7 @@ pub fn execute_search<'transaction>( continue; }; + assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates)); candidates[cur_ranking_rule_index] -= &next_bucket.candidates; if cur_ranking_rule_index == ranking_rules_len - 1 @@ -246,7 +249,7 @@ pub fn execute_search<'transaction>( candidates[cur_ranking_rule_index] = next_bucket.candidates.clone(); logger.start_iteration_ranking_rule( cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), + ranking_rules[cur_ranking_rule_index], &next_bucket.query, &candidates[cur_ranking_rule_index], ); diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 29d244383..9ef01bd95 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -16,19 +16,16 @@ use crate::{ Result, }; -// TODO: The implementation of Sort is not correct: -// (1) it should not return documents it has already returned (does the current implementation have the same bug?) -// (2) at the end, it should return all the remaining documents (this could be ensured at the trait level?) - pub struct Sort<'transaction, Query> { field_name: String, field_id: Option, is_ascending: bool, + original_query: Option, iter: Option>, } impl<'transaction, Query> Sort<'transaction, Query> { pub fn new( - index: &'transaction Index, + index: &Index, rtxn: &'transaction heed::RoTxn, field_name: String, is_ascending: bool, @@ -36,7 +33,7 @@ impl<'transaction, Query> Sort<'transaction, Query> { let fields_ids_map = index.fields_ids_map(rtxn)?; let field_id = fields_ids_map.id(&field_name); - Ok(Self { field_name, field_id, is_ascending, iter: None }) + Ok(Self { field_name, field_id, is_ascending, original_query: None, iter: None }) } } @@ -87,6 +84,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query } None => RankingRuleOutputIterWrapper::new(Box::new(std::iter::empty())), }; + self.original_query = Some(parent_query_graph.clone()); self.iter = Some(iter); Ok(()) } @@ -97,11 +95,17 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, _logger: &mut dyn SearchLogger, - _universe: &RoaringBitmap, + universe: &RoaringBitmap, ) -> Result>> { let iter = self.iter.as_mut().unwrap(); // TODO: we should make use of the universe in the function below - iter.next_bucket() + if let Some(mut bucket) = iter.next_bucket()? { + bucket.candidates &= universe; + Ok(Some(bucket)) + } else { + let query = self.original_query.as_ref().unwrap().clone(); + Ok(Some(RankingRuleOutput { query, candidates: universe.clone() })) + } } fn end_iteration( @@ -111,6 +115,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query _db_cache: &mut DatabaseCache<'transaction>, _logger: &mut dyn SearchLogger, ) { + self.original_query = None; self.iter = None; } } From caa1e1b9234932765f6a752269c12d2f6864e050 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 28 Feb 2023 14:19:57 +0100 Subject: [PATCH 033/234] Add typo ranking rule to new search impl --- .../search/new/graph_based_ranking_rule.rs | 18 ++- milli/src/search/new/logger/detailed.rs | 34 ++++- milli/src/search/new/logger/mod.rs | 17 ++- .../src/search/new/ranking_rule_graph/mod.rs | 1 + .../search/new/ranking_rule_graph/typo/mod.rs | 131 ++++++++++++++++++ milli/src/search/new/ranking_rules.rs | 9 +- 6 files changed, 193 insertions(+), 17 deletions(-) create mode 100644 milli/src/search/new/ranking_rule_graph/typo/mod.rs diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 2bf7885bd..264686b0a 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -70,14 +70,16 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap ) -> Result>> { assert!(universe.len() > 1); let mut state = self.state.take().unwrap(); - - let Some(mut cheapest_paths_state) = state.cheapest_paths_state.take() else { + if state.cheapest_paths_state.is_none() { return Ok(None); - }; + } let mut paths = PathsMap::default(); while paths.is_empty() { + let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else { + break; + }; if let Some(next_cheapest_paths_state) = cheapest_paths_state .compute_paths_of_next_lowest_cost( &mut state.graph, @@ -85,13 +87,15 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap &mut paths, ) { - cheapest_paths_state = next_cheapest_paths_state; + state.cheapest_paths_state = Some(next_cheapest_paths_state); } else { - self.state = None; - return Ok(None); + break; } } - state.cheapest_paths_state = Some(cheapest_paths_state); + + if paths.is_empty() && state.cheapest_paths_state.is_none() { + return Ok(None); + } G::log_state(&state.graph, &paths, &state.empty_paths_cache, logger); diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index a85d20ccc..a7a3f8793 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -3,6 +3,7 @@ use roaring::RoaringBitmap; use std::fs::File; use std::{io::Write, path::PathBuf}; +use crate::new::ranking_rule_graph::typo::TypoGraph; use crate::new::{QueryNode, QueryGraph}; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; @@ -38,6 +39,11 @@ pub enum SearchEvents { paths: PathsMap, empty_paths_cache: EmptyPathsCache, }, + TypoState { + graph: RankingRuleGraph, + paths: PathsMap, + empty_paths_cache: EmptyPathsCache, + }, RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap }, } @@ -132,7 +138,10 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() }) } - + fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap, empty_paths_cache: &EmptyPathsCache) { + self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() }) + } + } impl DetailedSearchLogger { @@ -251,7 +260,20 @@ results.{random} {{ let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::proximity_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); + Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); + writeln!( + &mut file, + "{id} {{ + link: \"{id}.d2.svg\" +}}").unwrap(); + }, + SearchEvents::TypoState { graph, paths, empty_paths_cache } => { + let cur_ranking_rule = timestamp.len() - 1; + let cur_activated_id = activated_id(×tamp); + let id = format!("{cur_ranking_rule}.{cur_activated_id}"); + let new_file_path = self.folder_path.join(format!("{id}.d2")); + let mut new_file = std::fs::File::create(new_file_path).unwrap(); + Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); writeln!( &mut file, "{id} {{ @@ -309,7 +331,7 @@ shape: class").unwrap(); } } } - fn proximity_graph_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, empty_paths_cache: &EmptyPathsCache, file: &mut File) { + fn ranking_rule_graph_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, empty_paths_cache: &EmptyPathsCache, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -333,7 +355,7 @@ shape: class").unwrap(); writeln!(file, "{from_node} -> {to_node} : \"cost {cost} {edge_label}\"", cost = edge.cost, - edge_label = ProximityGraph::graphviz_edge_details_label(details) + edge_label = R::graphviz_edge_details_label(details) ).unwrap(); } } @@ -354,7 +376,7 @@ shape: class").unwrap(); } writeln!(file, "}}").unwrap(); } - fn edge_d2_description(graph: &RankingRuleGraph, paths_idx: &str, edge_idx: u32, file: &mut File) -> String { + fn edge_d2_description(graph: &RankingRuleGraph, paths_idx: &str, edge_idx: u32, file: &mut File) -> String { let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { @@ -382,7 +404,7 @@ shape: class").unwrap(); }}").unwrap(); edge_id } - fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { + fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { for (edge_idx, rest) in paths.nodes.iter() { let edge_id = Self::edge_d2_description(graph, paths_idx, *edge_idx, file); for (dest_edge_idx, _) in rest.nodes.iter() { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 3b828f7cb..4e119ae42 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use super::{ ranking_rule_graph::{ empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, proximity::ProximityGraph, - RankingRuleGraph, + typo::TypoGraph, RankingRuleGraph, }, RankingRule, RankingRuleQueryTrait, }; @@ -61,6 +61,14 @@ impl SearchLogger for DefaultSearchLogger { _empty_paths_cache: &EmptyPathsCache, ) { } + + fn log_typo_state( + &mut self, + query_graph: &RankingRuleGraph, + paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, + ) { + } } pub trait SearchLogger { @@ -104,4 +112,11 @@ pub trait SearchLogger { paths: &PathsMap, empty_paths_cache: &EmptyPathsCache, ); + + fn log_typo_state( + &mut self, + query_graph: &RankingRuleGraph, + paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, + ); } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 3a396f3dc..b1adb80fc 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -5,6 +5,7 @@ pub mod empty_paths_cache; pub mod paths_map; pub mod proximity; pub mod resolve_paths; +pub mod typo; use std::ops::ControlFlow; diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs new file mode 100644 index 000000000..55a45e3c3 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -0,0 +1,131 @@ +use heed::{BytesDecode, RoTxn}; +use roaring::RoaringBitmap; + +use super::empty_paths_cache::EmptyPathsCache; +use super::paths_map::PathsMap; +use super::{EdgeDetails, RankingRuleGraphTrait}; +use crate::new::db_cache::DatabaseCache; +use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use crate::new::QueryNode; +use crate::{Index, Result, RoaringBitmapCodec}; + +#[derive(Clone)] +pub enum TypoEdge { + Phrase, + Word { derivations: WordDerivations, nbr_typos: u8 }, +} + +pub enum TypoGraph {} + +impl RankingRuleGraphTrait for TypoGraph { + type EdgeDetails = TypoEdge; + type BuildVisitedFromNode = (); + + fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String { + match edge { + TypoEdge::Phrase => format!(", 0 typos"), + TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"), + } + } + + fn compute_docids<'db_cache, 'transaction>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + edge: &Self::EdgeDetails, + ) -> Result { + match edge { + TypoEdge::Phrase => todo!(), + TypoEdge::Word { derivations, nbr_typos } => { + let words = match nbr_typos { + 0 => &derivations.zero_typo, + 1 => &derivations.one_typo, + 2 => &derivations.two_typos, + _ => panic!(), + }; + let mut docids = RoaringBitmap::new(); + for word in words.iter() { + let Some(bytes) = db_cache.get_word_docids(index, txn, word)? else { continue }; + let bitmap = + RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; + docids |= bitmap; + } + if *nbr_typos == 0 { + if let Some(bytes) = + db_cache.get_prefix_docids(index, txn, &derivations.original)? + { + let bitmap = + RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; + docids |= bitmap; + } + } + Ok(docids) + } + } + } + + fn build_visit_from_node<'transaction>( + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + from_node: &QueryNode, + ) -> Result> { + Ok(Some(())) + } + + fn build_visit_to_node<'from_data, 'transaction: 'from_data>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + to_node: &QueryNode, + from_node_data: &'from_data Self::BuildVisitedFromNode, + ) -> Result)>> { + match to_node { + QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { + QueryTerm::Phrase(_) => Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase))]), + QueryTerm::Word { derivations } => { + let mut edges = vec![]; + if !derivations.zero_typo.is_empty() || derivations.use_prefix_db { + edges.push(( + 0, + EdgeDetails::Data(TypoEdge::Word { + derivations: derivations.clone(), + nbr_typos: 0, + }), + )) + } + if !derivations.one_typo.is_empty() { + edges.push(( + 1, + EdgeDetails::Data(TypoEdge::Word { + derivations: derivations.clone(), + nbr_typos: 1, + }), + )) + } + if !derivations.two_typos.is_empty() { + edges.push(( + 2, + EdgeDetails::Data(TypoEdge::Word { + derivations: derivations.clone(), + nbr_typos: 2, + }), + )) + } + Ok(edges) + } + }, + QueryNode::End => Ok(vec![(0, EdgeDetails::Unconditional)]), + QueryNode::Deleted | QueryNode::Start => panic!(), + } + } + + fn log_state( + graph: &super::RankingRuleGraph, + paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, + logger: &mut dyn crate::new::logger::SearchLogger, + ) { + logger.log_typo_state(graph, paths, empty_paths_cache); + } +} diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index f3f71ab4b..f023f94d1 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -7,6 +7,7 @@ use super::logger::SearchLogger; use super::QueryGraph; use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::new::ranking_rule_graph::proximity::ProximityGraph; +use crate::new::ranking_rule_graph::typo::TypoGraph; use crate::new::words::Words; use crate::search::new::sort::Sort; use crate::{Filter, Index, Result, TermsMatchingStrategy}; @@ -125,9 +126,10 @@ pub fn execute_search<'transaction>( let words = &mut Words::new(TermsMatchingStrategy::Last); let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); + let typo = &mut GraphBasedRankingRule::::new("typo".to_owned()); // TODO: ranking rules given as argument let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> = - vec![words, proximity, sort]; + vec![words, typo, proximity, sort]; logger.ranking_rules(&ranking_rules); @@ -152,7 +154,7 @@ pub fn execute_search<'transaction>( macro_rules! back { () => { - assert!(candidates[cur_ranking_rule_index].is_empty()); + // assert!(candidates[cur_ranking_rule_index].is_empty()); logger.end_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], @@ -230,6 +232,7 @@ pub fn execute_search<'transaction>( ); let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else { + // TODO: add remaining candidates automatically here? back!(); continue; }; @@ -346,7 +349,7 @@ mod tests { let mut db_cache = DatabaseCache::default(); let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government") + make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government") .unwrap(); let mut logger = DetailedSearchLogger::new("log"); From c27ea2677f60ac50093ba2d1f43aabd31e6da1a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 2 Mar 2023 21:27:42 +0100 Subject: [PATCH 034/234] Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. --- milli/src/lib.rs | 48 ++- .../search/new/graph_based_ranking_rule.rs | 119 +++++-- milli/src/search/new/logger/detailed.rs | 175 +++++++--- milli/src/search/new/logger/mod.rs | 44 ++- .../new/ranking_rule_graph/cheapest_paths.rs | 309 ++++++------------ .../ranking_rule_graph/edge_docids_cache.rs | 11 +- .../ranking_rule_graph/empty_paths_cache.rs | 46 ++- .../src/search/new/ranking_rule_graph/mod.rs | 48 +-- .../new/ranking_rule_graph/paths_map.rs | 111 ++----- .../new/ranking_rule_graph/proximity/build.rs | 14 +- .../new/ranking_rule_graph/proximity/mod.rs | 26 +- .../new/ranking_rule_graph/resolve_paths.rs | 68 ++-- .../search/new/ranking_rule_graph/typo/mod.rs | 43 ++- milli/src/search/new/ranking_rules.rs | 250 ++++++++++++-- 14 files changed, 782 insertions(+), 530 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 66dd33036..594405891 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -3,7 +3,53 @@ #[cfg(test)] #[global_allocator] -static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; +pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +// #[cfg(test)] +// pub mod allocator { +// use std::alloc::{GlobalAlloc, System}; +// use std::sync::atomic::{self, AtomicI64}; + +// #[global_allocator] +// pub static ALLOC: CountingAlloc = CountingAlloc { +// max_resident: AtomicI64::new(0), +// resident: AtomicI64::new(0), +// allocated: AtomicI64::new(0), +// }; + +// pub struct CountingAlloc { +// pub max_resident: AtomicI64, +// pub resident: AtomicI64, +// pub allocated: AtomicI64, +// } +// unsafe impl GlobalAlloc for CountingAlloc { +// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { +// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); +// let old_resident = +// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); + +// let resident = old_resident + layout.size() as i64; +// self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst); + +// // if layout.size() > 1_000_000 { +// // eprintln!( +// // "allocating {} with new resident size: {resident}", +// // layout.size() / 1_000_000 +// // ); +// // // let trace = std::backtrace::Backtrace::capture(); +// // // let t = trace.to_string(); +// // // eprintln!("{t}"); +// // } + +// System.alloc(layout) +// } + +// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { +// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); +// System.dealloc(ptr, layout) +// } +// } +// } #[macro_use] pub mod documents; diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 264686b0a..e5a0fbad6 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -3,12 +3,11 @@ use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; use super::logger::SearchLogger; -use super::ranking_rule_graph::cheapest_paths::KCheapestPathsState; use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache; use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; -use super::ranking_rule_graph::paths_map::PathsMap; + use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; -use super::{QueryGraph, RankingRule, RankingRuleOutput}; +use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput}; use crate::{Index, Result}; @@ -24,9 +23,40 @@ impl GraphBasedRankingRule { pub struct GraphBasedRankingRuleState { graph: RankingRuleGraph, - cheapest_paths_state: Option, edge_docids_cache: EdgeDocidsCache, empty_paths_cache: EmptyPathsCache, + all_distances: Vec>, + cur_distance_idx: usize, +} + +fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>( + graph: &mut RankingRuleGraph, + edge_docids_cache: &mut EdgeDocidsCache, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + empty_paths_cache: &mut EmptyPathsCache, +) -> Result<()> { + for edge_index in 0..graph.all_edges.len() as u32 { + if graph.all_edges[edge_index as usize].is_none() { + continue; + } + let docids = edge_docids_cache + .get_edge_docids(index, txn, db_cache, edge_index, &*graph, universe)?; + match docids { + BitmapOrAllRef::Bitmap(bitmap) => { + if bitmap.is_disjoint(universe) { + graph.remove_edge(edge_index); + empty_paths_cache.forbid_edge(edge_index); + edge_docids_cache.cache.remove(&edge_index); + continue; + } + } + BitmapOrAllRef::All => continue, + } + } + Ok(()) } impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGraph> @@ -41,18 +71,31 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, _logger: &mut dyn SearchLogger, - _universe: &RoaringBitmap, + universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { // TODO: update old state instead of starting from scratch - let graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?; + let mut graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?; + let mut edge_docids_cache = EdgeDocidsCache::default(); + let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len()); + + remove_empty_edges( + &mut graph, + &mut edge_docids_cache, + index, + txn, + db_cache, + universe, + &mut empty_paths_cache, + )?; + let all_distances = graph.initialize_distances_cheapest(); - let cheapest_paths_state = KCheapestPathsState::new(&graph); let state = GraphBasedRankingRuleState { graph, - cheapest_paths_state, - edge_docids_cache: <_>::default(), - empty_paths_cache: <_>::default(), + edge_docids_cache, + empty_paths_cache, + all_distances, + cur_distance_idx: 0, }; self.state = Some(state); @@ -70,34 +113,42 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap ) -> Result>> { assert!(universe.len() > 1); let mut state = self.state.take().unwrap(); - if state.cheapest_paths_state.is_none() { + remove_empty_edges( + &mut state.graph, + &mut state.edge_docids_cache, + index, + txn, + db_cache, + universe, + &mut state.empty_paths_cache, + )?; + + if state.cur_distance_idx + >= state.all_distances[state.graph.query_graph.root_node as usize].len() + { + self.state = None; return Ok(None); } + let cost = + state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx]; + state.cur_distance_idx += 1; - let mut paths = PathsMap::default(); + let paths = state.graph.paths_of_cost( + state.graph.query_graph.root_node as usize, + cost, + &state.all_distances, + &state.empty_paths_cache, + ); - while paths.is_empty() { - let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else { - break; - }; - if let Some(next_cheapest_paths_state) = cheapest_paths_state - .compute_paths_of_next_lowest_cost( - &mut state.graph, - &state.empty_paths_cache, - &mut paths, - ) - { - state.cheapest_paths_state = Some(next_cheapest_paths_state); - } else { - break; - } - } - - if paths.is_empty() && state.cheapest_paths_state.is_none() { - return Ok(None); - } - - G::log_state(&state.graph, &paths, &state.empty_paths_cache, logger); + G::log_state( + &state.graph, + &paths, + &state.empty_paths_cache, + universe, + &state.all_distances, + cost, + logger, + ); let bucket = state.graph.resolve_paths( index, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index a7a3f8793..d2ce627dc 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -1,6 +1,8 @@ + use rand::random; use roaring::RoaringBitmap; use std::fs::File; +use std::time::Instant; use std::{io::Write, path::PathBuf}; use crate::new::ranking_rule_graph::typo::TypoGraph; @@ -9,7 +11,7 @@ use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::ranking_rule_graph::{ - paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph, + proximity::ProximityGraph, RankingRuleGraph, }; use super::{RankingRule, SearchLogger}; @@ -19,14 +21,18 @@ pub enum SearchEvents { ranking_rule_idx: usize, query: QueryGraph, universe: RoaringBitmap, + time: Instant, }, RankingRuleNextBucket { ranking_rule_idx: usize, universe: RoaringBitmap, + candidates: RoaringBitmap, + time: Instant, }, RankingRuleEndIteration { ranking_rule_idx: usize, universe: RoaringBitmap, + time: Instant, }, ExtendResults { new: Vec, @@ -36,20 +42,27 @@ pub enum SearchEvents { }, ProximityState { graph: RankingRuleGraph, - paths: PathsMap, + paths: Vec>, empty_paths_cache: EmptyPathsCache, + universe: RoaringBitmap, + distances: Vec>, + cost: u64, }, TypoState { graph: RankingRuleGraph, - paths: PathsMap, + paths: Vec>, empty_paths_cache: EmptyPathsCache, + universe: RoaringBitmap, + distances: Vec>, + cost: u64, }, - RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap }, + RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant, }, } pub struct DetailedSearchLogger { folder_path: PathBuf, initial_query: Option, + initial_query_time: Option, initial_universe: Option, ranking_rules_ids: Option>, events: Vec, @@ -58,17 +71,19 @@ impl DetailedSearchLogger { pub fn new(folder_path: &str) -> Self { Self { folder_path: PathBuf::new().join(folder_path), - initial_query: <_>::default(), - initial_universe: <_>::default(), - ranking_rules_ids: <_>::default(), - events: <_>::default(), + initial_query: None, + initial_query_time: None, + initial_universe: None, + ranking_rules_ids: None, + events: vec![], } } } impl SearchLogger for DetailedSearchLogger { - fn initial_query(&mut self, query: &QueryGraph) { + fn initial_query(&mut self, query: &QueryGraph, time: Instant) { self.initial_query = Some(query.clone()); + self.initial_query_time = Some(time); } fn initial_universe(&mut self, universe: &RoaringBitmap) { @@ -84,11 +99,13 @@ impl SearchLogger for DetailedSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, query: &QueryGraph, universe: &RoaringBitmap, + time: Instant, ) { self.events.push(SearchEvents::RankingRuleStartIteration { ranking_rule_idx, query: query.clone(), universe: universe.clone(), + time, }) } @@ -97,10 +114,14 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, + candidates: &RoaringBitmap, + time: Instant, ) { self.events.push(SearchEvents::RankingRuleNextBucket { ranking_rule_idx, universe: universe.clone(), + candidates: candidates.clone(), + time, }) } fn skip_bucket_ranking_rule<'transaction>( @@ -108,10 +129,12 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, candidates: &RoaringBitmap, + time: Instant, ) { self.events.push(SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates: candidates.clone(), + time }) } @@ -120,10 +143,12 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, + time: Instant, ) { self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe: universe.clone(), + time }) } fn add_to_results(&mut self, docids: &[u32]) { @@ -134,18 +159,19 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); } - fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap, empty_paths_cache: &EmptyPathsCache) { - self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() }) + fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u64,) { + self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } - fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap, empty_paths_cache: &EmptyPathsCache) { - self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() }) + fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u64,) { + self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } } impl DetailedSearchLogger { pub fn write_d2_description(&self) { + let mut prev_time = self.initial_query_time.unwrap(); let mut timestamp = vec![]; fn activated_id(timestamp: &[usize]) -> String { let mut s = String::new(); @@ -164,13 +190,16 @@ impl DetailedSearchLogger { writeln!(&mut file, "{idx}: {rr_id}").unwrap(); } writeln!(&mut file, "results").unwrap(); + // writeln!(&mut file, "time").unwrap(); for event in self.events.iter() { match event { - SearchEvents::RankingRuleStartIteration { ranking_rule_idx, .. } => { - + SearchEvents::RankingRuleStartIteration { ranking_rule_idx, time, .. } => { + let elapsed = time.duration_since(prev_time); + prev_time = *time; let parent_activated_id = activated_id(×tamp); timestamp.push(0); let self_activated_id = activated_id(×tamp); + // writeln!(&mut file, "time.{self_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); if *ranking_rule_idx != 0 { let parent_ranking_rule_idx = ranking_rule_idx - 1; writeln!( @@ -186,16 +215,22 @@ impl DetailedSearchLogger { }} }}").unwrap(); } - SearchEvents::RankingRuleNextBucket { ranking_rule_idx, .. } => { + SearchEvents::RankingRuleNextBucket { ranking_rule_idx, time, universe, candidates } => { + let elapsed = time.duration_since(prev_time); + prev_time = *time; let old_activated_id = activated_id(×tamp); + // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); *timestamp.last_mut().unwrap() += 1; let next_activated_id = activated_id(×tamp); writeln!(&mut file, - "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket",) + "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket {}/{}", candidates.len(), universe.len()) .unwrap(); } - SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates } => { + SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates, time } => { + let elapsed = time.duration_since(prev_time); + prev_time = *time; let old_activated_id = activated_id(×tamp); + // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); *timestamp.last_mut().unwrap() += 1; let next_activated_id = activated_id(×tamp); let len = candidates.len(); @@ -203,8 +238,12 @@ impl DetailedSearchLogger { "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : skip bucket ({len})",) .unwrap(); } - SearchEvents::RankingRuleEndIteration { ranking_rule_idx, .. } => { + SearchEvents::RankingRuleEndIteration { ranking_rule_idx, time, .. } => { + let elapsed = time.duration_since(prev_time); + prev_time = *time; let cur_activated_id = activated_id(×tamp); + // writeln!(&mut file, "time.{cur_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); + timestamp.pop(); let parent_activated_id = activated_id(×tamp); let parent_ranking_rule = if *ranking_rule_idx == 0 { @@ -254,43 +293,48 @@ results.{random} {{ link: \"{id}.d2.svg\" }}").unwrap(); }, - SearchEvents::ProximityState { graph, paths, empty_paths_cache } => { + SearchEvents::ProximityState { graph, paths, empty_paths_cache, universe, distances, cost } => { let cur_ranking_rule = timestamp.len() - 1; let cur_activated_id = activated_id(×tamp); let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); + Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file); writeln!( &mut file, "{id} {{ link: \"{id}.d2.svg\" -}}").unwrap(); + tooltip: \"cost {cost}, universe len: {}\" +}}", universe.len()).unwrap(); }, - SearchEvents::TypoState { graph, paths, empty_paths_cache } => { + SearchEvents::TypoState { graph, paths, empty_paths_cache, universe, distances, cost } => { let cur_ranking_rule = timestamp.len() - 1; let cur_activated_id = activated_id(×tamp); let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); + Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file); writeln!( &mut file, "{id} {{ link: \"{id}.d2.svg\" -}}").unwrap(); + tooltip: \"cost {cost}, universe len: {}\" +}}", universe.len()).unwrap(); }, } } writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc(node_idx: usize, node: &QueryNode, file: &mut File) { + fn query_node_d2_desc(node_idx: usize, node: &QueryNode, distances: &[u64], file: &mut File) { match &node { QueryNode::Term(LocatedQueryTerm { value, .. }) => { match value { - QueryTerm::Phrase(_) => todo!(), - QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db } } => { + QueryTerm::Phrase { phrase } => { + let phrase_str = phrase.description(); + writeln!(file,"{node_idx} : \"{phrase_str}\"").unwrap(); + }, + QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db, synonyms, split_words } } => { writeln!(file,"{node_idx} : \"{original}\" {{ shape: class").unwrap(); for w in zero_typo { @@ -302,9 +346,19 @@ shape: class").unwrap(); for w in two_typos { writeln!(file, "\"{w}\" : 2").unwrap(); } + if let Some((left, right)) = split_words { + writeln!(file, "\"{left} {right}\" : split_words").unwrap(); + } + for synonym in synonyms { + writeln!(file, "\"{}\" : synonym", synonym.description()).unwrap(); + } if *use_prefix_db { writeln!(file, "use prefix DB : true").unwrap(); } + // for (i, d) in distances.iter().enumerate() { + // writeln!(file, "\"distances\" : {d}").unwrap(); + // } + writeln!(file, "}}").unwrap(); }, } @@ -324,14 +378,14 @@ shape: class").unwrap(); if matches!(query_graph.nodes[node], QueryNode::Deleted) { continue; } - Self::query_node_d2_desc(node, &query_graph.nodes[node], file); + Self::query_node_d2_desc(node, &query_graph.nodes[node], &[], file); for edge in query_graph.edges[node].successors.iter() { writeln!(file, "{node} -> {edge};\n").unwrap(); } } } - fn ranking_rule_graph_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, empty_paths_cache: &EmptyPathsCache, file: &mut File) { + fn ranking_rule_graph_d2_description(graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -339,7 +393,8 @@ shape: class").unwrap(); if matches!(node, QueryNode::Deleted) { continue; } - Self::query_node_d2_desc(node_idx, node, file); + let distances = &distances[node_idx]; + Self::query_node_d2_desc(node_idx, node, distances.as_slice(), file); } for edge in graph.all_edges.iter().flatten() { let Edge { from_node, to_node, details, .. } = edge; @@ -362,26 +417,39 @@ shape: class").unwrap(); } writeln!(file, "}}").unwrap(); + // writeln!(file, "Distances {{").unwrap(); + // Self::paths_d2_description(graph, paths, file); + // writeln!(file, "}}").unwrap(); + + writeln!(file, "Shortest Paths {{").unwrap(); - Self::paths_d2_description(graph, "", paths, file); + Self::paths_d2_description(graph, paths, file); writeln!(file, "}}").unwrap(); - writeln!(file, "Empty Path Prefixes {{").unwrap(); - Self::paths_d2_description(graph, "", &empty_paths_cache.empty_prefixes, file); - writeln!(file, "}}").unwrap(); + // writeln!(file, "Empty Edge Couples {{").unwrap(); + // for (i, (e1, e2)) in empty_paths_cache.empty_couple_edges.iter().enumerate() { + // writeln!(file, "{i} : \"\" {{").unwrap(); + // Self::edge_d2_description(graph, *e1, file); + // Self::edge_d2_description(graph, *e2, file); + // writeln!(file, "{e1} -- {e2}").unwrap(); + // writeln!(file, "}}").unwrap(); + // } + // writeln!(file, "}}").unwrap(); - writeln!(file, "Removed Edges {{").unwrap(); - for edge_idx in empty_paths_cache.empty_edges.iter() { - writeln!(file, "{edge_idx}").unwrap(); - } - writeln!(file, "}}").unwrap(); + // writeln!(file, "Removed Edges {{").unwrap(); + // for edge_idx in empty_paths_cache.empty_edges.iter() { + // writeln!(file, "{edge_idx}").unwrap(); + // } + // writeln!(file, "}}").unwrap(); } - fn edge_d2_description(graph: &RankingRuleGraph, paths_idx: &str, edge_idx: u32, file: &mut File) -> String { + fn edge_d2_description(graph: &RankingRuleGraph, edge_idx: u32, file: &mut File) { let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { QueryNode::Term(term) => match &term.value { - QueryTerm::Phrase(_) => todo!(), + QueryTerm::Phrase { phrase } => { + phrase.description() + }, QueryTerm::Word { derivations } => derivations.original.clone(), }, QueryNode::Deleted => panic!(), @@ -391,27 +459,28 @@ shape: class").unwrap(); let to_node = &graph.query_graph.nodes[*to_node as usize]; let to_node_desc = match to_node { QueryNode::Term(term) => match &term.value { - QueryTerm::Phrase(_) => todo!(), + QueryTerm::Phrase { phrase } => phrase.description(), QueryTerm::Word { derivations } => derivations.original.clone(), }, QueryNode::Deleted => panic!(), QueryNode::Start => "START".to_owned(), QueryNode::End => "END".to_owned(), }; - let edge_id = format!("{paths_idx}{edge_idx}"); - writeln!(file, "{edge_id}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ + writeln!(file, "{edge_idx}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ shape: class }}").unwrap(); - edge_id } - fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { - for (edge_idx, rest) in paths.nodes.iter() { - let edge_id = Self::edge_d2_description(graph, paths_idx, *edge_idx, file); - for (dest_edge_idx, _) in rest.nodes.iter() { - let dest_edge_id = format!("{paths_idx}{edge_idx}{dest_edge_idx}"); - writeln!(file, "{edge_id} -> {dest_edge_id}").unwrap(); + fn paths_d2_description(graph: &RankingRuleGraph, paths: &[Vec], file: &mut File) { + for (path_idx, edge_indexes) in paths.iter().enumerate() { + writeln!(file, "{path_idx} {{").unwrap(); + for edge_idx in edge_indexes.iter() { + Self::edge_d2_description(graph, *edge_idx, file); } - Self::paths_d2_description(graph, &format!("{paths_idx}{edge_idx}"), rest, file); + for couple_edges in edge_indexes.windows(2) { + let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() }; + writeln!(file, "{src_edge_idx} -> {dest_edge_idx}").unwrap(); + } + writeln!(file, "}}").unwrap(); } } } diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 4e119ae42..079bb892c 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -2,28 +2,31 @@ pub mod detailed; use roaring::RoaringBitmap; +use std::time::Instant; use super::{ ranking_rule_graph::{ - empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, proximity::ProximityGraph, - typo::TypoGraph, RankingRuleGraph, + empty_paths_cache::EmptyPathsCache, proximity::ProximityGraph, typo::TypoGraph, + RankingRuleGraph, }, RankingRule, RankingRuleQueryTrait, }; pub struct DefaultSearchLogger; impl SearchLogger for DefaultSearchLogger { - fn initial_query(&mut self, _query: &Q) {} + fn initial_query(&mut self, _query: &Q, _time: Instant) {} fn initial_universe(&mut self, _universe: &RoaringBitmap) {} fn ranking_rules(&mut self, _rr: &[&mut dyn RankingRule]) {} + fn start_iteration_ranking_rule<'transaction>( &mut self, _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _query: &Q, _universe: &RoaringBitmap, + _time: Instant, ) { } @@ -32,6 +35,8 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _universe: &RoaringBitmap, + _candidates: &RoaringBitmap, + _time: Instant, ) { } fn skip_bucket_ranking_rule<'transaction>( @@ -39,6 +44,7 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _candidates: &RoaringBitmap, + _time: Instant, ) { } @@ -47,6 +53,7 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _universe: &RoaringBitmap, + _time: Instant, ) { } @@ -57,22 +64,28 @@ impl SearchLogger for DefaultSearchLogger { fn log_proximity_state( &mut self, _query_graph: &RankingRuleGraph, - _paths_map: &PathsMap, + _paths_map: &[Vec], _empty_paths_cache: &EmptyPathsCache, + _universe: &RoaringBitmap, + _distances: Vec>, + _cost: u64, ) { } fn log_typo_state( &mut self, - query_graph: &RankingRuleGraph, - paths: &PathsMap, - empty_paths_cache: &EmptyPathsCache, + _query_graph: &RankingRuleGraph, + _paths: &[Vec], + _empty_paths_cache: &EmptyPathsCache, + _universe: &RoaringBitmap, + _distances: Vec>, + _cost: u64, ) { } } pub trait SearchLogger { - fn initial_query(&mut self, query: &Q); + fn initial_query(&mut self, query: &Q, time: Instant); fn initial_universe(&mut self, universe: &RoaringBitmap); fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]); @@ -83,24 +96,29 @@ pub trait SearchLogger { ranking_rule: &dyn RankingRule<'transaction, Q>, query: &Q, universe: &RoaringBitmap, + time: Instant, ); fn next_bucket_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, + candidates: &RoaringBitmap, + time: Instant, ); fn skip_bucket_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, ranking_rule: &dyn RankingRule<'transaction, Q>, candidates: &RoaringBitmap, + time: Instant, ); fn end_iteration_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, + time: Instant, ); fn add_to_results(&mut self, docids: &[u32]); @@ -109,14 +127,20 @@ pub trait SearchLogger { fn log_proximity_state( &mut self, query_graph: &RankingRuleGraph, - paths: &PathsMap, + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + _distances: Vec>, + cost: u64, ); fn log_typo_state( &mut self, query_graph: &RankingRuleGraph, - paths: &PathsMap, + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + _distances: Vec>, + cost: u64, ); } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index e46f6ce66..2377f1c84 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -1,10 +1,8 @@ -use std::collections::{BTreeMap, HashSet}; - -use roaring::RoaringBitmap; +#![allow(clippy::too_many_arguments)] use super::empty_paths_cache::EmptyPathsCache; -use super::paths_map::PathsMap; -use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; +use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use std::collections::VecDeque; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Path { @@ -12,226 +10,119 @@ pub struct Path { pub cost: u64, } -struct DijkstraState { - unvisited: RoaringBitmap, // should be a small bitset? - distances: Vec, // or binary heap, or btreemap? (f64, usize) - edges: Vec, - edge_costs: Vec, - paths: Vec>, -} - -pub struct KCheapestPathsState { - cheapest_paths: PathsMap, - potential_cheapest_paths: BTreeMap>, - pub kth_cheapest_path: Path, -} - -impl KCheapestPathsState { - pub fn next_cost(&self) -> u64 { - self.kth_cheapest_path.cost - } - - pub fn new( - graph: &RankingRuleGraph, - ) -> Option { - let Some(cheapest_path) = graph.cheapest_path_to_end(graph.query_graph.root_node) else { - return None - }; - let cheapest_paths = PathsMap::from_paths(&[cheapest_path.clone()]); - let potential_cheapest_paths = BTreeMap::new(); - Some(KCheapestPathsState { - cheapest_paths, - potential_cheapest_paths, - kth_cheapest_path: cheapest_path, - }) - } - - pub fn remove_empty_paths(mut self, empty_paths_cache: &EmptyPathsCache) -> Option { - self.cheapest_paths.remove_edges(&empty_paths_cache.empty_edges); - self.cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes); - - let mut costs_to_delete = HashSet::new(); - for (cost, potential_cheapest_paths) in self.potential_cheapest_paths.iter_mut() { - potential_cheapest_paths.remove_edges(&empty_paths_cache.empty_edges); - potential_cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes); - if potential_cheapest_paths.is_empty() { - costs_to_delete.insert(*cost); - } - } - for cost in costs_to_delete { - self.potential_cheapest_paths.remove(&cost); - } - - if self.cheapest_paths.is_empty() {} - - todo!() - } - - pub fn compute_paths_of_next_lowest_cost( - mut self, - graph: &mut RankingRuleGraph, +impl RankingRuleGraph { + pub fn paths_of_cost( + &self, + from: usize, + cost: u64, + all_distances: &[Vec], empty_paths_cache: &EmptyPathsCache, - into_map: &mut PathsMap, - ) -> Option { - if !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges) { - into_map.add_path(&self.kth_cheapest_path); + ) -> Vec> { + let mut paths = vec![]; + self.paths_of_cost_rec( + from, + all_distances, + cost, + &mut vec![], + &mut paths, + &vec![false; self.all_edges.len()], + empty_paths_cache, + ); + paths + } + pub fn paths_of_cost_rec( + &self, + from: usize, + all_distances: &[Vec], + cost: u64, + prev_edges: &mut Vec, + paths: &mut Vec>, + forbidden_edges: &[bool], + empty_paths_cache: &EmptyPathsCache, + ) { + let distances = &all_distances[from]; + if !distances.contains(&cost) { + panic!(); } - let cur_cost = self.kth_cheapest_path.cost; - while self.kth_cheapest_path.cost <= cur_cost { - if let Some(next_self) = self.compute_next_cheapest_paths(graph, empty_paths_cache) { - self = next_self; - if self.kth_cheapest_path.cost == cur_cost - && !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges) + let tos = &self.query_graph.edges[from].successors; + let mut valid_edges = vec![]; + for to in tos { + self.visit_edges::<()>(from as u32, to, |edge_idx, edge| { + if cost >= edge.cost as u64 + && all_distances[to as usize].contains(&(cost - edge.cost as u64)) + && !forbidden_edges[edge_idx as usize] { - into_map.add_path(&self.kth_cheapest_path); - } else { - break; + valid_edges.push((edge_idx, edge.cost, to)); } - } else { - return None; - } + std::ops::ControlFlow::Continue(()) + }); } - Some(self) - } - fn compute_next_cheapest_paths( - mut self, - graph: &mut RankingRuleGraph, - empty_paths_cache: &EmptyPathsCache, - ) -> Option { - // for all nodes in the last cheapest path (called spur_node), except last one... - for (i, edge_idx) in self.kth_cheapest_path.edges[..self.kth_cheapest_path.edges.len() - 1] - .iter() - .enumerate() - { - let Some(edge) = graph.all_edges[*edge_idx as usize].as_ref() else { continue; }; - let Edge { from_node: spur_node, .. } = edge; - - let root_path = &self.kth_cheapest_path.edges[..i]; - if empty_paths_cache.path_is_empty(root_path) { + for (edge_idx, edge_cost, to) in valid_edges { + prev_edges.push(edge_idx); + if empty_paths_cache.empty_prefixes.contains_prefix_of_path(prev_edges) { continue; } - - let root_cost = root_path.iter().fold(0, |sum, next| { - sum + graph.all_edges[*next as usize].as_ref().unwrap().cost as u64 - }); - - let mut tmp_removed_edges = vec![]; - // for all the paths already found that share a common prefix with the root path - // we delete the edge from the spur node to the next one - for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) { - let was_removed = - graph.node_edges[*spur_node as usize].remove(edge_index_to_remove); - if was_removed { - tmp_removed_edges.push(edge_index_to_remove); - } + let mut new_forbidden_edges = forbidden_edges.to_vec(); + for edge_idx in empty_paths_cache.empty_couple_edges[edge_idx as usize].iter() { + new_forbidden_edges[*edge_idx as usize] = true; + } + for edge_idx in empty_paths_cache.empty_prefixes.final_edges_ater_prefix(prev_edges) { + new_forbidden_edges[edge_idx as usize] = true; } - // Compute the cheapest path from the spur node to the destination - // we will combine it with the root path to get a potential kth cheapest path - let spur_path = graph.cheapest_path_to_end(*spur_node); - // restore the temporarily removed edges - graph.node_edges[*spur_node as usize].extend(tmp_removed_edges); - - let Some(spur_path) = spur_path else { continue; }; - let total_cost = root_cost + spur_path.cost; - let total_path = Path { - edges: root_path.iter().chain(spur_path.edges.iter()).cloned().collect(), - cost: total_cost, - }; - let entry = self.potential_cheapest_paths.entry(total_cost).or_default(); - entry.add_path(&total_path); + if to == self.query_graph.end_node { + paths.push(prev_edges.clone()); + } else { + self.paths_of_cost_rec( + to as usize, + all_distances, + cost - edge_cost as u64, + prev_edges, + paths, + &new_forbidden_edges, + empty_paths_cache, + ) + } + prev_edges.pop(); } - while let Some(mut next_cheapest_paths_entry) = self.potential_cheapest_paths.first_entry() + } + + pub fn initialize_distances_cheapest(&self) -> Vec> { + let mut distances_to_end: Vec> = vec![vec![]; self.query_graph.nodes.len()]; + let mut enqueued = vec![false; self.query_graph.nodes.len()]; + + let mut node_stack = VecDeque::new(); + + distances_to_end[self.query_graph.end_node as usize] = vec![0]; + for prev_node in + self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter() { - let cost = *next_cheapest_paths_entry.key(); - let next_cheapest_paths = next_cheapest_paths_entry.get_mut(); + node_stack.push_back(prev_node as usize); + enqueued[prev_node as usize] = true; + } - while let Some((next_cheapest_path, cost2)) = next_cheapest_paths.remove_first() { - assert_eq!(cost, cost2); - // NOTE: it is important not to discard the paths that are forbidden due to a - // forbidden prefix, because the cheapest path algorithm (Dijkstra) cannot take - // this property into account. - if next_cheapest_path - .iter() - .any(|edge_index| graph.all_edges[*edge_index as usize].is_none()) - { - continue; - } else { - self.cheapest_paths.insert(next_cheapest_path.iter().copied(), cost); - - if next_cheapest_paths.is_empty() { - next_cheapest_paths_entry.remove(); + while let Some(cur_node) = node_stack.pop_front() { + let mut self_distances = vec![]; + for succ_node in self.query_graph.edges[cur_node].successors.iter() { + let succ_distances = &distances_to_end[succ_node as usize]; + let _ = self.visit_edges::<()>(cur_node as u32, succ_node, |_, edge| { + for succ_distance in succ_distances { + self_distances.push(edge.cost as u64 + succ_distance); } - self.kth_cheapest_path = Path { edges: next_cheapest_path, cost }; - - return Some(self); + std::ops::ControlFlow::Continue(()) + }); + } + self_distances.sort_unstable(); + self_distances.dedup(); + distances_to_end[cur_node] = self_distances; + for prev_node in self.query_graph.edges[cur_node].predecessors.iter() { + if !enqueued[prev_node as usize] { + node_stack.push_back(prev_node as usize); + enqueued[prev_node as usize] = true; } } - let _ = next_cheapest_paths_entry.remove_entry(); } - None - } -} - -impl RankingRuleGraph { - fn cheapest_path_to_end(&self, from: u32) -> Option { - let mut dijkstra = DijkstraState { - unvisited: (0..self.query_graph.nodes.len() as u32).collect(), - distances: vec![u64::MAX; self.query_graph.nodes.len()], - edges: vec![u32::MAX; self.query_graph.nodes.len()], - edge_costs: vec![u8::MAX; self.query_graph.nodes.len()], - paths: vec![None; self.query_graph.nodes.len()], - }; - dijkstra.distances[from as usize] = 0; - - // TODO: could use a binary heap here to store the distances, or a btreemap - while let Some(cur_node) = - dijkstra.unvisited.iter().min_by_key(|&n| dijkstra.distances[n as usize]) - { - let cur_node_dist = dijkstra.distances[cur_node as usize]; - if cur_node_dist == u64::MAX { - return None; - } - if cur_node == self.query_graph.end_node { - break; - } - - let succ_cur_node = &self.successors[cur_node as usize]; - let unvisited_succ_cur_node = succ_cur_node & &dijkstra.unvisited; - for succ in unvisited_succ_cur_node { - let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(cur_node, succ) else { - continue - }; - - let old_dist_succ = &mut dijkstra.distances[succ as usize]; - let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64; - if new_potential_distance < *old_dist_succ { - *old_dist_succ = new_potential_distance; - dijkstra.edges[succ as usize] = cheapest_edge; - dijkstra.edge_costs[succ as usize] = cheapest_edge_cost; - dijkstra.paths[succ as usize] = Some(cur_node); - } - } - dijkstra.unvisited.remove(cur_node); - } - - let mut cur = self.query_graph.end_node; - let mut path_edges = vec![]; - while let Some(n) = dijkstra.paths[cur as usize] { - path_edges.push(dijkstra.edges[cur as usize]); - cur = n; - } - path_edges.reverse(); - Some(Path { - edges: path_edges, - cost: dijkstra.distances[self.query_graph.end_node as usize], - }) - } - - pub fn cheapest_edge(&self, cur_node: u32, succ: u32) -> Option<(u32, u8)> { - self.visit_edges(cur_node, succ, |edge_idx, edge| { - std::ops::ControlFlow::Break((edge_idx, edge.cost)) - }) + distances_to_end } } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index cb3e3da38..ef2eba895 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -32,16 +32,19 @@ impl EdgeDocidsCache { db_cache: &mut DatabaseCache<'transaction>, edge_index: u32, graph: &RankingRuleGraph, + // TODO: maybe universe doesn't belong here + universe: &RoaringBitmap, ) -> Result> { - if self.cache.contains_key(&edge_index) { - return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); - } let edge = graph.all_edges[edge_index as usize].as_ref().unwrap(); match &edge.details { EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All), EdgeDetails::Data(details) => { - let docids = G::compute_docids(index, txn, db_cache, details)?; + if self.cache.contains_key(&edge_index) { + return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); + } + // TODO: maybe universe doesn't belong here + let docids = universe & G::compute_docids(index, txn, db_cache, details)?; let _ = self.cache.insert(edge_index, docids); let docids = &self.cache[&edge_index]; diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index db68838b5..bbfe2eedd 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,26 +1,60 @@ -use roaring::RoaringBitmap; - use super::paths_map::PathsMap; -#[derive(Default, Clone)] +#[derive(Clone)] pub struct EmptyPathsCache { - pub empty_edges: RoaringBitmap, + pub empty_edges: Vec, pub empty_prefixes: PathsMap<()>, + pub empty_couple_edges: Vec>, } impl EmptyPathsCache { + pub fn new(all_edges_len: usize) -> Self { + Self { + empty_edges: vec![false; all_edges_len], + empty_prefixes: PathsMap::default(), + empty_couple_edges: vec![vec![]; all_edges_len], + } + } pub fn forbid_edge(&mut self, edge_idx: u32) { - self.empty_edges.insert(edge_idx); + self.empty_edges[edge_idx as usize] = true; + self.empty_couple_edges[edge_idx as usize] = vec![]; self.empty_prefixes.remove_edge(&edge_idx); + for edges2 in self.empty_couple_edges.iter_mut() { + if let Some(edge2_pos) = edges2.iter().position(|e| *e == edge_idx) { + edges2.swap_remove(edge2_pos); + } + } + } + pub fn forbid_prefix(&mut self, prefix: &[u32]) { + self.empty_prefixes.insert(prefix.iter().copied(), ()); + } + pub fn forbid_couple_edges(&mut self, edge1: u32, edge2: u32) { + assert!(!self.empty_couple_edges[edge1 as usize].contains(&edge2)); + self.empty_couple_edges[edge1 as usize].push(edge2); } pub fn path_is_empty(&self, path: &[u32]) -> bool { for edge in path { - if self.empty_edges.contains(*edge) { + if self.empty_edges[*edge as usize] { return true; } } if self.empty_prefixes.contains_prefix_of_path(path) { return true; } + for (edge1, edges2) in self.empty_couple_edges.iter().enumerate() { + if let Some(pos_edge1) = path.iter().position(|e| *e == edge1 as u32) { + if path[pos_edge1..].iter().any(|e| edges2.contains(e)) { + return true; + } + } + } + // for (edge1, edge2) in self.empty_couple_edges.iter() { + // if path.contains(edge1) && path.contains(edge2) { + // return true; + // } + // } + // if self.empty_prefixes.contains_prefix_of_path(path) { + // return true; + // } false } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index b1adb80fc..ac5e1f46b 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -13,7 +13,6 @@ use heed::RoTxn; use roaring::RoaringBitmap; use self::empty_paths_cache::EmptyPathsCache; -use self::paths_map::PathsMap; use super::db_cache::DatabaseCache; use super::logger::SearchLogger; @@ -83,8 +82,11 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, - paths: &PathsMap, + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + distances: &[Vec], + cost: u64, logger: &mut dyn SearchLogger, ); } @@ -135,7 +137,7 @@ impl RankingRuleGraph { None } - fn remove_edge(&mut self, edge_index: u32) { + pub fn remove_edge(&mut self, edge_index: u32) { let edge_opt = &mut self.all_edges[edge_index as usize]; let Some(edge) = &edge_opt else { return }; let (from_node, _to_node) = (edge.from_node, edge.to_node); @@ -151,44 +153,4 @@ impl RankingRuleGraph { } self.successors[from_node as usize] = new_successors_from_node; } - - pub fn graphviz(&self) -> String { - let mut desc = String::new(); - desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n"); - - for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { - if matches!(node, QueryNode::Deleted) { - continue; - } - desc.push_str(&format!("{node_idx} [label = {:?}]", node)); - if node_idx == self.query_graph.root_node as usize { - desc.push_str("[color = blue]"); - } else if node_idx == self.query_graph.end_node as usize { - desc.push_str("[color = red]"); - } - desc.push_str(";\n"); - } - for edge in self.all_edges.iter().flatten() { - let Edge { from_node, to_node, details, .. } = edge; - - match &details { - EdgeDetails::Unconditional => { - desc.push_str(&format!( - "{from_node} -> {to_node} [label = \"always cost {cost}\"];\n", - cost = edge.cost, - )); - } - EdgeDetails::Data(details) => { - desc.push_str(&format!( - "{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\"];\n", - cost = edge.cost, - edge_label = G::graphviz_edge_details_label(details) - )); - } - } - } - - desc.push('}'); - desc - } } diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 111b55140..3b01508c9 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -1,12 +1,11 @@ -use std::collections::hash_map::DefaultHasher; -use std::fmt::Write; -use std::hash::{Hash, Hasher}; + + + use roaring::RoaringBitmap; use super::cheapest_paths::Path; -use super::{Edge, EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::QueryNode; + #[derive(Debug, Clone)] pub struct PathsMap { @@ -157,6 +156,24 @@ impl PathsMap { } } + pub fn final_edges_ater_prefix(&self, prefix: &[u32]) -> Vec { + let [first_edge, remaining_prefix @ ..] = prefix else { + return self.nodes.iter().filter_map(|n| { + if n.1.value.is_some() { + Some(n.0) + } else { + None + } + }).collect(); + }; + for (edge, rest) in self.nodes.iter() { + if edge == first_edge { + return rest.final_edges_ater_prefix(remaining_prefix); + } + } + vec![] + } + pub fn edge_indices_after_prefix(&self, prefix: &[u32]) -> Vec { let [first_edge, remaining_prefix @ ..] = prefix else { return self.nodes.iter().map(|n| n.0).collect(); @@ -185,88 +202,4 @@ impl PathsMap { } } } - - pub fn graphviz(&self, graph: &RankingRuleGraph) -> String { - let mut desc = String::new(); - desc.push_str("digraph G {\n"); - self.graphviz_rec(&mut desc, vec![], graph); - desc.push_str("\n}\n"); - desc - } - fn graphviz_rec( - &self, - desc: &mut String, - path_from: Vec, - graph: &RankingRuleGraph, - ) { - let id_from = { - let mut h = DefaultHasher::new(); - path_from.hash(&mut h); - h.finish() - }; - for (edge_idx, rest) in self.nodes.iter() { - let Some(Edge { from_node, to_node, cost, .. }) = graph.all_edges[*edge_idx as usize].as_ref() else { - continue; - }; - let mut path_to = path_from.clone(); - path_to.push({ - let mut h = DefaultHasher::new(); - edge_idx.hash(&mut h); - h.finish() - }); - let id_to = { - let mut h = DefaultHasher::new(); - path_to.hash(&mut h); - h.finish() - }; - writeln!(desc, "{id_to} [label = \"{from_node}→{to_node} [{cost}]\"];").unwrap(); - writeln!(desc, "{id_from} -> {id_to};").unwrap(); - - rest.graphviz_rec(desc, path_to, graph); - } - } -} - -impl RankingRuleGraph { - pub fn graphviz_with_path(&self, path: &Path) -> String { - let mut desc = String::new(); - desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n"); - - for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { - if matches!(node, QueryNode::Deleted) { - continue; - } - desc.push_str(&format!("{node_idx} [label = {:?}]", node)); - if node_idx == self.query_graph.root_node as usize { - desc.push_str("[color = blue]"); - } else if node_idx == self.query_graph.end_node as usize { - desc.push_str("[color = red]"); - } - desc.push_str(";\n"); - } - - for (edge_idx, edge) in self.all_edges.iter().enumerate() { - let Some(edge) = edge else { continue }; - let Edge { from_node, to_node, .. } = edge; - let color = if path.edges.contains(&(edge_idx as u32)) { "red" } else { "green" }; - match &edge.details { - EdgeDetails::Unconditional => { - desc.push_str(&format!( - "{from_node} -> {to_node} [label = \"cost {cost}\", color = {color}];\n", - cost = edge.cost, - )); - } - EdgeDetails::Data(details) => { - desc.push_str(&format!( - "{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\", color = {color}];\n", - cost = edge.cost, - edge_label = G::graphviz_edge_details_label(details), - )); - } - } - } - - desc.push('}'); - desc - } } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 6d2fefa65..9b4fa8edf 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -16,9 +16,9 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result { match value1 { QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()), - QueryTerm::Phrase(phrase1) => { + QueryTerm::Phrase { phrase: phrase1 } => { // TODO: remove second unwrap - let original = phrase1.last().unwrap().as_ref().unwrap().clone(); + let original = phrase1.words.last().unwrap().as_ref().unwrap().clone(); ( WordDerivations { original: original.clone(), @@ -26,6 +26,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result Result( let (derivations2, pos2, ngram_len2) = match value2 { QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()), - QueryTerm::Phrase(phrase2) => { + QueryTerm::Phrase { phrase: phrase2 } => { // TODO: remove second unwrap - let original = phrase2.last().unwrap().as_ref().unwrap().clone(); + let original = phrase2.words.last().unwrap().as_ref().unwrap().clone(); ( WordDerivations { original: original.clone(), @@ -73,6 +77,8 @@ pub fn visit_to_node<'transaction, 'from_data>( one_typo: vec![], two_typos: vec![], use_prefix_db: false, + synonyms: vec![], + split_words: None, }, *pos2.start(), 1, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index fc1a44310..5b3869ea8 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -2,18 +2,21 @@ pub mod build; pub mod compute_docids; use heed::RoTxn; +use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; -use super::paths_map::PathsMap; + use super::{EdgeDetails, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; +use crate::new::logger::SearchLogger; use crate::new::query_term::WordDerivations; -use crate::new::QueryNode; +use crate::new::{QueryGraph, QueryNode}; use crate::{Index, Result}; +// TODO: intern the strings, refer to them by their pointer? + #[derive(Debug, Clone)] pub enum WordPair { - // TODO: add WordsSwapped and WordPrefixSwapped case Words { left: String, right: String }, WordsSwapped { left: String, right: String }, WordPrefix { left: String, right_prefix: String }, @@ -22,6 +25,7 @@ pub enum WordPair { #[derive(Clone)] pub struct ProximityEdge { + // TODO: use a list of pointers to the word pairs instead? pairs: Vec, proximity: u8, } @@ -67,10 +71,20 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &super::RankingRuleGraph, - paths: &PathsMap, + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, - logger: &mut dyn crate::new::logger::SearchLogger, + universe: &RoaringBitmap, + distances: &[Vec], + cost: u64, + logger: &mut dyn SearchLogger, ) { - logger.log_proximity_state(graph, paths, empty_paths_cache); + logger.log_proximity_state( + graph, + paths, + empty_paths_cache, + universe, + distances.to_vec(), + cost, + ); } } diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index 90650340f..f3394206b 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -5,7 +5,7 @@ use roaring::{MultiOps, RoaringBitmap}; use super::edge_docids_cache::EdgeDocidsCache; use super::empty_paths_cache::EmptyPathsCache; -use super::paths_map::PathsMap; + use super::{RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; @@ -21,44 +21,65 @@ impl RankingRuleGraph { edge_docids_cache: &mut EdgeDocidsCache, empty_paths_cache: &mut EmptyPathsCache, universe: &RoaringBitmap, - mut paths: PathsMap, + mut paths: Vec>, ) -> Result { + paths.sort_unstable(); + let mut needs_filtering = false; let mut path_bitmaps = vec![]; + 'path_loop: loop { + if needs_filtering { + for path in paths.iter_mut() { + if empty_paths_cache.path_is_empty(path) { + path.clear(); + } + } + needs_filtering = false; + } + let Some(edge_indexes) = paths.pop() else { + break; + }; - paths.remove_edges(&empty_paths_cache.empty_edges); - paths.remove_prefixes(&empty_paths_cache.empty_prefixes); + if edge_indexes.is_empty() { + continue; + } - 'path_loop: while let Some((edge_indexes, _)) = paths.remove_first() { - // if path is excluded, continue... - let mut processed_edges = vec![]; let mut path_bitmap = universe.clone(); + let mut visited_edges = vec![]; + let mut cached_edge_docids = vec![]; 'edge_loop: for edge_index in edge_indexes { - processed_edges.push(edge_index); - let edge_docids = - edge_docids_cache.get_edge_docids(index, txn, db_cache, edge_index, self)?; + visited_edges.push(edge_index); + let edge_docids = edge_docids_cache + .get_edge_docids(index, txn, db_cache, edge_index, self, universe)?; match edge_docids { BitmapOrAllRef::Bitmap(edge_docids) => { + cached_edge_docids.push((edge_index, edge_docids.clone())); + let (_, edge_docids) = cached_edge_docids.last().unwrap(); if edge_docids.is_disjoint(universe) { // 1. Store in the cache that this edge is empty for this universe empty_paths_cache.forbid_edge(edge_index); - // 2. remove all the paths that contain this edge for this universe - paths.remove_edge(&edge_index); - // 3. remove this edge from the proximity graph - + // 2. remove this edge from the proximity graph self.remove_edge(edge_index); - - // 4. continue executing this function again on the remaining paths + edge_docids_cache.cache.remove(&edge_index); + needs_filtering = true; + // 3. continue executing this function again on the remaining paths continue 'path_loop; } else { path_bitmap &= edge_docids; if path_bitmap.is_disjoint(universe) { - // 1. Store in the cache that this prefix is empty for this universe - empty_paths_cache - .empty_prefixes - .insert(processed_edges.iter().copied(), ()); - // 2. remove all the paths beginning with this prefix - paths.remove_prefix(&processed_edges); - // 3. continue executing this function again on the remaining paths? + needs_filtering = true; + empty_paths_cache.forbid_prefix(&visited_edges); + // if the intersection between this edge and any + // previous one is disjoint with the universe, + // then we add these two edges to the empty_path_cache + for (edge_index2, edge_docids2) in + cached_edge_docids[..cached_edge_docids.len() - 1].iter() + { + let intersection = edge_docids & edge_docids2; + if intersection.is_disjoint(universe) { + empty_paths_cache + .forbid_couple_edges(*edge_index2, edge_index); + } + } continue 'path_loop; } } @@ -68,6 +89,7 @@ impl RankingRuleGraph { } path_bitmaps.push(path_bitmap); } + Ok(MultiOps::union(path_bitmaps)) } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 55a45e3c3..c9ca7c229 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -2,16 +2,18 @@ use heed::{BytesDecode, RoTxn}; use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; -use super::paths_map::PathsMap; -use super::{EdgeDetails, RankingRuleGraphTrait}; + +use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; -use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; -use crate::new::QueryNode; +use crate::new::logger::SearchLogger; +use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; +use crate::new::resolve_query_graph::resolve_phrase; +use crate::new::{QueryGraph, QueryNode}; use crate::{Index, Result, RoaringBitmapCodec}; #[derive(Clone)] pub enum TypoEdge { - Phrase, + Phrase { phrase: Phrase }, Word { derivations: WordDerivations, nbr_typos: u8 }, } @@ -23,7 +25,7 @@ impl RankingRuleGraphTrait for TypoGraph { fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String { match edge { - TypoEdge::Phrase => format!(", 0 typos"), + TypoEdge::Phrase { .. } => ", 0 typos".to_owned(), TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"), } } @@ -33,9 +35,9 @@ impl RankingRuleGraphTrait for TypoGraph { txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, edge: &Self::EdgeDetails, - ) -> Result { + ) -> Result { match edge { - TypoEdge::Phrase => todo!(), + TypoEdge::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase), TypoEdge::Word { derivations, nbr_typos } => { let words = match nbr_typos { 0 => &derivations.zero_typo, @@ -68,21 +70,23 @@ impl RankingRuleGraphTrait for TypoGraph { _index: &Index, _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, - from_node: &QueryNode, + _from_node: &QueryNode, ) -> Result> { Ok(Some(())) } fn build_visit_to_node<'from_data, 'transaction: 'from_data>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, to_node: &QueryNode, - from_node_data: &'from_data Self::BuildVisitedFromNode, + _from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { match to_node { QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { - QueryTerm::Phrase(_) => Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase))]), + QueryTerm::Phrase { phrase } => { + Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase: phrase.clone() }))]) + } QueryTerm::Word { derivations } => { let mut edges = vec![]; if !derivations.zero_typo.is_empty() || derivations.use_prefix_db { @@ -121,11 +125,14 @@ impl RankingRuleGraphTrait for TypoGraph { } fn log_state( - graph: &super::RankingRuleGraph, - paths: &PathsMap, + graph: &RankingRuleGraph, + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, - logger: &mut dyn crate::new::logger::SearchLogger, + universe: &RoaringBitmap, + distances: &[Vec], + cost: u64, + logger: &mut dyn SearchLogger, ) { - logger.log_typo_state(graph, paths, empty_paths_cache); + logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances.to_vec(), cost); } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index f023f94d1..9b3bcb38c 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -1,3 +1,5 @@ +use std::time::Instant; + use heed::RoTxn; use roaring::RoaringBitmap; @@ -9,7 +11,7 @@ use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::new::ranking_rule_graph::proximity::ProximityGraph; use crate::new::ranking_rule_graph::typo::TypoGraph; use crate::new::words::Words; -use crate::search::new::sort::Sort; +// use crate::search::new::sort::Sort; use crate::{Filter, Index, Result, TermsMatchingStrategy}; pub trait RankingRuleOutputIter<'transaction, Query> { @@ -123,13 +125,14 @@ pub fn execute_search<'transaction>( length: usize, logger: &mut dyn SearchLogger, ) -> Result> { + logger.initial_query(query_graph, Instant::now()); let words = &mut Words::new(TermsMatchingStrategy::Last); - let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; + // let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); let typo = &mut GraphBasedRankingRule::::new("typo".to_owned()); // TODO: ranking rules given as argument let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> = - vec![words, typo, proximity, sort]; + vec![words, typo, proximity /*sort*/]; logger.ranking_rules(&ranking_rules); @@ -144,7 +147,13 @@ pub fn execute_search<'transaction>( } let ranking_rules_len = ranking_rules.len(); - logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, &universe); + logger.start_iteration_ranking_rule( + 0, + ranking_rules[0], + query_graph, + &universe, + Instant::now(), + ); ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?; let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; @@ -154,11 +163,12 @@ pub fn execute_search<'transaction>( macro_rules! back { () => { - // assert!(candidates[cur_ranking_rule_index].is_empty()); + assert!(candidates[cur_ranking_rule_index].is_empty()); logger.end_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &candidates[cur_ranking_rule_index], + Instant::now(), ); candidates[cur_ranking_rule_index].clear(); ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger); @@ -187,6 +197,7 @@ pub fn execute_search<'transaction>( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &candidates, + Instant::now(), ); } else { let all_candidates = candidates.iter().collect::>(); @@ -196,6 +207,7 @@ pub fn execute_search<'transaction>( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &skipped_candidates.into_iter().collect(), + Instant::now(), ); let candidates = candidates .iter() @@ -219,24 +231,26 @@ pub fn execute_search<'transaction>( // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. if candidates[cur_ranking_rule_index].len() <= 1 { - candidates[cur_ranking_rule_index].clear(); maybe_add_to_results!(&candidates[cur_ranking_rule_index]); + candidates[cur_ranking_rule_index].clear(); back!(); continue; } - logger.next_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index], - &candidates[cur_ranking_rule_index], - ); - let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else { // TODO: add remaining candidates automatically here? back!(); continue; }; + logger.next_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index], + &candidates[cur_ranking_rule_index], + &next_bucket.candidates, + Instant::now(), + ); + assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates)); candidates[cur_ranking_rule_index] -= &next_bucket.candidates; @@ -255,6 +269,7 @@ pub fn execute_search<'transaction>( ranking_rules[cur_ranking_rule_index], &next_bucket.query, &candidates[cur_ranking_rule_index], + Instant::now(), ); ranking_rules[cur_ranking_rule_index].start_iteration( index, @@ -271,17 +286,18 @@ pub fn execute_search<'transaction>( #[cfg(test)] mod tests { - use std::fs::File; - use std::io::{BufRead, BufReader, Cursor, Seek}; - use std::time::Instant; - - use heed::EnvOpenOptions; - use super::execute_search; + // use crate::allocator::ALLOC; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; use crate::new::db_cache::DatabaseCache; - use crate::new::logger::detailed::DetailedSearchLogger; + use big_s::S; + use heed::EnvOpenOptions; + use maplit::hashset; + use std::fs::File; + use std::io::{BufRead, BufReader, Cursor, Seek}; + use std::time::Instant; + // use crate::new::logger::detailed::DetailedSearchLogger; use crate::new::logger::{DefaultSearchLogger, SearchLogger}; use crate::new::make_query_graph; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; @@ -323,16 +339,119 @@ mod tests { let mut db_cache = DatabaseCache::default(); let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "b b b b b b b b b b").unwrap(); - println!("{}", query_graph.graphviz()); - logger.initial_query(&query_graph); + make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government") + .unwrap(); + logger.initial_query(&query_graph, Instant::now()); let results = - execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 20, &mut logger) + execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 50, &mut logger) .unwrap(); println!("{results:?}") } + #[test] + fn search_wiki_new() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_wiki").unwrap(); + let txn = index.read_txn().unwrap(); + + println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); + + let primary_key = index.primary_key(&txn).unwrap().unwrap(); + let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + // loop { + let start = Instant::now(); + + let mut db_cache = DatabaseCache::default(); + + let query_graph = make_query_graph( + &index, + &txn, + &mut db_cache, + "which a the releases from poison by the government", + ) + .unwrap(); + + // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + + let results = execute_search( + &index, + &txn, + &mut db_cache, + &query_graph, + None, + 0, + 20, + &mut DefaultSearchLogger, + // &mut logger, + ) + .unwrap(); + + // logger.write_d2_description(); + + let elapsed = start.elapsed(); + + let ids = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|x| { + let obkv = &x.1; + let id = obkv.get(primary_key).unwrap(); + let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + id.as_str().unwrap().to_owned() + }) + .collect::>(); + + println!("{}us: {results:?}", elapsed.as_micros()); + println!("external ids: {ids:?}"); + // println!("max_resident: {}", ALLOC.max_resident.load(std::sync::atomic::Ordering::SeqCst)); + // println!("allocated: {}", ALLOC.allocated.load(std::sync::atomic::Ordering::SeqCst)); + // } + } + + #[test] + fn search_wiki_old() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_wiki").unwrap(); + + let txn = index.read_txn().unwrap(); + + let rr = index.criteria(&txn).unwrap(); + println!("{rr:?}"); + + let primary_key = index.primary_key(&txn).unwrap().unwrap(); + let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + + let start = Instant::now(); + + let mut s = Search::new(&txn, &index); + s.query("releases from poison by the government"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); + let docs = s.execute().unwrap(); + + let elapsed = start.elapsed(); + + let ids = index + .documents(&txn, docs.documents_ids.iter().copied()) + .unwrap() + .into_iter() + .map(|x| { + let obkv = &x.1; + let id = obkv.get(primary_key).unwrap(); + let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + id.as_str().unwrap().to_owned() + }) + .collect::>(); + + println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + println!("external ids: {ids:?}"); + } #[test] fn search_movies_new() { let mut options = EnvOpenOptions::new(); @@ -343,7 +462,7 @@ mod tests { let primary_key = index.primary_key(&txn).unwrap().unwrap(); let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - + // loop { let start = Instant::now(); let mut db_cache = DatabaseCache::default(); @@ -352,7 +471,7 @@ mod tests { make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government") .unwrap(); - let mut logger = DetailedSearchLogger::new("log"); + let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let results = execute_search( &index, @@ -360,9 +479,10 @@ mod tests { &mut db_cache, &query_graph, None, - 5, + 0, 20, - &mut logger, //&mut DefaultSearchLogger, + // &mut DefaultSearchLogger, + &mut logger, ) .unwrap(); @@ -384,6 +504,7 @@ mod tests { println!("{}us: {results:?}", elapsed.as_micros()); println!("external ids: {ids:?}"); + // } } #[test] @@ -392,19 +513,39 @@ mod tests { options.map_size(100 * 1024 * 1024 * 1024); // 100 GB let index = Index::new(options, "data_movies").unwrap(); + let txn = index.read_txn().unwrap(); + let rr = index.criteria(&txn).unwrap(); + println!("{rr:?}"); + + let primary_key = index.primary_key(&txn).unwrap().unwrap(); + let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("b b b b b b b b b b"); + s.query("releases from poison by the government"); s.terms_matching_strategy(TermsMatchingStrategy::Last); s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap(); let elapsed = start.elapsed(); + let ids = index + .documents(&txn, docs.documents_ids.iter().copied()) + .unwrap() + .into_iter() + .map(|x| { + let obkv = &x.1; + let id = obkv.get(primary_key).unwrap(); + let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + id.as_str().unwrap().to_owned() + }) + .collect::>(); + println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + println!("external ids: {ids:?}"); } #[test] @@ -420,10 +561,16 @@ mod tests { builder.set_min_word_len_one_typo(5); builder.set_min_word_len_two_typos(100); - - builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + builder.set_sortable_fields(hashset! { S("release_date") }); + builder.set_criteria(vec![ + Criterion::Words, + Criterion::Typo, + Criterion::Proximity, + Criterion::Asc("release_date".to_owned()), + ]); builder.execute(|_| (), || false).unwrap(); + wtxn.commit().unwrap(); } #[test] @@ -445,6 +592,7 @@ mod tests { builder.set_searchable_fields(searchable_fields); let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); builder.set_filterable_fields(filterable_fields); + builder.set_min_word_len_one_typo(5); builder.set_min_word_len_two_typos(100); builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); @@ -467,6 +615,48 @@ mod tests { index.prepare_for_closing().wait(); } + #[test] + fn _index_wiki() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_wiki").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + // let primary_key = "id"; + let searchable_fields = vec!["body", "title", "url"]; + // let filterable_fields = vec![]; + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + // builder.set_primary_key(primary_key.to_owned()); + let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_filterable_fields(filterable_fields); + + // builder.set_min_word_len_one_typo(5); + // builder.set_min_word_len_two_typos(100); + builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); + builder.execute(|_| (), || false).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) + .unwrap(); + + let documents = documents_from( + "/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv", + "csv", + ); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + } fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { let reader = File::open(filename) From 1db152046e374e0ee532872ccc294cb376cad3d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 2 Mar 2023 21:27:57 +0100 Subject: [PATCH 035/234] WIP on split words and synonyms support --- milli/src/search/new/mod.rs | 37 ++--- milli/src/search/new/query_graph.rs | 90 ++---------- milli/src/search/new/query_term.rs | 77 ++++++++-- milli/src/search/new/resolve_query_graph.rs | 152 ++++++++++++++++---- milli/src/search/new/words.rs | 19 +-- 5 files changed, 233 insertions(+), 142 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 7b82fc6e9..e09fe2300 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -31,22 +31,27 @@ pub fn make_query_graph<'transaction>( query: &str, ) -> Result { assert!(!query.is_empty()); - let fst = index.words_fst(txn).unwrap(); - let query = LocatedQueryTerm::from_query(query.tokenize(), None, |word, is_prefix| { - word_derivations( - index, - txn, - word, - if word.len() < 4 { - 0 - } else if word.len() < 100 { - 1 - } else { - 2 - }, - is_prefix, - &fst, - ) + let authorize_typos = index.authorize_typos(txn)?; + let min_len_one_typo = index.min_word_len_one_typo(txn)?; + let min_len_two_typos = index.min_word_len_two_typos(txn)?; + + let exact_words = index.exact_words(txn)?; + let fst = index.words_fst(txn)?; + + // TODO: get rid of this closure + // also, ngrams can have one typo? + let query = LocatedQueryTerm::from_query(query.tokenize(), None, move |word, is_prefix| { + let typos = if !authorize_typos + || word.len() < min_len_one_typo as usize + || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) + { + 0 + } else if word.len() < min_len_two_typos as usize { + 1 + } else { + 2 + }; + word_derivations(index, txn, word, typos, is_prefix, &fst) }) .unwrap(); let graph = QueryGraph::from_query(index, txn, db_cache, query)?; diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 422896068..8178f8ded 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -7,7 +7,7 @@ use super::db_cache::DatabaseCache; use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::{Index, Result}; -#[derive(Clone)] +#[derive(Debug, Clone)] pub enum QueryNode { Term(LocatedQueryTerm), Deleted, @@ -31,7 +31,7 @@ pub struct QueryGraph { } fn _assert_sizes() { - let _: [u8; 112] = [0; std::mem::size_of::()]; + let _: [u8; 184] = [0; std::mem::size_of::()]; let _: [u8; 48] = [0; std::mem::size_of::()]; } @@ -116,6 +116,8 @@ impl QueryGraph { one_typo: vec![], two_typos: vec![], use_prefix_db: false, + synonyms: vec![], // TODO: ngram synonyms + split_words: None, // TODO: maybe ngram split words? }, }, positions: ngram2_pos, @@ -141,6 +143,8 @@ impl QueryGraph { one_typo: vec![], two_typos: vec![], use_prefix_db: false, + synonyms: vec![], // TODO: ngram synonyms + split_words: None, // TODO: maybe ngram split words? }, }, positions: ngram3_pos, @@ -188,19 +192,20 @@ impl QueryGraph { Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }; } } - pub fn remove_words_at_position(&mut self, position: i8) { + pub fn remove_words_at_position(&mut self, position: i8) -> bool { let mut nodes_to_remove_keeping_edges = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { let node_idx = node_idx as u32; let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue }; if positions.start() == &position { - nodes_to_remove_keeping_edges.push(node_idx) + nodes_to_remove_keeping_edges.push(node_idx); } } self.remove_nodes_keep_edges(&nodes_to_remove_keeping_edges); self.simplify(); + !nodes_to_remove_keeping_edges.is_empty() } fn simplify(&mut self) { @@ -223,80 +228,3 @@ impl QueryGraph { } } } -impl Debug for QueryNode { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - QueryNode::Term(term @ LocatedQueryTerm { value, positions: _ }) => match value { - QueryTerm::Word { - derivations: - WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db }, - } => { - if term.is_empty() { - write!(f, "\"{original} (∅)\"") - } else { - let derivations = std::iter::once(original.clone()) - .chain(zero_typo.iter().map(|s| format!("T0 .. {s}"))) - .chain(one_typo.iter().map(|s| format!("T1 .. {s}"))) - .chain(two_typos.iter().map(|s| format!("T2 .. {s}"))) - .collect::>() - .join(" | "); - - write!(f, "\"{derivations}")?; - if *use_prefix_db { - write!(f, " | +prefix_db")?; - } - write!(f, " | pos:{}..={}", term.positions.start(), term.positions.end())?; - write!(f, "\"")?; - /* - "beautiful" [label = " beautiful | beauiful | beautifol"] - */ - Ok(()) - } - } - QueryTerm::Phrase(ws) => { - let joined = - ws.iter().filter_map(|x| x.clone()).collect::>().join(" "); - let in_quotes = format!("\"{joined}\""); - let escaped = in_quotes.escape_default().collect::(); - write!(f, "\"{escaped}\"") - } - }, - QueryNode::Start => write!(f, "\"START\""), - QueryNode::End => write!(f, "\"END\""), - QueryNode::Deleted => write!(f, "\"_deleted_\""), - } - } -} - -impl QueryGraph { - pub fn graphviz(&self) -> String { - let mut desc = String::new(); - desc.push_str( - r#" -digraph G { -rankdir = LR; -node [shape = "record"] -"#, - ); - - for node in 0..self.nodes.len() { - if matches!(self.nodes[node], QueryNode::Deleted) { - continue; - } - desc.push_str(&format!("{node} [label = {:?}]", &self.nodes[node],)); - if node == self.root_node as usize { - desc.push_str("[color = blue]"); - } else if node == self.end_node as usize { - desc.push_str("[color = red]"); - } - desc.push_str(";\n"); - - for edge in self.edges[node].successors.iter() { - desc.push_str(&format!("{node} -> {edge};\n")); - } - } - - desc.push('}'); - desc - } -} diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 537857bf2..9ea72aa3a 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -10,14 +10,28 @@ use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use heed::types::DecodeIgnore; use heed::RoTxn; +use itertools::Itertools; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; -use crate::{Index, Result}; +use crate::{CboRoaringBitmapLenCodec, Index, Result}; + +#[derive(Debug, Default, Clone)] +pub struct Phrase { + pub words: Vec>, +} +impl Phrase { + pub fn description(&self) -> String { + self.words.iter().flatten().join(" ") + } +} #[derive(Debug, Clone)] pub struct WordDerivations { pub original: String, + // TODO: pub prefix_of: Vec, + pub synonyms: Vec, + pub split_words: Option<(String, String)>, pub zero_typo: Vec, pub one_typo: Vec, pub two_typos: Vec, @@ -114,19 +128,63 @@ pub fn word_derivations( } } } + let split_words = split_best_frequency(index, txn, word)?; - Ok(WordDerivations { original: word.to_owned(), zero_typo, one_typo, two_typos, use_prefix_db }) + let synonyms = index.synonyms(txn)?; + let synonyms = synonyms + .get(&vec![word.to_owned()]) + .cloned() + .unwrap_or_default() + .into_iter() + .map(|words| Phrase { words: words.into_iter().map(Some).collect() }) + .collect(); + + Ok(WordDerivations { + original: word.to_owned(), + synonyms, + split_words, + zero_typo, + one_typo, + two_typos, + use_prefix_db, + }) +} + +fn split_best_frequency( + index: &Index, + txn: &RoTxn, + original: &str, +) -> Result> { + let chars = original.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = original.split_at(i); + + let key = (1, left, right); + let frequency = index + .word_pair_proximity_docids + .remap_data_type::() + .get(txn, &key)? + .unwrap_or(0); + + if frequency != 0 && best.map_or(true, |(old, _, _)| frequency > old) { + best = Some((frequency, left, right)); + } + } + + Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned()))) } #[derive(Debug, Clone)] pub enum QueryTerm { - Phrase(Vec>), + Phrase { phrase: Phrase }, Word { derivations: WordDerivations }, } impl QueryTerm { pub fn original_single_word(&self) -> Option<&str> { match self { - QueryTerm::Phrase(_) => None, + QueryTerm::Phrase { phrase: _ } => None, QueryTerm::Word { derivations } => { if derivations.is_empty() { None @@ -140,14 +198,14 @@ impl QueryTerm { #[derive(Debug, Clone)] pub struct LocatedQueryTerm { - pub value: QueryTerm, // value should be able to contain the word derivations as well + pub value: QueryTerm, pub positions: RangeInclusive, } impl LocatedQueryTerm { pub fn is_empty(&self) -> bool { match &self.value { - QueryTerm::Phrase(_) => false, + QueryTerm::Phrase { phrase: _ } => false, QueryTerm::Word { derivations, .. } => derivations.is_empty(), } } @@ -156,6 +214,7 @@ impl LocatedQueryTerm { pub fn from_query( query: NormalizedTokenIter>, words_limit: Option, + // TODO:` use index + txn + ? instead of closure derivations: impl Fn(&str, bool) -> Result, ) -> Result> { let mut primitive_query = Vec::new(); @@ -232,7 +291,9 @@ impl LocatedQueryTerm { && (quote_count > 0 || separator_kind == SeparatorKind::Hard) { let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase(mem::take(&mut phrase)), + value: QueryTerm::Phrase { + phrase: Phrase { words: mem::take(&mut phrase) }, + }, positions: phrase_start..=phrase_end, }; primitive_query.push(located_query_term); @@ -245,7 +306,7 @@ impl LocatedQueryTerm { // If a quote is never closed, we consider all of the end of the query as a phrase. if !phrase.is_empty() { let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase(mem::take(&mut phrase)), + value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } }, positions: phrase_start..=phrase_end, }; primitive_query.push(located_query_term); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 4da853e7c..93ebcf989 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -5,9 +5,10 @@ use heed::{BytesDecode, RoTxn}; use roaring::{MultiOps, RoaringBitmap}; use super::db_cache::DatabaseCache; -use super::query_term::{QueryTerm, WordDerivations}; -use super::QueryGraph; -use crate::{Index, Result, RoaringBitmapCodec}; +use super::query_term::{Phrase, QueryTerm, WordDerivations}; +use super::{QueryGraph, QueryNode}; + +use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. #[derive(Default)] @@ -27,33 +28,49 @@ impl NodeDocIdsCache { return Ok(&self.cache[&node_idx]); }; let docids = match term { - QueryTerm::Phrase(_) => { - todo!("resolve phrase") - } + QueryTerm::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase)?, QueryTerm::Word { derivations: - WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db }, + WordDerivations { + original, + zero_typo, + one_typo, + two_typos, + use_prefix_db, + synonyms, + split_words, + }, } => { - let derivations_docids = { - let mut or_docids = vec![]; - for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) { - if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? { - or_docids.push(word_docids); - } + let mut or_docids = vec![]; + for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) { + if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? { + or_docids.push(word_docids); } - if *use_prefix_db { - if let Some(prefix_docids) = - db_cache.get_prefix_docids(index, txn, original.as_str())? - { - or_docids.push(prefix_docids); - } + } + if *use_prefix_db { + if let Some(prefix_docids) = + db_cache.get_prefix_docids(index, txn, original.as_str())? + { + or_docids.push(prefix_docids); } - or_docids - }; - let derivations_iter = derivations_docids + } + let mut docids = or_docids .into_iter() - .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()); - MultiOps::union(derivations_iter) + .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()) + .collect::>(); + for synonym in synonyms { + // TODO: cache resolve_phrase? + docids.push(resolve_phrase(index, txn, db_cache, synonym)?); + } + if let Some((left, right)) = split_words { + if let Some(split_word_docids) = + db_cache.get_word_pair_proximity_docids(index, txn, left, right, 1)? + { + docids.push(CboRoaringBitmapCodec::deserialize_from(split_word_docids)?); + } + } + + MultiOps::union(docids) } }; let _ = self.cache.insert(node_idx, docids); @@ -90,19 +107,19 @@ pub fn resolve_query_graph<'transaction>( let predecessors_docids = MultiOps::union(predecessors_iter); let n = &q.nodes[node as usize]; - // println!("resolving {node} {n:?}, predecessors: {predecessors:?}, their docids: {predecessors_docids:?}"); + let node_docids = match n { - super::QueryNode::Term(located_term) => { + QueryNode::Term(located_term) => { let term = &located_term.value; let derivations_docids = node_docids_cache.get_docids(index, txn, db_cache, term, node)?; predecessors_docids & derivations_docids } - super::QueryNode::Deleted => { + QueryNode::Deleted => { panic!() } - super::QueryNode::Start => universe.clone(), - super::QueryNode::End => { + QueryNode::Start => universe.clone(), + QueryNode::End => { return Ok(predecessors_docids); } }; @@ -125,3 +142,80 @@ pub fn resolve_query_graph<'transaction>( panic!() } + +pub fn resolve_phrase<'transaction>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + phrase: &Phrase, +) -> Result { + let Phrase { words } = phrase; + let mut candidates = RoaringBitmap::new(); + let mut first_iter = true; + let winsize = words.len().min(3); + + if words.is_empty() { + return Ok(candidates); + } + + for win in words.windows(winsize) { + // Get all the documents with the matching distance for each word pairs. + let mut bitmaps = Vec::with_capacity(winsize.pow(2)); + for (offset, s1) in win + .iter() + .enumerate() + .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) + { + for (dist, s2) in win + .iter() + .skip(offset + 1) + .enumerate() + .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) + { + if dist == 0 { + match db_cache.get_word_pair_proximity_docids(index, txn, s1, s2, 1)? { + Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?), + // If there are no documents for this pair, there will be no + // results for the phrase query. + None => return Ok(RoaringBitmap::new()), + } + } else { + let mut bitmap = RoaringBitmap::new(); + for dist in 0..=dist { + if let Some(m) = db_cache.get_word_pair_proximity_docids( + index, + txn, + s1, + s2, + dist as u8 + 1, + )? { + bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?; + } + } + if bitmap.is_empty() { + return Ok(bitmap); + } else { + bitmaps.push(bitmap); + } + } + } + } + + // We sort the bitmaps so that we perform the small intersections first, which is faster. + bitmaps.sort_unstable_by_key(|a| a.len()); + + for bitmap in bitmaps { + if first_iter { + candidates = bitmap; + first_iter = false; + } else { + candidates &= bitmap; + } + // There will be no match, return early + if candidates.is_empty() { + break; + } + } + } + Ok(candidates) +} diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index e4513eea0..da4599ec5 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -99,14 +99,17 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { )?; let child_query_graph = query_graph.clone(); - // TODO: Check whether a position exists in the graph before removing it and - // returning the next bucket. - // while graph.does_not_contain(positions_to_remove.last()) { positions_to_remove.pop() } - if self.positions_to_remove.is_empty() { - self.exhausted = true; - } else { - let position_to_remove = self.positions_to_remove.pop().unwrap(); - query_graph.remove_words_at_position(position_to_remove); + loop { + if self.positions_to_remove.is_empty() { + self.exhausted = true; + break; + } else { + let position_to_remove = self.positions_to_remove.pop().unwrap(); + let did_delete_any_node = query_graph.remove_words_at_position(position_to_remove); + if did_delete_any_node { + break; + } + } } Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket })) From aa414565bb184e626dfb5ae64d00d1f2371ef91d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sun, 5 Mar 2023 19:49:39 +0100 Subject: [PATCH 036/234] Fix proximity graph edge builder to include all proximities --- milli/src/search/new/ranking_rule_graph/proximity/build.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 9b4fa8edf..0d7e68272 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -108,7 +108,7 @@ pub fn visit_to_node<'transaction, 'from_data>( if updb2 { for word1 in derivations1.clone() { - for proximity in 0..(7 - ngram_len2) { + for proximity in 1..=(8 - ngram_len2) { let cost = (proximity + ngram_len2 - 1) as u8; if db_cache .get_word_prefix_pair_proximity_docids( @@ -139,7 +139,7 @@ pub fn visit_to_node<'transaction, 'from_data>( let product_derivations = derivations1.cartesian_product(derivations2); for (word1, word2) in product_derivations { - for proximity in 0..(7 - ngram_len2) { + for proximity in 1..=(8 - ngram_len2) { let cost = (proximity + ngram_len2 - 1) as u8; // TODO: do the opposite way with a proximity penalty as well! // search for (word2, word1, proximity-1), I guess? From 23931f8a4f1de0964de3eef8d1c285deb6d0d39c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sun, 5 Mar 2023 19:50:01 +0100 Subject: [PATCH 037/234] Fix small bug in visual logger of search algo --- milli/src/search/new/logger/detailed.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index d2ce627dc..a9f4ee045 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -282,7 +282,9 @@ results.{random} {{ }, SearchEvents::WordsState { query_graph } => { let cur_ranking_rule = timestamp.len() - 1; + *timestamp.last_mut().unwrap() += 1; let cur_activated_id = activated_id(×tamp); + *timestamp.last_mut().unwrap() -= 1; let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); @@ -295,7 +297,9 @@ results.{random} {{ }, SearchEvents::ProximityState { graph, paths, empty_paths_cache, universe, distances, cost } => { let cur_ranking_rule = timestamp.len() - 1; + *timestamp.last_mut().unwrap() += 1; let cur_activated_id = activated_id(×tamp); + *timestamp.last_mut().unwrap() -= 1; let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); @@ -309,7 +313,9 @@ results.{random} {{ }, SearchEvents::TypoState { graph, paths, empty_paths_cache, universe, distances, cost } => { let cur_ranking_rule = timestamp.len() - 1; + *timestamp.last_mut().unwrap() += 1; let cur_activated_id = activated_id(×tamp); + *timestamp.last_mut().unwrap() -= 1; let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); From c4979a2fda4fc31ac8cb6851dd28cbe7080425b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sun, 5 Mar 2023 20:07:37 +0100 Subject: [PATCH 038/234] Fix code visibility issue + unimplemented detail in proximity rule --- milli/src/search/new/db_cache.rs | 8 +- milli/src/search/new/logger/detailed.rs | 10 +-- milli/src/search/new/mod.rs | 28 +++--- .../new/ranking_rule_graph/paths_map.rs | 4 + .../new/ranking_rule_graph/proximity/build.rs | 88 +++++++++++++------ .../proximity/compute_docids.rs | 7 +- .../new/ranking_rule_graph/proximity/mod.rs | 3 +- .../new/ranking_rule_graph/resolve_paths.rs | 7 ++ milli/src/search/new/ranking_rules.rs | 14 --- 9 files changed, 99 insertions(+), 70 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 4232cadaa..f2f8f12c5 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -121,18 +121,18 @@ impl<'transaction> DatabaseCache<'transaction> { &mut self, index: &Index, txn: &'transaction RoTxn, - word1: &str, - prefix2: &str, + left_prefix: &str, + right: &str, proximity: u8, ) -> Result> { - let key = (proximity, prefix2.to_owned(), word1.to_owned()); + let key = (proximity, left_prefix.to_owned(), right.to_owned()); match self.prefix_word_pair_proximity_docids.entry(key) { Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), Entry::Vacant(entry) => { let bitmap_ptr = index .prefix_word_pair_proximity_docids .remap_data_type::() - .get(txn, &(proximity, prefix2, word1))?; + .get(txn, &(proximity, left_prefix, right))?; entry.insert(bitmap_ptr); Ok(bitmap_ptr) } diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index a9f4ee045..c2415837d 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -194,7 +194,7 @@ impl DetailedSearchLogger { for event in self.events.iter() { match event { SearchEvents::RankingRuleStartIteration { ranking_rule_idx, time, .. } => { - let elapsed = time.duration_since(prev_time); + let _elapsed = time.duration_since(prev_time); prev_time = *time; let parent_activated_id = activated_id(×tamp); timestamp.push(0); @@ -216,7 +216,7 @@ impl DetailedSearchLogger { }}").unwrap(); } SearchEvents::RankingRuleNextBucket { ranking_rule_idx, time, universe, candidates } => { - let elapsed = time.duration_since(prev_time); + let _elapsed = time.duration_since(prev_time); prev_time = *time; let old_activated_id = activated_id(×tamp); // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); @@ -227,7 +227,7 @@ impl DetailedSearchLogger { .unwrap(); } SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates, time } => { - let elapsed = time.duration_since(prev_time); + let _elapsed = time.duration_since(prev_time); prev_time = *time; let old_activated_id = activated_id(×tamp); // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); @@ -239,7 +239,7 @@ impl DetailedSearchLogger { .unwrap(); } SearchEvents::RankingRuleEndIteration { ranking_rule_idx, time, .. } => { - let elapsed = time.duration_since(prev_time); + let _elapsed = time.duration_since(prev_time); prev_time = *time; let cur_activated_id = activated_id(×tamp); // writeln!(&mut file, "time.{cur_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); @@ -332,7 +332,7 @@ results.{random} {{ writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc(node_idx: usize, node: &QueryNode, distances: &[u64], file: &mut File) { + fn query_node_d2_desc(node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) { match &node { QueryNode::Term(LocatedQueryTerm { value, .. }) => { match value { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index e09fe2300..94120cd8a 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -1,18 +1,22 @@ -pub mod db_cache; -pub mod graph_based_ranking_rule; -pub mod logger; -pub mod query_graph; -pub mod query_term; -pub mod ranking_rule_graph; -pub mod ranking_rules; -pub mod resolve_query_graph; -pub mod sort; -pub mod words; +mod db_cache; +mod graph_based_ranking_rule; +mod logger; +mod query_graph; +mod query_term; +mod ranking_rule_graph; +mod ranking_rules; +mod resolve_query_graph; +mod sort; +mod words; use charabia::Tokenize; use heed::RoTxn; -pub use query_graph::*; -pub use ranking_rules::*; + +use query_graph::{QueryGraph, QueryNode}; +pub use ranking_rules::{ + execute_search, RankingRule, RankingRuleOutput, RankingRuleOutputIter, + RankingRuleOutputIterWrapper, RankingRuleQueryTrait, +}; use roaring::RoaringBitmap; use self::db_cache::DatabaseCache; diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 3b01508c9..b9d089efc 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -6,6 +6,10 @@ use roaring::RoaringBitmap; use super::cheapest_paths::Path; +// What is PathsMap used for? +// For the empty_prefixes field in the EmptyPathsCache only :/ +// but it could be used for more, like efficient computing of a set of paths + #[derive(Debug, Clone)] pub struct PathsMap { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 0d7e68272..fbe3c8169 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -13,16 +13,14 @@ use crate::{Index, Result}; pub fn visit_from_node(from_node: &QueryNode) -> Result> { Ok(Some(match from_node { - QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => { - match value1 { - QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()), - QueryTerm::Phrase { phrase: phrase1 } => { - // TODO: remove second unwrap - let original = phrase1.words.last().unwrap().as_ref().unwrap().clone(); + QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 { + QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()), + QueryTerm::Phrase { phrase: phrase1 } => { + if let Some(original) = phrase1.words.last().unwrap().as_ref() { ( WordDerivations { original: original.clone(), - zero_typo: vec![original], + zero_typo: vec![original.to_owned()], one_typo: vec![], two_typos: vec![], use_prefix_db: false, @@ -31,9 +29,12 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result ( WordDerivations { original: String::new(), @@ -68,26 +69,27 @@ pub fn visit_to_node<'transaction, 'from_data>( let (derivations2, pos2, ngram_len2) = match value2 { QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()), QueryTerm::Phrase { phrase: phrase2 } => { - // TODO: remove second unwrap - let original = phrase2.words.last().unwrap().as_ref().unwrap().clone(); - ( - WordDerivations { - original: original.clone(), - zero_typo: vec![original], - one_typo: vec![], - two_typos: vec![], - use_prefix_db: false, - synonyms: vec![], - split_words: None, - }, - *pos2.start(), - 1, - ) + if let Some(original) = phrase2.words.first().unwrap().as_ref() { + ( + WordDerivations { + original: original.clone(), + zero_typo: vec![original.to_owned()], + one_typo: vec![], + two_typos: vec![], + use_prefix_db: false, + synonyms: vec![], + split_words: None, + }, + *pos2.start(), + 1, + ) + } else { + // No word pairs if the phrase does not have a regular word as its first term + return Ok(vec![]); + } } }; - // TODO: here we would actually do it for each combination of word1 and word2 - // and take the union of them if pos1 + 1 != pos2 { // TODO: how should this actually be handled? // We want to effectively ignore this pair of terms @@ -130,19 +132,37 @@ pub fn visit_to_node<'transaction, 'from_data>( right_prefix: original_word_2.to_owned(), }); } + if db_cache + .get_prefix_word_pair_proximity_docids( + index, + txn, + original_word_2.as_str(), + word1.as_str(), + proximity as u8 - 1, + )? + .is_some() + { + cost_proximity_word_pairs + .entry(cost) + .or_default() + .entry(proximity as u8) + .or_default() + .push(WordPair::WordPrefixSwapped { + left_prefix: original_word_2.to_owned(), + right: word1.to_owned(), + }); + } } } } let derivations2 = derivations2.all_derivations_except_prefix_db(); - // TODO: safeguard in case the cartesian product is too large? + // TODO: add safeguard in case the cartesian product is too large? let product_derivations = derivations1.cartesian_product(derivations2); for (word1, word2) in product_derivations { for proximity in 1..=(8 - ngram_len2) { let cost = (proximity + ngram_len2 - 1) as u8; - // TODO: do the opposite way with a proximity penalty as well! - // search for (word2, word1, proximity-1), I guess? if db_cache .get_word_pair_proximity_docids(index, txn, word1, word2, proximity as u8)? .is_some() @@ -154,6 +174,18 @@ pub fn visit_to_node<'transaction, 'from_data>( .or_default() .push(WordPair::Words { left: word1.to_owned(), right: word2.to_owned() }); } + if proximity > 1 + && db_cache + .get_word_pair_proximity_docids(index, txn, word2, word1, proximity as u8 - 1)? + .is_some() + { + cost_proximity_word_pairs + .entry(cost) + .or_default() + .entry(proximity as u8 - 1) + .or_default() + .push(WordPair::Words { left: word2.to_owned(), right: word1.to_owned() }); + } } } let mut new_edges = cost_proximity_word_pairs diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 908f50ef6..34f7deea1 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -20,11 +20,8 @@ pub fn compute_docids<'transaction>( } WordPair::WordPrefix { left, right_prefix } => db_cache .get_word_prefix_pair_proximity_docids(index, txn, left, right_prefix, *proximity), - WordPair::WordsSwapped { left, right } => { - db_cache.get_word_pair_proximity_docids(index, txn, left, right, *proximity) - } - WordPair::WordPrefixSwapped { left, right_prefix } => db_cache - .get_prefix_word_pair_proximity_docids(index, txn, left, right_prefix, *proximity), + WordPair::WordPrefixSwapped { left_prefix, right } => db_cache + .get_prefix_word_pair_proximity_docids(index, txn, left_prefix, right, *proximity), }?; let bitmap = bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 5b3869ea8..c0dbbefa9 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -18,9 +18,8 @@ use crate::{Index, Result}; #[derive(Debug, Clone)] pub enum WordPair { Words { left: String, right: String }, - WordsSwapped { left: String, right: String }, WordPrefix { left: String, right_prefix: String }, - WordPrefixSwapped { left: String, right_prefix: String }, + WordPrefixSwapped { left_prefix: String, right: String }, } #[derive(Clone)] diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index f3394206b..94a51756e 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -24,9 +24,13 @@ impl RankingRuleGraph { mut paths: Vec>, ) -> Result { paths.sort_unstable(); + // let mut needs_filtering_empty_edges = false; + // let mut needs_filtering_empty_prefix = false; + // let mut needs_filtering_empty_couple_edges = false; let mut needs_filtering = false; let mut path_bitmaps = vec![]; 'path_loop: loop { + // TODO: distinguish between empty_edges, empty_prefix, and empty_couple_edges filtering if needs_filtering { for path in paths.iter_mut() { if empty_paths_cache.path_is_empty(path) { @@ -61,11 +65,13 @@ impl RankingRuleGraph { self.remove_edge(edge_index); edge_docids_cache.cache.remove(&edge_index); needs_filtering = true; + // needs_filtering_empty_edges = true; // 3. continue executing this function again on the remaining paths continue 'path_loop; } else { path_bitmap &= edge_docids; if path_bitmap.is_disjoint(universe) { + // needs_filtering_empty_prefix = true; needs_filtering = true; empty_paths_cache.forbid_prefix(&visited_edges); // if the intersection between this edge and any @@ -76,6 +82,7 @@ impl RankingRuleGraph { { let intersection = edge_docids & edge_docids2; if intersection.is_disjoint(universe) { + // needs_filtering_empty_couple_edges = true; empty_paths_cache .forbid_couple_edges(*edge_index2, edge_index); } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 9b3bcb38c..bc0523b31 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -98,20 +98,6 @@ pub struct RankingRuleOutput { pub candidates: RoaringBitmap, } -#[allow(unused)] -pub fn get_start_universe<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, - query_graph: &QueryGraph, - term_matching_strategy: TermsMatchingStrategy, - // filters: Filters, -) -> Result { - // TODO: actually compute the universe from the query graph - let universe = index.documents_ids(txn).unwrap(); - Ok(universe) -} - // TODO: can make it generic over the query type (either query graph or placeholder) fairly easily #[allow(clippy::too_many_arguments)] pub fn execute_search<'transaction>( From cab2b6bcdac8291c5a0ab71ec5becec488ad98a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 6 Mar 2023 08:35:01 +0100 Subject: [PATCH 039/234] Fix: computation of initial universe, code organisation --- .../search/new/graph_based_ranking_rule.rs | 4 +- milli/src/search/new/logger/detailed.rs | 54 ++-- milli/src/search/new/logger/mod.rs | 23 +- milli/src/search/new/mod.rs | 124 +++++--- milli/src/search/new/query_graph.rs | 22 +- milli/src/search/new/query_term.rs | 275 ++++++++++-------- .../ranking_rule_graph/edge_docids_cache.rs | 3 +- .../src/search/new/ranking_rule_graph/mod.rs | 23 +- .../new/ranking_rule_graph/proximity/build.rs | 1 + milli/src/search/new/ranking_rules.rs | 75 ++--- milli/src/search/new/words.rs | 12 +- 11 files changed, 341 insertions(+), 275 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index e5a0fbad6..d1f5864aa 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -3,8 +3,8 @@ use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; use super::logger::SearchLogger; -use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache; -use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; +use super::ranking_rule_graph::EdgeDocidsCache; +use super::ranking_rule_graph::EmptyPathsCache; use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput}; diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index c2415837d..4282db27f 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -5,13 +5,13 @@ use std::fs::File; use std::time::Instant; use std::{io::Write, path::PathBuf}; -use crate::new::ranking_rule_graph::typo::TypoGraph; +use crate::new::ranking_rule_graph::TypoGraph; use crate::new::{QueryNode, QueryGraph}; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; -use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; +use crate::new::ranking_rule_graph::EmptyPathsCache; use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::ranking_rule_graph::{ - proximity::ProximityGraph, RankingRuleGraph, + ProximityGraph, RankingRuleGraph, }; use super::{RankingRule, SearchLogger}; @@ -21,18 +21,18 @@ pub enum SearchEvents { ranking_rule_idx: usize, query: QueryGraph, universe: RoaringBitmap, - time: Instant, + time: Instant }, RankingRuleNextBucket { ranking_rule_idx: usize, universe: RoaringBitmap, candidates: RoaringBitmap, - time: Instant, + time: Instant }, RankingRuleEndIteration { ranking_rule_idx: usize, universe: RoaringBitmap, - time: Instant, + time: Instant }, ExtendResults { new: Vec, @@ -56,13 +56,14 @@ pub enum SearchEvents { distances: Vec>, cost: u64, }, - RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant, }, + RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant }, } pub struct DetailedSearchLogger { folder_path: PathBuf, initial_query: Option, initial_query_time: Option, + query_for_universe: Option, initial_universe: Option, ranking_rules_ids: Option>, events: Vec, @@ -73,6 +74,7 @@ impl DetailedSearchLogger { folder_path: PathBuf::new().join(folder_path), initial_query: None, initial_query_time: None, + query_for_universe: None, initial_universe: None, ranking_rules_ids: None, events: vec![], @@ -81,9 +83,13 @@ impl DetailedSearchLogger { } impl SearchLogger for DetailedSearchLogger { - fn initial_query(&mut self, query: &QueryGraph, time: Instant) { + fn initial_query(&mut self, query: &QueryGraph) { self.initial_query = Some(query.clone()); - self.initial_query_time = Some(time); + self.initial_query_time = Some(Instant::now()); + } + + fn query_for_universe(&mut self, query: &QueryGraph) { + self.query_for_universe = Some(query.clone()); } fn initial_universe(&mut self, universe: &RoaringBitmap) { @@ -99,13 +105,13 @@ impl SearchLogger for DetailedSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, query: &QueryGraph, universe: &RoaringBitmap, - time: Instant, + ) { self.events.push(SearchEvents::RankingRuleStartIteration { ranking_rule_idx, query: query.clone(), universe: universe.clone(), - time, + time: Instant::now(), }) } @@ -115,13 +121,13 @@ impl SearchLogger for DetailedSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, candidates: &RoaringBitmap, - time: Instant, + ) { self.events.push(SearchEvents::RankingRuleNextBucket { ranking_rule_idx, universe: universe.clone(), candidates: candidates.clone(), - time, + time: Instant::now(), }) } fn skip_bucket_ranking_rule<'transaction>( @@ -129,12 +135,12 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, candidates: &RoaringBitmap, - time: Instant, + ) { self.events.push(SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates: candidates.clone(), - time + time: Instant::now() }) } @@ -143,12 +149,12 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, - time: Instant, + ) { self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe: universe.clone(), - time + time: Instant::now() }) } fn add_to_results(&mut self, docids: &[u32]) { @@ -184,6 +190,20 @@ impl DetailedSearchLogger { let index_path = self.folder_path.join("index.d2"); let mut file = std::fs::File::create(index_path).unwrap(); + writeln!(&mut file, "direction: right").unwrap(); + writeln!(&mut file, "Initial Query Graph: {{").unwrap(); + let initial_query_graph = self.initial_query.as_ref().unwrap(); + Self::query_graph_d2_description(initial_query_graph, &mut file); + writeln!(&mut file, "}}").unwrap(); + + writeln!(&mut file, "Query Graph Used To Compute Universe: {{").unwrap(); + let query_graph_for_universe = self.query_for_universe.as_ref().unwrap(); + Self::query_graph_d2_description(query_graph_for_universe, &mut file); + writeln!(&mut file, "}}").unwrap(); + + let initial_universe = self.initial_universe.as_ref().unwrap(); + writeln!(&mut file, "Initial Universe Length {}", initial_universe.len()).unwrap(); + writeln!(&mut file, "Control Flow Between Ranking Rules: {{").unwrap(); writeln!(&mut file, "shape: sequence_diagram").unwrap(); for (idx, rr_id) in self.ranking_rules_ids.as_ref().unwrap().iter().enumerate() { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 079bb892c..9a141c1c6 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -2,19 +2,17 @@ pub mod detailed; use roaring::RoaringBitmap; -use std::time::Instant; use super::{ - ranking_rule_graph::{ - empty_paths_cache::EmptyPathsCache, proximity::ProximityGraph, typo::TypoGraph, - RankingRuleGraph, - }, + ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGraph, TypoGraph}, RankingRule, RankingRuleQueryTrait, }; pub struct DefaultSearchLogger; impl SearchLogger for DefaultSearchLogger { - fn initial_query(&mut self, _query: &Q, _time: Instant) {} + fn initial_query(&mut self, _query: &Q) {} + + fn query_for_universe(&mut self, _query: &Q) {} fn initial_universe(&mut self, _universe: &RoaringBitmap) {} @@ -26,7 +24,6 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, Q>, _query: &Q, _universe: &RoaringBitmap, - _time: Instant, ) { } @@ -36,7 +33,6 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, Q>, _universe: &RoaringBitmap, _candidates: &RoaringBitmap, - _time: Instant, ) { } fn skip_bucket_ranking_rule<'transaction>( @@ -44,7 +40,6 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _candidates: &RoaringBitmap, - _time: Instant, ) { } @@ -53,7 +48,6 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _universe: &RoaringBitmap, - _time: Instant, ) { } @@ -85,7 +79,10 @@ impl SearchLogger for DefaultSearchLogger { } pub trait SearchLogger { - fn initial_query(&mut self, query: &Q, time: Instant); + fn initial_query(&mut self, query: &Q); + + fn query_for_universe(&mut self, query: &Q); + fn initial_universe(&mut self, universe: &RoaringBitmap); fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]); @@ -96,7 +93,6 @@ pub trait SearchLogger { ranking_rule: &dyn RankingRule<'transaction, Q>, query: &Q, universe: &RoaringBitmap, - time: Instant, ); fn next_bucket_ranking_rule<'transaction>( &mut self, @@ -104,21 +100,18 @@ pub trait SearchLogger { ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, candidates: &RoaringBitmap, - time: Instant, ); fn skip_bucket_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, ranking_rule: &dyn RankingRule<'transaction, Q>, candidates: &RoaringBitmap, - time: Instant, ); fn end_iteration_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, - time: Instant, ); fn add_to_results(&mut self, docids: &[u32]); diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 94120cd8a..3e9b43f1b 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -9,55 +9,113 @@ mod resolve_query_graph; mod sort; mod words; -use charabia::Tokenize; -use heed::RoTxn; +use std::collections::BTreeSet; -use query_graph::{QueryGraph, QueryNode}; pub use ranking_rules::{ - execute_search, RankingRule, RankingRuleOutput, RankingRuleOutputIter, + apply_ranking_rules, RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper, RankingRuleQueryTrait, }; + +use crate::{ + new::query_term::located_query_terms_from_string, Filter, Index, Result, TermsMatchingStrategy, +}; +use charabia::Tokenize; +use db_cache::DatabaseCache; +use heed::RoTxn; +use query_graph::{QueryGraph, QueryNode}; use roaring::RoaringBitmap; -use self::db_cache::DatabaseCache; -use self::query_term::{word_derivations, LocatedQueryTerm}; -use crate::{Index, Result}; +use self::{ + logger::SearchLogger, + resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}, +}; pub enum BitmapOrAllRef<'s> { Bitmap(&'s RoaringBitmap), All, } -pub fn make_query_graph<'transaction>( +#[allow(clippy::too_many_arguments)] +pub fn resolve_maximally_reduced_query_graph<'transaction>( index: &Index, - txn: &RoTxn, + txn: &'transaction heed::RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + query_graph: &QueryGraph, + node_docids_cache: &mut NodeDocIdsCache, + matching_strategy: TermsMatchingStrategy, + logger: &mut dyn SearchLogger, +) -> Result { + let mut graph = query_graph.clone(); + let mut positions_to_remove = match matching_strategy { + TermsMatchingStrategy::Last => { + let mut all_positions = BTreeSet::new(); + for n in query_graph.nodes.iter() { + match n { + QueryNode::Term(term) => { + all_positions.extend(term.positions.clone().into_iter()); + } + QueryNode::Deleted | QueryNode::Start | QueryNode::End => {} + } + } + all_positions.into_iter().collect() + } + TermsMatchingStrategy::All => vec![], + }; + // don't remove the first term + positions_to_remove.remove(0); + loop { + if positions_to_remove.is_empty() { + break; + } else { + let position_to_remove = positions_to_remove.pop().unwrap(); + let _ = graph.remove_words_at_position(position_to_remove); + } + } + logger.query_for_universe(&graph); + let docids = resolve_query_graph(index, txn, db_cache, node_docids_cache, &graph, universe)?; + + Ok(docids) +} + +#[allow(clippy::too_many_arguments)] +pub fn execute_search<'transaction>( + index: &Index, + txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, query: &str, -) -> Result { + filters: Option, + from: usize, + length: usize, + logger: &mut dyn SearchLogger, +) -> Result> { assert!(!query.is_empty()); - let authorize_typos = index.authorize_typos(txn)?; - let min_len_one_typo = index.min_word_len_one_typo(txn)?; - let min_len_two_typos = index.min_word_len_two_typos(txn)?; + let query_terms = located_query_terms_from_string(index, txn, query.tokenize(), None).unwrap(); + let graph = QueryGraph::from_query(index, txn, db_cache, query_terms)?; - let exact_words = index.exact_words(txn)?; - let fst = index.words_fst(txn)?; + logger.initial_query(&graph); - // TODO: get rid of this closure - // also, ngrams can have one typo? - let query = LocatedQueryTerm::from_query(query.tokenize(), None, move |word, is_prefix| { - let typos = if !authorize_typos - || word.len() < min_len_one_typo as usize - || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) - { - 0 - } else if word.len() < min_len_two_typos as usize { - 1 - } else { - 2 - }; - word_derivations(index, txn, word, typos, is_prefix, &fst) - }) - .unwrap(); - let graph = QueryGraph::from_query(index, txn, db_cache, query)?; - Ok(graph) + let universe = if let Some(filters) = filters { + filters.evaluate(txn, index)? + } else { + index.documents_ids(txn)? + }; + + let mut node_docids_cache = NodeDocIdsCache::default(); + + let universe = resolve_maximally_reduced_query_graph( + index, + txn, + db_cache, + &universe, + &graph, + &mut node_docids_cache, + TermsMatchingStrategy::Last, + logger, + )?; + // TODO: create ranking rules here, reuse the node docids cache for the words ranking rule + + logger.initial_universe(&universe); + + apply_ranking_rules(index, txn, db_cache, &graph, &universe, from, length, logger) } diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 8178f8ded..e86c175af 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -4,7 +4,7 @@ use heed::RoTxn; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; -use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::{Index, Result}; #[derive(Debug, Clone)] @@ -31,6 +31,7 @@ pub struct QueryGraph { } fn _assert_sizes() { + // TODO: QueryNodes are too big now, 184B is an unreasonable size let _: [u8; 184] = [0; std::mem::size_of::()]; let _: [u8; 48] = [0; std::mem::size_of::()]; } @@ -75,7 +76,7 @@ impl QueryGraph { index: &Index, txn: &RoTxn, _db_cache: &mut DatabaseCache<'transaction>, - query: Vec, + terms: Vec, ) -> Result { // TODO: maybe empty nodes should not be removed here, to compute // the score of the `words` ranking rule correctly @@ -90,8 +91,8 @@ impl QueryGraph { (vec![], vec![], vec![graph.root_node]); // TODO: split words / synonyms - for length in 1..=query.len() { - let query = &query[..length]; + for length in 1..=terms.len() { + let query = &terms[..length]; let term0 = query.last().unwrap(); @@ -104,7 +105,7 @@ impl QueryGraph { if !prev1.is_empty() { if let Some((ngram2_str, ngram2_pos)) = - LocatedQueryTerm::ngram2(&query[length - 2], &query[length - 1]) + query_term::ngram2(&query[length - 2], &query[length - 1]) { if word_set.contains(ngram2_str.as_bytes()) { let ngram2 = LocatedQueryTerm { @@ -128,11 +129,9 @@ impl QueryGraph { } } if !prev2.is_empty() { - if let Some((ngram3_str, ngram3_pos)) = LocatedQueryTerm::ngram3( - &query[length - 3], - &query[length - 2], - &query[length - 1], - ) { + if let Some((ngram3_str, ngram3_pos)) = + query_term::ngram3(&query[length - 3], &query[length - 2], &query[length - 1]) + { if word_set.contains(ngram3_str.as_bytes()) { let ngram3 = LocatedQueryTerm { value: QueryTerm::Word { @@ -143,8 +142,9 @@ impl QueryGraph { one_typo: vec![], two_typos: vec![], use_prefix_db: false, - synonyms: vec![], // TODO: ngram synonyms + synonyms: vec![], // TODO: ngram synonyms split_words: None, // TODO: maybe ngram split words? + // would be nice for typos like su nflower }, }, positions: ngram3_pos, diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 9ea72aa3a..3820b8ed0 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -178,9 +178,15 @@ fn split_best_frequency( #[derive(Debug, Clone)] pub enum QueryTerm { + // TODO: should there be SplitWord, NGram2, and NGram3 variants? + // NGram2 can have 1 typo and synonyms + // NGram3 cannot have typos but can have synonyms + // SplitWords are a phrase + // Can NGrams be prefixes? Phrase { phrase: Phrase }, Word { derivations: WordDerivations }, } + impl QueryTerm { pub fn original_single_word(&self) -> Option<&str> { match self { @@ -209,53 +215,77 @@ impl LocatedQueryTerm { QueryTerm::Word { derivations, .. } => derivations.is_empty(), } } - /// Create primitive query from tokenized query string, - /// the primitive query is an intermediate state to build the query tree. - pub fn from_query( - query: NormalizedTokenIter>, - words_limit: Option, - // TODO:` use index + txn + ? instead of closure - derivations: impl Fn(&str, bool) -> Result, - ) -> Result> { - let mut primitive_query = Vec::new(); - let mut phrase = Vec::new(); +} - let mut quoted = false; +pub fn located_query_terms_from_string<'transaction>( + index: &Index, + txn: &'transaction RoTxn, + query: NormalizedTokenIter>, + words_limit: Option, +) -> Result> { + let authorize_typos = index.authorize_typos(txn)?; + let min_len_one_typo = index.min_word_len_one_typo(txn)?; + let min_len_two_typos = index.min_word_len_two_typos(txn)?; - let parts_limit = words_limit.unwrap_or(usize::MAX); + let exact_words = index.exact_words(txn)?; + let fst = index.words_fst(txn)?; - let mut position = -1i8; - let mut phrase_start = -1i8; - let mut phrase_end = -1i8; + let nbr_typos = |word: &str| { + if !authorize_typos + || word.len() < min_len_one_typo as usize + || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) + { + 0 + } else if word.len() < min_len_two_typos as usize { + 1 + } else { + 2 + } + }; - let mut peekable = query.peekable(); - while let Some(token) = peekable.next() { - // early return if word limit is exceeded - if primitive_query.len() >= parts_limit { - return Ok(primitive_query); - } + let derivations = |word: &str, is_prefix: bool| { + word_derivations(index, txn, word, nbr_typos(word), is_prefix, &fst) + }; - match token.kind { - TokenKind::Word | TokenKind::StopWord => { - position += 1; - // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, - // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, - // 3. if the word is the last token of the query we push it as a prefix word. - if quoted { - phrase_end = position; - if phrase.is_empty() { - phrase_start = position; - } - if let TokenKind::StopWord = token.kind { - phrase.push(None); - } else { - // TODO: in a phrase, check that every word exists - // otherwise return WordDerivations::Empty - phrase.push(Some(token.lemma().to_string())); - } - } else if peekable.peek().is_some() { - if let TokenKind::StopWord = token.kind { - } else { + let mut primitive_query = Vec::new(); + let mut phrase = Vec::new(); + + let mut quoted = false; + + let parts_limit = words_limit.unwrap_or(usize::MAX); + + let mut position = -1i8; + let mut phrase_start = -1i8; + let mut phrase_end = -1i8; + + let mut peekable = query.peekable(); + while let Some(token) = peekable.next() { + // early return if word limit is exceeded + if primitive_query.len() >= parts_limit { + return Ok(primitive_query); + } + + match token.kind { + TokenKind::Word | TokenKind::StopWord => { + position += 1; + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, + // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, + // 3. if the word is the last token of the query we push it as a prefix word. + if quoted { + phrase_end = position; + if phrase.is_empty() { + phrase_start = position; + } + if let TokenKind::StopWord = token.kind { + phrase.push(None); + } else { + // TODO: in a phrase, check that every word exists + // otherwise return WordDerivations::Empty + phrase.push(Some(token.lemma().to_string())); + } + } else if peekable.peek().is_some() { + match token.kind { + TokenKind::Word => { let derivations = derivations(token.lemma(), false)?; let located_term = LocatedQueryTerm { value: QueryTerm::Word { derivations }, @@ -263,100 +293,91 @@ impl LocatedQueryTerm { }; primitive_query.push(located_term); } - } else { - let derivations = derivations(token.lemma(), true)?; - let located_term = LocatedQueryTerm { - value: QueryTerm::Word { derivations }, - positions: position..=position, - }; - primitive_query.push(located_term); + TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} + } + } else { + let derivations = derivations(token.lemma(), true)?; + let located_term = LocatedQueryTerm { + value: QueryTerm::Word { derivations }, + positions: position..=position, + }; + primitive_query.push(located_term); + } + } + TokenKind::Separator(separator_kind) => { + match separator_kind { + SeparatorKind::Hard => { + position += 1; + } + SeparatorKind::Soft => { + position += 0; } } - TokenKind::Separator(separator_kind) => { - match separator_kind { - SeparatorKind::Hard => { - position += 1; - } - SeparatorKind::Soft => { - position += 0; - } - } - let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - // swap quoted state if we encounter a double quote - if quote_count % 2 != 0 { - quoted = !quoted; - } - // if there is a quote or a hard separator we close the phrase. - if !phrase.is_empty() - && (quote_count > 0 || separator_kind == SeparatorKind::Hard) - { - let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase { - phrase: Phrase { words: mem::take(&mut phrase) }, - }, - positions: phrase_start..=phrase_end, - }; - primitive_query.push(located_query_term); - } + let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + // swap quoted state if we encounter a double quote + if quote_count % 2 != 0 { + quoted = !quoted; + } + // if there is a quote or a hard separator we close the phrase. + if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) + { + let located_query_term = LocatedQueryTerm { + value: QueryTerm::Phrase { + phrase: Phrase { words: mem::take(&mut phrase) }, + }, + positions: phrase_start..=phrase_end, + }; + primitive_query.push(located_query_term); } - _ => (), } + _ => (), } - - // If a quote is never closed, we consider all of the end of the query as a phrase. - if !phrase.is_empty() { - let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } }, - positions: phrase_start..=phrase_end, - }; - primitive_query.push(located_query_term); - } - - Ok(primitive_query) } + + // If a quote is never closed, we consider all of the end of the query as a phrase. + if !phrase.is_empty() { + let located_query_term = LocatedQueryTerm { + value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } }, + positions: phrase_start..=phrase_end, + }; + primitive_query.push(located_query_term); + } + + Ok(primitive_query) } -impl LocatedQueryTerm { - pub fn ngram2( - x: &LocatedQueryTerm, - y: &LocatedQueryTerm, - ) -> Option<(String, RangeInclusive)> { - if *x.positions.end() != y.positions.start() - 1 { - println!( - "x positions end: {}, y positions start: {}", - *x.positions.end(), - y.positions.start() - ); - return None; - } - match (&x.value.original_single_word(), &y.value.original_single_word()) { - (Some(w1), Some(w2)) => { - let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end()); - Some(term) - } - _ => None, - } +// TODO: return a word derivations instead? +pub fn ngram2(x: &LocatedQueryTerm, y: &LocatedQueryTerm) -> Option<(String, RangeInclusive)> { + if *x.positions.end() != y.positions.start() - 1 { + return None; } - pub fn ngram3( - x: &LocatedQueryTerm, - y: &LocatedQueryTerm, - z: &LocatedQueryTerm, - ) -> Option<(String, RangeInclusive)> { - if *x.positions.end() != y.positions.start() - 1 - || *y.positions.end() != z.positions.start() - 1 - { - return None; - } - match ( - &x.value.original_single_word(), - &y.value.original_single_word(), - &z.value.original_single_word(), - ) { - (Some(w1), Some(w2), Some(w3)) => { - let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end()); - Some(term) - } - _ => None, + match (&x.value.original_single_word(), &y.value.original_single_word()) { + (Some(w1), Some(w2)) => { + let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end()); + Some(term) } + _ => None, + } +} +pub fn ngram3( + x: &LocatedQueryTerm, + y: &LocatedQueryTerm, + z: &LocatedQueryTerm, +) -> Option<(String, RangeInclusive)> { + if *x.positions.end() != y.positions.start() - 1 + || *y.positions.end() != z.positions.start() - 1 + { + return None; + } + match ( + &x.value.original_single_word(), + &y.value.original_single_word(), + &z.value.original_single_word(), + ) { + (Some(w1), Some(w2), Some(w3)) => { + let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end()); + Some(term) + } + _ => None, } } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index ef2eba895..3d48fd69c 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -41,11 +41,12 @@ impl EdgeDocidsCache { EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All), EdgeDetails::Data(details) => { if self.cache.contains_key(&edge_index) { + // TODO: should we update the bitmap in the cache if the new universe + // reduces it? return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); } // TODO: maybe universe doesn't belong here let docids = universe & G::compute_docids(index, txn, db_cache, details)?; - let _ = self.cache.insert(edge_index, docids); let docids = &self.cache[&edge_index]; Ok(BitmapOrAllRef::Bitmap(docids)) diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index ac5e1f46b..e65d5f70b 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -1,19 +1,22 @@ -pub mod build; -pub mod cheapest_paths; -pub mod edge_docids_cache; -pub mod empty_paths_cache; -pub mod paths_map; -pub mod proximity; -pub mod resolve_paths; -pub mod typo; +mod build; +mod cheapest_paths; +mod edge_docids_cache; +mod empty_paths_cache; +mod paths_map; +mod proximity; +mod resolve_paths; +mod typo; + +pub use edge_docids_cache::EdgeDocidsCache; +pub use empty_paths_cache::EmptyPathsCache; +pub use proximity::ProximityGraph; +pub use typo::TypoGraph; use std::ops::ControlFlow; use heed::RoTxn; use roaring::RoaringBitmap; -use self::empty_paths_cache::EmptyPathsCache; - use super::db_cache::DatabaseCache; use super::logger::SearchLogger; use super::{QueryGraph, QueryNode}; diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index fbe3c8169..06c860d7e 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -105,6 +105,7 @@ pub fn visit_to_node<'transaction, 'from_data>( assert!(!updb1); let derivations1 = derivations1.all_derivations_except_prefix_db(); + // TODO: eventually, we want to get rid of the uses from `orginal` let original_word_2 = derivations2.original.clone(); let mut cost_proximity_word_pairs = BTreeMap::>>::new(); diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index bc0523b31..9fa840ad5 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -1,5 +1,3 @@ -use std::time::Instant; - use heed::RoTxn; use roaring::RoaringBitmap; @@ -8,11 +6,11 @@ use super::logger::SearchLogger; use super::QueryGraph; use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; -use crate::new::ranking_rule_graph::proximity::ProximityGraph; -use crate::new::ranking_rule_graph::typo::TypoGraph; +use crate::new::ranking_rule_graph::ProximityGraph; +use crate::new::ranking_rule_graph::TypoGraph; use crate::new::words::Words; // use crate::search::new::sort::Sort; -use crate::{Filter, Index, Result, TermsMatchingStrategy}; +use crate::{Index, Result, TermsMatchingStrategy}; pub trait RankingRuleOutputIter<'transaction, Query> { fn next_bucket(&mut self) -> Result>>; @@ -100,18 +98,18 @@ pub struct RankingRuleOutput { // TODO: can make it generic over the query type (either query graph or placeholder) fairly easily #[allow(clippy::too_many_arguments)] -pub fn execute_search<'transaction>( +pub fn apply_ranking_rules<'transaction>( index: &Index, txn: &'transaction heed::RoTxn, // TODO: ranking rules parameter db_cache: &mut DatabaseCache<'transaction>, query_graph: &QueryGraph, - filters: Option, + universe: &RoaringBitmap, from: usize, length: usize, logger: &mut dyn SearchLogger, ) -> Result> { - logger.initial_query(query_graph, Instant::now()); + logger.initial_query(query_graph); let words = &mut Words::new(TermsMatchingStrategy::Last); // let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); @@ -122,25 +120,13 @@ pub fn execute_search<'transaction>( logger.ranking_rules(&ranking_rules); - let universe = if let Some(filters) = filters { - filters.evaluate(txn, index)? - } else { - index.documents_ids(txn)? - }; - if universe.len() < from as u64 { return Ok(vec![]); } let ranking_rules_len = ranking_rules.len(); - logger.start_iteration_ranking_rule( - 0, - ranking_rules[0], - query_graph, - &universe, - Instant::now(), - ); - ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?; + logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe); + ranking_rules[0].start_iteration(index, txn, db_cache, logger, universe, query_graph)?; let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; candidates[0] = universe.clone(); @@ -154,7 +140,6 @@ pub fn execute_search<'transaction>( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &candidates[cur_ranking_rule_index], - Instant::now(), ); candidates[cur_ranking_rule_index].clear(); ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger); @@ -183,7 +168,6 @@ pub fn execute_search<'transaction>( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &candidates, - Instant::now(), ); } else { let all_candidates = candidates.iter().collect::>(); @@ -193,7 +177,6 @@ pub fn execute_search<'transaction>( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &skipped_candidates.into_iter().collect(), - Instant::now(), ); let candidates = candidates .iter() @@ -234,7 +217,6 @@ pub fn execute_search<'transaction>( ranking_rules[cur_ranking_rule_index], &candidates[cur_ranking_rule_index], &next_bucket.candidates, - Instant::now(), ); assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates)); @@ -255,7 +237,6 @@ pub fn execute_search<'transaction>( ranking_rules[cur_ranking_rule_index], &next_bucket.query, &candidates[cur_ranking_rule_index], - Instant::now(), ); ranking_rules[cur_ranking_rule_index].start_iteration( index, @@ -272,11 +253,11 @@ pub fn execute_search<'transaction>( #[cfg(test)] mod tests { - use super::execute_search; // use crate::allocator::ALLOC; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; use crate::new::db_cache::DatabaseCache; + use crate::new::execute_search; use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; @@ -284,8 +265,7 @@ mod tests { use std::io::{BufRead, BufReader, Cursor, Seek}; use std::time::Instant; // use crate::new::logger::detailed::DetailedSearchLogger; - use crate::new::logger::{DefaultSearchLogger, SearchLogger}; - use crate::new::make_query_graph; + use crate::new::logger::DefaultSearchLogger; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; @@ -321,17 +301,20 @@ mod tests { ])) .unwrap(); let txn = index.read_txn().unwrap(); - let mut logger = DefaultSearchLogger; let mut db_cache = DatabaseCache::default(); - let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government") - .unwrap(); - logger.initial_query(&query_graph, Instant::now()); + let results = execute_search( + &index, + &txn, + &mut db_cache, + "releases from poison by the government", + None, + 0, + 50, + &mut DefaultSearchLogger, + ) + .unwrap(); - let results = - execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 50, &mut logger) - .unwrap(); println!("{results:?}") } @@ -352,21 +335,13 @@ mod tests { let mut db_cache = DatabaseCache::default(); - let query_graph = make_query_graph( - &index, - &txn, - &mut db_cache, - "which a the releases from poison by the government", - ) - .unwrap(); - // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let results = execute_search( &index, &txn, &mut db_cache, - &query_graph, + "which a the releases from poison by the government", None, 0, 20, @@ -453,17 +428,13 @@ mod tests { let mut db_cache = DatabaseCache::default(); - let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government") - .unwrap(); - let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let results = execute_search( &index, &txn, &mut db_cache, - &query_graph, + "releases from poison by the government", None, 0, 20, diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index da4599ec5..10c0800ba 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -43,12 +43,9 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { _parent_candidates: &RoaringBitmap, parent_query_graph: &QueryGraph, ) -> Result<()> { - // println!("Words: start iteration"); self.exhausted = false; self.query_graph = Some(parent_query_graph.clone()); - // TODO: a phrase can contain many positions, but represents a single node. - // That's a problem. let positions_to_remove = match self.terms_matching_strategy { TermsMatchingStrategy::Last => { let mut all_positions = BTreeSet::new(); @@ -60,11 +57,13 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { QueryNode::Deleted | QueryNode::Start | QueryNode::End => {} } } - all_positions.into_iter().collect() + let mut r: Vec = all_positions.into_iter().collect(); + // don't remove the first term + r.remove(0); + r } TermsMatchingStrategy::All => vec![], }; - // println!("positions to remove: {positions_to_remove:?}"); self.positions_to_remove = positions_to_remove; self.iterating = true; Ok(()) @@ -78,7 +77,6 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { - // println!("Words: next bucket"); assert!(self.iterating); assert!(universe.len() > 1); @@ -122,9 +120,9 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { _db_cache: &mut DatabaseCache<'transaction>, _logger: &mut dyn SearchLogger, ) { - // println!("Words: end iteration"); self.iterating = false; self.exhausted = true; self.positions_to_remove = vec![]; + self.query_graph = None; } } From 3f1729a17fbc4d8ce679c5325c476e67a3f179fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 6 Mar 2023 10:21:28 +0100 Subject: [PATCH 040/234] Update new search test --- milli/src/search/new/ranking_rules.rs | 60 ++++++++++++++------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 9fa840ad5..cfa43c006 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -328,49 +328,50 @@ mod tests { println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - let primary_key = index.primary_key(&txn).unwrap().unwrap(); - let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); // loop { let start = Instant::now(); let mut db_cache = DatabaseCache::default(); - // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let results = execute_search( &index, &txn, &mut db_cache, - "which a the releases from poison by the government", + "releases from poison by the government", None, 0, 20, - &mut DefaultSearchLogger, - // &mut logger, + // &mut DefaultSearchLogger, + &mut logger, ) .unwrap(); - // logger.write_d2_description(); + logger.write_d2_description(); let elapsed = start.elapsed(); - let ids = index + let documents = index .documents(&txn, results.iter().copied()) .unwrap() .into_iter() - .map(|x| { - let obkv = &x.1; - let id = obkv.get(primary_key).unwrap(); - let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - id.as_str().unwrap().to_owned() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) }) .collect::>(); - println!("{}us: {results:?}", elapsed.as_micros()); - println!("external ids: {ids:?}"); - // println!("max_resident: {}", ALLOC.max_resident.load(std::sync::atomic::Ordering::SeqCst)); - // println!("allocated: {}", ALLOC.allocated.load(std::sync::atomic::Ordering::SeqCst)); - // } + println!("{}us: {:?}", elapsed.as_micros(), results); + for (id, document) in documents { + println!("{id}:"); + println!("{document}"); + } } #[test] @@ -385,9 +386,6 @@ mod tests { let rr = index.criteria(&txn).unwrap(); println!("{rr:?}"); - let primary_key = index.primary_key(&txn).unwrap().unwrap(); - let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - let start = Instant::now(); let mut s = Search::new(&txn, &index); @@ -398,20 +396,26 @@ mod tests { let elapsed = start.elapsed(); - let ids = index + let documents = index .documents(&txn, docs.documents_ids.iter().copied()) .unwrap() .into_iter() - .map(|x| { - let obkv = &x.1; - let id = obkv.get(primary_key).unwrap(); - let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - id.as_str().unwrap().to_owned() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) }) .collect::>(); println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - println!("external ids: {ids:?}"); + for (id, document) in documents { + println!("{id}:"); + println!("{document}"); + } } #[test] fn search_movies_new() { From e8c76cf7bfe54b7291793b24c72c9b4efdbfe48c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 6 Mar 2023 19:21:55 +0100 Subject: [PATCH 041/234] Intern all strings and phrases in the search logic --- milli/src/search/new/db_cache.rs | 122 ++++++------- .../search/new/graph_based_ranking_rule.rs | 47 ++--- milli/src/search/new/interner.rs | 78 ++++++++ milli/src/search/new/logger/detailed.rs | 71 +++++--- milli/src/search/new/mod.rs | 56 +++--- milli/src/search/new/query_graph.rs | 69 ++++---- milli/src/search/new/query_term.rs | 166 +++++++++++------- .../search/new/ranking_rule_graph/build.rs | 21 +-- .../ranking_rule_graph/edge_docids_cache.rs | 19 +- .../src/search/new/ranking_rule_graph/mod.rs | 75 +++++--- .../new/ranking_rule_graph/proximity/build.rs | 95 +++++----- .../proximity/compute_docids.rs | 26 ++- .../new/ranking_rule_graph/proximity/mod.rs | 43 ++--- .../new/ranking_rule_graph/resolve_paths.rs | 23 +-- .../search/new/ranking_rule_graph/typo/mod.rs | 46 ++--- milli/src/search/new/ranking_rules.rs | 162 +++++------------ milli/src/search/new/resolve_query_graph.rs | 90 ++++------ milli/src/search/new/sort.rs | 41 ++--- milli/src/search/new/words.rs | 39 ++-- 19 files changed, 635 insertions(+), 654 deletions(-) create mode 100644 milli/src/search/new/interner.rs diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index f2f8f12c5..cfd69b04f 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -1,51 +1,48 @@ -use std::collections::hash_map::Entry; - +use super::{interner::Interned, SearchContext}; +use crate::Result; use fxhash::FxHashMap; use heed::types::ByteSlice; -use heed::RoTxn; - -use crate::{Index, Result}; +use std::collections::hash_map::Entry; #[derive(Default)] -pub struct DatabaseCache<'transaction> { - pub word_pair_proximity_docids: FxHashMap<(u8, String, String), Option<&'transaction [u8]>>, +pub struct DatabaseCache<'search> { + // TODO: interner for all database cache keys + pub word_pair_proximity_docids: + FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, pub word_prefix_pair_proximity_docids: - FxHashMap<(u8, String, String), Option<&'transaction [u8]>>, + FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, pub prefix_word_pair_proximity_docids: - FxHashMap<(u8, String, String), Option<&'transaction [u8]>>, - pub word_docids: FxHashMap>, - pub exact_word_docids: FxHashMap>, - pub word_prefix_docids: FxHashMap>, + FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, + pub word_docids: FxHashMap, Option<&'search [u8]>>, + pub exact_word_docids: FxHashMap, Option<&'search [u8]>>, + pub word_prefix_docids: FxHashMap, Option<&'search [u8]>>, } -impl<'transaction> DatabaseCache<'transaction> { - pub fn get_word_docids( - &mut self, - index: &Index, - txn: &'transaction RoTxn, - word: &str, - ) -> Result> { - let bitmap_ptr = match self.word_docids.entry(word.to_owned()) { +impl<'search> SearchContext<'search> { + pub fn get_word_docids(&mut self, word: Interned) -> Result> { + let bitmap_ptr = match self.db_cache.word_docids.entry(word) { Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(), Entry::Vacant(entry) => { - let bitmap_ptr = index.word_docids.remap_data_type::().get(txn, word)?; + let bitmap_ptr = self + .index + .word_docids + .remap_data_type::() + .get(self.txn, self.word_interner.get(word))?; entry.insert(bitmap_ptr); bitmap_ptr } }; Ok(bitmap_ptr) } - pub fn get_prefix_docids( - &mut self, - index: &Index, - txn: &'transaction RoTxn, - prefix: &str, - ) -> Result> { + pub fn get_prefix_docids(&mut self, prefix: Interned) -> Result> { // In the future, this will be a frozen roaring bitmap - let bitmap_ptr = match self.word_prefix_docids.entry(prefix.to_owned()) { + let bitmap_ptr = match self.db_cache.word_prefix_docids.entry(prefix) { Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(), Entry::Vacant(entry) => { - let bitmap_ptr = - index.word_prefix_docids.remap_data_type::().get(txn, prefix)?; + let bitmap_ptr = self + .index + .word_prefix_docids + .remap_data_type::() + .get(self.txn, self.word_interner.get(prefix))?; entry.insert(bitmap_ptr); bitmap_ptr } @@ -55,14 +52,12 @@ impl<'transaction> DatabaseCache<'transaction> { pub fn get_word_pair_proximity_docids( &mut self, - index: &Index, - txn: &'transaction RoTxn, - word1: &str, - word2: &str, + word1: Interned, + word2: Interned, proximity: u8, - ) -> Result> { - let key = (proximity, word1.to_owned(), word2.to_owned()); - match self.word_pair_proximity_docids.entry(key.clone()) { + ) -> Result> { + let key = (proximity, word1, word2); + match self.db_cache.word_pair_proximity_docids.entry(key) { Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), Entry::Vacant(entry) => { // We shouldn't greedily access this DB at all @@ -86,10 +81,11 @@ impl<'transaction> DatabaseCache<'transaction> { // output.push(word1, word2, proximities); // } // } - let bitmap_ptr = index - .word_pair_proximity_docids - .remap_data_type::() - .get(txn, &(key.0, key.1.as_str(), key.2.as_str()))?; + let bitmap_ptr = + self.index.word_pair_proximity_docids.remap_data_type::().get( + self.txn, + &(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)), + )?; entry.insert(bitmap_ptr); Ok(bitmap_ptr) } @@ -98,20 +94,22 @@ impl<'transaction> DatabaseCache<'transaction> { pub fn get_word_prefix_pair_proximity_docids( &mut self, - index: &Index, - txn: &'transaction RoTxn, - word1: &str, - prefix2: &str, + word1: Interned, + prefix2: Interned, proximity: u8, - ) -> Result> { - let key = (proximity, word1.to_owned(), prefix2.to_owned()); - match self.word_prefix_pair_proximity_docids.entry(key.clone()) { + ) -> Result> { + let key = (proximity, word1, prefix2); + match self.db_cache.word_prefix_pair_proximity_docids.entry(key) { Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), Entry::Vacant(entry) => { - let bitmap_ptr = index + let bitmap_ptr = self + .index .word_prefix_pair_proximity_docids .remap_data_type::() - .get(txn, &(key.0, key.1.as_str(), key.2.as_str()))?; + .get( + self.txn, + &(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)), + )?; entry.insert(bitmap_ptr); Ok(bitmap_ptr) } @@ -119,20 +117,26 @@ impl<'transaction> DatabaseCache<'transaction> { } pub fn get_prefix_word_pair_proximity_docids( &mut self, - index: &Index, - txn: &'transaction RoTxn, - left_prefix: &str, - right: &str, + left_prefix: Interned, + right: Interned, proximity: u8, - ) -> Result> { - let key = (proximity, left_prefix.to_owned(), right.to_owned()); - match self.prefix_word_pair_proximity_docids.entry(key) { + ) -> Result> { + let key = (proximity, left_prefix, right); + match self.db_cache.prefix_word_pair_proximity_docids.entry(key) { Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), Entry::Vacant(entry) => { - let bitmap_ptr = index + let bitmap_ptr = self + .index .prefix_word_pair_proximity_docids .remap_data_type::() - .get(txn, &(proximity, left_prefix, right))?; + .get( + self.txn, + &( + proximity, + self.word_interner.get(left_prefix), + self.word_interner.get(right), + ), + )?; entry.insert(bitmap_ptr); Ok(bitmap_ptr) } diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index d1f5864aa..d51fb6920 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -1,15 +1,11 @@ -use heed::RoTxn; -use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; use super::logger::SearchLogger; use super::ranking_rule_graph::EdgeDocidsCache; use super::ranking_rule_graph::EmptyPathsCache; - use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::SearchContext; use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput}; - -use crate::{Index, Result}; +use crate::Result; +use roaring::RoaringBitmap; pub struct GraphBasedRankingRule { id: String, @@ -29,12 +25,10 @@ pub struct GraphBasedRankingRuleState { cur_distance_idx: usize, } -fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>( +fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( + ctx: &mut SearchContext<'search>, graph: &mut RankingRuleGraph, edge_docids_cache: &mut EdgeDocidsCache, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, universe: &RoaringBitmap, empty_paths_cache: &mut EmptyPathsCache, ) -> Result<()> { @@ -42,8 +36,7 @@ fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>( if graph.all_edges[edge_index as usize].is_none() { continue; } - let docids = edge_docids_cache - .get_edge_docids(index, txn, db_cache, edge_index, &*graph, universe)?; + let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?; match docids { BitmapOrAllRef::Bitmap(bitmap) => { if bitmap.is_disjoint(universe) { @@ -59,7 +52,7 @@ fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>( Ok(()) } -impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGraph> +impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> for GraphBasedRankingRule { fn id(&self) -> String { @@ -67,24 +60,20 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap } fn start_iteration( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { // TODO: update old state instead of starting from scratch - let mut graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?; + let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; let mut edge_docids_cache = EdgeDocidsCache::default(); let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len()); remove_empty_edges( + ctx, &mut graph, &mut edge_docids_cache, - index, - txn, - db_cache, universe, &mut empty_paths_cache, )?; @@ -105,20 +94,16 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap fn next_bucket( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { assert!(universe.len() > 1); let mut state = self.state.take().unwrap(); remove_empty_edges( + ctx, &mut state.graph, &mut state.edge_docids_cache, - index, - txn, - db_cache, universe, &mut state.empty_paths_cache, )?; @@ -151,9 +136,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap ); let bucket = state.graph.resolve_paths( - index, - txn, - db_cache, + ctx, &mut state.edge_docids_cache, &mut state.empty_paths_cache, universe, @@ -169,9 +152,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap fn end_iteration( &mut self, - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + _ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, ) { self.state = None; diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs new file mode 100644 index 000000000..ae0a4e9cb --- /dev/null +++ b/milli/src/search/new/interner.rs @@ -0,0 +1,78 @@ +use fxhash::FxHashMap; +use std::hash::Hash; +use std::marker::PhantomData; + +pub struct Interned { + idx: u32, + _phantom: PhantomData, +} + +impl Interned { + fn new(idx: u32) -> Self { + Self { idx, _phantom: PhantomData } + } +} + +pub struct Interner { + stable_store: Vec, + lookup: FxHashMap>, +} +impl Default for Interner { + fn default() -> Self { + Self { stable_store: Default::default(), lookup: Default::default() } + } +} + +impl Interner +where + T: Clone + Eq + Hash, +{ + pub fn insert(&mut self, s: T) -> Interned { + if let Some(interned) = self.lookup.get(&s) { + *interned + } else { + self.stable_store.push(s.clone()); + let interned = Interned::new(self.stable_store.len() as u32 - 1); + self.lookup.insert(s, interned); + interned + } + } + pub fn get(&self, interned: Interned) -> &T { + &self.stable_store[interned.idx as usize] + } +} + +// Interned boilerplate implementations + +impl Hash for Interned { + fn hash(&self, state: &mut H) { + self.idx.hash(state); + } +} + +impl Ord for Interned { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.idx.cmp(&other.idx) + } +} + +impl PartialOrd for Interned { + fn partial_cmp(&self, other: &Self) -> Option { + self.idx.partial_cmp(&other.idx) + } +} + +impl Eq for Interned {} + +impl PartialEq for Interned { + fn eq(&self, other: &Self) -> bool { + self.idx == other.idx + } +} +impl Clone for Interned { + fn clone(&self) -> Self { + Self { idx: self.idx, _phantom: PhantomData } + } +} + +impl Copy for Interned {} diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 4282db27f..76c3f8977 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -6,7 +6,7 @@ use std::time::Instant; use std::{io::Write, path::PathBuf}; use crate::new::ranking_rule_graph::TypoGraph; -use crate::new::{QueryNode, QueryGraph}; +use crate::new::{QueryNode, QueryGraph, SearchContext}; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::EmptyPathsCache; use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait}; @@ -176,7 +176,7 @@ impl SearchLogger for DetailedSearchLogger { } impl DetailedSearchLogger { - pub fn write_d2_description(&self) { + pub fn write_d2_description(&self,ctx: &mut SearchContext,) { let mut prev_time = self.initial_query_time.unwrap(); let mut timestamp = vec![]; fn activated_id(timestamp: &[usize]) -> String { @@ -193,12 +193,12 @@ impl DetailedSearchLogger { writeln!(&mut file, "direction: right").unwrap(); writeln!(&mut file, "Initial Query Graph: {{").unwrap(); let initial_query_graph = self.initial_query.as_ref().unwrap(); - Self::query_graph_d2_description(initial_query_graph, &mut file); + Self::query_graph_d2_description(ctx, initial_query_graph, &mut file); writeln!(&mut file, "}}").unwrap(); writeln!(&mut file, "Query Graph Used To Compute Universe: {{").unwrap(); let query_graph_for_universe = self.query_for_universe.as_ref().unwrap(); - Self::query_graph_d2_description(query_graph_for_universe, &mut file); + Self::query_graph_d2_description(ctx, query_graph_for_universe, &mut file); writeln!(&mut file, "}}").unwrap(); let initial_universe = self.initial_universe.as_ref().unwrap(); @@ -308,7 +308,7 @@ results.{random} {{ let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::query_graph_d2_description(query_graph, &mut new_file); + Self::query_graph_d2_description(ctx, query_graph, &mut new_file); writeln!( &mut file, "{id} {{ @@ -323,7 +323,7 @@ results.{random} {{ let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file); + Self::ranking_rule_graph_d2_description(ctx, graph, paths, empty_paths_cache, distances.clone(), &mut new_file); writeln!( &mut file, "{id} {{ @@ -339,7 +339,7 @@ results.{random} {{ let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file); + Self::ranking_rule_graph_d2_description(ctx,graph, paths, empty_paths_cache, distances.clone(), &mut new_file); writeln!( &mut file, "{id} {{ @@ -352,31 +352,40 @@ results.{random} {{ writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc(node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) { + fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) { match &node { QueryNode::Term(LocatedQueryTerm { value, .. }) => { match value { QueryTerm::Phrase { phrase } => { - let phrase_str = phrase.description(); + let phrase = ctx.phrase_interner.get(*phrase); + let phrase_str = phrase.description(&ctx.word_interner); writeln!(file,"{node_idx} : \"{phrase_str}\"").unwrap(); }, QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db, synonyms, split_words } } => { + let original = ctx.word_interner.get(*original); writeln!(file,"{node_idx} : \"{original}\" {{ shape: class").unwrap(); - for w in zero_typo { + for w in zero_typo.iter().copied() { + let w = ctx.word_interner.get(w); writeln!(file, "\"{w}\" : 0").unwrap(); } - for w in one_typo { + for w in one_typo.iter().copied() { + let w = ctx.word_interner.get(w); writeln!(file, "\"{w}\" : 1").unwrap(); } - for w in two_typos { + for w in two_typos.iter().copied() { + let w = ctx.word_interner.get(w); writeln!(file, "\"{w}\" : 2").unwrap(); } - if let Some((left, right)) = split_words { - writeln!(file, "\"{left} {right}\" : split_words").unwrap(); + if let Some(split_words) = split_words { + let phrase = ctx.phrase_interner.get(*split_words); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); } - for synonym in synonyms { - writeln!(file, "\"{}\" : synonym", synonym.description()).unwrap(); + for synonym in synonyms.iter().copied() { + let phrase = ctx.phrase_interner.get(synonym); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); } if *use_prefix_db { writeln!(file, "use prefix DB : true").unwrap(); @@ -398,20 +407,20 @@ shape: class").unwrap(); }, } } - fn query_graph_d2_description(query_graph: &QueryGraph, file: &mut File) { + fn query_graph_d2_description(ctx: &mut SearchContext, query_graph: &QueryGraph, file: &mut File) { writeln!(file,"direction: right").unwrap(); for node in 0..query_graph.nodes.len() { if matches!(query_graph.nodes[node], QueryNode::Deleted) { continue; } - Self::query_node_d2_desc(node, &query_graph.nodes[node], &[], file); + Self::query_node_d2_desc(ctx, node, &query_graph.nodes[node], &[], file); for edge in query_graph.edges[node].successors.iter() { writeln!(file, "{node} -> {edge};\n").unwrap(); } } } - fn ranking_rule_graph_d2_description(graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { + fn ranking_rule_graph_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -420,7 +429,7 @@ shape: class").unwrap(); continue; } let distances = &distances[node_idx]; - Self::query_node_d2_desc(node_idx, node, distances.as_slice(), file); + Self::query_node_d2_desc(ctx, node_idx, node, distances.as_slice(), file); } for edge in graph.all_edges.iter().flatten() { let Edge { from_node, to_node, details, .. } = edge; @@ -449,7 +458,7 @@ shape: class").unwrap(); writeln!(file, "Shortest Paths {{").unwrap(); - Self::paths_d2_description(graph, paths, file); + Self::paths_d2_description(ctx, graph, paths, file); writeln!(file, "}}").unwrap(); // writeln!(file, "Empty Edge Couples {{").unwrap(); @@ -468,15 +477,18 @@ shape: class").unwrap(); // } // writeln!(file, "}}").unwrap(); } - fn edge_d2_description(graph: &RankingRuleGraph, edge_idx: u32, file: &mut File) { + fn edge_d2_description(ctx: &mut SearchContext,graph: &RankingRuleGraph, edge_idx: u32, file: &mut File) { let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { QueryNode::Term(term) => match &term.value { QueryTerm::Phrase { phrase } => { - phrase.description() + let phrase = ctx.phrase_interner.get(*phrase); + phrase.description(&ctx.word_interner) + }, + QueryTerm::Word { derivations } => { + ctx.word_interner.get(derivations.original).to_owned() }, - QueryTerm::Word { derivations } => derivations.original.clone(), }, QueryNode::Deleted => panic!(), QueryNode::Start => "START".to_owned(), @@ -485,8 +497,11 @@ shape: class").unwrap(); let to_node = &graph.query_graph.nodes[*to_node as usize]; let to_node_desc = match to_node { QueryNode::Term(term) => match &term.value { - QueryTerm::Phrase { phrase } => phrase.description(), - QueryTerm::Word { derivations } => derivations.original.clone(), + QueryTerm::Phrase { phrase } => { + let phrase = ctx.phrase_interner.get(*phrase); + phrase.description(&ctx.word_interner) + }, + QueryTerm::Word { derivations } => ctx.word_interner.get(derivations.original).to_owned(), }, QueryNode::Deleted => panic!(), QueryNode::Start => "START".to_owned(), @@ -496,11 +511,11 @@ shape: class").unwrap(); shape: class }}").unwrap(); } - fn paths_d2_description(graph: &RankingRuleGraph, paths: &[Vec], file: &mut File) { + fn paths_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], file: &mut File) { for (path_idx, edge_indexes) in paths.iter().enumerate() { writeln!(file, "{path_idx} {{").unwrap(); for edge_idx in edge_indexes.iter() { - Self::edge_d2_description(graph, *edge_idx, file); + Self::edge_d2_description(ctx, graph, *edge_idx, file); } for couple_edges in edge_indexes.windows(2) { let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() }; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 3e9b43f1b..0feef1f60 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -1,5 +1,6 @@ mod db_cache; mod graph_based_ranking_rule; +mod interner; mod logger; mod query_graph; mod query_term; @@ -26,7 +27,9 @@ use query_graph::{QueryGraph, QueryNode}; use roaring::RoaringBitmap; use self::{ + interner::Interner, logger::SearchLogger, + query_term::Phrase, resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}, }; @@ -35,14 +38,32 @@ pub enum BitmapOrAllRef<'s> { All, } +pub struct SearchContext<'search> { + pub index: &'search Index, + pub txn: &'search RoTxn<'search>, + pub db_cache: DatabaseCache<'search>, + pub word_interner: Interner, + pub phrase_interner: Interner, + pub node_docids_cache: NodeDocIdsCache, +} +impl<'search> SearchContext<'search> { + pub fn new(index: &'search Index, txn: &'search RoTxn<'search>) -> Self { + Self { + index, + txn, + db_cache: <_>::default(), + word_interner: <_>::default(), + phrase_interner: <_>::default(), + node_docids_cache: <_>::default(), + } + } +} + #[allow(clippy::too_many_arguments)] -pub fn resolve_maximally_reduced_query_graph<'transaction>( - index: &Index, - txn: &'transaction heed::RoTxn, - db_cache: &mut DatabaseCache<'transaction>, +pub fn resolve_maximally_reduced_query_graph<'search>( + ctx: &mut SearchContext<'search>, universe: &RoaringBitmap, query_graph: &QueryGraph, - node_docids_cache: &mut NodeDocIdsCache, matching_strategy: TermsMatchingStrategy, logger: &mut dyn SearchLogger, ) -> Result { @@ -73,16 +94,14 @@ pub fn resolve_maximally_reduced_query_graph<'transaction>( } } logger.query_for_universe(&graph); - let docids = resolve_query_graph(index, txn, db_cache, node_docids_cache, &graph, universe)?; + let docids = resolve_query_graph(ctx, &graph, universe)?; Ok(docids) } #[allow(clippy::too_many_arguments)] -pub fn execute_search<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, +pub fn execute_search<'search>( + ctx: &mut SearchContext<'search>, query: &str, filters: Option, from: usize, @@ -90,26 +109,21 @@ pub fn execute_search<'transaction>( logger: &mut dyn SearchLogger, ) -> Result> { assert!(!query.is_empty()); - let query_terms = located_query_terms_from_string(index, txn, query.tokenize(), None).unwrap(); - let graph = QueryGraph::from_query(index, txn, db_cache, query_terms)?; + let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None).unwrap(); + let graph = QueryGraph::from_query(ctx, query_terms)?; logger.initial_query(&graph); let universe = if let Some(filters) = filters { - filters.evaluate(txn, index)? + filters.evaluate(ctx.txn, ctx.index)? } else { - index.documents_ids(txn)? + ctx.index.documents_ids(ctx.txn)? }; - let mut node_docids_cache = NodeDocIdsCache::default(); - let universe = resolve_maximally_reduced_query_graph( - index, - txn, - db_cache, + ctx, &universe, &graph, - &mut node_docids_cache, TermsMatchingStrategy::Last, logger, )?; @@ -117,5 +131,5 @@ pub fn execute_search<'transaction>( logger.initial_universe(&universe); - apply_ranking_rules(index, txn, db_cache, &graph, &universe, from, length, logger) + apply_ranking_rules(ctx, &graph, &universe, from, length, logger) } diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index e86c175af..449b6536c 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,13 +1,10 @@ +use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations}; +use super::SearchContext; +use crate::Result; +use roaring::RoaringBitmap; use std::fmt::Debug; -use heed::RoTxn; -use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; -use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations}; -use crate::{Index, Result}; - -#[derive(Debug, Clone)] +#[derive(Clone)] pub enum QueryNode { Term(LocatedQueryTerm), Deleted, @@ -22,7 +19,7 @@ pub struct Edges { pub successors: RoaringBitmap, } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct QueryGraph { pub root_node: u32, pub end_node: u32, @@ -31,8 +28,8 @@ pub struct QueryGraph { } fn _assert_sizes() { - // TODO: QueryNodes are too big now, 184B is an unreasonable size - let _: [u8; 184] = [0; std::mem::size_of::()]; + // TODO: QueryNodes are too big now, 88B is a bit too big + let _: [u8; 88] = [0; std::mem::size_of::()]; let _: [u8; 48] = [0; std::mem::size_of::()]; } @@ -72,19 +69,14 @@ impl QueryGraph { impl QueryGraph { // TODO: return the list of all matching words here as well - pub fn from_query<'transaction>( - index: &Index, - txn: &RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, - terms: Vec, - ) -> Result { + pub fn from_query(ctx: &mut SearchContext, terms: Vec) -> Result { // TODO: maybe empty nodes should not be removed here, to compute // the score of the `words` ranking rule correctly // it is very easy to traverse the graph and remove afterwards anyway // Still, I'm keeping this here as a demo let mut empty_nodes = vec![]; - let word_set = index.words_fst(txn)?; + let word_set = ctx.index.words_fst(ctx.txn)?; let mut graph = QueryGraph::default(); let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = @@ -105,20 +97,20 @@ impl QueryGraph { if !prev1.is_empty() { if let Some((ngram2_str, ngram2_pos)) = - query_term::ngram2(&query[length - 2], &query[length - 1]) + query_term::ngram2(ctx, &query[length - 2], &query[length - 1]) { - if word_set.contains(ngram2_str.as_bytes()) { + if word_set.contains(ctx.word_interner.get(ngram2_str)) { let ngram2 = LocatedQueryTerm { value: QueryTerm::Word { derivations: WordDerivations { - original: ngram2_str.clone(), + original: ngram2_str, // TODO: could add a typo if it's an ngram? - zero_typo: vec![ngram2_str], - one_typo: vec![], - two_typos: vec![], + zero_typo: Box::new([ngram2_str]), + one_typo: Box::new([]), + two_typos: Box::new([]), use_prefix_db: false, - synonyms: vec![], // TODO: ngram synonyms - split_words: None, // TODO: maybe ngram split words? + synonyms: Box::new([]), // TODO: ngram synonyms + split_words: None, // TODO: maybe ngram split words? }, }, positions: ngram2_pos, @@ -129,22 +121,25 @@ impl QueryGraph { } } if !prev2.is_empty() { - if let Some((ngram3_str, ngram3_pos)) = - query_term::ngram3(&query[length - 3], &query[length - 2], &query[length - 1]) - { - if word_set.contains(ngram3_str.as_bytes()) { + if let Some((ngram3_str, ngram3_pos)) = query_term::ngram3( + ctx, + &query[length - 3], + &query[length - 2], + &query[length - 1], + ) { + if word_set.contains(ctx.word_interner.get(ngram3_str)) { let ngram3 = LocatedQueryTerm { value: QueryTerm::Word { derivations: WordDerivations { - original: ngram3_str.clone(), + original: ngram3_str, // TODO: could add a typo if it's an ngram? - zero_typo: vec![ngram3_str], - one_typo: vec![], - two_typos: vec![], + zero_typo: Box::new([ngram3_str]), + one_typo: Box::new([]), + two_typos: Box::new([]), use_prefix_db: false, - synonyms: vec![], // TODO: ngram synonyms - split_words: None, // TODO: maybe ngram split words? - // would be nice for typos like su nflower + synonyms: Box::new([]), // TODO: ngram synonyms + split_words: None, // TODO: maybe ngram split words? + // would be nice for typos like su nflower }, }, positions: ngram3_pos, diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 3820b8ed0..b5e29bffc 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -16,30 +16,35 @@ use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; use crate::{CboRoaringBitmapLenCodec, Index, Result}; -#[derive(Debug, Default, Clone)] +use super::interner::{Interned, Interner}; +use super::SearchContext; + +#[derive(Default, Clone, PartialEq, Eq, Hash)] pub struct Phrase { - pub words: Vec>, + pub words: Vec>>, } impl Phrase { - pub fn description(&self) -> String { - self.words.iter().flatten().join(" ") + pub fn description(&self, interner: &Interner) -> String { + self.words.iter().flatten().map(|w| interner.get(*w)).join(" ") } } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct WordDerivations { - pub original: String, + pub original: Interned, // TODO: pub prefix_of: Vec, - pub synonyms: Vec, - pub split_words: Option<(String, String)>, - pub zero_typo: Vec, - pub one_typo: Vec, - pub two_typos: Vec, + pub synonyms: Box<[Interned]>, + pub split_words: Option>, + pub zero_typo: Box<[Interned]>, + pub one_typo: Box<[Interned]>, + pub two_typos: Box<[Interned]>, pub use_prefix_db: bool, } impl WordDerivations { - pub fn all_derivations_except_prefix_db(&self) -> impl Iterator + Clone { - self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter()) + pub fn all_derivations_except_prefix_db( + &'_ self, + ) -> impl Iterator> + Clone + '_ { + self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter()).copied() } fn is_empty(&self) -> bool { self.zero_typo.is_empty() @@ -50,15 +55,21 @@ impl WordDerivations { } pub fn word_derivations( - index: &Index, - txn: &RoTxn, + ctx: &mut SearchContext, word: &str, max_typo: u8, is_prefix: bool, fst: &fst::Set>, ) -> Result { + let word_interned = ctx.word_interner.insert(word.to_owned()); + let use_prefix_db = is_prefix - && index.word_prefix_docids.remap_data_type::().get(txn, word)?.is_some(); + && ctx + .index + .word_prefix_docids + .remap_data_type::() + .get(ctx.txn, word)? + .is_some(); let mut zero_typo = vec![]; let mut one_typo = vec![]; @@ -70,11 +81,12 @@ pub fn word_derivations( let mut stream = fst.search(prefix).into_stream(); while let Some(word) = stream.next() { - let word = std::str::from_utf8(word)?; - zero_typo.push(word.to_string()); + let word = std::str::from_utf8(word)?.to_owned(); + let word_interned = ctx.word_interner.insert(word); + zero_typo.push(word_interned); } } else if fst.contains(word) { - zero_typo.push(word.to_string()); + zero_typo.push(word_interned); } } else if max_typo == 1 { let dfa = build_dfa(word, 1, is_prefix); @@ -83,13 +95,14 @@ pub fn word_derivations( while let Some((word, state)) = stream.next() { let word = std::str::from_utf8(word)?; + let word_interned = ctx.word_interner.insert(word.to_owned()); let d = dfa.distance(state.1); match d.to_u8() { 0 => { - zero_typo.push(word.to_string()); + zero_typo.push(word_interned); } 1 => { - one_typo.push(word.to_string()); + one_typo.push(word_interned); } _ => panic!(), } @@ -105,47 +118,56 @@ pub fn word_derivations( while let Some((found_word, state)) = stream.next() { let found_word = std::str::from_utf8(found_word)?; + let found_word_interned = ctx.word_interner.insert(found_word.to_owned()); // in the case the typo is on the first letter, we know the number of typo // is two if get_first(found_word) != get_first(word) { - two_typos.push(found_word.to_string()); + two_typos.push(found_word_interned); } else { // Else, we know that it is the second dfa that matched and compute the // correct distance let d = second_dfa.distance((state.1).0); match d.to_u8() { 0 => { - zero_typo.push(found_word.to_string()); + zero_typo.push(found_word_interned); } 1 => { - one_typo.push(found_word.to_string()); + one_typo.push(found_word_interned); } 2 => { - two_typos.push(found_word.to_string()); + two_typos.push(found_word_interned); } _ => panic!(), } } } } - let split_words = split_best_frequency(index, txn, word)?; + let split_words = split_best_frequency(ctx.index, ctx.txn, word)?.map(|(l, r)| { + ctx.phrase_interner.insert(Phrase { + words: vec![Some(ctx.word_interner.insert(l)), Some(ctx.word_interner.insert(r))], + }) + }); + + let synonyms = ctx.index.synonyms(ctx.txn)?; - let synonyms = index.synonyms(txn)?; let synonyms = synonyms .get(&vec![word.to_owned()]) .cloned() .unwrap_or_default() .into_iter() - .map(|words| Phrase { words: words.into_iter().map(Some).collect() }) + .map(|words| { + let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); + ctx.phrase_interner.insert(Phrase { words }) + }) .collect(); Ok(WordDerivations { - original: word.to_owned(), + original: ctx.word_interner.insert(word.to_owned()), synonyms, split_words, - zero_typo, - one_typo, - two_typos, + zero_typo: zero_typo.into_boxed_slice(), + one_typo: one_typo.into_boxed_slice(), + two_typos: two_typos.into_boxed_slice(), use_prefix_db, }) } @@ -176,33 +198,36 @@ fn split_best_frequency( Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned()))) } -#[derive(Debug, Clone)] +#[derive(Clone)] pub enum QueryTerm { // TODO: should there be SplitWord, NGram2, and NGram3 variants? // NGram2 can have 1 typo and synonyms // NGram3 cannot have typos but can have synonyms // SplitWords are a phrase // Can NGrams be prefixes? - Phrase { phrase: Phrase }, + Phrase { phrase: Interned }, Word { derivations: WordDerivations }, } impl QueryTerm { - pub fn original_single_word(&self) -> Option<&str> { + pub fn original_single_word<'interner>( + &self, + word_interner: &'interner Interner, + ) -> Option<&'interner str> { match self { QueryTerm::Phrase { phrase: _ } => None, QueryTerm::Word { derivations } => { if derivations.is_empty() { None } else { - Some(derivations.original.as_str()) + Some(word_interner.get(derivations.original)) } } } } } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct LocatedQueryTerm { pub value: QueryTerm, pub positions: RangeInclusive, @@ -217,18 +242,17 @@ impl LocatedQueryTerm { } } -pub fn located_query_terms_from_string<'transaction>( - index: &Index, - txn: &'transaction RoTxn, +pub fn located_query_terms_from_string<'search>( + ctx: &mut SearchContext<'search>, query: NormalizedTokenIter>, words_limit: Option, ) -> Result> { - let authorize_typos = index.authorize_typos(txn)?; - let min_len_one_typo = index.min_word_len_one_typo(txn)?; - let min_len_two_typos = index.min_word_len_two_typos(txn)?; + let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; + let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; + let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; - let exact_words = index.exact_words(txn)?; - let fst = index.words_fst(txn)?; + let exact_words = ctx.index.exact_words(ctx.txn)?; + let fst = ctx.index.words_fst(ctx.txn)?; let nbr_typos = |word: &str| { if !authorize_typos @@ -243,10 +267,6 @@ pub fn located_query_terms_from_string<'transaction>( } }; - let derivations = |word: &str, is_prefix: bool| { - word_derivations(index, txn, word, nbr_typos(word), is_prefix, &fst) - }; - let mut primitive_query = Vec::new(); let mut phrase = Vec::new(); @@ -279,14 +299,17 @@ pub fn located_query_terms_from_string<'transaction>( if let TokenKind::StopWord = token.kind { phrase.push(None); } else { + let word = ctx.word_interner.insert(token.lemma().to_string()); // TODO: in a phrase, check that every word exists // otherwise return WordDerivations::Empty - phrase.push(Some(token.lemma().to_string())); + phrase.push(Some(word)); } } else if peekable.peek().is_some() { match token.kind { TokenKind::Word => { - let derivations = derivations(token.lemma(), false)?; + let word = token.lemma(); + let derivations = + word_derivations(ctx, word, nbr_typos(word), false, &fst)?; let located_term = LocatedQueryTerm { value: QueryTerm::Word { derivations }, positions: position..=position, @@ -296,7 +319,8 @@ pub fn located_query_terms_from_string<'transaction>( TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} } } else { - let derivations = derivations(token.lemma(), true)?; + let word = token.lemma(); + let derivations = word_derivations(ctx, word, nbr_typos(word), true, &fst)?; let located_term = LocatedQueryTerm { value: QueryTerm::Word { derivations }, positions: position..=position, @@ -323,7 +347,9 @@ pub fn located_query_terms_from_string<'transaction>( { let located_query_term = LocatedQueryTerm { value: QueryTerm::Phrase { - phrase: Phrase { words: mem::take(&mut phrase) }, + phrase: ctx + .phrase_interner + .insert(Phrase { words: mem::take(&mut phrase) }), }, positions: phrase_start..=phrase_end, }; @@ -337,7 +363,9 @@ pub fn located_query_terms_from_string<'transaction>( // If a quote is never closed, we consider all of the end of the query as a phrase. if !phrase.is_empty() { let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } }, + value: QueryTerm::Phrase { + phrase: ctx.phrase_interner.insert(Phrase { words: mem::take(&mut phrase) }), + }, positions: phrase_start..=phrase_end, }; primitive_query.push(located_query_term); @@ -347,35 +375,49 @@ pub fn located_query_terms_from_string<'transaction>( } // TODO: return a word derivations instead? -pub fn ngram2(x: &LocatedQueryTerm, y: &LocatedQueryTerm) -> Option<(String, RangeInclusive)> { +pub fn ngram2( + ctx: &mut SearchContext, + x: &LocatedQueryTerm, + y: &LocatedQueryTerm, +) -> Option<(Interned, RangeInclusive)> { if *x.positions.end() != y.positions.start() - 1 { return None; } - match (&x.value.original_single_word(), &y.value.original_single_word()) { + match ( + &x.value.original_single_word(&ctx.word_interner), + &y.value.original_single_word(&ctx.word_interner), + ) { (Some(w1), Some(w2)) => { - let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end()); + let term = ( + ctx.word_interner.insert(format!("{w1}{w2}")), + *x.positions.start()..=*y.positions.end(), + ); Some(term) } _ => None, } } pub fn ngram3( + ctx: &mut SearchContext, x: &LocatedQueryTerm, y: &LocatedQueryTerm, z: &LocatedQueryTerm, -) -> Option<(String, RangeInclusive)> { +) -> Option<(Interned, RangeInclusive)> { if *x.positions.end() != y.positions.start() - 1 || *y.positions.end() != z.positions.start() - 1 { return None; } match ( - &x.value.original_single_word(), - &y.value.original_single_word(), - &z.value.original_single_word(), + &x.value.original_single_word(&ctx.word_interner), + &y.value.original_single_word(&ctx.word_interner), + &z.value.original_single_word(&ctx.word_interner), ) { (Some(w1), Some(w2), Some(w3)) => { - let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end()); + let term = ( + ctx.word_interner.insert(format!("{w1}{w2}{w3}")), + *x.positions.start()..=*z.positions.end(), + ); Some(term) } _ => None, diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index a0fdd79c6..03a7f6c9d 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,18 +1,10 @@ -use heed::RoTxn; +use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::new::{QueryGraph, SearchContext}; +use crate::Result; use roaring::RoaringBitmap; -use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::db_cache::DatabaseCache; -use crate::new::QueryGraph; -use crate::{Index, Result}; - impl RankingRuleGraph { - pub fn build<'db_cache, 'transaction: 'db_cache>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, - query_graph: QueryGraph, - ) -> Result { + pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { let mut ranking_rule_graph = Self { query_graph, all_edges: vec![], node_edges: vec![], successors: vec![] }; @@ -22,12 +14,11 @@ impl RankingRuleGraph { let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap(); let new_successors = ranking_rule_graph.successors.last_mut().unwrap(); - let Some(from_node_data) = G::build_visit_from_node(index, txn, db_cache, node)? else { continue }; + let Some(from_node_data) = G::build_visit_from_node(ctx, node)? else { continue }; for successor_idx in ranking_rule_graph.query_graph.edges[node_idx].successors.iter() { let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx as usize]; - let mut edges = - G::build_visit_to_node(index, txn, db_cache, to_node, &from_node_data)?; + let mut edges = G::build_visit_to_node(ctx, to_node, &from_node_data)?; if edges.is_empty() { continue; } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 3d48fd69c..21c186f3c 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -1,13 +1,10 @@ use std::marker::PhantomData; -use fxhash::FxHashMap; -use heed::RoTxn; -use roaring::RoaringBitmap; - use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::db_cache::DatabaseCache; -use crate::new::BitmapOrAllRef; -use crate::{Index, Result}; +use crate::new::{BitmapOrAllRef, SearchContext}; +use crate::Result; +use fxhash::FxHashMap; +use roaring::RoaringBitmap; // TODO: the cache should have a G::EdgeDetails as key // but then it means that we should have a quick way of @@ -25,11 +22,9 @@ impl Default for EdgeDocidsCache { } } impl EdgeDocidsCache { - pub fn get_edge_docids<'s, 'transaction>( + pub fn get_edge_docids<'s, 'search>( &'s mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, edge_index: u32, graph: &RankingRuleGraph, // TODO: maybe universe doesn't belong here @@ -46,7 +41,7 @@ impl EdgeDocidsCache { return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); } // TODO: maybe universe doesn't belong here - let docids = universe & G::compute_docids(index, txn, db_cache, details)?; + let docids = universe & G::compute_docids(ctx, details)?; let _ = self.cache.insert(edge_index, docids); let docids = &self.cache[&edge_index]; Ok(BitmapOrAllRef::Bitmap(docids)) diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index e65d5f70b..446c4e248 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -7,20 +7,15 @@ mod proximity; mod resolve_paths; mod typo; +use super::logger::SearchLogger; +use super::{QueryGraph, QueryNode, SearchContext}; +use crate::Result; pub use edge_docids_cache::EdgeDocidsCache; pub use empty_paths_cache::EmptyPathsCache; pub use proximity::ProximityGraph; -pub use typo::TypoGraph; - -use std::ops::ControlFlow; - -use heed::RoTxn; use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; -use super::logger::SearchLogger; -use super::{QueryGraph, QueryNode}; -use crate::{Index, Result}; +use std::ops::ControlFlow; +pub use typo::TypoGraph; #[derive(Debug, Clone)] pub enum EdgeDetails { @@ -42,6 +37,48 @@ pub struct EdgePointer<'graph, E> { pub edge: &'graph Edge, } +// pub struct SubWordDerivations { +// words: FxHashSet>, +// synonyms: FxHashSet>, // NO! they're phrases, not strings +// split_words: bool, +// use_prefix_db: bool, +// } + +// pub struct EdgeWordDerivations { +// // TODO: not Option, instead: Any | All | Subset(SubWordDerivations) +// from_words: Option, // ??? +// to_words: Option, // + use prefix db? +// } + +// fn aggregate_edge_word_derivations( +// graph: (), +// edges: Vec, +// ) -> BTreeMap { +// todo!() +// } + +// fn reduce_word_term_to_sub_word_derivations( +// term: &mut WordDerivations, +// derivations: &SubWordDerivations, +// ) { +// let mut new_one_typo = vec![]; +// for w in term.one_typo { +// if derivations.words.contains(w) { +// new_one_typo.push(w); +// } +// } +// if term.use_prefix_db && !derivations.use_prefix_db { +// term.use_prefix_db = false; +// } +// // etc. +// } + +// fn word_derivations_used_by_edge( +// edge: G::EdgeDetails, +// ) -> SubWordDerivations { +// todo!() +// } + pub trait RankingRuleGraphTrait: Sized { /// The details of an edge connecting two query nodes. These details /// should be sufficient to compute the edge's cost and associated document ids @@ -55,10 +92,8 @@ pub trait RankingRuleGraphTrait: Sized { fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String; /// Compute the document ids associated with the given edge. - fn compute_docids<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn compute_docids<'search>( + ctx: &mut SearchContext<'search>, edge_details: &Self::EdgeDetails, ) -> Result; @@ -66,19 +101,15 @@ pub trait RankingRuleGraphTrait: Sized { /// /// This call is followed by zero, one or more calls to [`build_visit_to_node`](RankingRuleGraphTrait::build_visit_to_node), /// which builds the actual edges. - fn build_visit_from_node<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_from_node<'search>( + ctx: &mut SearchContext<'search>, from_node: &QueryNode, ) -> Result>; /// Return the cost and details of the edges going from the previously visited node /// (with [`build_visit_from_node`](RankingRuleGraphTrait::build_visit_from_node)) to `to_node`. - fn build_visit_to_node<'from_data, 'transaction: 'from_data>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_to_node<'from_data, 'search: 'from_data>( + ctx: &mut SearchContext<'search>, to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>>; diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 06c860d7e..4603c7ea0 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -1,30 +1,30 @@ -use std::collections::BTreeMap; - -use heed::RoTxn; -use itertools::Itertools; - use super::ProximityEdge; -use crate::new::db_cache::DatabaseCache; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::proximity::WordPair; use crate::new::ranking_rule_graph::EdgeDetails; -use crate::new::QueryNode; -use crate::{Index, Result}; +use crate::new::{QueryNode, SearchContext}; +use crate::Result; +use itertools::Itertools; +use std::collections::BTreeMap; -pub fn visit_from_node(from_node: &QueryNode) -> Result> { +pub fn visit_from_node( + ctx: &mut SearchContext, + from_node: &QueryNode, +) -> Result> { Ok(Some(match from_node { QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 { QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()), QueryTerm::Phrase { phrase: phrase1 } => { - if let Some(original) = phrase1.words.last().unwrap().as_ref() { + let phrase1 = ctx.phrase_interner.get(*phrase1); + if let Some(original) = *phrase1.words.last().unwrap() { ( WordDerivations { - original: original.clone(), - zero_typo: vec![original.to_owned()], - one_typo: vec![], - two_typos: vec![], + original, + zero_typo: Box::new([original]), + one_typo: Box::new([]), + two_typos: Box::new([]), use_prefix_db: false, - synonyms: vec![], + synonyms: Box::new([]), split_words: None, }, *pos1.end(), @@ -37,12 +37,12 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result ( WordDerivations { - original: String::new(), - zero_typo: vec![], - one_typo: vec![], - two_typos: vec![], + original: ctx.word_interner.insert(String::new()), + zero_typo: Box::new([]), + one_typo: Box::new([]), + two_typos: Box::new([]), use_prefix_db: false, - synonyms: vec![], + synonyms: Box::new([]), split_words: None, }, -100, @@ -51,10 +51,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, +pub fn visit_to_node<'search, 'from_data>( + ctx: &mut SearchContext<'search>, to_node: &QueryNode, from_node_data: &'from_data (WordDerivations, i8), ) -> Result)>> { @@ -69,15 +67,16 @@ pub fn visit_to_node<'transaction, 'from_data>( let (derivations2, pos2, ngram_len2) = match value2 { QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()), QueryTerm::Phrase { phrase: phrase2 } => { - if let Some(original) = phrase2.words.first().unwrap().as_ref() { + let phrase2 = ctx.phrase_interner.get(*phrase2); + if let Some(original) = *phrase2.words.first().unwrap() { ( WordDerivations { - original: original.clone(), - zero_typo: vec![original.to_owned()], - one_typo: vec![], - two_typos: vec![], + original, + zero_typo: Box::new([original]), + one_typo: Box::new([]), + two_typos: Box::new([]), use_prefix_db: false, - synonyms: vec![], + synonyms: Box::new([]), split_words: None, }, *pos2.start(), @@ -106,19 +105,16 @@ pub fn visit_to_node<'transaction, 'from_data>( let derivations1 = derivations1.all_derivations_except_prefix_db(); // TODO: eventually, we want to get rid of the uses from `orginal` - let original_word_2 = derivations2.original.clone(); let mut cost_proximity_word_pairs = BTreeMap::>>::new(); if updb2 { for word1 in derivations1.clone() { for proximity in 1..=(8 - ngram_len2) { let cost = (proximity + ngram_len2 - 1) as u8; - if db_cache + if ctx .get_word_prefix_pair_proximity_docids( - index, - txn, word1, - original_word_2.as_str(), + derivations2.original, proximity as u8, )? .is_some() @@ -129,16 +125,14 @@ pub fn visit_to_node<'transaction, 'from_data>( .entry(proximity as u8) .or_default() .push(WordPair::WordPrefix { - left: word1.to_owned(), - right_prefix: original_word_2.to_owned(), + left: word1, + right_prefix: derivations2.original, }); } - if db_cache + if ctx .get_prefix_word_pair_proximity_docids( - index, - txn, - original_word_2.as_str(), - word1.as_str(), + derivations2.original, + word1, proximity as u8 - 1, )? .is_some() @@ -149,8 +143,8 @@ pub fn visit_to_node<'transaction, 'from_data>( .entry(proximity as u8) .or_default() .push(WordPair::WordPrefixSwapped { - left_prefix: original_word_2.to_owned(), - right: word1.to_owned(), + left_prefix: derivations2.original, + right: word1, }); } } @@ -164,28 +158,23 @@ pub fn visit_to_node<'transaction, 'from_data>( for (word1, word2) in product_derivations { for proximity in 1..=(8 - ngram_len2) { let cost = (proximity + ngram_len2 - 1) as u8; - if db_cache - .get_word_pair_proximity_docids(index, txn, word1, word2, proximity as u8)? - .is_some() - { + if ctx.get_word_pair_proximity_docids(word1, word2, proximity as u8)?.is_some() { cost_proximity_word_pairs .entry(cost) .or_default() .entry(proximity as u8) .or_default() - .push(WordPair::Words { left: word1.to_owned(), right: word2.to_owned() }); + .push(WordPair::Words { left: word1, right: word2 }); } if proximity > 1 - && db_cache - .get_word_pair_proximity_docids(index, txn, word2, word1, proximity as u8 - 1)? - .is_some() + && ctx.get_word_pair_proximity_docids(word2, word1, proximity as u8 - 1)?.is_some() { cost_proximity_word_pairs .entry(cost) .or_default() .entry(proximity as u8 - 1) .or_default() - .push(WordPair::Words { left: word2.to_owned(), right: word1.to_owned() }); + .push(WordPair::Words { left: word2, right: word1 }); } } } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 34f7deea1..df289fb2c 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,14 +1,10 @@ -use heed::RoTxn; +use super::{ProximityEdge, WordPair}; +use crate::new::SearchContext; +use crate::{CboRoaringBitmapCodec, Result}; use roaring::{MultiOps, RoaringBitmap}; -use super::{ProximityEdge, WordPair}; -use crate::new::db_cache::DatabaseCache; -use crate::{CboRoaringBitmapCodec, Result}; - -pub fn compute_docids<'transaction>( - index: &crate::Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, +pub fn compute_docids<'search>( + ctx: &mut SearchContext<'search>, edge: &ProximityEdge, ) -> Result { let ProximityEdge { pairs, proximity } = edge; @@ -16,12 +12,14 @@ pub fn compute_docids<'transaction>( for pair in pairs.iter() { let bytes = match pair { WordPair::Words { left, right } => { - db_cache.get_word_pair_proximity_docids(index, txn, left, right, *proximity) + ctx.get_word_pair_proximity_docids(*left, *right, *proximity) + } + WordPair::WordPrefix { left, right_prefix } => { + ctx.get_word_prefix_pair_proximity_docids(*left, *right_prefix, *proximity) + } + WordPair::WordPrefixSwapped { left_prefix, right } => { + ctx.get_prefix_word_pair_proximity_docids(*left_prefix, *right, *proximity) } - WordPair::WordPrefix { left, right_prefix } => db_cache - .get_word_prefix_pair_proximity_docids(index, txn, left, right_prefix, *proximity), - WordPair::WordPrefixSwapped { left_prefix, right } => db_cache - .get_prefix_word_pair_proximity_docids(index, txn, left_prefix, right, *proximity), }?; let bitmap = bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index c0dbbefa9..ec1a7b5fa 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -1,25 +1,22 @@ pub mod build; pub mod compute_docids; -use heed::RoTxn; -use roaring::RoaringBitmap; - use super::empty_paths_cache::EmptyPathsCache; - use super::{EdgeDetails, RankingRuleGraphTrait}; -use crate::new::db_cache::DatabaseCache; +use crate::new::interner::Interned; use crate::new::logger::SearchLogger; use crate::new::query_term::WordDerivations; -use crate::new::{QueryGraph, QueryNode}; -use crate::{Index, Result}; +use crate::new::{QueryGraph, QueryNode, SearchContext}; +use crate::Result; +use roaring::RoaringBitmap; // TODO: intern the strings, refer to them by their pointer? -#[derive(Debug, Clone)] +#[derive(Clone)] pub enum WordPair { - Words { left: String, right: String }, - WordPrefix { left: String, right_prefix: String }, - WordPrefixSwapped { left_prefix: String, right: String }, + Words { left: Interned, right: Interned }, + WordPrefix { left: Interned, right_prefix: Interned }, + WordPrefixSwapped { left_prefix: Interned, right: Interned }, } #[derive(Clone)] @@ -40,32 +37,26 @@ impl RankingRuleGraphTrait for ProximityGraph { format!(", prox {proximity}, {} pairs", pairs.len()) } - fn compute_docids<'db_cache, 'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn compute_docids<'search>( + ctx: &mut SearchContext<'search>, edge: &Self::EdgeDetails, ) -> Result { - compute_docids::compute_docids(index, txn, db_cache, edge) + compute_docids::compute_docids(ctx, edge) } - fn build_visit_from_node<'transaction>( - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_from_node<'search>( + ctx: &mut SearchContext<'search>, from_node: &QueryNode, ) -> Result> { - build::visit_from_node(from_node) + build::visit_from_node(ctx, from_node) } - fn build_visit_to_node<'from_data, 'transaction: 'from_data>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_to_node<'from_data, 'search: 'from_data>( + ctx: &mut SearchContext<'search>, to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { - build::visit_to_node(index, txn, db_cache, to_node, from_node_data) + build::visit_to_node(ctx, to_node, from_node_data) } fn log_state( diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index 94a51756e..b3e03d555 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -1,23 +1,18 @@ #![allow(clippy::too_many_arguments)] -use heed::RoTxn; -use roaring::{MultiOps, RoaringBitmap}; - use super::edge_docids_cache::EdgeDocidsCache; use super::empty_paths_cache::EmptyPathsCache; - use super::{RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::db_cache::DatabaseCache; - -use crate::new::BitmapOrAllRef; -use crate::{Index, Result}; +use crate::new::{BitmapOrAllRef, SearchContext}; +use crate::Result; +use roaring::{MultiOps, RoaringBitmap}; impl RankingRuleGraph { - pub fn resolve_paths<'transaction>( + // TODO: reduce the universe after computing each path + // TODO: deserialize roaring bitmap within a universe + pub fn resolve_paths<'search>( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, edge_docids_cache: &mut EdgeDocidsCache, empty_paths_cache: &mut EmptyPathsCache, universe: &RoaringBitmap, @@ -52,8 +47,8 @@ impl RankingRuleGraph { let mut cached_edge_docids = vec![]; 'edge_loop: for edge_index in edge_indexes { visited_edges.push(edge_index); - let edge_docids = edge_docids_cache - .get_edge_docids(index, txn, db_cache, edge_index, self, universe)?; + let edge_docids = + edge_docids_cache.get_edge_docids(ctx, edge_index, self, universe)?; match edge_docids { BitmapOrAllRef::Bitmap(edge_docids) => { cached_edge_docids.push((edge_index, edge_docids.clone())); diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index c9ca7c229..d3771221f 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,19 +1,17 @@ -use heed::{BytesDecode, RoTxn}; -use roaring::RoaringBitmap; - use super::empty_paths_cache::EmptyPathsCache; - use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::db_cache::DatabaseCache; +use crate::new::interner::Interned; use crate::new::logger::SearchLogger; use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; use crate::new::resolve_query_graph::resolve_phrase; -use crate::new::{QueryGraph, QueryNode}; -use crate::{Index, Result, RoaringBitmapCodec}; +use crate::new::{QueryGraph, QueryNode, SearchContext}; +use crate::{Result, RoaringBitmapCodec}; +use heed::BytesDecode; +use roaring::RoaringBitmap; #[derive(Clone)] pub enum TypoEdge { - Phrase { phrase: Phrase }, + Phrase { phrase: Interned }, Word { derivations: WordDerivations, nbr_typos: u8 }, } @@ -30,14 +28,12 @@ impl RankingRuleGraphTrait for TypoGraph { } } - fn compute_docids<'db_cache, 'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn compute_docids<'db_cache, 'search>( + ctx: &mut SearchContext<'search>, edge: &Self::EdgeDetails, ) -> Result { match edge { - TypoEdge::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase), + TypoEdge::Phrase { phrase } => resolve_phrase(ctx, *phrase), TypoEdge::Word { derivations, nbr_typos } => { let words = match nbr_typos { 0 => &derivations.zero_typo, @@ -46,16 +42,14 @@ impl RankingRuleGraphTrait for TypoGraph { _ => panic!(), }; let mut docids = RoaringBitmap::new(); - for word in words.iter() { - let Some(bytes) = db_cache.get_word_docids(index, txn, word)? else { continue }; + for word in words.iter().copied() { + let Some(bytes) = ctx.get_word_docids(word)? else { continue }; let bitmap = RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; docids |= bitmap; } if *nbr_typos == 0 { - if let Some(bytes) = - db_cache.get_prefix_docids(index, txn, &derivations.original)? - { + if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? { let bitmap = RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; docids |= bitmap; @@ -66,26 +60,22 @@ impl RankingRuleGraphTrait for TypoGraph { } } - fn build_visit_from_node<'transaction>( - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_from_node<'search>( + _ctx: &mut SearchContext<'search>, _from_node: &QueryNode, ) -> Result> { Ok(Some(())) } - fn build_visit_to_node<'from_data, 'transaction: 'from_data>( - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_to_node<'from_data, 'search: 'from_data>( + _ctx: &mut SearchContext<'search>, to_node: &QueryNode, _from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { match to_node { QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { - QueryTerm::Phrase { phrase } => { - Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase: phrase.clone() }))]) + &QueryTerm::Phrase { phrase } => { + Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase }))]) } QueryTerm::Word { derivations } => { let mut edges = vec![]; diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index cfa43c006..bfb9b5492 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -1,33 +1,28 @@ -use heed::RoTxn; -use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; use super::logger::SearchLogger; - use super::QueryGraph; +use super::SearchContext; use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::new::ranking_rule_graph::ProximityGraph; use crate::new::ranking_rule_graph::TypoGraph; use crate::new::words::Words; +use roaring::RoaringBitmap; // use crate::search::new::sort::Sort; -use crate::{Index, Result, TermsMatchingStrategy}; +use crate::{Result, TermsMatchingStrategy}; -pub trait RankingRuleOutputIter<'transaction, Query> { +pub trait RankingRuleOutputIter<'search, Query> { fn next_bucket(&mut self) -> Result>>; } -pub struct RankingRuleOutputIterWrapper<'transaction, Query> { - iter: Box>> + 'transaction>, +pub struct RankingRuleOutputIterWrapper<'search, Query> { + iter: Box>> + 'search>, } -impl<'transaction, Query> RankingRuleOutputIterWrapper<'transaction, Query> { - pub fn new( - iter: Box>> + 'transaction>, - ) -> Self { +impl<'search, Query> RankingRuleOutputIterWrapper<'search, Query> { + pub fn new(iter: Box>> + 'search>) -> Self { Self { iter } } } -impl<'transaction, Query> RankingRuleOutputIter<'transaction, Query> - for RankingRuleOutputIterWrapper<'transaction, Query> +impl<'search, Query> RankingRuleOutputIter<'search, Query> + for RankingRuleOutputIterWrapper<'search, Query> { fn next_bucket(&mut self) -> Result>> { match self.iter.next() { @@ -44,7 +39,7 @@ pub struct PlaceholderQuery; impl RankingRuleQueryTrait for PlaceholderQuery {} impl RankingRuleQueryTrait for QueryGraph {} -pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { +pub trait RankingRule<'search, Query: RankingRuleQueryTrait> { fn id(&self) -> String; /// Prepare the ranking rule such that it can start iterating over its @@ -53,9 +48,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { /// The given universe is the universe that will be given to [`next_bucket`](RankingRule::next_bucket). fn start_iteration( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, query: &Query, @@ -70,9 +63,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { /// - the universe given to [`start_iteration`](RankingRule::start_iteration) fn next_bucket( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>>; @@ -81,9 +72,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { /// The next call to this ranking rule, if any, will be [`start_iteration`](RankingRule::start_iteration). fn end_iteration( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, logger: &mut dyn SearchLogger, ); } @@ -98,11 +87,9 @@ pub struct RankingRuleOutput { // TODO: can make it generic over the query type (either query graph or placeholder) fairly easily #[allow(clippy::too_many_arguments)] -pub fn apply_ranking_rules<'transaction>( - index: &Index, - txn: &'transaction heed::RoTxn, +pub fn apply_ranking_rules<'search>( + ctx: &mut SearchContext<'search>, // TODO: ranking rules parameter - db_cache: &mut DatabaseCache<'transaction>, query_graph: &QueryGraph, universe: &RoaringBitmap, from: usize, @@ -115,7 +102,7 @@ pub fn apply_ranking_rules<'transaction>( let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); let typo = &mut GraphBasedRankingRule::::new("typo".to_owned()); // TODO: ranking rules given as argument - let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> = + let mut ranking_rules: Vec<&mut dyn RankingRule<'search, QueryGraph>> = vec![words, typo, proximity /*sort*/]; logger.ranking_rules(&ranking_rules); @@ -126,7 +113,7 @@ pub fn apply_ranking_rules<'transaction>( let ranking_rules_len = ranking_rules.len(); logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe); - ranking_rules[0].start_iteration(index, txn, db_cache, logger, universe, query_graph)?; + ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?; let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; candidates[0] = universe.clone(); @@ -142,7 +129,7 @@ pub fn apply_ranking_rules<'transaction>( &candidates[cur_ranking_rule_index], ); candidates[cur_ranking_rule_index].clear(); - ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger); + ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger); if cur_ranking_rule_index == 0 { break; } else { @@ -206,7 +193,7 @@ pub fn apply_ranking_rules<'transaction>( continue; } - let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else { + let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &candidates[cur_ranking_rule_index])? else { // TODO: add remaining candidates automatically here? back!(); continue; @@ -239,9 +226,7 @@ pub fn apply_ranking_rules<'transaction>( &candidates[cur_ranking_rule_index], ); ranking_rules[cur_ranking_rule_index].start_iteration( - index, - txn, - db_cache, + ctx, logger, &next_bucket.candidates, &next_bucket.query, @@ -255,9 +240,7 @@ pub fn apply_ranking_rules<'transaction>( mod tests { // use crate::allocator::ALLOC; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::new::db_cache::DatabaseCache; - use crate::new::execute_search; + use crate::new::{execute_search, SearchContext}; use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; @@ -269,55 +252,6 @@ mod tests { use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; - #[test] - fn execute_new_search() { - let index = TempIndex::new(); - index - .add_documents(documents!([ - { - "id": 7, - "text": "the super quick super brown fox jumps over", - }, - { - "id": 8, - "text": "the super quick brown fox jumps over", - }, - { - "id": 9, - "text": "the quick super brown fox jumps over", - }, - { - "id": 10, - "text": "the quick brown fox jumps over", - }, - { - "id": 11, - "text": "the quick brown fox jumps over the lazy dog", - }, - { - "id": 12, - "text": "the quick brown cat jumps over the lazy dog", - }, - ])) - .unwrap(); - let txn = index.read_txn().unwrap(); - let mut db_cache = DatabaseCache::default(); - - let results = execute_search( - &index, - &txn, - &mut db_cache, - "releases from poison by the government", - None, - 0, - 50, - &mut DefaultSearchLogger, - ) - .unwrap(); - - println!("{results:?}") - } - #[test] fn search_wiki_new() { let mut options = EnvOpenOptions::new(); @@ -331,24 +265,20 @@ mod tests { // loop { let start = Instant::now(); - let mut db_cache = DatabaseCache::default(); - - let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let results = execute_search( - &index, - &txn, - &mut db_cache, + &mut SearchContext::new(&index, &txn), "releases from poison by the government", None, 0, 20, - // &mut DefaultSearchLogger, - &mut logger, + &mut DefaultSearchLogger, + // &mut logger, ) .unwrap(); - logger.write_d2_description(); + // logger.write_d2_description(); let elapsed = start.elapsed(); @@ -425,19 +355,15 @@ mod tests { let index = Index::new(options, "data_movies").unwrap(); let txn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&txn).unwrap().unwrap(); - let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + // let primary_key = index.primary_key(&txn).unwrap().unwrap(); + // let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); // loop { let start = Instant::now(); - let mut db_cache = DatabaseCache::default(); - let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); - + let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( - &index, - &txn, - &mut db_cache, + &mut ctx, "releases from poison by the government", None, 0, @@ -447,24 +373,24 @@ mod tests { ) .unwrap(); - logger.write_d2_description(); + logger.write_d2_description(&mut ctx); let elapsed = start.elapsed(); - let ids = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|x| { - let obkv = &x.1; - let id = obkv.get(primary_key).unwrap(); - let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - id.as_str().unwrap().to_owned() - }) - .collect::>(); + // let ids = index + // .documents(&txn, results.iter().copied()) + // .unwrap() + // .into_iter() + // .map(|x| { + // let obkv = &x.1; + // let id = obkv.get(primary_key).unwrap(); + // let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + // id.as_str().unwrap().to_owned() + // }) + // .collect::>(); println!("{}us: {results:?}", elapsed.as_micros()); - println!("external ids: {ids:?}"); + // println!("external ids: {ids:?}"); // } } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 93ebcf989..de5cf02ab 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -1,34 +1,28 @@ -use std::collections::VecDeque; - -use fxhash::FxHashMap; -use heed::{BytesDecode, RoTxn}; -use roaring::{MultiOps, RoaringBitmap}; - -use super::db_cache::DatabaseCache; +use super::interner::Interned; use super::query_term::{Phrase, QueryTerm, WordDerivations}; -use super::{QueryGraph, QueryNode}; - -use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; +use super::{QueryGraph, QueryNode, SearchContext}; +use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; +use fxhash::FxHashMap; +use heed::BytesDecode; +use roaring::{MultiOps, RoaringBitmap}; +use std::collections::VecDeque; // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. #[derive(Default)] pub struct NodeDocIdsCache { pub cache: FxHashMap, } -impl NodeDocIdsCache { - fn get_docids<'cache, 'transaction>( +impl<'search> SearchContext<'search> { + fn get_node_docids<'cache>( &'cache mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, term: &QueryTerm, node_idx: u32, ) -> Result<&'cache RoaringBitmap> { - if self.cache.contains_key(&node_idx) { - return Ok(&self.cache[&node_idx]); + if self.node_docids_cache.cache.contains_key(&node_idx) { + return Ok(&self.node_docids_cache.cache[&node_idx]); }; let docids = match term { - QueryTerm::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase)?, + QueryTerm::Phrase { phrase } => resolve_phrase(self, *phrase)?, QueryTerm::Word { derivations: WordDerivations { @@ -42,15 +36,14 @@ impl NodeDocIdsCache { }, } => { let mut or_docids = vec![]; - for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) { - if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? { + for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()).copied() + { + if let Some(word_docids) = self.get_word_docids(word)? { or_docids.push(word_docids); } } if *use_prefix_db { - if let Some(prefix_docids) = - db_cache.get_prefix_docids(index, txn, original.as_str())? - { + if let Some(prefix_docids) = self.get_prefix_docids(*original)? { or_docids.push(prefix_docids); } } @@ -58,32 +51,25 @@ impl NodeDocIdsCache { .into_iter() .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()) .collect::>(); - for synonym in synonyms { + for synonym in synonyms.iter().copied() { // TODO: cache resolve_phrase? - docids.push(resolve_phrase(index, txn, db_cache, synonym)?); + docids.push(resolve_phrase(self, synonym)?); } - if let Some((left, right)) = split_words { - if let Some(split_word_docids) = - db_cache.get_word_pair_proximity_docids(index, txn, left, right, 1)? - { - docids.push(CboRoaringBitmapCodec::deserialize_from(split_word_docids)?); - } + if let Some(split_words) = split_words { + docids.push(resolve_phrase(self, *split_words)?); } MultiOps::union(docids) } }; - let _ = self.cache.insert(node_idx, docids); - let docids = &self.cache[&node_idx]; + let _ = self.node_docids_cache.cache.insert(node_idx, docids); + let docids = &self.node_docids_cache.cache[&node_idx]; Ok(docids) } } -pub fn resolve_query_graph<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, - node_docids_cache: &mut NodeDocIdsCache, +pub fn resolve_query_graph<'search>( + ctx: &mut SearchContext<'search>, q: &QueryGraph, universe: &RoaringBitmap, ) -> Result { @@ -111,8 +97,7 @@ pub fn resolve_query_graph<'transaction>( let node_docids = match n { QueryNode::Term(located_term) => { let term = &located_term.value; - let derivations_docids = - node_docids_cache.get_docids(index, txn, db_cache, term, node)?; + let derivations_docids = ctx.get_node_docids(term, node)?; predecessors_docids & derivations_docids } QueryNode::Deleted => { @@ -143,13 +128,8 @@ pub fn resolve_query_graph<'transaction>( panic!() } -pub fn resolve_phrase<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, - phrase: &Phrase, -) -> Result { - let Phrase { words } = phrase; +pub fn resolve_phrase(ctx: &mut SearchContext, phrase: Interned) -> Result { + let Phrase { words } = ctx.phrase_interner.get(phrase).clone(); let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = words.len().min(3); @@ -161,19 +141,19 @@ pub fn resolve_phrase<'transaction>( for win in words.windows(winsize) { // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win + for (offset, &s1) in win .iter() .enumerate() .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) { - for (dist, s2) in win + for (dist, &s2) in win .iter() .skip(offset + 1) .enumerate() .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) { if dist == 0 { - match db_cache.get_word_pair_proximity_docids(index, txn, s1, s2, 1)? { + match ctx.get_word_pair_proximity_docids(s1, s2, 1)? { Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?), // If there are no documents for this pair, there will be no // results for the phrase query. @@ -182,13 +162,9 @@ pub fn resolve_phrase<'transaction>( } else { let mut bitmap = RoaringBitmap::new(); for dist in 0..=dist { - if let Some(m) = db_cache.get_word_pair_proximity_docids( - index, - txn, - s1, - s2, - dist as u8 + 1, - )? { + if let Some(m) = + ctx.get_word_pair_proximity_docids(s1, s2, dist as u8 + 1)? + { bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?; } } diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 9ef01bd95..f0967843b 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -1,11 +1,7 @@ -use heed::RoTxn; -use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; use super::logger::SearchLogger; use super::{ RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper, - RankingRuleQueryTrait, + RankingRuleQueryTrait, SearchContext, }; use crate::{ // facet::FacetType, @@ -15,18 +11,19 @@ use crate::{ Index, Result, }; +use roaring::RoaringBitmap; -pub struct Sort<'transaction, Query> { +pub struct Sort<'search, Query> { field_name: String, field_id: Option, is_ascending: bool, original_query: Option, - iter: Option>, + iter: Option>, } -impl<'transaction, Query> Sort<'transaction, Query> { - pub fn new( +impl<'search, Query> Sort<'search, Query> { + pub fn _new( index: &Index, - rtxn: &'transaction heed::RoTxn, + rtxn: &'search heed::RoTxn, field_name: String, is_ascending: bool, ) -> Result { @@ -37,18 +34,14 @@ impl<'transaction, Query> Sort<'transaction, Query> { } } -impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query> - for Sort<'transaction, Query> -{ +impl<'search, Query: RankingRuleQueryTrait> RankingRule<'search, Query> for Sort<'search, Query> { fn id(&self) -> String { let Self { field_name, is_ascending, .. } = self; format!("{field_name}:{}", if *is_ascending { "asc" } else { "desc " }) } fn start_iteration( &mut self, - index: &Index, - txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, parent_candidates: &RoaringBitmap, parent_query_graph: &Query, @@ -59,8 +52,8 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query if self.is_ascending { ascending_facet_sort } else { descending_facet_sort }; let number_iter = make_iter( - txn, - index + ctx.txn, + ctx.index .facet_id_f64_docids .remap_key_type::>(), field_id, @@ -68,8 +61,8 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query )?; let string_iter = make_iter( - txn, - index + ctx.txn, + ctx.index .facet_id_string_docids .remap_key_type::>(), field_id, @@ -91,9 +84,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query fn next_bucket( &mut self, - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + _ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { @@ -110,9 +101,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query fn end_iteration( &mut self, - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + _ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, ) { self.original_query = None; diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 10c0800ba..9ad8b33ba 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -1,13 +1,9 @@ -use std::collections::BTreeSet; - -use heed::RoTxn; -use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; use super::logger::SearchLogger; -use super::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}; -use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput}; -use crate::{Index, Result, TermsMatchingStrategy}; +use super::resolve_query_graph::resolve_query_graph; +use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput, SearchContext}; +use crate::{Result, TermsMatchingStrategy}; +use roaring::RoaringBitmap; +use std::collections::BTreeSet; pub struct Words { exhausted: bool, @@ -15,7 +11,6 @@ pub struct Words { iterating: bool, positions_to_remove: Vec, terms_matching_strategy: TermsMatchingStrategy, - node_docids_cache: NodeDocIdsCache, } impl Words { pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self { @@ -25,20 +20,17 @@ impl Words { iterating: false, positions_to_remove: vec![], terms_matching_strategy, - node_docids_cache: <_>::default(), } } } -impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { +impl<'search> RankingRule<'search, QueryGraph> for Words { fn id(&self) -> String { "words".to_owned() } fn start_iteration( &mut self, - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + _ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, _parent_candidates: &RoaringBitmap, parent_query_graph: &QueryGraph, @@ -71,9 +63,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { fn next_bucket( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { @@ -87,14 +77,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { logger.log_words_state(query_graph); - let this_bucket = resolve_query_graph( - index, - txn, - db_cache, - &mut self.node_docids_cache, - query_graph, - universe, - )?; + let this_bucket = resolve_query_graph(ctx, query_graph, universe)?; let child_query_graph = query_graph.clone(); loop { @@ -115,9 +98,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { fn end_iteration( &mut self, - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + _ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, ) { self.iterating = false; From 9051065c22a90c4d181cc6d534b666f124627522 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 7 Mar 2023 14:42:58 +0100 Subject: [PATCH 042/234] Apply a few optimisations for graph-based ranking rules --- .../search/new/graph_based_ranking_rule.rs | 89 ++++-- milli/src/search/new/logger/detailed.rs | 24 +- milli/src/search/new/logger/mod.rs | 24 +- milli/src/search/new/mod.rs | 33 +-- milli/src/search/new/query_graph.rs | 51 ++-- .../search/new/ranking_rule_graph/build.rs | 42 ++- .../new/ranking_rule_graph/cheapest_paths.rs | 170 ++++++----- .../ranking_rule_graph/edge_docids_cache.rs | 17 +- .../ranking_rule_graph/empty_paths_cache.rs | 54 ++-- .../src/search/new/ranking_rule_graph/mod.rs | 49 +--- .../new/ranking_rule_graph/paths_map.rs | 45 ++- .../new/ranking_rule_graph/proximity/build.rs | 11 +- .../proximity/compute_docids.rs | 16 +- .../new/ranking_rule_graph/proximity/mod.rs | 14 +- .../new/ranking_rule_graph/resolve_paths.rs | 97 ------- .../search/new/ranking_rule_graph/typo/mod.rs | 18 +- milli/src/search/new/ranking_rules.rs | 82 +++--- milli/src/search/new/resolve_query_graph.rs | 13 +- milli/src/search/new/small_bitmap.rs | 271 ++++++++++++++++++ 19 files changed, 682 insertions(+), 438 deletions(-) delete mode 100644 milli/src/search/new/ranking_rule_graph/resolve_paths.rs create mode 100644 milli/src/search/new/small_bitmap.rs diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index d51fb6920..ac56b4f20 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -21,7 +21,7 @@ pub struct GraphBasedRankingRuleState { graph: RankingRuleGraph, edge_docids_cache: EdgeDocidsCache, empty_paths_cache: EmptyPathsCache, - all_distances: Vec>, + all_distances: Vec>, cur_distance_idx: usize, } @@ -32,14 +32,14 @@ fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( universe: &RoaringBitmap, empty_paths_cache: &mut EmptyPathsCache, ) -> Result<()> { - for edge_index in 0..graph.all_edges.len() as u32 { + for edge_index in 0..graph.all_edges.len() as u16 { if graph.all_edges[edge_index as usize].is_none() { continue; } let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?; match docids { - BitmapOrAllRef::Bitmap(bitmap) => { - if bitmap.is_disjoint(universe) { + BitmapOrAllRef::Bitmap(docids) => { + if docids.is_disjoint(universe) { graph.remove_edge(edge_index); empty_paths_cache.forbid_edge(edge_index); edge_docids_cache.cache.remove(&edge_index); @@ -68,7 +68,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> // TODO: update old state instead of starting from scratch let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; let mut edge_docids_cache = EdgeDocidsCache::default(); - let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len()); + let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16); remove_empty_edges( ctx, @@ -118,31 +118,82 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx]; state.cur_distance_idx += 1; - let paths = state.graph.paths_of_cost( - state.graph.query_graph.root_node as usize, + let mut bucket = RoaringBitmap::new(); + + let GraphBasedRankingRuleState { + graph, + edge_docids_cache, + empty_paths_cache, + all_distances, + cur_distance_idx: _, + } = &mut state; + + let mut paths = vec![]; + let original_universe = universe; + let mut universe = universe.clone(); + + graph.visit_paths_of_cost( + graph.query_graph.root_node as usize, cost, - &state.all_distances, - &state.empty_paths_cache, - ); + all_distances, + empty_paths_cache, + |path, graph, empty_paths_cache| { + let mut path_docids = universe.clone(); + let mut visited_edges = vec![]; + let mut cached_edge_docids = vec![]; + for &edge_index in path { + visited_edges.push(edge_index); + let edge_docids = + edge_docids_cache.get_edge_docids(ctx, edge_index, graph, &universe)?; + let edge_docids = match edge_docids { + BitmapOrAllRef::Bitmap(b) => b, + BitmapOrAllRef::All => continue, + }; + cached_edge_docids.push((edge_index, edge_docids.clone())); + if edge_docids.is_disjoint(&universe) { + // 1. Store in the cache that this edge is empty for this universe + empty_paths_cache.forbid_edge(edge_index); + // 2. remove this edge from the ranking rule graph + graph.remove_edge(edge_index); + edge_docids_cache.cache.remove(&edge_index); + return Ok(()); + } + path_docids &= edge_docids; + + if path_docids.is_disjoint(&universe) { + empty_paths_cache.forbid_prefix(&visited_edges); + // if the intersection between this edge and any + // previous one is disjoint with the universe, + // then we add these two edges to the empty_path_cache + for (edge_index2, edge_docids2) in + cached_edge_docids[..cached_edge_docids.len() - 1].iter() + { + let intersection = edge_docids & edge_docids2; + if intersection.is_disjoint(&universe) { + // needs_filtering_empty_couple_edges = true; + empty_paths_cache.forbid_couple_edges(*edge_index2, edge_index); + } + } + return Ok(()); + } + } + paths.push(path.to_vec()); + bucket |= &path_docids; + universe -= path_docids; + Ok(()) + }, + )?; G::log_state( &state.graph, &paths, &state.empty_paths_cache, - universe, + original_universe, &state.all_distances, cost, logger, ); - let bucket = state.graph.resolve_paths( - ctx, - &mut state.edge_docids_cache, - &mut state.empty_paths_cache, - universe, - paths, - )?; - let next_query_graph = state.graph.query_graph.clone(); self.state = Some(state); diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 76c3f8977..10b5e7097 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -42,19 +42,19 @@ pub enum SearchEvents { }, ProximityState { graph: RankingRuleGraph, - paths: Vec>, + paths: Vec>, empty_paths_cache: EmptyPathsCache, universe: RoaringBitmap, - distances: Vec>, - cost: u64, + distances: Vec>, + cost: u16, }, TypoState { graph: RankingRuleGraph, - paths: Vec>, + paths: Vec>, empty_paths_cache: EmptyPathsCache, universe: RoaringBitmap, - distances: Vec>, - cost: u64, + distances: Vec>, + cost: u16, }, RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant }, } @@ -165,11 +165,11 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); } - fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u64,) { + fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } - fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u64,) { + fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } @@ -352,7 +352,7 @@ results.{random} {{ writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) { + fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, _distances: &[u16], file: &mut File) { match &node { QueryNode::Term(LocatedQueryTerm { value, .. }) => { match value { @@ -420,7 +420,7 @@ shape: class").unwrap(); } } } - fn ranking_rule_graph_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { + fn ranking_rule_graph_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -477,7 +477,7 @@ shape: class").unwrap(); // } // writeln!(file, "}}").unwrap(); } - fn edge_d2_description(ctx: &mut SearchContext,graph: &RankingRuleGraph, edge_idx: u32, file: &mut File) { + fn edge_d2_description(ctx: &mut SearchContext,graph: &RankingRuleGraph, edge_idx: u16, file: &mut File) { let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { @@ -511,7 +511,7 @@ shape: class").unwrap(); shape: class }}").unwrap(); } - fn paths_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], file: &mut File) { + fn paths_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], file: &mut File) { for (path_idx, edge_indexes) in paths.iter().enumerate() { writeln!(file, "{path_idx} {{").unwrap(); for edge_idx in edge_indexes.iter() { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 9a141c1c6..bf78e4de0 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -58,22 +58,22 @@ impl SearchLogger for DefaultSearchLogger { fn log_proximity_state( &mut self, _query_graph: &RankingRuleGraph, - _paths_map: &[Vec], + _paths_map: &[Vec], _empty_paths_cache: &EmptyPathsCache, _universe: &RoaringBitmap, - _distances: Vec>, - _cost: u64, + _distances: Vec>, + _cost: u16, ) { } fn log_typo_state( &mut self, _query_graph: &RankingRuleGraph, - _paths: &[Vec], + _paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, _universe: &RoaringBitmap, - _distances: Vec>, - _cost: u64, + _distances: Vec>, + _cost: u16, ) { } } @@ -120,20 +120,20 @@ pub trait SearchLogger { fn log_proximity_state( &mut self, query_graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - _distances: Vec>, - cost: u64, + distances: Vec>, + cost: u16, ); fn log_typo_state( &mut self, query_graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - _distances: Vec>, - cost: u64, + distances: Vec>, + cost: u16, ); } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 0feef1f60..dc73fe51c 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -7,31 +7,26 @@ mod query_term; mod ranking_rule_graph; mod ranking_rules; mod resolve_query_graph; +mod small_bitmap; mod sort; mod words; -use std::collections::BTreeSet; - -pub use ranking_rules::{ - apply_ranking_rules, RankingRule, RankingRuleOutput, RankingRuleOutputIter, - RankingRuleOutputIterWrapper, RankingRuleQueryTrait, -}; - -use crate::{ - new::query_term::located_query_terms_from_string, Filter, Index, Result, TermsMatchingStrategy, -}; +use self::interner::Interner; +use self::logger::SearchLogger; +use self::query_term::Phrase; +use self::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}; +use crate::new::query_term::located_query_terms_from_string; +use crate::{Filter, Index, Result, TermsMatchingStrategy}; use charabia::Tokenize; use db_cache::DatabaseCache; use heed::RoTxn; use query_graph::{QueryGraph, QueryNode}; -use roaring::RoaringBitmap; - -use self::{ - interner::Interner, - logger::SearchLogger, - query_term::Phrase, - resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}, +pub use ranking_rules::{ + apply_ranking_rules, RankingRule, RankingRuleOutput, RankingRuleOutputIter, + RankingRuleOutputIterWrapper, RankingRuleQueryTrait, }; +use roaring::RoaringBitmap; +use std::collections::BTreeSet; pub enum BitmapOrAllRef<'s> { Bitmap(&'s RoaringBitmap), @@ -109,7 +104,7 @@ pub fn execute_search<'search>( logger: &mut dyn SearchLogger, ) -> Result> { assert!(!query.is_empty()); - let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None).unwrap(); + let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None)?; let graph = QueryGraph::from_query(ctx, query_terms)?; logger.initial_query(&graph); @@ -127,7 +122,7 @@ pub fn execute_search<'search>( TermsMatchingStrategy::Last, logger, )?; - // TODO: create ranking rules here, reuse the node docids cache for the words ranking rule + // TODO: create ranking rules here logger.initial_universe(&universe); diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 449b6536c..b879b2c15 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,8 +1,7 @@ use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations}; +use super::small_bitmap::SmallBitmap; use super::SearchContext; use crate::Result; -use roaring::RoaringBitmap; -use std::fmt::Debug; #[derive(Clone)] pub enum QueryNode { @@ -12,17 +11,17 @@ pub enum QueryNode { End, } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct Edges { // TODO: use a tiny bitset instead, something like a simple Vec where most queries will see a vector of one element - pub predecessors: RoaringBitmap, - pub successors: RoaringBitmap, + pub predecessors: SmallBitmap, + pub successors: SmallBitmap, } #[derive(Clone)] pub struct QueryGraph { - pub root_node: u32, - pub end_node: u32, + pub root_node: u16, + pub end_node: u16, pub nodes: Vec, pub edges: Vec, } @@ -30,7 +29,7 @@ pub struct QueryGraph { fn _assert_sizes() { // TODO: QueryNodes are too big now, 88B is a bit too big let _: [u8; 88] = [0; std::mem::size_of::()]; - let _: [u8; 48] = [0; std::mem::size_of::()]; + let _: [u8; 32] = [0; std::mem::size_of::()]; } impl Default for QueryGraph { @@ -38,8 +37,8 @@ impl Default for QueryGraph { fn default() -> Self { let nodes = vec![QueryNode::Start, QueryNode::End]; let edges = vec![ - Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }, - Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }, + Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }, + Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }, ]; Self { root_node: 0, end_node: 1, nodes, edges } @@ -47,18 +46,18 @@ impl Default for QueryGraph { } impl QueryGraph { - fn connect_to_node(&mut self, from_nodes: &[u32], to_node: u32) { + fn connect_to_node(&mut self, from_nodes: &[u16], to_node: u16) { for &from_node in from_nodes { self.edges[from_node as usize].successors.insert(to_node); self.edges[to_node as usize].predecessors.insert(from_node); } } - fn add_node(&mut self, from_nodes: &[u32], node: QueryNode) -> u32 { - let new_node_idx = self.nodes.len() as u32; + fn add_node(&mut self, from_nodes: &[u16], node: QueryNode) -> u16 { + let new_node_idx = self.nodes.len() as u16; self.nodes.push(node); self.edges.push(Edges { - predecessors: from_nodes.iter().collect(), - successors: RoaringBitmap::new(), + predecessors: SmallBitmap::from_array(from_nodes, 64), + successors: SmallBitmap::new(64), }); for from_node in from_nodes { self.edges[*from_node as usize].successors.insert(new_node_idx); @@ -79,7 +78,7 @@ impl QueryGraph { let word_set = ctx.index.words_fst(ctx.txn)?; let mut graph = QueryGraph::default(); - let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = + let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![graph.root_node]); // TODO: split words / synonyms @@ -157,40 +156,40 @@ impl QueryGraph { Ok(graph) } - pub fn remove_nodes(&mut self, nodes: &[u32]) { + pub fn remove_nodes(&mut self, nodes: &[u16]) { for &node in nodes { self.nodes[node as usize] = QueryNode::Deleted; let edges = self.edges[node as usize].clone(); for pred in edges.predecessors.iter() { self.edges[pred as usize].successors.remove(node); } - for succ in edges.successors { + for succ in edges.successors.iter() { self.edges[succ as usize].predecessors.remove(node); } self.edges[node as usize] = - Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }; + Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }; } } - pub fn remove_nodes_keep_edges(&mut self, nodes: &[u32]) { + pub fn remove_nodes_keep_edges(&mut self, nodes: &[u16]) { for &node in nodes { self.nodes[node as usize] = QueryNode::Deleted; let edges = self.edges[node as usize].clone(); for pred in edges.predecessors.iter() { self.edges[pred as usize].successors.remove(node); - self.edges[pred as usize].successors |= &edges.successors; + self.edges[pred as usize].successors.union(&edges.successors); } - for succ in edges.successors { + for succ in edges.successors.iter() { self.edges[succ as usize].predecessors.remove(node); - self.edges[succ as usize].predecessors |= &edges.predecessors; + self.edges[succ as usize].predecessors.union(&edges.predecessors); } self.edges[node as usize] = - Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }; + Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }; } } pub fn remove_words_at_position(&mut self, position: i8) -> bool { let mut nodes_to_remove_keeping_edges = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { - let node_idx = node_idx as u32; + let node_idx = node_idx as u16; let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue }; if positions.start() == &position { nodes_to_remove_keeping_edges.push(node_idx); @@ -212,7 +211,7 @@ impl QueryGraph { || (!matches!(node, QueryNode::Start | QueryNode::Deleted) && self.edges[node_idx].predecessors.is_empty()) { - nodes_to_remove.push(node_idx as u32); + nodes_to_remove.push(node_idx as u16); } } if nodes_to_remove.is_empty() { diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 03a7f6c9d..261f2909b 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,40 +1,54 @@ +use std::collections::HashSet; + use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::new::small_bitmap::SmallBitmap; use crate::new::{QueryGraph, SearchContext}; use crate::Result; -use roaring::RoaringBitmap; impl RankingRuleGraph { pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { - let mut ranking_rule_graph = - Self { query_graph, all_edges: vec![], node_edges: vec![], successors: vec![] }; + let QueryGraph { nodes: graph_nodes, edges: graph_edges, .. } = &query_graph; - for (node_idx, node) in ranking_rule_graph.query_graph.nodes.iter().enumerate() { - ranking_rule_graph.node_edges.push(RoaringBitmap::new()); - ranking_rule_graph.successors.push(RoaringBitmap::new()); - let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap(); - let new_successors = ranking_rule_graph.successors.last_mut().unwrap(); + let mut all_edges = vec![]; + let mut node_edges = vec![]; + let mut successors = vec![]; + + for (node_idx, node) in graph_nodes.iter().enumerate() { + node_edges.push(HashSet::new()); + successors.push(HashSet::new()); + let new_edges = node_edges.last_mut().unwrap(); + let new_successors = successors.last_mut().unwrap(); let Some(from_node_data) = G::build_visit_from_node(ctx, node)? else { continue }; - for successor_idx in ranking_rule_graph.query_graph.edges[node_idx].successors.iter() { - let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx as usize]; + for successor_idx in graph_edges[node_idx].successors.iter() { + let to_node = &graph_nodes[successor_idx as usize]; let mut edges = G::build_visit_to_node(ctx, to_node, &from_node_data)?; if edges.is_empty() { continue; } edges.sort_by_key(|e| e.0); for (cost, details) in edges { - ranking_rule_graph.all_edges.push(Some(Edge { - from_node: node_idx as u32, + all_edges.push(Some(Edge { + from_node: node_idx as u16, to_node: successor_idx, cost, details, })); - new_edges.insert(ranking_rule_graph.all_edges.len() as u32 - 1); + new_edges.insert(all_edges.len() as u16 - 1); new_successors.insert(successor_idx); } } } - Ok(ranking_rule_graph) + let node_edges = node_edges + .into_iter() + .map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16)) + .collect(); + let successors = successors + .into_iter() + .map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16)) + .collect(); + + Ok(RankingRuleGraph { query_graph, all_edges, node_edges, successors }) } } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 2377f1c84..14afd83d0 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -2,124 +2,146 @@ use super::empty_paths_cache::EmptyPathsCache; use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use crate::new::small_bitmap::SmallBitmap; +use crate::Result; use std::collections::VecDeque; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Path { - pub edges: Vec, + pub edges: Vec, pub cost: u64, } impl RankingRuleGraph { - pub fn paths_of_cost( - &self, + pub fn visit_paths_of_cost( + &mut self, from: usize, - cost: u64, - all_distances: &[Vec], - empty_paths_cache: &EmptyPathsCache, - ) -> Vec> { - let mut paths = vec![]; - self.paths_of_cost_rec( + cost: u16, + all_distances: &[Vec], + empty_paths_cache: &mut EmptyPathsCache, + mut visit: impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, + ) -> Result<()> { + let _ = self.visit_paths_of_cost_rec( from, - all_distances, cost, - &mut vec![], - &mut paths, - &vec![false; self.all_edges.len()], + all_distances, empty_paths_cache, - ); - paths + &mut visit, + &mut vec![], + &mut SmallBitmap::new(self.all_edges.len() as u16), + empty_paths_cache.empty_edges.clone(), + )?; + Ok(()) } - pub fn paths_of_cost_rec( - &self, + pub fn visit_paths_of_cost_rec( + &mut self, from: usize, - all_distances: &[Vec], - cost: u64, - prev_edges: &mut Vec, - paths: &mut Vec>, - forbidden_edges: &[bool], - empty_paths_cache: &EmptyPathsCache, - ) { - let distances = &all_distances[from]; - if !distances.contains(&cost) { - panic!(); - } - let tos = &self.query_graph.edges[from].successors; - let mut valid_edges = vec![]; - for to in tos { - self.visit_edges::<()>(from as u32, to, |edge_idx, edge| { - if cost >= edge.cost as u64 - && all_distances[to as usize].contains(&(cost - edge.cost as u64)) - && !forbidden_edges[edge_idx as usize] - { - valid_edges.push((edge_idx, edge.cost, to)); - } - std::ops::ControlFlow::Continue(()) - }); - } + cost: u16, + // TODO: replace all_distances with a Vec where the SmallBitmap contains true if the cost exists and false otherwise + all_distances: &[Vec], + empty_paths_cache: &mut EmptyPathsCache, + visit: &mut impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, + // replace prev edges by: + // (1) a small bitmap representing the path + // (2) a pointer within the EmptyPathsCache::forbidden_prefixes structure + prev_edges: &mut Vec, + cur_path: &mut SmallBitmap, + mut forbidden_edges: SmallBitmap, + ) -> Result { + let mut any_valid = false; - for (edge_idx, edge_cost, to) in valid_edges { - prev_edges.push(edge_idx); - if empty_paths_cache.empty_prefixes.contains_prefix_of_path(prev_edges) { + let edges = self.node_edges[from].clone(); + for edge_idx in edges.iter() { + let Some(edge) = self.all_edges[edge_idx as usize].as_ref() else { continue }; + if cost < edge.cost as u16 + || forbidden_edges.contains(edge_idx) + || !all_distances[edge.to_node as usize].contains(&(cost - edge.cost as u16)) + { continue; } - let mut new_forbidden_edges = forbidden_edges.to_vec(); - for edge_idx in empty_paths_cache.empty_couple_edges[edge_idx as usize].iter() { - new_forbidden_edges[*edge_idx as usize] = true; - } - for edge_idx in empty_paths_cache.empty_prefixes.final_edges_ater_prefix(prev_edges) { - new_forbidden_edges[edge_idx as usize] = true; - } + cur_path.insert(edge_idx); + prev_edges.push(edge_idx); - if to == self.query_graph.end_node { - paths.push(prev_edges.clone()); + let mut new_forbidden_edges = forbidden_edges.clone(); + new_forbidden_edges.union(&empty_paths_cache.empty_couple_edges[edge_idx as usize]); + empty_paths_cache.empty_prefixes.final_edges_after_prefix(prev_edges, &mut |x| { + new_forbidden_edges.insert(x); + }); + + let next_any_valid = if edge.to_node == self.query_graph.end_node { + any_valid = true; + visit(prev_edges, self, empty_paths_cache)?; + true } else { - self.paths_of_cost_rec( - to as usize, + self.visit_paths_of_cost_rec( + edge.to_node as usize, + cost - edge.cost as u16, all_distances, - cost - edge_cost as u64, - prev_edges, - paths, - &new_forbidden_edges, empty_paths_cache, - ) - } + visit, + prev_edges, + cur_path, + new_forbidden_edges, + )? + }; + any_valid |= next_any_valid; + cur_path.remove(edge_idx); prev_edges.pop(); + if next_any_valid { + if empty_paths_cache.path_is_empty(prev_edges, cur_path) { + return Ok(any_valid); + } + forbidden_edges.union(&empty_paths_cache.empty_edges); + for edge in prev_edges.iter() { + forbidden_edges.union(&empty_paths_cache.empty_couple_edges[*edge as usize]); + } + empty_paths_cache.empty_prefixes.final_edges_after_prefix(prev_edges, &mut |x| { + forbidden_edges.insert(x); + }); + } + if next_any_valid && empty_paths_cache.path_is_empty(prev_edges, cur_path) { + return Ok(any_valid); + } } + + Ok(any_valid) } - pub fn initialize_distances_cheapest(&self) -> Vec> { - let mut distances_to_end: Vec> = vec![vec![]; self.query_graph.nodes.len()]; - let mut enqueued = vec![false; self.query_graph.nodes.len()]; + pub fn initialize_distances_cheapest(&self) -> Vec> { + let mut distances_to_end: Vec> = vec![vec![]; self.query_graph.nodes.len()]; + let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len() as u16); let mut node_stack = VecDeque::new(); distances_to_end[self.query_graph.end_node as usize] = vec![0]; + for prev_node in self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter() { node_stack.push_back(prev_node as usize); - enqueued[prev_node as usize] = true; + enqueued.insert(prev_node); } while let Some(cur_node) = node_stack.pop_front() { let mut self_distances = vec![]; - for succ_node in self.query_graph.edges[cur_node].successors.iter() { + + let cur_node_edges = &self.node_edges[cur_node]; + for edge_idx in cur_node_edges.iter() { + let edge = self.all_edges[edge_idx as usize].as_ref().unwrap(); + let succ_node = edge.to_node; let succ_distances = &distances_to_end[succ_node as usize]; - let _ = self.visit_edges::<()>(cur_node as u32, succ_node, |_, edge| { - for succ_distance in succ_distances { - self_distances.push(edge.cost as u64 + succ_distance); - } - std::ops::ControlFlow::Continue(()) - }); + for succ_distance in succ_distances { + self_distances.push(edge.cost as u16 + succ_distance); + } } + self_distances.sort_unstable(); self_distances.dedup(); distances_to_end[cur_node] = self_distances; for prev_node in self.query_graph.edges[cur_node].predecessors.iter() { - if !enqueued[prev_node as usize] { + if !enqueued.contains(prev_node) { node_stack.push_back(prev_node as usize); - enqueued[prev_node as usize] = true; + enqueued.insert(prev_node); } } } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 21c186f3c..13ee03a22 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -11,9 +11,20 @@ use roaring::RoaringBitmap; // computing their hash and comparing them // which can be done... // by using a pointer (real, Rc, bumpalo, or in a vector)??? +// +// But actually.... the edge details' docids are a subset of the universe at the +// moment they were computed. +// But the universes between two iterations of a ranking rule are completely different +// Thus, there is no point in doing this. +// UNLESS... +// we compute the whole docids corresponding to the edge details (potentially expensive in time and memory +// in the common case) +// +// But we could still benefit within a single iteration for requests like: +// `a a a a a a a a a` where we have many of the same edge details, repeated pub struct EdgeDocidsCache { - pub cache: FxHashMap, + pub cache: FxHashMap, _phantom: PhantomData, } impl Default for EdgeDocidsCache { @@ -25,7 +36,7 @@ impl EdgeDocidsCache { pub fn get_edge_docids<'s, 'search>( &'s mut self, ctx: &mut SearchContext<'search>, - edge_index: u32, + edge_index: u16, graph: &RankingRuleGraph, // TODO: maybe universe doesn't belong here universe: &RoaringBitmap, @@ -41,7 +52,7 @@ impl EdgeDocidsCache { return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); } // TODO: maybe universe doesn't belong here - let docids = universe & G::compute_docids(ctx, details)?; + let docids = universe & G::compute_docids(ctx, details, universe)?; let _ = self.cache.insert(edge_index, docids); let docids = &self.cache[&edge_index]; Ok(BitmapOrAllRef::Bitmap(docids)) diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index bbfe2eedd..3c8fb5184 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,60 +1,48 @@ +use crate::new::small_bitmap::SmallBitmap; + use super::paths_map::PathsMap; #[derive(Clone)] pub struct EmptyPathsCache { - pub empty_edges: Vec, + pub empty_edges: SmallBitmap, pub empty_prefixes: PathsMap<()>, - pub empty_couple_edges: Vec>, + pub empty_couple_edges: Vec, } impl EmptyPathsCache { - pub fn new(all_edges_len: usize) -> Self { + pub fn new(all_edges_len: u16) -> Self { Self { - empty_edges: vec![false; all_edges_len], + empty_edges: SmallBitmap::new(all_edges_len), empty_prefixes: PathsMap::default(), - empty_couple_edges: vec![vec![]; all_edges_len], + empty_couple_edges: vec![SmallBitmap::new(all_edges_len); all_edges_len as usize], } } - pub fn forbid_edge(&mut self, edge_idx: u32) { - self.empty_edges[edge_idx as usize] = true; - self.empty_couple_edges[edge_idx as usize] = vec![]; + pub fn forbid_edge(&mut self, edge_idx: u16) { + self.empty_edges.insert(edge_idx); + self.empty_couple_edges[edge_idx as usize].clear(); self.empty_prefixes.remove_edge(&edge_idx); for edges2 in self.empty_couple_edges.iter_mut() { - if let Some(edge2_pos) = edges2.iter().position(|e| *e == edge_idx) { - edges2.swap_remove(edge2_pos); - } + edges2.remove(edge_idx); } } - pub fn forbid_prefix(&mut self, prefix: &[u32]) { + pub fn forbid_prefix(&mut self, prefix: &[u16]) { self.empty_prefixes.insert(prefix.iter().copied(), ()); } - pub fn forbid_couple_edges(&mut self, edge1: u32, edge2: u32) { - assert!(!self.empty_couple_edges[edge1 as usize].contains(&edge2)); - self.empty_couple_edges[edge1 as usize].push(edge2); + pub fn forbid_couple_edges(&mut self, edge1: u16, edge2: u16) { + self.empty_couple_edges[edge1 as usize].insert(edge2); } - pub fn path_is_empty(&self, path: &[u32]) -> bool { - for edge in path { - if self.empty_edges[*edge as usize] { + pub fn path_is_empty(&self, path: &[u16], path_bitmap: &SmallBitmap) -> bool { + if path_bitmap.intersects(&self.empty_edges) { + return true; + } + for edge in path.iter() { + let forbidden_other_edges = &self.empty_couple_edges[*edge as usize]; + if path_bitmap.intersects(forbidden_other_edges) { return true; } } if self.empty_prefixes.contains_prefix_of_path(path) { return true; } - for (edge1, edges2) in self.empty_couple_edges.iter().enumerate() { - if let Some(pos_edge1) = path.iter().position(|e| *e == edge1 as u32) { - if path[pos_edge1..].iter().any(|e| edges2.contains(e)) { - return true; - } - } - } - // for (edge1, edge2) in self.empty_couple_edges.iter() { - // if path.contains(edge1) && path.contains(edge2) { - // return true; - // } - // } - // if self.empty_prefixes.contains_prefix_of_path(path) { - // return true; - // } false } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 446c4e248..989986159 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -4,17 +4,16 @@ mod edge_docids_cache; mod empty_paths_cache; mod paths_map; mod proximity; -mod resolve_paths; mod typo; use super::logger::SearchLogger; +use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; use crate::Result; pub use edge_docids_cache::EdgeDocidsCache; pub use empty_paths_cache::EmptyPathsCache; pub use proximity::ProximityGraph; use roaring::RoaringBitmap; -use std::ops::ControlFlow; pub use typo::TypoGraph; #[derive(Debug, Clone)] @@ -25,15 +24,15 @@ pub enum EdgeDetails { #[derive(Debug, Clone)] pub struct Edge { - pub from_node: u32, - pub to_node: u32, + pub from_node: u16, + pub to_node: u16, pub cost: u8, pub details: EdgeDetails, } #[derive(Debug, Clone)] pub struct EdgePointer<'graph, E> { - pub index: u32, + pub index: u16, pub edge: &'graph Edge, } @@ -95,6 +94,7 @@ pub trait RankingRuleGraphTrait: Sized { fn compute_docids<'search>( ctx: &mut SearchContext<'search>, edge_details: &Self::EdgeDetails, + universe: &RoaringBitmap, ) -> Result; /// Prepare to build the edges outgoing from `from_node`. @@ -116,11 +116,11 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], - cost: u64, + distances: &[Vec], + cost: u16, logger: &mut dyn SearchLogger, ); } @@ -130,9 +130,9 @@ pub struct RankingRuleGraph { // pub edges: Vec>>>, pub all_edges: Vec>>, - pub node_edges: Vec, + pub node_edges: Vec, - pub successors: Vec, + pub successors: Vec, // TODO: to get the edges between two nodes: // 1. get node_outgoing_edges[from] // 2. get node_incoming_edges[to] @@ -149,29 +149,7 @@ impl Clone for RankingRuleGraph { } } impl RankingRuleGraph { - // Visit all edges between the two given nodes in order of increasing cost. - pub fn visit_edges<'graph, O>( - &'graph self, - from: u32, - to: u32, - mut visit: impl FnMut(u32, &'graph Edge) -> ControlFlow, - ) -> Option { - let from_edges = &self.node_edges[from as usize]; - for edge_idx in from_edges { - let edge = self.all_edges[edge_idx as usize].as_ref().unwrap(); - if edge.to_node == to { - let cf = visit(edge_idx, edge); - match cf { - ControlFlow::Continue(_) => continue, - ControlFlow::Break(o) => return Some(o), - } - } - } - - None - } - - pub fn remove_edge(&mut self, edge_index: u32) { + pub fn remove_edge(&mut self, edge_index: u16) { let edge_opt = &mut self.all_edges[edge_index as usize]; let Some(edge) = &edge_opt else { return }; let (from_node, _to_node) = (edge.from_node, edge.to_node); @@ -180,9 +158,10 @@ impl RankingRuleGraph { let from_node_edges = &mut self.node_edges[from_node as usize]; from_node_edges.remove(edge_index); - let mut new_successors_from_node = RoaringBitmap::new(); + let mut new_successors_from_node = SmallBitmap::new(self.all_edges.len() as u16); + let all_edges = &self.all_edges; for from_node_edge in from_node_edges.iter() { - let Edge { to_node, .. } = &self.all_edges[from_node_edge as usize].as_ref().unwrap(); + let Edge { to_node, .. } = &all_edges[from_node_edge as usize].as_ref().unwrap(); new_successors_from_node.insert(*to_node); } self.successors[from_node as usize] = new_successors_from_node; diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index b9d089efc..0cce9c93f 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -1,9 +1,4 @@ - - - - -use roaring::RoaringBitmap; - +use crate::new::small_bitmap::SmallBitmap; use super::cheapest_paths::Path; // What is PathsMap used for? @@ -13,7 +8,7 @@ use super::cheapest_paths::Path; #[derive(Debug, Clone)] pub struct PathsMap { - pub nodes: Vec<(u32, PathsMap)>, + pub nodes: Vec<(u16, PathsMap)>, pub value: Option, } impl Default for PathsMap { @@ -39,7 +34,7 @@ impl PathsMap { self.nodes.is_empty() && self.value.is_none() } - pub fn insert(&mut self, mut edges: impl Iterator, value: V) { + pub fn insert(&mut self, mut edges: impl Iterator, value: V) { match edges.next() { None => { self.value = Some(value); @@ -57,7 +52,7 @@ impl PathsMap { } } } - fn remove_first_rec(&mut self, cur: &mut Vec) -> (bool, V) { + fn remove_first_rec(&mut self, cur: &mut Vec) -> (bool, V) { let Some((first_edge, rest)) = self.nodes.first_mut() else { // The PathsMap has to be correct by construction here, otherwise // the unwrap() will crash @@ -72,7 +67,7 @@ impl PathsMap { (false, value) } } - pub fn remove_first(&mut self) -> Option<(Vec, V)> { + pub fn remove_first(&mut self) -> Option<(Vec, V)> { if self.is_empty() { return None; } @@ -81,7 +76,7 @@ impl PathsMap { let (_, value) = self.remove_first_rec(&mut result); Some((result, value)) } - pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { + pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { if let Some(value) = &self.value { visit(cur, value); } @@ -91,7 +86,7 @@ impl PathsMap { cur.pop(); } } - pub fn iterate(&self, mut visit: impl FnMut(&Vec, &V)) { + pub fn iterate(&self, mut visit: impl FnMut(&Vec, &V)) { self.iterate_rec(&mut vec![], &mut visit) } @@ -100,7 +95,7 @@ impl PathsMap { self.remove_prefix(prefix); }); } - pub fn remove_edges(&mut self, forbidden_edges: &RoaringBitmap) { + pub fn remove_edges(&mut self, forbidden_edges: &SmallBitmap) { let mut i = 0; while i < self.nodes.len() { let should_remove = if forbidden_edges.contains(self.nodes[i].0) { @@ -118,7 +113,7 @@ impl PathsMap { } } } - pub fn remove_edge(&mut self, forbidden_edge: &u32) { + pub fn remove_edge(&mut self, forbidden_edge: &u16) { let mut i = 0; while i < self.nodes.len() { let should_remove = if &self.nodes[i].0 == forbidden_edge { @@ -136,7 +131,7 @@ impl PathsMap { } } } - pub fn remove_prefix(&mut self, forbidden_prefix: &[u32]) { + pub fn remove_prefix(&mut self, forbidden_prefix: &[u16]) { let [first_edge, remaining_prefix @ ..] = forbidden_prefix else { self.nodes.clear(); self.value = None; @@ -160,25 +155,23 @@ impl PathsMap { } } - pub fn final_edges_ater_prefix(&self, prefix: &[u32]) -> Vec { + pub fn final_edges_after_prefix(&self, prefix: &[u16], visit: &mut impl FnMut(u16)) { let [first_edge, remaining_prefix @ ..] = prefix else { - return self.nodes.iter().filter_map(|n| { - if n.1.value.is_some() { - Some(n.0) - } else { - None + for node in self.nodes.iter() { + if node.1.value.is_some() { + visit(node.0) } - }).collect(); + } + return }; for (edge, rest) in self.nodes.iter() { if edge == first_edge { - return rest.final_edges_ater_prefix(remaining_prefix); + return rest.final_edges_after_prefix(remaining_prefix, visit); } } - vec![] } - pub fn edge_indices_after_prefix(&self, prefix: &[u32]) -> Vec { + pub fn edge_indices_after_prefix(&self, prefix: &[u16]) -> Vec { let [first_edge, remaining_prefix @ ..] = prefix else { return self.nodes.iter().map(|n| n.0).collect(); }; @@ -190,7 +183,7 @@ impl PathsMap { vec![] } - pub fn contains_prefix_of_path(&self, path: &[u32]) -> bool { + pub fn contains_prefix_of_path(&self, path: &[u16]) -> bool { if self.value.is_some() { return true; } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 4603c7ea0..e0bc1f5e4 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -111,6 +111,8 @@ pub fn visit_to_node<'search, 'from_data>( for word1 in derivations1.clone() { for proximity in 1..=(8 - ngram_len2) { let cost = (proximity + ngram_len2 - 1) as u8; + // TODO: if we had access to the universe here, we could already check whether + // the bitmap corresponding to this word pair is disjoint with the universe or not if ctx .get_word_prefix_pair_proximity_docids( word1, @@ -183,8 +185,13 @@ pub fn visit_to_node<'search, 'from_data>( .flat_map(|(cost, proximity_word_pairs)| { let mut edges = vec![]; for (proximity, word_pairs) in proximity_word_pairs { - edges - .push((cost, EdgeDetails::Data(ProximityEdge { pairs: word_pairs, proximity }))) + edges.push(( + cost, + EdgeDetails::Data(ProximityEdge { + pairs: word_pairs.into_boxed_slice(), + proximity, + }), + )) } edges }) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index df289fb2c..94a46d670 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,14 +1,15 @@ use super::{ProximityEdge, WordPair}; use crate::new::SearchContext; use crate::{CboRoaringBitmapCodec, Result}; -use roaring::{MultiOps, RoaringBitmap}; +use roaring::RoaringBitmap; pub fn compute_docids<'search>( ctx: &mut SearchContext<'search>, edge: &ProximityEdge, + universe: &RoaringBitmap, ) -> Result { let ProximityEdge { pairs, proximity } = edge; - let mut pair_docids = vec![]; + let mut pair_docids = RoaringBitmap::new(); for pair in pairs.iter() { let bytes = match pair { WordPair::Words { left, right } => { @@ -21,10 +22,11 @@ pub fn compute_docids<'search>( ctx.get_prefix_word_pair_proximity_docids(*left_prefix, *right, *proximity) } }?; - let bitmap = - bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); - pair_docids.push(bitmap); + // TODO: deserialize bitmap within a universe, and (maybe) using a bump allocator? + let bitmap = universe + & bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); + pair_docids |= bitmap; } - let docids = MultiOps::union(pair_docids); - Ok(docids) + + Ok(pair_docids) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index ec1a7b5fa..6c95b0805 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -10,7 +10,7 @@ use crate::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; use roaring::RoaringBitmap; -// TODO: intern the strings, refer to them by their pointer? +// TODO: intern the proximity edges as well? #[derive(Clone)] pub enum WordPair { @@ -21,8 +21,7 @@ pub enum WordPair { #[derive(Clone)] pub struct ProximityEdge { - // TODO: use a list of pointers to the word pairs instead? - pairs: Vec, + pairs: Box<[WordPair]>, proximity: u8, } @@ -40,8 +39,9 @@ impl RankingRuleGraphTrait for ProximityGraph { fn compute_docids<'search>( ctx: &mut SearchContext<'search>, edge: &Self::EdgeDetails, + universe: &RoaringBitmap, ) -> Result { - compute_docids::compute_docids(ctx, edge) + compute_docids::compute_docids(ctx, edge, universe) } fn build_visit_from_node<'search>( @@ -61,11 +61,11 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &super::RankingRuleGraph, - paths: &[Vec], + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], - cost: u64, + distances: &[Vec], + cost: u16, logger: &mut dyn SearchLogger, ) { logger.log_proximity_state( diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs deleted file mode 100644 index b3e03d555..000000000 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ /dev/null @@ -1,97 +0,0 @@ -#![allow(clippy::too_many_arguments)] - -use super::edge_docids_cache::EdgeDocidsCache; -use super::empty_paths_cache::EmptyPathsCache; -use super::{RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::{BitmapOrAllRef, SearchContext}; -use crate::Result; -use roaring::{MultiOps, RoaringBitmap}; - -impl RankingRuleGraph { - // TODO: reduce the universe after computing each path - // TODO: deserialize roaring bitmap within a universe - pub fn resolve_paths<'search>( - &mut self, - ctx: &mut SearchContext<'search>, - edge_docids_cache: &mut EdgeDocidsCache, - empty_paths_cache: &mut EmptyPathsCache, - universe: &RoaringBitmap, - mut paths: Vec>, - ) -> Result { - paths.sort_unstable(); - // let mut needs_filtering_empty_edges = false; - // let mut needs_filtering_empty_prefix = false; - // let mut needs_filtering_empty_couple_edges = false; - let mut needs_filtering = false; - let mut path_bitmaps = vec![]; - 'path_loop: loop { - // TODO: distinguish between empty_edges, empty_prefix, and empty_couple_edges filtering - if needs_filtering { - for path in paths.iter_mut() { - if empty_paths_cache.path_is_empty(path) { - path.clear(); - } - } - needs_filtering = false; - } - let Some(edge_indexes) = paths.pop() else { - break; - }; - - if edge_indexes.is_empty() { - continue; - } - - let mut path_bitmap = universe.clone(); - let mut visited_edges = vec![]; - let mut cached_edge_docids = vec![]; - 'edge_loop: for edge_index in edge_indexes { - visited_edges.push(edge_index); - let edge_docids = - edge_docids_cache.get_edge_docids(ctx, edge_index, self, universe)?; - match edge_docids { - BitmapOrAllRef::Bitmap(edge_docids) => { - cached_edge_docids.push((edge_index, edge_docids.clone())); - let (_, edge_docids) = cached_edge_docids.last().unwrap(); - if edge_docids.is_disjoint(universe) { - // 1. Store in the cache that this edge is empty for this universe - empty_paths_cache.forbid_edge(edge_index); - // 2. remove this edge from the proximity graph - self.remove_edge(edge_index); - edge_docids_cache.cache.remove(&edge_index); - needs_filtering = true; - // needs_filtering_empty_edges = true; - // 3. continue executing this function again on the remaining paths - continue 'path_loop; - } else { - path_bitmap &= edge_docids; - if path_bitmap.is_disjoint(universe) { - // needs_filtering_empty_prefix = true; - needs_filtering = true; - empty_paths_cache.forbid_prefix(&visited_edges); - // if the intersection between this edge and any - // previous one is disjoint with the universe, - // then we add these two edges to the empty_path_cache - for (edge_index2, edge_docids2) in - cached_edge_docids[..cached_edge_docids.len() - 1].iter() - { - let intersection = edge_docids & edge_docids2; - if intersection.is_disjoint(universe) { - // needs_filtering_empty_couple_edges = true; - empty_paths_cache - .forbid_couple_edges(*edge_index2, edge_index); - } - } - continue 'path_loop; - } - } - } - BitmapOrAllRef::All => continue 'edge_loop, - } - } - path_bitmaps.push(path_bitmap); - } - - Ok(MultiOps::union(path_bitmaps)) - } -} diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index d3771221f..c510c4851 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -31,6 +31,7 @@ impl RankingRuleGraphTrait for TypoGraph { fn compute_docids<'db_cache, 'search>( ctx: &mut SearchContext<'search>, edge: &Self::EdgeDetails, + universe: &RoaringBitmap, ) -> Result { match edge { TypoEdge::Phrase { phrase } => resolve_phrase(ctx, *phrase), @@ -44,14 +45,17 @@ impl RankingRuleGraphTrait for TypoGraph { let mut docids = RoaringBitmap::new(); for word in words.iter().copied() { let Some(bytes) = ctx.get_word_docids(word)? else { continue }; - let bitmap = - RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; + // TODO: deserialize bitmap within a universe + let bitmap = universe + & RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; docids |= bitmap; } if *nbr_typos == 0 { if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? { - let bitmap = - RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; + // TODO: deserialize bitmap within a universe + let bitmap = universe + & RoaringBitmapCodec::bytes_decode(bytes) + .ok_or(heed::Error::Decoding)?; docids |= bitmap; } } @@ -116,11 +120,11 @@ impl RankingRuleGraphTrait for TypoGraph { fn log_state( graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], - cost: u64, + distances: &[Vec], + cost: u16, logger: &mut dyn SearchLogger, ) { logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances.to_vec(), cost); diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index bfb9b5492..b65ff6d1a 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -262,46 +262,48 @@ mod tests { println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - // loop { - let start = Instant::now(); + loop { + let start = Instant::now(); - // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let results = execute_search( + &mut ctx, + "which a the releases from poison by the government", + None, + 0, + 20, + &mut DefaultSearchLogger, + // &mut logger, + ) + .unwrap(); - let results = execute_search( - &mut SearchContext::new(&index, &txn), - "releases from poison by the government", - None, - 0, - 20, - &mut DefaultSearchLogger, - // &mut logger, - ) - .unwrap(); + // logger.write_d2_description(&mut ctx); - // logger.write_d2_description(); + let elapsed = start.elapsed(); + println!("{}us", elapsed.as_micros()); - let elapsed = start.elapsed(); + let _documents = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); - let documents = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), results); - for (id, document) in documents { - println!("{id}:"); - println!("{document}"); + println!("{}us: {:?}", elapsed.as_micros(), results); } + // for (id, _document) in documents { + // println!("{id}:"); + // // println!("{document}"); + // } } #[test] @@ -342,9 +344,9 @@ mod tests { .collect::>(); println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - for (id, document) in documents { + for (id, _document) in documents { println!("{id}:"); - println!("{document}"); + // println!("{document}"); } } #[test] @@ -360,7 +362,7 @@ mod tests { // loop { let start = Instant::now(); - let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( &mut ctx, @@ -368,12 +370,12 @@ mod tests { None, 0, 20, - // &mut DefaultSearchLogger, - &mut logger, + &mut DefaultSearchLogger, + // &mut logger, ) .unwrap(); - logger.write_d2_description(&mut ctx); + // logger.write_d2_description(&mut ctx); let elapsed = start.elapsed(); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index de5cf02ab..4fa0912e1 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -1,5 +1,6 @@ use super::interner::Interned; use super::query_term::{Phrase, QueryTerm, WordDerivations}; +use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; use fxhash::FxHashMap; @@ -10,13 +11,13 @@ use std::collections::VecDeque; // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. #[derive(Default)] pub struct NodeDocIdsCache { - pub cache: FxHashMap, + pub cache: FxHashMap, } impl<'search> SearchContext<'search> { fn get_node_docids<'cache>( &'cache mut self, term: &QueryTerm, - node_idx: u32, + node_idx: u16, ) -> Result<&'cache RoaringBitmap> { if self.node_docids_cache.cache.contains_key(&node_idx) { return Ok(&self.node_docids_cache.cache[&node_idx]); @@ -76,7 +77,7 @@ pub fn resolve_query_graph<'search>( // TODO: there is definitely a faster way to compute this big // roaring bitmap expression - let mut nodes_resolved = RoaringBitmap::new(); + let mut nodes_resolved = SmallBitmap::new(64); let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()]; let mut next_nodes_to_visit = VecDeque::new(); @@ -89,8 +90,10 @@ pub fn resolve_query_graph<'search>( continue; } // Take union of all predecessors - let predecessors_iter = predecessors.iter().map(|p| &path_nodes_docids[p as usize]); - let predecessors_docids = MultiOps::union(predecessors_iter); + let mut predecessors_docids = RoaringBitmap::new(); + for p in predecessors.iter() { + predecessors_docids |= &path_nodes_docids[p as usize]; + } let n = &q.nodes[node as usize]; diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs new file mode 100644 index 000000000..f7adecee0 --- /dev/null +++ b/milli/src/search/new/small_bitmap.rs @@ -0,0 +1,271 @@ +// #[macro_export] +// macro_rules! iter_bitmap { +// ($bitmap:expr, $id:lifetime, $p:pat, $body:block) => { +// match $bitmap { +// SmallBitmap::Tiny(mut set) => { +// while set > 0 { +// let $p = set.trailing_zeros() as u16; +// $body; +// set &= set - 1; +// } +// } +// SmallBitmap::Small(sets) => { +// let mut base = 0; +// for set in sets.iter() { +// let mut set = *set; +// while set > 0 { +// let idx = set.trailing_zeros() as u16; +// let $p = idx + base; +// set &= set - 1; +// $body; +// } +// base += 64; +// } +// } +// } +// }; +// } + +#[derive(Clone)] +pub enum SmallBitmap { + Tiny(u64), + Small(Box<[u64]>), +} +impl SmallBitmap { + pub fn new(universe_length: u16) -> Self { + if universe_length <= 64 { + Self::Tiny(0) + } else { + Self::Small(vec![0; 1 + universe_length as usize / 64].into_boxed_slice()) + } + } + pub fn from_iter(xs: impl Iterator, universe_length: u16) -> Self { + let mut s = Self::new(universe_length); + for x in xs { + s.insert(x); + } + s + } + pub fn from_array(xs: &[u16], universe_length: u16) -> Self { + let mut s = Self::new(universe_length); + for x in xs { + s.insert(*x); + } + s + } + pub fn is_empty(&self) -> bool { + match self { + SmallBitmap::Tiny(set) => *set == 0, + SmallBitmap::Small(sets) => { + for set in sets.iter() { + if *set != 0 { + return false; + } + } + true + } + } + } + pub fn clear(&mut self) { + match self { + SmallBitmap::Tiny(set) => *set = 0, + SmallBitmap::Small(sets) => { + for set in sets.iter_mut() { + *set = 0; + } + } + } + } + pub fn contains(&self, mut x: u16) -> bool { + let set = match self { + SmallBitmap::Tiny(set) => *set, + SmallBitmap::Small(set) => { + let idx = x / 64; + x %= 64; + set[idx as usize] + } + }; + set & 0b1 << x != 0 + } + pub fn insert(&mut self, mut x: u16) { + let set = match self { + SmallBitmap::Tiny(set) => set, + SmallBitmap::Small(set) => { + let idx = x / 64; + x %= 64; + &mut set[idx as usize] + } + }; + *set |= 0b1 << x; + } + pub fn remove(&mut self, mut x: u16) { + let set = match self { + SmallBitmap::Tiny(set) => set, + SmallBitmap::Small(set) => { + let idx = x / 64; + x %= 64; + &mut set[idx as usize] + } + }; + *set &= !(0b1 << x); + } + // fn iter_single(mut set: u64, mut visit: impl FnMut(u16) -> Result<()>) -> Result<()> { + // while set > 0 { + // let idx = set.trailing_zeros() as u16; + // visit(idx)?; + // set &= set - 1; + // } + // Ok(()) + // } + // pub fn iter(&self, mut visit: impl FnMut(u16) -> Result<()>) -> Result<()> { + // match self { + // SmallBitmap::Tiny(set) => Self::iter_single(*set, &mut visit), + // SmallBitmap::Small(sets) => { + // let mut base = 0; + // for set in sets.iter() { + // Self::iter_single(*set, |x| visit(base + x))?; + // base += 64; + // } + // Ok(()) + // } + // } + // } + + pub fn intersection(&mut self, other: &SmallBitmap) { + self.apply_op(other, |a, b| *a &= b); + } + pub fn union(&mut self, other: &SmallBitmap) { + self.apply_op(other, |a, b| *a |= b); + } + pub fn subtract(&mut self, other: &SmallBitmap) { + self.apply_op(other, |a, b| *a &= !b); + } + + pub fn apply_op(&mut self, other: &SmallBitmap, op: impl Fn(&mut u64, u64)) { + match (self, other) { + (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(a, *b), + (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + assert!(a.len() == b.len(),); + for (a, b) in a.iter_mut().zip(b.iter()) { + op(a, *b); + } + } + _ => { + panic!(); + } + } + } + pub fn all_satisfy_op(&self, other: &SmallBitmap, op: impl Fn(u64, u64) -> bool) -> bool { + match (self, other) { + (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(*a, *b), + (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + assert!(a.len() == b.len()); + for (a, b) in a.iter().zip(b.iter()) { + if !op(*a, *b) { + return false; + } + } + true + } + _ => { + panic!(); + } + } + } + pub fn any_satisfy_op(&self, other: &SmallBitmap, op: impl Fn(u64, u64) -> bool) -> bool { + match (self, other) { + (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(*a, *b), + (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + assert!(a.len() == b.len()); + for (a, b) in a.iter().zip(b.iter()) { + if op(*a, *b) { + return true; + } + } + false + } + _ => { + panic!(); + } + } + } + pub fn is_subset(&self, other: &SmallBitmap) -> bool { + self.all_satisfy_op(other, |a, b| a & !b == 0) + } + pub fn intersects(&self, other: &SmallBitmap) -> bool { + self.any_satisfy_op(other, |a, b| a & b != 0) + } + pub fn iter(&self) -> SmallBitmapIter<'_> { + match self { + SmallBitmap::Tiny(x) => SmallBitmapIter::Tiny(*x), + SmallBitmap::Small(xs) => { + SmallBitmapIter::Small { cur: xs[0], next: &xs[1..], base: 0 } + } + } + } +} + +pub enum SmallBitmapIter<'b> { + Tiny(u64), + Small { cur: u64, next: &'b [u64], base: u16 }, +} +impl<'b> Iterator for SmallBitmapIter<'b> { + type Item = u16; + + fn next(&mut self) -> Option { + match self { + SmallBitmapIter::Tiny(set) => { + if *set > 0 { + let idx = set.trailing_zeros() as u16; + *set &= *set - 1; + Some(idx) + } else { + None + } + } + SmallBitmapIter::Small { cur, next, base } => { + if *cur > 0 { + let idx = cur.trailing_zeros() as u16; + *cur &= *cur - 1; + Some(idx + *base) + } else if next.is_empty() { + return None; + } else { + *base += 64; + *cur = next[0]; + *next = &next[1..]; + self.next() + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::SmallBitmap; + + #[test] + fn test_small_bitmap() { + let mut bitmap1 = SmallBitmap::new(32); + for x in 0..16 { + bitmap1.insert(x * 2); + } + let mut bitmap2 = SmallBitmap::new(32); + for x in 0..=10 { + bitmap2.insert(x * 3); + } + bitmap1.intersection(&bitmap2); + // println!("{}", bitmap.contains(12)); + // bitmap1 + // .iter(|x| { + // println!("{x}"); + // Ok(()) + // }) + // .unwrap(); + + // iter_bitmap!(bitmap1, 'loop1, x, { + // println!("{x}"); + // }) + } +} From 10626dddfc9d6bff22c638b94c2b0aee160b4ca2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 8 Mar 2023 09:53:05 +0100 Subject: [PATCH 043/234] Add a few more optimisations to new search algorithms --- milli/src/search/new/db_cache.rs | 2 +- .../search/new/graph_based_ranking_rule.rs | 18 +++-- milli/src/search/new/logger/detailed.rs | 21 ++--- milli/src/search/new/logger/mod.rs | 9 ++- .../new/ranking_rule_graph/cheapest_paths.rs | 51 +++++++----- .../ranking_rule_graph/edge_docids_cache.rs | 3 + .../src/search/new/ranking_rule_graph/mod.rs | 2 +- .../new/ranking_rule_graph/proximity/mod.rs | 3 +- .../search/new/ranking_rule_graph/typo/mod.rs | 3 +- milli/src/search/new/ranking_rules.rs | 78 +++++++++---------- 10 files changed, 104 insertions(+), 86 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index cfd69b04f..8ebe14047 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -6,7 +6,7 @@ use std::collections::hash_map::Entry; #[derive(Default)] pub struct DatabaseCache<'search> { - // TODO: interner for all database cache keys + // TODO: interner for all database cache keys? pub word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, pub word_prefix_pair_proximity_docids: diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index ac56b4f20..6c2e714ad 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -2,6 +2,7 @@ use super::logger::SearchLogger; use super::ranking_rule_graph::EdgeDocidsCache; use super::ranking_rule_graph::EmptyPathsCache; use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::small_bitmap::SmallBitmap; use super::SearchContext; use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput}; use crate::Result; @@ -21,7 +22,7 @@ pub struct GraphBasedRankingRuleState { graph: RankingRuleGraph, edge_docids_cache: EdgeDocidsCache, empty_paths_cache: EmptyPathsCache, - all_distances: Vec>, + all_distances: Vec>, cur_distance_idx: usize, } @@ -65,7 +66,6 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { - // TODO: update old state instead of starting from scratch let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; let mut edge_docids_cache = EdgeDocidsCache::default(); let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16); @@ -77,7 +77,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> universe, &mut empty_paths_cache, )?; - let all_distances = graph.initialize_distances_cheapest(); + let all_distances = graph.initialize_distances_with_necessary_edges(); let state = GraphBasedRankingRuleState { graph, @@ -100,6 +100,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> ) -> Result>> { assert!(universe.len() > 1); let mut state = self.state.take().unwrap(); + remove_empty_edges( ctx, &mut state.graph, @@ -114,7 +115,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> self.state = None; return Ok(None); } - let cost = + let (cost, _) = state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx]; state.cur_distance_idx += 1; @@ -132,12 +133,15 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> let original_universe = universe; let mut universe = universe.clone(); + // TODO: remove this unnecessary clone + let original_graph = graph.clone(); graph.visit_paths_of_cost( graph.query_graph.root_node as usize, cost, all_distances, empty_paths_cache, |path, graph, empty_paths_cache| { + paths.push(path.to_vec()); let mut path_docids = universe.clone(); let mut visited_edges = vec![]; let mut cached_edge_docids = vec![]; @@ -161,7 +165,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> path_docids &= edge_docids; if path_docids.is_disjoint(&universe) { - empty_paths_cache.forbid_prefix(&visited_edges); + // empty_paths_cache.forbid_prefix(&visited_edges); // if the intersection between this edge and any // previous one is disjoint with the universe, // then we add these two edges to the empty_path_cache @@ -170,14 +174,12 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> { let intersection = edge_docids & edge_docids2; if intersection.is_disjoint(&universe) { - // needs_filtering_empty_couple_edges = true; empty_paths_cache.forbid_couple_edges(*edge_index2, edge_index); } } return Ok(()); } } - paths.push(path.to_vec()); bucket |= &path_docids; universe -= path_docids; Ok(()) @@ -185,7 +187,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> )?; G::log_state( - &state.graph, + &original_graph, &paths, &state.empty_paths_cache, original_universe, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 10b5e7097..47b3e2ea2 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -6,6 +6,7 @@ use std::time::Instant; use std::{io::Write, path::PathBuf}; use crate::new::ranking_rule_graph::TypoGraph; +use crate::new::small_bitmap::SmallBitmap; use crate::new::{QueryNode, QueryGraph, SearchContext}; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::EmptyPathsCache; @@ -45,7 +46,7 @@ pub enum SearchEvents { paths: Vec>, empty_paths_cache: EmptyPathsCache, universe: RoaringBitmap, - distances: Vec>, + distances: Vec>, cost: u16, }, TypoState { @@ -53,7 +54,7 @@ pub enum SearchEvents { paths: Vec>, empty_paths_cache: EmptyPathsCache, universe: RoaringBitmap, - distances: Vec>, + distances: Vec>, cost: u16, }, RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant }, @@ -165,11 +166,11 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); } - fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { + fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } - fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { + fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } @@ -352,7 +353,7 @@ results.{random} {{ writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, _distances: &[u16], file: &mut File) { + fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, distances: &[(u16, SmallBitmap)], file: &mut File) { match &node { QueryNode::Term(LocatedQueryTerm { value, .. }) => { match value { @@ -390,9 +391,9 @@ shape: class").unwrap(); if *use_prefix_db { writeln!(file, "use prefix DB : true").unwrap(); } - // for (i, d) in distances.iter().enumerate() { - // writeln!(file, "\"distances\" : {d}").unwrap(); - // } + for (d, edges) in distances.iter() { + writeln!(file, "\"distance {d}\" : {:?}", edges.iter().collect::>() ).unwrap(); + } writeln!(file, "}}").unwrap(); }, @@ -420,7 +421,7 @@ shape: class").unwrap(); } } } - fn ranking_rule_graph_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { + fn ranking_rule_graph_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -477,7 +478,7 @@ shape: class").unwrap(); // } // writeln!(file, "}}").unwrap(); } - fn edge_d2_description(ctx: &mut SearchContext,graph: &RankingRuleGraph, edge_idx: u16, file: &mut File) { + fn edge_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, edge_idx: u16, file: &mut File) { let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index bf78e4de0..8a10fd064 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -5,6 +5,7 @@ use roaring::RoaringBitmap; use super::{ ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGraph, TypoGraph}, + small_bitmap::SmallBitmap, RankingRule, RankingRuleQueryTrait, }; @@ -61,7 +62,7 @@ impl SearchLogger for DefaultSearchLogger { _paths_map: &[Vec], _empty_paths_cache: &EmptyPathsCache, _universe: &RoaringBitmap, - _distances: Vec>, + _distances: Vec>, _cost: u16, ) { } @@ -72,7 +73,7 @@ impl SearchLogger for DefaultSearchLogger { _paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, _universe: &RoaringBitmap, - _distances: Vec>, + _distances: Vec>, _cost: u16, ) { } @@ -123,7 +124,7 @@ pub trait SearchLogger { paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: Vec>, + distances: Vec>, cost: u16, ); @@ -133,7 +134,7 @@ pub trait SearchLogger { paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: Vec>, + distances: Vec>, cost: u16, ); } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 14afd83d0..1adade945 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -4,7 +4,8 @@ use super::empty_paths_cache::EmptyPathsCache; use super::{RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::small_bitmap::SmallBitmap; use crate::Result; -use std::collections::VecDeque; +use std::collections::btree_map::Entry; +use std::collections::{BTreeMap, VecDeque}; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Path { @@ -17,7 +18,7 @@ impl RankingRuleGraph { &mut self, from: usize, cost: u16, - all_distances: &[Vec], + all_distances: &[Vec<(u16, SmallBitmap)>], empty_paths_cache: &mut EmptyPathsCache, mut visit: impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, ) -> Result<()> { @@ -37,13 +38,9 @@ impl RankingRuleGraph { &mut self, from: usize, cost: u16, - // TODO: replace all_distances with a Vec where the SmallBitmap contains true if the cost exists and false otherwise - all_distances: &[Vec], + all_distances: &[Vec<(u16, SmallBitmap)>], empty_paths_cache: &mut EmptyPathsCache, visit: &mut impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, - // replace prev edges by: - // (1) a small bitmap representing the path - // (2) a pointer within the EmptyPathsCache::forbidden_prefixes structure prev_edges: &mut Vec, cur_path: &mut SmallBitmap, mut forbidden_edges: SmallBitmap, @@ -55,7 +52,12 @@ impl RankingRuleGraph { let Some(edge) = self.all_edges[edge_idx as usize].as_ref() else { continue }; if cost < edge.cost as u16 || forbidden_edges.contains(edge_idx) - || !all_distances[edge.to_node as usize].contains(&(cost - edge.cost as u16)) + || !all_distances[edge.to_node as usize].iter().any( + |(next_cost, necessary_edges)| { + (*next_cost == cost - edge.cost as u16) + && !forbidden_edges.intersects(necessary_edges) + }, + ) { continue; } @@ -99,21 +101,20 @@ impl RankingRuleGraph { forbidden_edges.insert(x); }); } - if next_any_valid && empty_paths_cache.path_is_empty(prev_edges, cur_path) { - return Ok(any_valid); - } } Ok(any_valid) } - pub fn initialize_distances_cheapest(&self) -> Vec> { - let mut distances_to_end: Vec> = vec![vec![]; self.query_graph.nodes.len()]; + pub fn initialize_distances_with_necessary_edges(&self) -> Vec> { + let mut distances_to_end: Vec> = + vec![vec![]; self.query_graph.nodes.len()]; let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len() as u16); let mut node_stack = VecDeque::new(); - distances_to_end[self.query_graph.end_node as usize] = vec![0]; + distances_to_end[self.query_graph.end_node as usize] = + vec![(0, SmallBitmap::new(self.all_edges.len() as u16))]; for prev_node in self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter() @@ -123,21 +124,29 @@ impl RankingRuleGraph { } while let Some(cur_node) = node_stack.pop_front() { - let mut self_distances = vec![]; + let mut self_distances = BTreeMap::::new(); let cur_node_edges = &self.node_edges[cur_node]; for edge_idx in cur_node_edges.iter() { let edge = self.all_edges[edge_idx as usize].as_ref().unwrap(); let succ_node = edge.to_node; let succ_distances = &distances_to_end[succ_node as usize]; - for succ_distance in succ_distances { - self_distances.push(edge.cost as u16 + succ_distance); + for (succ_distance, succ_necessary_edges) in succ_distances { + let potential_necessary_edges = SmallBitmap::from_iter( + std::iter::once(edge_idx).chain(succ_necessary_edges.iter()), + self.all_edges.len() as u16, + ); + match self_distances.entry(edge.cost as u16 + succ_distance) { + Entry::Occupied(mut prev_necessary_edges) => { + prev_necessary_edges.get_mut().intersection(&potential_necessary_edges); + } + Entry::Vacant(entry) => { + entry.insert(potential_necessary_edges); + } + } } } - - self_distances.sort_unstable(); - self_distances.dedup(); - distances_to_end[cur_node] = self_distances; + distances_to_end[cur_node] = self_distances.into_iter().collect(); for prev_node in self.query_graph.edges[cur_node].predecessors.iter() { if !enqueued.contains(prev_node) { node_stack.push_back(prev_node as usize); diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 13ee03a22..9823c4fcc 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -49,6 +49,9 @@ impl EdgeDocidsCache { if self.cache.contains_key(&edge_index) { // TODO: should we update the bitmap in the cache if the new universe // reduces it? + // TODO: maybe have a generation: u32 to track every time the universe was + // reduced. Then only attempt to recompute the intersection when there is a chance + // that edge_docids & universe changed return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); } // TODO: maybe universe doesn't belong here diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 989986159..6d7445eac 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -119,7 +119,7 @@ pub trait RankingRuleGraphTrait: Sized { paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], + distances: &[Vec<(u16, SmallBitmap)>], cost: u16, logger: &mut dyn SearchLogger, ); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 6c95b0805..7cc4f995f 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -6,6 +6,7 @@ use super::{EdgeDetails, RankingRuleGraphTrait}; use crate::new::interner::Interned; use crate::new::logger::SearchLogger; use crate::new::query_term::WordDerivations; +use crate::new::small_bitmap::SmallBitmap; use crate::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; use roaring::RoaringBitmap; @@ -64,7 +65,7 @@ impl RankingRuleGraphTrait for ProximityGraph { paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], + distances: &[Vec<(u16, SmallBitmap)>], cost: u16, logger: &mut dyn SearchLogger, ) { diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index c510c4851..d3aec7174 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -4,6 +4,7 @@ use crate::new::interner::Interned; use crate::new::logger::SearchLogger; use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; use crate::new::resolve_query_graph::resolve_phrase; +use crate::new::small_bitmap::SmallBitmap; use crate::new::{QueryGraph, QueryNode, SearchContext}; use crate::{Result, RoaringBitmapCodec}; use heed::BytesDecode; @@ -123,7 +124,7 @@ impl RankingRuleGraphTrait for TypoGraph { paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], + distances: &[Vec<(u16, SmallBitmap)>], cost: u16, logger: &mut dyn SearchLogger, ) { diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index b65ff6d1a..82216c9cf 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -262,44 +262,44 @@ mod tests { println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - loop { - let start = Instant::now(); + // loop { + let start = Instant::now(); - // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "which a the releases from poison by the government", - None, - 0, - 20, - &mut DefaultSearchLogger, - // &mut logger, - ) - .unwrap(); + // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let results = execute_search( + &mut ctx, + "which a the releases from poison by the government", + None, + 0, + 20, + &mut DefaultSearchLogger, + // &mut logger, + ) + .unwrap(); - // logger.write_d2_description(&mut ctx); + // logger.write_d2_description(&mut ctx); - let elapsed = start.elapsed(); - println!("{}us", elapsed.as_micros()); + let elapsed = start.elapsed(); + println!("{}us", elapsed.as_micros()); - let _documents = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); + let _documents = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); - println!("{}us: {:?}", elapsed.as_micros(), results); - } + println!("{}us: {:?}", elapsed.as_micros(), results); + // } // for (id, _document) in documents { // println!("{id}:"); // // println!("{document}"); @@ -321,7 +321,7 @@ mod tests { let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("releases from poison by the government"); + s.query("which a the releases from poison by the government"); s.terms_matching_strategy(TermsMatchingStrategy::Last); s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap(); @@ -362,7 +362,7 @@ mod tests { // loop { let start = Instant::now(); - // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( &mut ctx, @@ -370,12 +370,12 @@ mod tests { None, 0, 20, - &mut DefaultSearchLogger, - // &mut logger, + // &mut DefaultSearchLogger, + &mut logger, ) .unwrap(); - // logger.write_d2_description(&mut ctx); + logger.write_d2_description(&mut ctx); let elapsed = start.elapsed(); @@ -414,7 +414,7 @@ mod tests { let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("releases from poison by the government"); + s.query("which a the releases from poison by the government"); s.terms_matching_strategy(TermsMatchingStrategy::Last); s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap(); From 57fa689131db00f976afd8c984a2eaf6a914b8fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 8 Mar 2023 09:55:53 +0100 Subject: [PATCH 044/234] Cargo fmt --- milli/src/lib.rs | 2 - milli/src/search/new/db_cache.rs | 9 +- .../search/new/graph_based_ranking_rule.rs | 16 +- milli/src/search/new/interner.rs | 3 +- milli/src/search/new/logger/detailed.rs | 358 ++++++++++++------ milli/src/search/new/logger/mod.rs | 8 +- milli/src/search/new/mod.rs | 16 +- milli/src/search/new/query_term.rs | 5 +- .../search/new/ranking_rule_graph/build.rs | 4 +- .../new/ranking_rule_graph/cheapest_paths.rs | 7 +- .../ranking_rule_graph/edge_docids_cache.rs | 7 +- .../ranking_rule_graph/empty_paths_cache.rs | 3 +- .../src/search/new/ranking_rule_graph/mod.rs | 9 +- .../new/ranking_rule_graph/paths_map.rs | 7 +- .../new/ranking_rule_graph/proximity/build.rs | 16 +- .../proximity/compute_docids.rs | 7 +- .../new/ranking_rule_graph/proximity/mod.rs | 13 +- .../search/new/ranking_rule_graph/typo/mod.rs | 19 +- milli/src/search/new/ranking_rules.rs | 33 +- milli/src/search/new/resolve_query_graph.rs | 10 +- milli/src/search/new/sort.rs | 3 +- milli/src/search/new/words.rs | 6 +- 22 files changed, 348 insertions(+), 213 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 594405891..af4324ae4 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -54,8 +54,6 @@ pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; #[macro_use] pub mod documents; -pub use search::new; - mod asc_desc; mod criterion; mod error; diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 8ebe14047..100dae90a 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -1,8 +1,11 @@ -use super::{interner::Interned, SearchContext}; -use crate::Result; +use std::collections::hash_map::Entry; + use fxhash::FxHashMap; use heed::types::ByteSlice; -use std::collections::hash_map::Entry; + +use super::interner::Interned; +use super::SearchContext; +use crate::Result; #[derive(Default)] pub struct DatabaseCache<'search> { diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 6c2e714ad..1d17c32a8 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -1,13 +1,13 @@ -use super::logger::SearchLogger; -use super::ranking_rule_graph::EdgeDocidsCache; -use super::ranking_rule_graph::EmptyPathsCache; -use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; -use super::small_bitmap::SmallBitmap; -use super::SearchContext; -use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput}; -use crate::Result; use roaring::RoaringBitmap; +use super::logger::SearchLogger; +use super::ranking_rule_graph::{ + EdgeDocidsCache, EmptyPathsCache, RankingRuleGraph, RankingRuleGraphTrait, +}; +use super::small_bitmap::SmallBitmap; +use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; +use crate::Result; + pub struct GraphBasedRankingRule { id: String, state: Option>, diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index ae0a4e9cb..8a8fad1e1 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -1,7 +1,8 @@ -use fxhash::FxHashMap; use std::hash::Hash; use std::marker::PhantomData; +use fxhash::FxHashMap; + pub struct Interned { idx: u32, _phantom: PhantomData, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 47b3e2ea2..c6570ef54 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -1,39 +1,37 @@ +use std::fs::File; +use std::io::Write; +use std::path::PathBuf; +use std::time::Instant; use rand::random; use roaring::RoaringBitmap; -use std::fs::File; -use std::time::Instant; -use std::{io::Write, path::PathBuf}; -use crate::new::ranking_rule_graph::TypoGraph; -use crate::new::small_bitmap::SmallBitmap; -use crate::new::{QueryNode, QueryGraph, SearchContext}; -use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; -use crate::new::ranking_rule_graph::EmptyPathsCache; -use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait}; -use crate::new::ranking_rule_graph::{ - ProximityGraph, RankingRuleGraph, +use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use crate::search::new::ranking_rule_graph::{ + Edge, EdgeDetails, EmptyPathsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, + TypoGraph, }; - -use super::{RankingRule, SearchLogger}; +use crate::search::new::small_bitmap::SmallBitmap; +use crate::search::new::{QueryGraph, QueryNode, SearchContext}; +use crate::search::new::{RankingRule, SearchLogger}; pub enum SearchEvents { RankingRuleStartIteration { ranking_rule_idx: usize, query: QueryGraph, universe: RoaringBitmap, - time: Instant + time: Instant, }, RankingRuleNextBucket { ranking_rule_idx: usize, universe: RoaringBitmap, candidates: RoaringBitmap, - time: Instant + time: Instant, }, RankingRuleEndIteration { ranking_rule_idx: usize, universe: RoaringBitmap, - time: Instant + time: Instant, }, ExtendResults { new: Vec, @@ -57,7 +55,11 @@ pub enum SearchEvents { distances: Vec>, cost: u16, }, - RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant }, + RankingRuleSkipBucket { + ranking_rule_idx: usize, + candidates: RoaringBitmap, + time: Instant, + }, } pub struct DetailedSearchLogger { @@ -106,7 +108,6 @@ impl SearchLogger for DetailedSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, query: &QueryGraph, universe: &RoaringBitmap, - ) { self.events.push(SearchEvents::RankingRuleStartIteration { ranking_rule_idx, @@ -122,7 +123,6 @@ impl SearchLogger for DetailedSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, candidates: &RoaringBitmap, - ) { self.events.push(SearchEvents::RankingRuleNextBucket { ranking_rule_idx, @@ -136,12 +136,11 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, candidates: &RoaringBitmap, - ) { self.events.push(SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates: candidates.clone(), - time: Instant::now() + time: Instant::now(), }) } @@ -150,12 +149,11 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, - ) { self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe: universe.clone(), - time: Instant::now() + time: Instant::now(), }) } fn add_to_results(&mut self, docids: &[u32]) { @@ -166,18 +164,47 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); } - fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { - self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) - } - - fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { - self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) + fn log_proximity_state( + &mut self, + query_graph: &RankingRuleGraph, + paths_map: &[Vec], + empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + distances: Vec>, + cost: u16, + ) { + self.events.push(SearchEvents::ProximityState { + graph: query_graph.clone(), + paths: paths_map.to_vec(), + empty_paths_cache: empty_paths_cache.clone(), + universe: universe.clone(), + distances, + cost, + }) } + fn log_typo_state( + &mut self, + query_graph: &RankingRuleGraph, + paths_map: &[Vec], + empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + distances: Vec>, + cost: u16, + ) { + self.events.push(SearchEvents::TypoState { + graph: query_graph.clone(), + paths: paths_map.to_vec(), + empty_paths_cache: empty_paths_cache.clone(), + universe: universe.clone(), + distances, + cost, + }) + } } impl DetailedSearchLogger { - pub fn write_d2_description(&self,ctx: &mut SearchContext,) { + pub fn write_d2_description(&self, ctx: &mut SearchContext) { let mut prev_time = self.initial_query_time.unwrap(); let mut timestamp = vec![]; fn activated_id(timestamp: &[usize]) -> String { @@ -229,21 +256,29 @@ impl DetailedSearchLogger { ) .unwrap(); } - writeln!(&mut file, - "{ranking_rule_idx}.{self_activated_id} {{ + writeln!( + &mut file, + "{ranking_rule_idx}.{self_activated_id} {{ style {{ fill: \"#D8A7B1\" }} -}}").unwrap(); +}}" + ) + .unwrap(); } - SearchEvents::RankingRuleNextBucket { ranking_rule_idx, time, universe, candidates } => { + SearchEvents::RankingRuleNextBucket { + ranking_rule_idx, + time, + universe, + candidates, + } => { let _elapsed = time.duration_since(prev_time); prev_time = *time; let old_activated_id = activated_id(×tamp); // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); *timestamp.last_mut().unwrap() += 1; let next_activated_id = activated_id(×tamp); - writeln!(&mut file, + writeln!(&mut file, "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket {}/{}", candidates.len(), universe.len()) .unwrap(); } @@ -255,7 +290,7 @@ impl DetailedSearchLogger { *timestamp.last_mut().unwrap() += 1; let next_activated_id = activated_id(×tamp); let len = candidates.len(); - writeln!(&mut file, + writeln!(&mut file, "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : skip bucket ({len})",) .unwrap(); } @@ -280,14 +315,14 @@ impl DetailedSearchLogger { } SearchEvents::ExtendResults { new } => { if new.is_empty() { - continue + continue; } let cur_ranking_rule = timestamp.len() - 1; let cur_activated_id = activated_id(×tamp); let docids = new.iter().collect::>(); let len = new.len(); let random = random::(); - + writeln!( &mut file, "{cur_ranking_rule}.{cur_activated_id} -> results.{random} : \"add {len}\" @@ -300,7 +335,7 @@ results.{random} {{ " ) .unwrap(); - }, + } SearchEvents::WordsState { query_graph } => { let cur_ranking_rule = timestamp.len() - 1; *timestamp.last_mut().unwrap() += 1; @@ -314,9 +349,18 @@ results.{random} {{ &mut file, "{id} {{ link: \"{id}.d2.svg\" -}}").unwrap(); - }, - SearchEvents::ProximityState { graph, paths, empty_paths_cache, universe, distances, cost } => { +}}" + ) + .unwrap(); + } + SearchEvents::ProximityState { + graph, + paths, + empty_paths_cache, + universe, + distances, + cost, + } => { let cur_ranking_rule = timestamp.len() - 1; *timestamp.last_mut().unwrap() += 1; let cur_activated_id = activated_id(×tamp); @@ -324,15 +368,32 @@ results.{random} {{ let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description(ctx, graph, paths, empty_paths_cache, distances.clone(), &mut new_file); + Self::ranking_rule_graph_d2_description( + ctx, + graph, + paths, + empty_paths_cache, + distances.clone(), + &mut new_file, + ); writeln!( &mut file, "{id} {{ link: \"{id}.d2.svg\" tooltip: \"cost {cost}, universe len: {}\" -}}", universe.len()).unwrap(); - }, - SearchEvents::TypoState { graph, paths, empty_paths_cache, universe, distances, cost } => { +}}", + universe.len() + ) + .unwrap(); + } + SearchEvents::TypoState { + graph, + paths, + empty_paths_cache, + universe, + distances, + cost, + } => { let cur_ranking_rule = timestamp.len() - 1; *timestamp.last_mut().unwrap() += 1; let cur_activated_id = activated_id(×tamp); @@ -340,89 +401,130 @@ results.{random} {{ let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description(ctx,graph, paths, empty_paths_cache, distances.clone(), &mut new_file); + Self::ranking_rule_graph_d2_description( + ctx, + graph, + paths, + empty_paths_cache, + distances.clone(), + &mut new_file, + ); writeln!( &mut file, "{id} {{ link: \"{id}.d2.svg\" tooltip: \"cost {cost}, universe len: {}\" -}}", universe.len()).unwrap(); - }, +}}", + universe.len() + ) + .unwrap(); + } } } writeln!(&mut file, "}}").unwrap(); } - - fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, distances: &[(u16, SmallBitmap)], file: &mut File) { + + fn query_node_d2_desc( + ctx: &mut SearchContext, + node_idx: usize, + node: &QueryNode, + distances: &[(u16, SmallBitmap)], + file: &mut File, + ) { match &node { - QueryNode::Term(LocatedQueryTerm { value, .. }) => { - match value { - QueryTerm::Phrase { phrase } => { - let phrase = ctx.phrase_interner.get(*phrase); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file,"{node_idx} : \"{phrase_str}\"").unwrap(); - }, - QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db, synonyms, split_words } } => { - let original = ctx.word_interner.get(*original); - writeln!(file,"{node_idx} : \"{original}\" {{ -shape: class").unwrap(); - for w in zero_typo.iter().copied() { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0").unwrap(); - } - for w in one_typo.iter().copied() { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 1").unwrap(); - } - for w in two_typos.iter().copied() { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 2").unwrap(); - } - if let Some(split_words) = split_words { - let phrase = ctx.phrase_interner.get(*split_words); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); - } - for synonym in synonyms.iter().copied() { - let phrase = ctx.phrase_interner.get(synonym); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); - } - if *use_prefix_db { - writeln!(file, "use prefix DB : true").unwrap(); - } - for (d, edges) in distances.iter() { - writeln!(file, "\"distance {d}\" : {:?}", edges.iter().collect::>() ).unwrap(); - } - - writeln!(file, "}}").unwrap(); - }, + QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { + QueryTerm::Phrase { phrase } => { + let phrase = ctx.phrase_interner.get(*phrase); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "{node_idx} : \"{phrase_str}\"").unwrap(); + } + QueryTerm::Word { + derivations: + WordDerivations { + original, + zero_typo, + one_typo, + two_typos, + use_prefix_db, + synonyms, + split_words, + }, + } => { + let original = ctx.word_interner.get(*original); + writeln!( + file, + "{node_idx} : \"{original}\" {{ +shape: class" + ) + .unwrap(); + for w in zero_typo.iter().copied() { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 0").unwrap(); + } + for w in one_typo.iter().copied() { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 1").unwrap(); + } + for w in two_typos.iter().copied() { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 2").unwrap(); + } + if let Some(split_words) = split_words { + let phrase = ctx.phrase_interner.get(*split_words); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); + } + for synonym in synonyms.iter().copied() { + let phrase = ctx.phrase_interner.get(synonym); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); + } + if *use_prefix_db { + writeln!(file, "use prefix DB : true").unwrap(); + } + for (d, edges) in distances.iter() { + writeln!(file, "\"distance {d}\" : {:?}", edges.iter().collect::>()) + .unwrap(); + } + + writeln!(file, "}}").unwrap(); } }, QueryNode::Deleted => panic!(), QueryNode::Start => { - writeln!(file,"{node_idx} : START").unwrap(); - }, + writeln!(file, "{node_idx} : START").unwrap(); + } QueryNode::End => { - writeln!(file,"{node_idx} : END").unwrap(); - }, + writeln!(file, "{node_idx} : END").unwrap(); + } } } - fn query_graph_d2_description(ctx: &mut SearchContext, query_graph: &QueryGraph, file: &mut File) { - writeln!(file,"direction: right").unwrap(); + fn query_graph_d2_description( + ctx: &mut SearchContext, + query_graph: &QueryGraph, + file: &mut File, + ) { + writeln!(file, "direction: right").unwrap(); for node in 0..query_graph.nodes.len() { if matches!(query_graph.nodes[node], QueryNode::Deleted) { continue; } Self::query_node_d2_desc(ctx, node, &query_graph.nodes[node], &[], file); - + for edge in query_graph.edges[node].successors.iter() { writeln!(file, "{node} -> {edge};\n").unwrap(); } - } + } } - fn ranking_rule_graph_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { - writeln!(file,"direction: right").unwrap(); + fn ranking_rule_graph_d2_description( + ctx: &mut SearchContext, + graph: &RankingRuleGraph, + paths: &[Vec], + _empty_paths_cache: &EmptyPathsCache, + distances: Vec>, + file: &mut File, + ) { + writeln!(file, "direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); for (node_idx, node) in graph.query_graph.nodes.iter().enumerate() { @@ -437,17 +539,21 @@ shape: class").unwrap(); match &details { EdgeDetails::Unconditional => { - writeln!(file, + writeln!( + file, "{from_node} -> {to_node} : \"always cost {cost}\"", cost = edge.cost, - ).unwrap(); + ) + .unwrap(); } EdgeDetails::Data(details) => { - writeln!(file, + writeln!( + file, "{from_node} -> {to_node} : \"cost {cost} {edge_label}\"", cost = edge.cost, edge_label = R::graphviz_edge_details_label(details) - ).unwrap(); + ) + .unwrap(); } } } @@ -457,12 +563,11 @@ shape: class").unwrap(); // Self::paths_d2_description(graph, paths, file); // writeln!(file, "}}").unwrap(); - writeln!(file, "Shortest Paths {{").unwrap(); Self::paths_d2_description(ctx, graph, paths, file); writeln!(file, "}}").unwrap(); - // writeln!(file, "Empty Edge Couples {{").unwrap(); + // writeln!(file, "Empty Edge Couples {{").unwrap(); // for (i, (e1, e2)) in empty_paths_cache.empty_couple_edges.iter().enumerate() { // writeln!(file, "{i} : \"\" {{").unwrap(); // Self::edge_d2_description(graph, *e1, file); @@ -478,18 +583,24 @@ shape: class").unwrap(); // } // writeln!(file, "}}").unwrap(); } - fn edge_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, edge_idx: u16, file: &mut File) { - let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; + fn edge_d2_description( + ctx: &mut SearchContext, + graph: &RankingRuleGraph, + edge_idx: u16, + file: &mut File, + ) { + let Edge { from_node, to_node, cost, .. } = + graph.all_edges[edge_idx as usize].as_ref().unwrap(); let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { QueryNode::Term(term) => match &term.value { QueryTerm::Phrase { phrase } => { let phrase = ctx.phrase_interner.get(*phrase); phrase.description(&ctx.word_interner) - }, + } QueryTerm::Word { derivations } => { ctx.word_interner.get(derivations.original).to_owned() - }, + } }, QueryNode::Deleted => panic!(), QueryNode::Start => "START".to_owned(), @@ -501,18 +612,29 @@ shape: class").unwrap(); QueryTerm::Phrase { phrase } => { let phrase = ctx.phrase_interner.get(*phrase); phrase.description(&ctx.word_interner) - }, - QueryTerm::Word { derivations } => ctx.word_interner.get(derivations.original).to_owned(), + } + QueryTerm::Word { derivations } => { + ctx.word_interner.get(derivations.original).to_owned() + } }, QueryNode::Deleted => panic!(), QueryNode::Start => "START".to_owned(), QueryNode::End => "END".to_owned(), }; - writeln!(file, "{edge_idx}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ + writeln!( + file, + "{edge_idx}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ shape: class - }}").unwrap(); + }}" + ) + .unwrap(); } - fn paths_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], file: &mut File) { + fn paths_d2_description( + ctx: &mut SearchContext, + graph: &RankingRuleGraph, + paths: &[Vec], + file: &mut File, + ) { for (path_idx, edge_indexes) in paths.iter().enumerate() { writeln!(file, "{path_idx} {{").unwrap(); for edge_idx in edge_indexes.iter() { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 8a10fd064..11e1389d0 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -3,11 +3,9 @@ pub mod detailed; use roaring::RoaringBitmap; -use super::{ - ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGraph, TypoGraph}, - small_bitmap::SmallBitmap, - RankingRule, RankingRuleQueryTrait, -}; +use super::ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGraph, TypoGraph}; +use super::small_bitmap::SmallBitmap; +use super::{RankingRule, RankingRuleQueryTrait}; pub struct DefaultSearchLogger; impl SearchLogger for DefaultSearchLogger { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index dc73fe51c..0dbdd93b0 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -11,12 +11,8 @@ mod small_bitmap; mod sort; mod words; -use self::interner::Interner; -use self::logger::SearchLogger; -use self::query_term::Phrase; -use self::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}; -use crate::new::query_term::located_query_terms_from_string; -use crate::{Filter, Index, Result, TermsMatchingStrategy}; +use std::collections::BTreeSet; + use charabia::Tokenize; use db_cache::DatabaseCache; use heed::RoTxn; @@ -26,7 +22,13 @@ pub use ranking_rules::{ RankingRuleOutputIterWrapper, RankingRuleQueryTrait, }; use roaring::RoaringBitmap; -use std::collections::BTreeSet; + +use self::interner::Interner; +use self::logger::SearchLogger; +use self::query_term::Phrase; +use self::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}; +use crate::search::new::query_term::located_query_terms_from_string; +use crate::{Filter, Index, Result, TermsMatchingStrategy}; pub enum BitmapOrAllRef<'s> { Bitmap(&'s RoaringBitmap), diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index b5e29bffc..46a62b4a9 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -12,13 +12,12 @@ use heed::types::DecodeIgnore; use heed::RoTxn; use itertools::Itertools; +use super::interner::{Interned, Interner}; +use super::SearchContext; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; use crate::{CboRoaringBitmapLenCodec, Index, Result}; -use super::interner::{Interned, Interner}; -use super::SearchContext; - #[derive(Default, Clone, PartialEq, Eq, Hash)] pub struct Phrase { pub words: Vec>>, diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 261f2909b..d9732b010 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,8 +1,8 @@ use std::collections::HashSet; use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::small_bitmap::SmallBitmap; -use crate::new::{QueryGraph, SearchContext}; +use crate::search::new::small_bitmap::SmallBitmap; +use crate::search::new::{QueryGraph, SearchContext}; use crate::Result; impl RankingRuleGraph { diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 1adade945..8627860e7 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -1,11 +1,12 @@ #![allow(clippy::too_many_arguments)] +use std::collections::btree_map::Entry; +use std::collections::{BTreeMap, VecDeque}; + use super::empty_paths_cache::EmptyPathsCache; use super::{RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::small_bitmap::SmallBitmap; +use crate::search::new::small_bitmap::SmallBitmap; use crate::Result; -use std::collections::btree_map::Entry; -use std::collections::{BTreeMap, VecDeque}; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Path { diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 9823c4fcc..c0c46289c 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -1,11 +1,12 @@ use std::marker::PhantomData; -use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::{BitmapOrAllRef, SearchContext}; -use crate::Result; use fxhash::FxHashMap; use roaring::RoaringBitmap; +use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::{BitmapOrAllRef, SearchContext}; +use crate::Result; + // TODO: the cache should have a G::EdgeDetails as key // but then it means that we should have a quick way of // computing their hash and comparing them diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index 3c8fb5184..659042a01 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,6 +1,5 @@ -use crate::new::small_bitmap::SmallBitmap; - use super::paths_map::PathsMap; +use crate::search::new::small_bitmap::SmallBitmap; #[derive(Clone)] pub struct EmptyPathsCache { diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 6d7445eac..635f194f5 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -6,16 +6,17 @@ mod paths_map; mod proximity; mod typo; -use super::logger::SearchLogger; -use super::small_bitmap::SmallBitmap; -use super::{QueryGraph, QueryNode, SearchContext}; -use crate::Result; pub use edge_docids_cache::EdgeDocidsCache; pub use empty_paths_cache::EmptyPathsCache; pub use proximity::ProximityGraph; use roaring::RoaringBitmap; pub use typo::TypoGraph; +use super::logger::SearchLogger; +use super::small_bitmap::SmallBitmap; +use super::{QueryGraph, QueryNode, SearchContext}; +use crate::Result; + #[derive(Debug, Clone)] pub enum EdgeDetails { Unconditional, diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 0cce9c93f..82f181b97 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -1,11 +1,10 @@ -use crate::new::small_bitmap::SmallBitmap; use super::cheapest_paths::Path; +use crate::search::new::small_bitmap::SmallBitmap; // What is PathsMap used for? // For the empty_prefixes field in the EmptyPathsCache only :/ // but it could be used for more, like efficient computing of a set of paths - #[derive(Debug, Clone)] pub struct PathsMap { pub nodes: Vec<(u16, PathsMap)>, @@ -53,10 +52,10 @@ impl PathsMap { } } fn remove_first_rec(&mut self, cur: &mut Vec) -> (bool, V) { - let Some((first_edge, rest)) = self.nodes.first_mut() else { + let Some((first_edge, rest)) = self.nodes.first_mut() else { // The PathsMap has to be correct by construction here, otherwise // the unwrap() will crash - return (true, self.value.take().unwrap()) + return (true, self.value.take().unwrap()) }; cur.push(*first_edge); let (rest_is_empty, value) = rest.remove_first_rec(cur); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index e0bc1f5e4..48a6dda7e 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -1,12 +1,14 @@ -use super::ProximityEdge; -use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; -use crate::new::ranking_rule_graph::proximity::WordPair; -use crate::new::ranking_rule_graph::EdgeDetails; -use crate::new::{QueryNode, SearchContext}; -use crate::Result; -use itertools::Itertools; use std::collections::BTreeMap; +use itertools::Itertools; + +use super::ProximityEdge; +use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use crate::search::new::ranking_rule_graph::proximity::WordPair; +use crate::search::new::ranking_rule_graph::EdgeDetails; +use crate::search::new::{QueryNode, SearchContext}; +use crate::Result; + pub fn visit_from_node( ctx: &mut SearchContext, from_node: &QueryNode, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 94a46d670..9aa4ce446 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,8 +1,9 @@ -use super::{ProximityEdge, WordPair}; -use crate::new::SearchContext; -use crate::{CboRoaringBitmapCodec, Result}; use roaring::RoaringBitmap; +use super::{ProximityEdge, WordPair}; +use crate::search::new::SearchContext; +use crate::{CboRoaringBitmapCodec, Result}; + pub fn compute_docids<'search>( ctx: &mut SearchContext<'search>, edge: &ProximityEdge, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 7cc4f995f..09c9aa960 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -1,15 +1,16 @@ pub mod build; pub mod compute_docids; +use roaring::RoaringBitmap; + use super::empty_paths_cache::EmptyPathsCache; use super::{EdgeDetails, RankingRuleGraphTrait}; -use crate::new::interner::Interned; -use crate::new::logger::SearchLogger; -use crate::new::query_term::WordDerivations; -use crate::new::small_bitmap::SmallBitmap; -use crate::new::{QueryGraph, QueryNode, SearchContext}; +use crate::search::new::interner::Interned; +use crate::search::new::logger::SearchLogger; +use crate::search::new::query_term::WordDerivations; +use crate::search::new::small_bitmap::SmallBitmap; +use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; -use roaring::RoaringBitmap; // TODO: intern the proximity edges as well? diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index d3aec7174..ce569fbb0 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,15 +1,16 @@ -use super::empty_paths_cache::EmptyPathsCache; -use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::interner::Interned; -use crate::new::logger::SearchLogger; -use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; -use crate::new::resolve_query_graph::resolve_phrase; -use crate::new::small_bitmap::SmallBitmap; -use crate::new::{QueryGraph, QueryNode, SearchContext}; -use crate::{Result, RoaringBitmapCodec}; use heed::BytesDecode; use roaring::RoaringBitmap; +use super::empty_paths_cache::EmptyPathsCache; +use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::Interned; +use crate::search::new::logger::SearchLogger; +use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; +use crate::search::new::resolve_query_graph::resolve_phrase; +use crate::search::new::small_bitmap::SmallBitmap; +use crate::search::new::{QueryGraph, QueryNode, SearchContext}; +use crate::{Result, RoaringBitmapCodec}; + #[derive(Clone)] pub enum TypoEdge { Phrase { phrase: Interned }, diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 82216c9cf..788f8a496 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -1,11 +1,10 @@ -use super::logger::SearchLogger; -use super::QueryGraph; -use super::SearchContext; -use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; -use crate::new::ranking_rule_graph::ProximityGraph; -use crate::new::ranking_rule_graph::TypoGraph; -use crate::new::words::Words; use roaring::RoaringBitmap; + +use super::logger::SearchLogger; +use super::{QueryGraph, SearchContext}; +use crate::search::new::graph_based_ranking_rule::GraphBasedRankingRule; +use crate::search::new::ranking_rule_graph::{ProximityGraph, TypoGraph}; +use crate::search::new::words::Words; // use crate::search::new::sort::Sort; use crate::{Result, TermsMatchingStrategy}; @@ -239,16 +238,18 @@ pub fn apply_ranking_rules<'search>( #[cfg(test)] mod tests { // use crate::allocator::ALLOC; - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::new::{execute_search, SearchContext}; - use big_s::S; - use heed::EnvOpenOptions; - use maplit::hashset; use std::fs::File; use std::io::{BufRead, BufReader, Cursor, Seek}; use std::time::Instant; - // use crate::new::logger::detailed::DetailedSearchLogger; - use crate::new::logger::DefaultSearchLogger; + + use big_s::S; + use heed::EnvOpenOptions; + use maplit::hashset; + + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + // use crate::search::new::logger::detailed::DetailedSearchLogger; + use crate::search::new::logger::DefaultSearchLogger; + use crate::search::new::{execute_search, SearchContext}; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; @@ -265,7 +266,7 @@ mod tests { // loop { let start = Instant::now(); - // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + // let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( &mut ctx, @@ -362,7 +363,7 @@ mod tests { // loop { let start = Instant::now(); - let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( &mut ctx, diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 4fa0912e1..0581341d1 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -1,12 +1,14 @@ +use std::collections::VecDeque; + +use fxhash::FxHashMap; +use heed::BytesDecode; +use roaring::{MultiOps, RoaringBitmap}; + use super::interner::Interned; use super::query_term::{Phrase, QueryTerm, WordDerivations}; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; -use fxhash::FxHashMap; -use heed::BytesDecode; -use roaring::{MultiOps, RoaringBitmap}; -use std::collections::VecDeque; // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. #[derive(Default)] diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index f0967843b..d5a6276ad 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -1,3 +1,5 @@ +use roaring::RoaringBitmap; + use super::logger::SearchLogger; use super::{ RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper, @@ -11,7 +13,6 @@ use crate::{ Index, Result, }; -use roaring::RoaringBitmap; pub struct Sort<'search, Query> { field_name: String, diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 9ad8b33ba..2858e1569 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -1,9 +1,11 @@ +use std::collections::BTreeSet; + +use roaring::RoaringBitmap; + use super::logger::SearchLogger; use super::resolve_query_graph::resolve_query_graph; use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput, SearchContext}; use crate::{Result, TermsMatchingStrategy}; -use roaring::RoaringBitmap; -use std::collections::BTreeSet; pub struct Words { exhausted: bool, From 4e266211bfdc60dcc0117c6411c31546aad508fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 8 Mar 2023 10:12:05 +0100 Subject: [PATCH 045/234] Small code reorganisation --- milli/src/lib.rs | 2 ++ milli/src/search/new/mod.rs | 5 ++-- milli/src/search/new/small_bitmap.rs | 42 ++-------------------------- 3 files changed, 8 insertions(+), 41 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index af4324ae4..ade6ee8bd 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -70,6 +70,8 @@ pub mod update; #[macro_use] pub mod snapshot_tests; +pub use search::new::{execute_search, SearchContext}; + use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 0dbdd93b0..235075580 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -11,6 +11,8 @@ mod small_bitmap; mod sort; mod words; +pub use logger::{DefaultSearchLogger, SearchLogger}; + use std::collections::BTreeSet; use charabia::Tokenize; @@ -24,7 +26,6 @@ pub use ranking_rules::{ use roaring::RoaringBitmap; use self::interner::Interner; -use self::logger::SearchLogger; use self::query_term::Phrase; use self::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}; use crate::search::new::query_term::located_query_terms_from_string; @@ -57,7 +58,7 @@ impl<'search> SearchContext<'search> { } #[allow(clippy::too_many_arguments)] -pub fn resolve_maximally_reduced_query_graph<'search>( +fn resolve_maximally_reduced_query_graph<'search>( ctx: &mut SearchContext<'search>, universe: &RoaringBitmap, query_graph: &QueryGraph, diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index f7adecee0..fea5a5684 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -1,31 +1,3 @@ -// #[macro_export] -// macro_rules! iter_bitmap { -// ($bitmap:expr, $id:lifetime, $p:pat, $body:block) => { -// match $bitmap { -// SmallBitmap::Tiny(mut set) => { -// while set > 0 { -// let $p = set.trailing_zeros() as u16; -// $body; -// set &= set - 1; -// } -// } -// SmallBitmap::Small(sets) => { -// let mut base = 0; -// for set in sets.iter() { -// let mut set = *set; -// while set > 0 { -// let idx = set.trailing_zeros() as u16; -// let $p = idx + base; -// set &= set - 1; -// $body; -// } -// base += 64; -// } -// } -// } -// }; -// } - #[derive(Clone)] pub enum SmallBitmap { Tiny(u64), @@ -256,16 +228,8 @@ mod tests { bitmap2.insert(x * 3); } bitmap1.intersection(&bitmap2); - // println!("{}", bitmap.contains(12)); - // bitmap1 - // .iter(|x| { - // println!("{x}"); - // Ok(()) - // }) - // .unwrap(); - - // iter_bitmap!(bitmap1, 'loop1, x, { - // println!("{x}"); - // }) + for v in bitmap1.iter() { + println!("{v}"); + } } } From c232cdabf5b594e3882232806ad8657a78a209dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 8 Mar 2023 13:26:29 +0100 Subject: [PATCH 046/234] Add documentation --- milli/src/search/new/db_cache.rs | 169 ++++----- .../search/new/graph_based_ranking_rule.rs | 100 ++++- milli/src/search/new/interner.rs | 6 +- milli/src/search/new/logger/mod.rs | 135 ++++--- milli/src/search/new/mod.rs | 2 +- milli/src/search/new/query_graph.rs | 128 +++++-- .../search/new/ranking_rule_graph/typo/mod.rs | 2 +- milli/src/search/new/ranking_rules.rs | 359 +----------------- milli/src/search/new/resolve_query_graph.rs | 2 +- milli/src/search/new/words.rs | 3 +- 10 files changed, 358 insertions(+), 548 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 100dae90a..7e68ec5e5 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -1,15 +1,21 @@ use std::collections::hash_map::Entry; +use std::hash::Hash; use fxhash::FxHashMap; use heed::types::ByteSlice; +use heed::{BytesEncode, Database, RoTxn}; use super::interner::Interned; use super::SearchContext; use crate::Result; +/// A cache storing pointers to values in the LMDB databases. +/// +/// Used for performance reasons only. By using this cache, we avoid performing a +/// database lookup and instead get a direct reference to the value using a fast +/// local HashMap lookup. #[derive(Default)] pub struct DatabaseCache<'search> { - // TODO: interner for all database cache keys? pub word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, pub word_prefix_pair_proximity_docids: @@ -21,36 +27,50 @@ pub struct DatabaseCache<'search> { pub word_prefix_docids: FxHashMap, Option<&'search [u8]>>, } impl<'search> SearchContext<'search> { - pub fn get_word_docids(&mut self, word: Interned) -> Result> { - let bitmap_ptr = match self.db_cache.word_docids.entry(word) { + fn get_value<'v, K1, KC>( + txn: &'search RoTxn, + cache_key: K1, + db_key: &'v KC::EItem, + cache: &mut FxHashMap>, + db: Database, + ) -> Result> + where + K1: Copy + Eq + Hash, + KC: BytesEncode<'v>, + { + let bitmap_ptr = match cache.entry(cache_key) { Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(), Entry::Vacant(entry) => { - let bitmap_ptr = self - .index - .word_docids - .remap_data_type::() - .get(self.txn, self.word_interner.get(word))?; + let bitmap_ptr = db.get(txn, db_key)?; entry.insert(bitmap_ptr); bitmap_ptr } }; Ok(bitmap_ptr) } - pub fn get_prefix_docids(&mut self, prefix: Interned) -> Result> { - // In the future, this will be a frozen roaring bitmap - let bitmap_ptr = match self.db_cache.word_prefix_docids.entry(prefix) { - Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(), - Entry::Vacant(entry) => { - let bitmap_ptr = self - .index - .word_prefix_docids - .remap_data_type::() - .get(self.txn, self.word_interner.get(prefix))?; - entry.insert(bitmap_ptr); - bitmap_ptr - } - }; - Ok(bitmap_ptr) + + /// Retrieve or insert the given value in the `word_docids` database. + pub fn get_word_docids(&mut self, word: Interned) -> Result> { + Self::get_value( + self.txn, + word, + self.word_interner.get(word).as_str(), + &mut self.db_cache.word_docids, + self.index.word_docids.remap_data_type::(), + ) + } + /// Retrieve or insert the given value in the `word_prefix_docids` database. + pub fn get_word_prefix_docids( + &mut self, + prefix: Interned, + ) -> Result> { + Self::get_value( + self.txn, + prefix, + self.word_interner.get(prefix).as_str(), + &mut self.db_cache.word_prefix_docids, + self.index.word_prefix_docids.remap_data_type::(), + ) } pub fn get_word_pair_proximity_docids( @@ -59,40 +79,17 @@ impl<'search> SearchContext<'search> { word2: Interned, proximity: u8, ) -> Result> { - let key = (proximity, word1, word2); - match self.db_cache.word_pair_proximity_docids.entry(key) { - Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), - Entry::Vacant(entry) => { - // We shouldn't greedily access this DB at all - // a DB (w1, w2) -> [proximities] would be much better - // We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity - // And if we worked with words encoded as integers, the set of words could be a roaring bitmap - // Then, to find all the proximities between two list of words, we'd do: - - // inputs: - // - words1 (roaring bitmap) - // - words2 (roaring bitmap) - // output: - // - [(word1, word2, [proximities])] - // algo: - // let mut ouput = vec![]; - // for word1 in words1 { - // let all_words_in_proximity_of_w1 = pair_words_db.get(word1); - // let words_in_proximity_of_w1 = all_words_in_proximity_of_w1 & words2; - // for word2 in words_in_proximity_of_w1 { - // let proximties = prox_db.get(word1, word2); - // output.push(word1, word2, proximities); - // } - // } - let bitmap_ptr = - self.index.word_pair_proximity_docids.remap_data_type::().get( - self.txn, - &(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)), - )?; - entry.insert(bitmap_ptr); - Ok(bitmap_ptr) - } - } + Self::get_value( + self.txn, + (proximity, word1, word2), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(word2).as_str(), + ), + &mut self.db_cache.word_pair_proximity_docids, + self.index.word_pair_proximity_docids.remap_data_type::(), + ) } pub fn get_word_prefix_pair_proximity_docids( @@ -101,22 +98,17 @@ impl<'search> SearchContext<'search> { prefix2: Interned, proximity: u8, ) -> Result> { - let key = (proximity, word1, prefix2); - match self.db_cache.word_prefix_pair_proximity_docids.entry(key) { - Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), - Entry::Vacant(entry) => { - let bitmap_ptr = self - .index - .word_prefix_pair_proximity_docids - .remap_data_type::() - .get( - self.txn, - &(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)), - )?; - entry.insert(bitmap_ptr); - Ok(bitmap_ptr) - } - } + Self::get_value( + self.txn, + (proximity, word1, prefix2), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(prefix2).as_str(), + ), + &mut self.db_cache.word_prefix_pair_proximity_docids, + self.index.word_prefix_pair_proximity_docids.remap_data_type::(), + ) } pub fn get_prefix_word_pair_proximity_docids( &mut self, @@ -124,25 +116,16 @@ impl<'search> SearchContext<'search> { right: Interned, proximity: u8, ) -> Result> { - let key = (proximity, left_prefix, right); - match self.db_cache.prefix_word_pair_proximity_docids.entry(key) { - Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), - Entry::Vacant(entry) => { - let bitmap_ptr = self - .index - .prefix_word_pair_proximity_docids - .remap_data_type::() - .get( - self.txn, - &( - proximity, - self.word_interner.get(left_prefix), - self.word_interner.get(right), - ), - )?; - entry.insert(bitmap_ptr); - Ok(bitmap_ptr) - } - } + Self::get_value( + self.txn, + (proximity, left_prefix, right), + &( + proximity, + self.word_interner.get(left_prefix).as_str(), + self.word_interner.get(right).as_str(), + ), + &mut self.db_cache.prefix_word_pair_proximity_docids, + self.index.prefix_word_pair_proximity_docids.remap_data_type::(), + ) } } diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 1d17c32a8..2cedbffa5 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -1,3 +1,41 @@ +/*! Implementation of a generic graph-based ranking rule. + +A graph-based ranking rule is a ranking rule that works by representing +its possible operations and their relevancy cost as a directed acyclic multi-graph +built on top of the query graph. It then computes its buckets by finding the +cheapest paths from the start node to the end node and computing the document ids +that satisfy those paths. + +For example, the proximity ranking rule builds a graph where the edges between two +nodes represent a condition that the term of the source node is in a certain proximity +to the term of the destination node. With the query "pretty house by" where the term +"pretty" has three possible proximities to the term "house" and "house" has two +proximities to "by", the graph will look like this: + +```txt +┌───────┐ ┌───────┐─────1────▶┌───────┐──1──▶┌─────┐ ┌───────┐ +│ START │──0─▶│pretty │─────2────▶│ house │ │ by │─0─▶│ END │ +└───────┘ └───────┘─────3────▶└───────┘──2-─▶└─────┘ └───────┘ +``` +The proximity ranking rule's first bucket will be determined by the union of all +the shortest paths from START to END, which in this case is: +```txt +START --0-> pretty --1--> house --1--> by --0--> end +``` +The path's corresponding document ids are found by taking the intersection of the +document ids of each edge. That is, we find the documents where both `pretty` is +1-close to `house` AND `house` is 1-close to `by`. + +For the second bucket, we get the union of the second-cheapest paths, which are: +```txt +START --0-> pretty --1--> house --2--> by --0--> end +START --0-> pretty --2--> house --1--> by --0--> end +``` +That is we find the documents where either: +- `pretty` is 1-close to `house` AND `house` is 2-close to `by` +- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by` +*/ + use roaring::RoaringBitmap; use super::logger::SearchLogger; @@ -8,24 +46,38 @@ use super::small_bitmap::SmallBitmap; use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::Result; +/// A generic graph-based ranking rule pub struct GraphBasedRankingRule { id: String, + // When the ranking rule is not iterating over its buckets, + // its state is `None`. state: Option>, } impl GraphBasedRankingRule { + /// Creates the ranking rule with the given identifier pub fn new(id: String) -> Self { Self { id, state: None } } } +/// The internal state of a graph-based ranking rule during iteration pub struct GraphBasedRankingRuleState { + /// The current graph graph: RankingRuleGraph, + /// Cache to retrieve the docids associated with each edge edge_docids_cache: EdgeDocidsCache, + /// Cache used to optimistically discard paths that resolve to no documents. empty_paths_cache: EmptyPathsCache, + /// A structure giving the list of possible costs from each node to the end node, + /// along with a set of unavoidable edges that must be traversed to achieve that distance. all_distances: Vec>, + /// An index in the first element of `all_distances`, giving the cost of the next bucket cur_distance_idx: usize, } +/// Traverse each edge of the graph, computes its associated document ids, +/// and remove this edge from the graph if its docids are disjoint with the +/// given universe. fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( ctx: &mut SearchContext<'search>, graph: &mut RankingRuleGraph, @@ -70,6 +122,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> let mut edge_docids_cache = EdgeDocidsCache::default(); let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16); + // First simplify the graph as much as possible, by computing the docids of the edges + // within the rule's universe and removing the edges that have no associated docids. remove_empty_edges( ctx, &mut graph, @@ -77,6 +131,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> universe, &mut empty_paths_cache, )?; + + // Then pre-compute the cost of all paths from each node to the end node let all_distances = graph.initialize_distances_with_necessary_edges(); let state = GraphBasedRankingRuleState { @@ -98,9 +154,14 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { + // If universe.len() <= 1, the bucket sort algorithm + // should not have called this function. assert!(universe.len() > 1); + // Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`, + // should never happen let mut state = self.state.take().unwrap(); + // TODO: does this have a real positive performance cost? remove_empty_edges( ctx, &mut state.graph, @@ -109,12 +170,16 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> &mut state.empty_paths_cache, )?; + // If the cur_distance_idx does not point to a valid cost in the `all_distances` + // structure, then we have computed all the buckets and can return. if state.cur_distance_idx >= state.all_distances[state.graph.query_graph.root_node as usize].len() { self.state = None; return Ok(None); } + + // Retrieve the cost of the paths to compute let (cost, _) = state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx]; state.cur_distance_idx += 1; @@ -129,22 +194,38 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> cur_distance_idx: _, } = &mut state; - let mut paths = vec![]; let original_universe = universe; let mut universe = universe.clone(); // TODO: remove this unnecessary clone let original_graph = graph.clone(); + // and this vector as well + let mut paths = vec![]; + + // For each path of the given cost, we will compute its associated + // document ids. + // In case the path does not resolve to any document id, we try to figure out why + // and update the `empty_paths_cache` accordingly. + // For example, it may be that the path is empty because one of its edges is disjoint + // with the universe, or because a prefix of the path is disjoint with the universe, or because + // the path contains two edges that are disjoint from each other within the universe. + // Updating the empty_paths_cache helps speed up the execution of `visit_paths_of_cost` and reduces + // the number of future candidate paths given by that same function. graph.visit_paths_of_cost( graph.query_graph.root_node as usize, cost, all_distances, empty_paths_cache, |path, graph, empty_paths_cache| { + // Accumulate the path for logging purposes only paths.push(path.to_vec()); let mut path_docids = universe.clone(); + + // We store the edges and their docids in vectors in case the path turns out to be + // empty and we need to figure out why it was empty. let mut visited_edges = vec![]; let mut cached_edge_docids = vec![]; + for &edge_index in path { visited_edges.push(edge_index); let edge_docids = @@ -154,21 +235,29 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> BitmapOrAllRef::All => continue, }; cached_edge_docids.push((edge_index, edge_docids.clone())); + + // If the edge is empty, then the path will be empty as well, we update the graph + // and caches accordingly and skip to the next candidate path. if edge_docids.is_disjoint(&universe) { // 1. Store in the cache that this edge is empty for this universe empty_paths_cache.forbid_edge(edge_index); // 2. remove this edge from the ranking rule graph graph.remove_edge(edge_index); + // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore edge_docids_cache.cache.remove(&edge_index); return Ok(()); } path_docids &= edge_docids; + // If the (sub)path is empty, we try to figure out why and update the caches accordingly. if path_docids.is_disjoint(&universe) { - // empty_paths_cache.forbid_prefix(&visited_edges); - // if the intersection between this edge and any + // First, we know that this path is empty, and thus any path + // that is a superset of it will also be empty. + empty_paths_cache.forbid_prefix(&visited_edges); + // Second, if the intersection between this edge and any // previous one is disjoint with the universe, - // then we add these two edges to the empty_path_cache + // then we also know that any path containing the same couple of + // edges will also be empty. for (edge_index2, edge_docids2) in cached_edge_docids[..cached_edge_docids.len() - 1].iter() { @@ -181,6 +270,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> } } bucket |= &path_docids; + // Reduce the size of the universe so that we can more optimistically discard candidate paths universe -= path_docids; Ok(()) }, @@ -196,6 +286,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> logger, ); + // TODO: Graph-based ranking rules do not (yet) modify the query graph. We could, however, + // remove nodes and/or terms within nodes that weren't present in any of the paths. let next_query_graph = state.graph.query_graph.clone(); self.state = Some(state); diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index 8a8fad1e1..e68f3b949 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -3,6 +3,7 @@ use std::marker::PhantomData; use fxhash::FxHashMap; +/// An index within a [`Interner`] structure. pub struct Interned { idx: u32, _phantom: PhantomData, @@ -13,7 +14,10 @@ impl Interned { Self { idx, _phantom: PhantomData } } } - +/// An [`Interner`] is used to store a unique copy of a value of type `T`. This value +/// is then identified by a lightweight index of type [`Interned`], which can +/// be copied, compared, and hashed efficiently. An immutable reference to the original value +/// can be retrieved using `self.get(interned)`. pub struct Interner { stable_store: Vec, lookup: FxHashMap>, diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 11e1389d0..d4d64f844 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -7,7 +7,82 @@ use super::ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGrap use super::small_bitmap::SmallBitmap; use super::{RankingRule, RankingRuleQueryTrait}; +/// Trait for structure logging the execution of a search query. +pub trait SearchLogger { + /// Logs the initial query + fn initial_query(&mut self, query: &Q); + + /// Logs the query that was used to compute the set of all candidates + fn query_for_universe(&mut self, query: &Q); + + /// Logs the value of the initial set of all candidates + fn initial_universe(&mut self, universe: &RoaringBitmap); + + /// Logs the ranking rules used to perform the search query + fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]); + + /// Logs the start of a ranking rule's iteration. + fn start_iteration_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + query: &Q, + universe: &RoaringBitmap, + ); + /// Logs the end of the computation of a ranking rule bucket + fn next_bucket_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + universe: &RoaringBitmap, + candidates: &RoaringBitmap, + ); + /// Logs the skipping of a ranking rule bucket + fn skip_bucket_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + candidates: &RoaringBitmap, + ); + /// Logs the end of a ranking rule's iteration. + fn end_iteration_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + universe: &RoaringBitmap, + ); + /// Logs the addition of document ids to the final results + fn add_to_results(&mut self, docids: &[u32]); + + /// Logs the internal state of the words ranking rule + fn log_words_state(&mut self, query_graph: &Q); + + /// Logs the internal state of the proximity ranking rule + fn log_proximity_state( + &mut self, + query_graph: &RankingRuleGraph, + paths: &[Vec], + empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + distances: Vec>, + cost: u16, + ); + + /// Logs the internal state of the typo ranking rule + fn log_typo_state( + &mut self, + query_graph: &RankingRuleGraph, + paths: &[Vec], + empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + distances: Vec>, + cost: u16, + ); +} + +/// A dummy [`SearchLogger`] which does nothing. pub struct DefaultSearchLogger; + impl SearchLogger for DefaultSearchLogger { fn initial_query(&mut self, _query: &Q) {} @@ -76,63 +151,3 @@ impl SearchLogger for DefaultSearchLogger { ) { } } - -pub trait SearchLogger { - fn initial_query(&mut self, query: &Q); - - fn query_for_universe(&mut self, query: &Q); - - fn initial_universe(&mut self, universe: &RoaringBitmap); - - fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]); - - fn start_iteration_ranking_rule<'transaction>( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - query: &Q, - universe: &RoaringBitmap, - ); - fn next_bucket_ranking_rule<'transaction>( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - universe: &RoaringBitmap, - candidates: &RoaringBitmap, - ); - fn skip_bucket_ranking_rule<'transaction>( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - candidates: &RoaringBitmap, - ); - fn end_iteration_ranking_rule<'transaction>( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - universe: &RoaringBitmap, - ); - fn add_to_results(&mut self, docids: &[u32]); - - fn log_words_state(&mut self, query_graph: &Q); - - fn log_proximity_state( - &mut self, - query_graph: &RankingRuleGraph, - paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, - universe: &RoaringBitmap, - distances: Vec>, - cost: u16, - ); - - fn log_typo_state( - &mut self, - query_graph: &RankingRuleGraph, - paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, - universe: &RoaringBitmap, - distances: Vec>, - cost: u16, - ); -} diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 235075580..f2cc7d5f4 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -88,7 +88,7 @@ fn resolve_maximally_reduced_query_graph<'search>( break; } else { let position_to_remove = positions_to_remove.pop().unwrap(); - let _ = graph.remove_words_at_position(position_to_remove); + let _ = graph.remove_words_starting_at_position(position_to_remove); } } logger.query_for_universe(&graph); diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index b879b2c15..88d1849e3 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -3,6 +3,17 @@ use super::small_bitmap::SmallBitmap; use super::SearchContext; use crate::Result; +const QUERY_GRAPH_NODE_LENGTH_LIMIT: u16 = 64; + +/// A node of the [`QueryGraph`]. +/// +/// There are four types of nodes: +/// 1. `Start` : unique, represents the start of the query +/// 2. `End` : unique, represents the end of a query +/// 3. `Deleted` : represents a node that was deleted. +/// All deleted nodes are unreachable from the start node. +/// 4. `Term` is a regular node representing a word or combination of words +/// from the user query. #[derive(Clone)] pub enum QueryNode { Term(LocatedQueryTerm), @@ -11,34 +22,84 @@ pub enum QueryNode { End, } +/// The edges associated with a node in the query graph. #[derive(Clone)] pub struct Edges { - // TODO: use a tiny bitset instead, something like a simple Vec where most queries will see a vector of one element + /// Set of nodes which have an edge going to the current node pub predecessors: SmallBitmap, + /// Set of nodes which are reached by an edge from the current node pub successors: SmallBitmap, } +/** +A graph representing all the ways to interpret the user's search query. + +## Important +At the moment, a query graph has a hardcoded limit of [`QUERY_GRAPH_NODE_LENGTH_LIMIT`] nodes. + +## Example 1 +For the search query `sunflower`, we need to register the following things: +- we need to look for the exact word `sunflower` +- but also any word which is 1 or 2 typos apart from `sunflower` +- and every word that contains the prefix `sunflower` +- and also the couple of adjacent words `sun flower` +- as well as all the user-defined synonyms of `sunflower` + +All these derivations of a word will be stored in [`WordDerivations`]. + +## Example 2: +For the search query `summer house by`. + +We also look for all word derivations of each term. And we also need to consider +the potential n-grams `summerhouse`, `summerhouseby`, and `houseby`. +Furthermore, we need to know which words these ngrams replace. This is done by creating the +following graph, where each node also contains a list of derivations: +```txt + ┌───────┐ + ┌─│houseby│─────────┐ + │ └───────┘ │ +┌───────┐ ┌───────┐ │ ┌───────┐ ┌────┐ │ ┌───────┐ +│ START │─┬─│summer │─┴─│ house │┌─│ by │─┼─│ END │ +└───────┘ │ └───────┘ └───────┘│ └────┘ │ └───────┘ + │ ┌────────────┐ │ │ + ├─│summerhouse │───────┘ │ + │ └────────────┘ │ + │ ┌─────────────┐ │ + └─────────│summerhouseby│───────┘ + └─────────────┘ +``` +Note also that each node has a range of positions associated with it, +such that `summer` is known to be a word at the positions `0..=0` and `houseby` +is registered with the positions `1..=2`. When two nodes are connected by an edge, +it means that they are potentially next to each other in the user's search query +(depending on the [`TermsMatchingStrategy`](crate::search::TermsMatchingStrategy) +and the transformations that were done on the query graph). +*/ #[derive(Clone)] pub struct QueryGraph { + /// The index of the start node within `self.nodes` pub root_node: u16, + /// The index of the end node within `self.nodes` pub end_node: u16, + /// The list of all query nodes pub nodes: Vec, + /// The list of all node edges pub edges: Vec, } -fn _assert_sizes() { - // TODO: QueryNodes are too big now, 88B is a bit too big - let _: [u8; 88] = [0; std::mem::size_of::()]; - let _: [u8; 32] = [0; std::mem::size_of::()]; -} - impl Default for QueryGraph { /// Create a new QueryGraph with two disconnected nodes: the root and end nodes. fn default() -> Self { let nodes = vec![QueryNode::Start, QueryNode::End]; let edges = vec![ - Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }, - Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }, + Edges { + predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + }, + Edges { + predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + }, ]; Self { root_node: 0, end_node: 1, nodes, edges } @@ -46,33 +107,31 @@ impl Default for QueryGraph { } impl QueryGraph { + /// Connect all the given predecessor nodes to the given successor node fn connect_to_node(&mut self, from_nodes: &[u16], to_node: u16) { for &from_node in from_nodes { self.edges[from_node as usize].successors.insert(to_node); self.edges[to_node as usize].predecessors.insert(from_node); } } + /// Add the given node to the graph and connect it to all the given predecessor nodes fn add_node(&mut self, from_nodes: &[u16], node: QueryNode) -> u16 { let new_node_idx = self.nodes.len() as u16; + assert!(new_node_idx <= QUERY_GRAPH_NODE_LENGTH_LIMIT); self.nodes.push(node); self.edges.push(Edges { - predecessors: SmallBitmap::from_array(from_nodes, 64), - successors: SmallBitmap::new(64), + predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), }); - for from_node in from_nodes { - self.edges[*from_node as usize].successors.insert(new_node_idx); - } + self.connect_to_node(from_nodes, new_node_idx); + new_node_idx } } impl QueryGraph { - // TODO: return the list of all matching words here as well + /// Build the query graph from the parsed user search query. pub fn from_query(ctx: &mut SearchContext, terms: Vec) -> Result { - // TODO: maybe empty nodes should not be removed here, to compute - // the score of the `words` ranking rule correctly - // it is very easy to traverse the graph and remove afterwards anyway - // Still, I'm keeping this here as a demo let mut empty_nodes = vec![]; let word_set = ctx.index.words_fst(ctx.txn)?; @@ -81,7 +140,6 @@ impl QueryGraph { let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![graph.root_node]); - // TODO: split words / synonyms for length in 1..=terms.len() { let query = &terms[..length]; @@ -156,6 +214,8 @@ impl QueryGraph { Ok(graph) } + + /// Remove the given nodes and all their edges from the query graph. pub fn remove_nodes(&mut self, nodes: &[u16]) { for &node in nodes { self.nodes[node as usize] = QueryNode::Deleted; @@ -166,10 +226,13 @@ impl QueryGraph { for succ in edges.successors.iter() { self.edges[succ as usize].predecessors.remove(node); } - self.edges[node as usize] = - Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }; + self.edges[node as usize] = Edges { + predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + }; } } + /// Remove the given nodes, connecting all their predecessors to all their successors. pub fn remove_nodes_keep_edges(&mut self, nodes: &[u16]) { for &node in nodes { self.nodes[node as usize] = QueryNode::Deleted; @@ -182,11 +245,17 @@ impl QueryGraph { self.edges[succ as usize].predecessors.remove(node); self.edges[succ as usize].predecessors.union(&edges.predecessors); } - self.edges[node as usize] = - Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }; + self.edges[node as usize] = Edges { + predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + }; } } - pub fn remove_words_at_position(&mut self, position: i8) -> bool { + + /// Remove all the nodes that correspond to a word starting at the given position, and connect + /// the predecessors of these nodes to their successors. + /// Return `true` if any node was removed. + pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool { let mut nodes_to_remove_keeping_edges = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { let node_idx = node_idx as u16; @@ -202,14 +271,15 @@ impl QueryGraph { !nodes_to_remove_keeping_edges.is_empty() } + /// Simplify the query graph by removing all nodes that are disconnected from + /// the start or end nodes. fn simplify(&mut self) { loop { let mut nodes_to_remove = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { - if (!matches!(node, QueryNode::End | QueryNode::Deleted) - && self.edges[node_idx].successors.is_empty()) - || (!matches!(node, QueryNode::Start | QueryNode::Deleted) - && self.edges[node_idx].predecessors.is_empty()) + if !matches!(node, QueryNode::End | QueryNode::Deleted) + && (self.edges[node_idx].successors.is_empty() + || self.edges[node_idx].predecessors.is_empty()) { nodes_to_remove.push(node_idx as u16); } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index ce569fbb0..bf2c6572e 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -53,7 +53,7 @@ impl RankingRuleGraphTrait for TypoGraph { docids |= bitmap; } if *nbr_typos == 0 { - if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? { + if let Some(bytes) = ctx.get_word_prefix_docids(derivations.original)? { // TODO: deserialize bitmap within a universe let bitmap = universe & RoaringBitmapCodec::bytes_decode(bytes) diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 788f8a496..3ccb54032 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -114,7 +114,7 @@ pub fn apply_ranking_rules<'search>( logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe); ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?; - let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; + let mut candidates: Vec = vec![RoaringBitmap::default(); ranking_rules_len]; candidates[0] = universe.clone(); let mut cur_ranking_rule_index = 0; @@ -174,7 +174,7 @@ pub fn apply_ranking_rules<'search>( } } else { let candidates = - candidates.iter().take(length - results.len()).collect::>(); + candidates.iter().take(length - results.len()).collect::>(); logger.add_to_results(&candidates); results.extend(&candidates); } @@ -234,358 +234,3 @@ pub fn apply_ranking_rules<'search>( Ok(results) } - -#[cfg(test)] -mod tests { - // use crate::allocator::ALLOC; - use std::fs::File; - use std::io::{BufRead, BufReader, Cursor, Seek}; - use std::time::Instant; - - use big_s::S; - use heed::EnvOpenOptions; - use maplit::hashset; - - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - // use crate::search::new::logger::detailed::DetailedSearchLogger; - use crate::search::new::logger::DefaultSearchLogger; - use crate::search::new::{execute_search, SearchContext}; - use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; - use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; - - #[test] - fn search_wiki_new() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - let txn = index.read_txn().unwrap(); - - println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - - // loop { - let start = Instant::now(); - - // let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "which a the releases from poison by the government", - None, - 0, - 20, - &mut DefaultSearchLogger, - // &mut logger, - ) - .unwrap(); - - // logger.write_d2_description(&mut ctx); - - let elapsed = start.elapsed(); - println!("{}us", elapsed.as_micros()); - - let _documents = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), results); - // } - // for (id, _document) in documents { - // println!("{id}:"); - // // println!("{document}"); - // } - } - - #[test] - fn search_wiki_old() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - - let txn = index.read_txn().unwrap(); - - let rr = index.criteria(&txn).unwrap(); - println!("{rr:?}"); - - let start = Instant::now(); - - let mut s = Search::new(&txn, &index); - s.query("which a the releases from poison by the government"); - s.terms_matching_strategy(TermsMatchingStrategy::Last); - s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); - let docs = s.execute().unwrap(); - - let elapsed = start.elapsed(); - - let documents = index - .documents(&txn, docs.documents_ids.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - for (id, _document) in documents { - println!("{id}:"); - // println!("{document}"); - } - } - #[test] - fn search_movies_new() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let txn = index.read_txn().unwrap(); - - // let primary_key = index.primary_key(&txn).unwrap().unwrap(); - // let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - // loop { - let start = Instant::now(); - - let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "releases from poison by the government", - None, - 0, - 20, - // &mut DefaultSearchLogger, - &mut logger, - ) - .unwrap(); - - logger.write_d2_description(&mut ctx); - - let elapsed = start.elapsed(); - - // let ids = index - // .documents(&txn, results.iter().copied()) - // .unwrap() - // .into_iter() - // .map(|x| { - // let obkv = &x.1; - // let id = obkv.get(primary_key).unwrap(); - // let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - // id.as_str().unwrap().to_owned() - // }) - // .collect::>(); - - println!("{}us: {results:?}", elapsed.as_micros()); - // println!("external ids: {ids:?}"); - // } - } - - #[test] - fn search_movies_old() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - - let txn = index.read_txn().unwrap(); - - let rr = index.criteria(&txn).unwrap(); - println!("{rr:?}"); - - let primary_key = index.primary_key(&txn).unwrap().unwrap(); - let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - - let start = Instant::now(); - - let mut s = Search::new(&txn, &index); - s.query("which a the releases from poison by the government"); - s.terms_matching_strategy(TermsMatchingStrategy::Last); - s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); - let docs = s.execute().unwrap(); - - let elapsed = start.elapsed(); - - let ids = index - .documents(&txn, docs.documents_ids.iter().copied()) - .unwrap() - .into_iter() - .map(|x| { - let obkv = &x.1; - let id = obkv.get(primary_key).unwrap(); - let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - id.as_str().unwrap().to_owned() - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - println!("external ids: {ids:?}"); - } - - #[test] - fn _settings_movies() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_min_word_len_one_typo(5); - builder.set_min_word_len_two_typos(100); - builder.set_sortable_fields(hashset! { S("release_date") }); - builder.set_criteria(vec![ - Criterion::Words, - Criterion::Typo, - Criterion::Proximity, - Criterion::Asc("release_date".to_owned()), - ]); - - builder.execute(|_| (), || false).unwrap(); - wtxn.commit().unwrap(); - } - - #[test] - fn _index_movies() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - let primary_key = "id"; - let searchable_fields = vec!["title", "overview"]; - let filterable_fields = vec!["release_date", "genres"]; - - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(primary_key.to_owned()); - let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_filterable_fields(filterable_fields); - - builder.set_min_word_len_one_typo(5); - builder.set_min_word_len_two_typos(100); - builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); - builder.execute(|_| (), || false).unwrap(); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) - .unwrap(); - - let documents = documents_from( - "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json", - "json", - ); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); - } - #[test] - fn _index_wiki() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - // let primary_key = "id"; - let searchable_fields = vec!["body", "title", "url"]; - // let filterable_fields = vec![]; - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - // builder.set_primary_key(primary_key.to_owned()); - let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_filterable_fields(filterable_fields); - - // builder.set_min_word_len_one_typo(5); - // builder.set_min_word_len_two_typos(100); - builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); - builder.execute(|_| (), || false).unwrap(); - - let config = IndexerConfig::default(); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) - .unwrap(); - - let documents = documents_from( - "/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv", - "csv", - ); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); - } - - fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { - let reader = File::open(filename) - .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename)); - let reader = BufReader::new(reader); - let documents = match filetype { - "csv" => documents_from_csv(reader).unwrap(), - "json" => documents_from_json(reader).unwrap(), - "jsonl" => documents_from_jsonl(reader).unwrap(), - otherwise => panic!("invalid update format {:?}", otherwise), - }; - DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() - } - - fn documents_from_jsonl(reader: impl BufRead) -> crate::Result> { - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - - for result in serde_json::Deserializer::from_reader(reader).into_iter::() { - let object = result.unwrap(); - documents.append_json_object(&object)?; - } - - documents.into_inner().map_err(Into::into) - } - - fn documents_from_json(reader: impl BufRead) -> crate::Result> { - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - - documents.append_json_array(reader)?; - - documents.into_inner().map_err(Into::into) - } - - fn documents_from_csv(reader: impl BufRead) -> crate::Result> { - let csv = csv::Reader::from_reader(reader); - - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - documents.append_csv(csv)?; - - documents.into_inner().map_err(Into::into) - } -} diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 0581341d1..ca64e4342 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -46,7 +46,7 @@ impl<'search> SearchContext<'search> { } } if *use_prefix_db { - if let Some(prefix_docids) = self.get_prefix_docids(*original)? { + if let Some(prefix_docids) = self.get_word_prefix_docids(*original)? { or_docids.push(prefix_docids); } } diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 2858e1569..2015367da 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -88,7 +88,8 @@ impl<'search> RankingRule<'search, QueryGraph> for Words { break; } else { let position_to_remove = self.positions_to_remove.pop().unwrap(); - let did_delete_any_node = query_graph.remove_words_at_position(position_to_remove); + let did_delete_any_node = + query_graph.remove_words_starting_at_position(position_to_remove); if did_delete_any_node { break; } From 2099991dd174aca95b081cf29c6f2f3b30e7bed9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 8 Mar 2023 15:04:25 +0100 Subject: [PATCH 047/234] Continue documenting and cleaning up the code --- .../search/new/graph_based_ranking_rule.rs | 10 +- milli/src/search/new/logger/detailed.rs | 30 ++-- milli/src/search/new/query_term.rs | 74 ++++++--- .../search/new/ranking_rule_graph/build.rs | 42 +++-- .../new/ranking_rule_graph/cheapest_paths.rs | 22 +-- .../ranking_rule_graph/edge_docids_cache.rs | 36 ++--- .../ranking_rule_graph/empty_paths_cache.rs | 22 ++- .../src/search/new/ranking_rule_graph/mod.rs | 132 +++++++++------- .../new/ranking_rule_graph/paths_map.rs | 148 ++---------------- .../new/ranking_rule_graph/proximity/build.rs | 12 +- .../new/ranking_rule_graph/proximity/mod.rs | 16 +- .../search/new/ranking_rule_graph/typo/mod.rs | 26 +-- 12 files changed, 245 insertions(+), 325 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 2cedbffa5..d8f881b07 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -85,15 +85,15 @@ fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( universe: &RoaringBitmap, empty_paths_cache: &mut EmptyPathsCache, ) -> Result<()> { - for edge_index in 0..graph.all_edges.len() as u16 { - if graph.all_edges[edge_index as usize].is_none() { + for edge_index in 0..graph.edges_store.len() as u16 { + if graph.edges_store[edge_index as usize].is_none() { continue; } let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?; match docids { BitmapOrAllRef::Bitmap(docids) => { if docids.is_disjoint(universe) { - graph.remove_edge(edge_index); + graph.remove_ranking_rule_edge(edge_index); empty_paths_cache.forbid_edge(edge_index); edge_docids_cache.cache.remove(&edge_index); continue; @@ -120,7 +120,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> ) -> Result<()> { let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; let mut edge_docids_cache = EdgeDocidsCache::default(); - let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16); + let mut empty_paths_cache = EmptyPathsCache::new(graph.edges_store.len() as u16); // First simplify the graph as much as possible, by computing the docids of the edges // within the rule's universe and removing the edges that have no associated docids. @@ -242,7 +242,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> // 1. Store in the cache that this edge is empty for this universe empty_paths_cache.forbid_edge(edge_index); // 2. remove this edge from the ranking rule graph - graph.remove_edge(edge_index); + graph.remove_ranking_rule_edge(edge_index); // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore edge_docids_cache.cache.remove(&edge_index); return Ok(()); diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index c6570ef54..468bc0343 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::search::new::ranking_rule_graph::{ - Edge, EdgeDetails, EmptyPathsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, + Edge, EdgeCondition, EmptyPathsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, }; use crate::search::new::small_bitmap::SmallBitmap; @@ -534,24 +534,24 @@ shape: class" let distances = &distances[node_idx]; Self::query_node_d2_desc(ctx, node_idx, node, distances.as_slice(), file); } - for edge in graph.all_edges.iter().flatten() { - let Edge { from_node, to_node, details, .. } = edge; + for edge in graph.edges_store.iter().flatten() { + let Edge { source_node, dest_node, condition: details, .. } = edge; match &details { - EdgeDetails::Unconditional => { + EdgeCondition::Unconditional => { writeln!( file, - "{from_node} -> {to_node} : \"always cost {cost}\"", + "{source_node} -> {dest_node} : \"always cost {cost}\"", cost = edge.cost, ) .unwrap(); } - EdgeDetails::Data(details) => { + EdgeCondition::Conditional(details) => { writeln!( file, - "{from_node} -> {to_node} : \"cost {cost} {edge_label}\"", + "{source_node} -> {dest_node} : \"cost {cost} {edge_label}\"", cost = edge.cost, - edge_label = R::graphviz_edge_details_label(details) + edge_label = R::label_for_edge_condition(details) ) .unwrap(); } @@ -589,10 +589,10 @@ shape: class" edge_idx: u16, file: &mut File, ) { - let Edge { from_node, to_node, cost, .. } = - graph.all_edges[edge_idx as usize].as_ref().unwrap(); - let from_node = &graph.query_graph.nodes[*from_node as usize]; - let from_node_desc = match from_node { + let Edge { source_node, dest_node, cost, .. } = + graph.edges_store[edge_idx as usize].as_ref().unwrap(); + let source_node = &graph.query_graph.nodes[*source_node as usize]; + let source_node_desc = match source_node { QueryNode::Term(term) => match &term.value { QueryTerm::Phrase { phrase } => { let phrase = ctx.phrase_interner.get(*phrase); @@ -606,8 +606,8 @@ shape: class" QueryNode::Start => "START".to_owned(), QueryNode::End => "END".to_owned(), }; - let to_node = &graph.query_graph.nodes[*to_node as usize]; - let to_node_desc = match to_node { + let dest_node = &graph.query_graph.nodes[*dest_node as usize]; + let dest_node_desc = match dest_node { QueryNode::Term(term) => match &term.value { QueryTerm::Phrase { phrase } => { let phrase = ctx.phrase_interner.get(*phrase); @@ -623,7 +623,7 @@ shape: class" }; writeln!( file, - "{edge_idx}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ + "{edge_idx}: \"{source_node_desc}->{dest_node_desc} [{cost}]\" {{ shape: class }}" ) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 46a62b4a9..446f34e68 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -1,6 +1,3 @@ -// TODO: put primitive query part in here - -use std::borrow::Cow; use std::mem; use std::ops::RangeInclusive; @@ -18,6 +15,8 @@ use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; use crate::{CboRoaringBitmapLenCodec, Index, Result}; +/// A phrase in the user's search query, consisting of several words +/// that must appear side-by-side in the search results. #[derive(Default, Clone, PartialEq, Eq, Hash)] pub struct Phrase { pub words: Vec>>, @@ -28,18 +27,38 @@ impl Phrase { } } +/// A structure storing all the different ways to match +/// a term in the user's search query. #[derive(Clone)] pub struct WordDerivations { + /// The original word pub original: Interned, - // TODO: pub prefix_of: Vec, + // TODO: original should only be used for debugging purposes? + // TODO: pub zero_typo: Option>, + // TODO: pub prefix_of: Box<[Interned]>, + /// All the synonyms of the original word pub synonyms: Box<[Interned]>, + + /// The original word split into multiple consecutive words pub split_words: Option>, + + /// The original words and words which are prefixed by it pub zero_typo: Box<[Interned]>, + + /// Words that are 1 typo away from the original word pub one_typo: Box<[Interned]>, + + /// Words that are 2 typos away from the original word pub two_typos: Box<[Interned]>, + + /// True if the prefix databases must be used to retrieve + /// the words which are prefixed by the original word. pub use_prefix_db: bool, } impl WordDerivations { + /// Return an iterator over all the single words derived from the original word. + /// + /// This excludes synonyms, split words, and words stored in the prefix databases. pub fn all_derivations_except_prefix_db( &'_ self, ) -> impl Iterator> + Clone + '_ { @@ -49,17 +68,20 @@ impl WordDerivations { self.zero_typo.is_empty() && self.one_typo.is_empty() && self.two_typos.is_empty() + && self.synonyms.is_empty() + && self.split_words.is_none() && !self.use_prefix_db } } +/// Compute the word derivations for the given word pub fn word_derivations( ctx: &mut SearchContext, word: &str, max_typo: u8, is_prefix: bool, - fst: &fst::Set>, ) -> Result { + let fst = ctx.index.words_fst(ctx.txn)?; let word_interned = ctx.word_interner.insert(word.to_owned()); let use_prefix_db = is_prefix @@ -171,6 +193,10 @@ pub fn word_derivations( }) } +/// Split the original word into the two words that appear the +/// most next to each other in the index. +/// +/// Return `None` if the original word cannot be split. fn split_best_frequency( index: &Index, txn: &RoTxn, @@ -199,16 +225,12 @@ fn split_best_frequency( #[derive(Clone)] pub enum QueryTerm { - // TODO: should there be SplitWord, NGram2, and NGram3 variants? - // NGram2 can have 1 typo and synonyms - // NGram3 cannot have typos but can have synonyms - // SplitWords are a phrase - // Can NGrams be prefixes? Phrase { phrase: Interned }, Word { derivations: WordDerivations }, } impl QueryTerm { + /// Return the original word from the given query term pub fn original_single_word<'interner>( &self, word_interner: &'interner Interner, @@ -226,6 +248,7 @@ impl QueryTerm { } } +/// A query term term coupled with its position in the user's search query. #[derive(Clone)] pub struct LocatedQueryTerm { pub value: QueryTerm, @@ -233,14 +256,18 @@ pub struct LocatedQueryTerm { } impl LocatedQueryTerm { + /// Return `true` iff the word derivations within the query term are empty pub fn is_empty(&self) -> bool { match &self.value { + // TODO: phrases should be greedily computed, so that they can be excluded from + // the query graph right from the start? QueryTerm::Phrase { phrase: _ } => false, QueryTerm::Word { derivations, .. } => derivations.is_empty(), } } } +/// Convert the tokenised search query into a list of located query terms. pub fn located_query_terms_from_string<'search>( ctx: &mut SearchContext<'search>, query: NormalizedTokenIter>, @@ -250,8 +277,8 @@ pub fn located_query_terms_from_string<'search>( let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; + // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? let exact_words = ctx.index.exact_words(ctx.txn)?; - let fst = ctx.index.words_fst(ctx.txn)?; let nbr_typos = |word: &str| { if !authorize_typos @@ -266,9 +293,9 @@ pub fn located_query_terms_from_string<'search>( } }; - let mut primitive_query = Vec::new(); - let mut phrase = Vec::new(); + let mut located_terms = Vec::new(); + let mut phrase = Vec::new(); let mut quoted = false; let parts_limit = words_limit.unwrap_or(usize::MAX); @@ -280,8 +307,8 @@ pub fn located_query_terms_from_string<'search>( let mut peekable = query.peekable(); while let Some(token) = peekable.next() { // early return if word limit is exceeded - if primitive_query.len() >= parts_limit { - return Ok(primitive_query); + if located_terms.len() >= parts_limit { + return Ok(located_terms); } match token.kind { @@ -307,24 +334,23 @@ pub fn located_query_terms_from_string<'search>( match token.kind { TokenKind::Word => { let word = token.lemma(); - let derivations = - word_derivations(ctx, word, nbr_typos(word), false, &fst)?; + let derivations = word_derivations(ctx, word, nbr_typos(word), false)?; let located_term = LocatedQueryTerm { value: QueryTerm::Word { derivations }, positions: position..=position, }; - primitive_query.push(located_term); + located_terms.push(located_term); } TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} } } else { let word = token.lemma(); - let derivations = word_derivations(ctx, word, nbr_typos(word), true, &fst)?; + let derivations = word_derivations(ctx, word, nbr_typos(word), true)?; let located_term = LocatedQueryTerm { value: QueryTerm::Word { derivations }, positions: position..=position, }; - primitive_query.push(located_term); + located_terms.push(located_term); } } TokenKind::Separator(separator_kind) => { @@ -352,7 +378,7 @@ pub fn located_query_terms_from_string<'search>( }, positions: phrase_start..=phrase_end, }; - primitive_query.push(located_query_term); + located_terms.push(located_query_term); } } _ => (), @@ -367,10 +393,10 @@ pub fn located_query_terms_from_string<'search>( }, positions: phrase_start..=phrase_end, }; - primitive_query.push(located_query_term); + located_terms.push(located_query_term); } - Ok(primitive_query) + Ok(located_terms) } // TODO: return a word derivations instead? @@ -396,6 +422,8 @@ pub fn ngram2( _ => None, } } + +// TODO: return a word derivations instead? pub fn ngram3( ctx: &mut SearchContext, x: &LocatedQueryTerm, diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index d9732b010..49c78a32f 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -6,49 +6,43 @@ use crate::search::new::{QueryGraph, SearchContext}; use crate::Result; impl RankingRuleGraph { + /// Build the ranking rule graph from the given query graph pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { let QueryGraph { nodes: graph_nodes, edges: graph_edges, .. } = &query_graph; - let mut all_edges = vec![]; - let mut node_edges = vec![]; - let mut successors = vec![]; + let mut edges_store = vec![]; + let mut edges_of_node = vec![]; for (node_idx, node) in graph_nodes.iter().enumerate() { - node_edges.push(HashSet::new()); - successors.push(HashSet::new()); - let new_edges = node_edges.last_mut().unwrap(); - let new_successors = successors.last_mut().unwrap(); + edges_of_node.push(HashSet::new()); + let new_edges = edges_of_node.last_mut().unwrap(); - let Some(from_node_data) = G::build_visit_from_node(ctx, node)? else { continue }; + let Some(source_node_data) = G::build_step_visit_source_node(ctx, node)? else { continue }; for successor_idx in graph_edges[node_idx].successors.iter() { - let to_node = &graph_nodes[successor_idx as usize]; - let mut edges = G::build_visit_to_node(ctx, to_node, &from_node_data)?; + let dest_node = &graph_nodes[successor_idx as usize]; + let edges = + G::build_step_visit_destination_node(ctx, dest_node, &source_node_data)?; if edges.is_empty() { continue; } - edges.sort_by_key(|e| e.0); + for (cost, details) in edges { - all_edges.push(Some(Edge { - from_node: node_idx as u16, - to_node: successor_idx, + edges_store.push(Some(Edge { + source_node: node_idx as u16, + dest_node: successor_idx, cost, - details, + condition: details, })); - new_edges.insert(all_edges.len() as u16 - 1); - new_successors.insert(successor_idx); + new_edges.insert(edges_store.len() as u16 - 1); } } } - let node_edges = node_edges + let edges_of_node = edges_of_node .into_iter() - .map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16)) - .collect(); - let successors = successors - .into_iter() - .map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16)) + .map(|edges| SmallBitmap::from_iter(edges.into_iter(), edges_store.len() as u16)) .collect(); - Ok(RankingRuleGraph { query_graph, all_edges, node_edges, successors }) + Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node }) } } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 8627860e7..529bb32c4 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -30,7 +30,7 @@ impl RankingRuleGraph { empty_paths_cache, &mut visit, &mut vec![], - &mut SmallBitmap::new(self.all_edges.len() as u16), + &mut SmallBitmap::new(self.edges_store.len() as u16), empty_paths_cache.empty_edges.clone(), )?; Ok(()) @@ -48,12 +48,12 @@ impl RankingRuleGraph { ) -> Result { let mut any_valid = false; - let edges = self.node_edges[from].clone(); + let edges = self.edges_of_node[from].clone(); for edge_idx in edges.iter() { - let Some(edge) = self.all_edges[edge_idx as usize].as_ref() else { continue }; + let Some(edge) = self.edges_store[edge_idx as usize].as_ref() else { continue }; if cost < edge.cost as u16 || forbidden_edges.contains(edge_idx) - || !all_distances[edge.to_node as usize].iter().any( + || !all_distances[edge.dest_node as usize].iter().any( |(next_cost, necessary_edges)| { (*next_cost == cost - edge.cost as u16) && !forbidden_edges.intersects(necessary_edges) @@ -71,13 +71,13 @@ impl RankingRuleGraph { new_forbidden_edges.insert(x); }); - let next_any_valid = if edge.to_node == self.query_graph.end_node { + let next_any_valid = if edge.dest_node == self.query_graph.end_node { any_valid = true; visit(prev_edges, self, empty_paths_cache)?; true } else { self.visit_paths_of_cost_rec( - edge.to_node as usize, + edge.dest_node as usize, cost - edge.cost as u16, all_distances, empty_paths_cache, @@ -115,7 +115,7 @@ impl RankingRuleGraph { let mut node_stack = VecDeque::new(); distances_to_end[self.query_graph.end_node as usize] = - vec![(0, SmallBitmap::new(self.all_edges.len() as u16))]; + vec![(0, SmallBitmap::new(self.edges_store.len() as u16))]; for prev_node in self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter() @@ -127,15 +127,15 @@ impl RankingRuleGraph { while let Some(cur_node) = node_stack.pop_front() { let mut self_distances = BTreeMap::::new(); - let cur_node_edges = &self.node_edges[cur_node]; + let cur_node_edges = &self.edges_of_node[cur_node]; for edge_idx in cur_node_edges.iter() { - let edge = self.all_edges[edge_idx as usize].as_ref().unwrap(); - let succ_node = edge.to_node; + let edge = self.edges_store[edge_idx as usize].as_ref().unwrap(); + let succ_node = edge.dest_node; let succ_distances = &distances_to_end[succ_node as usize]; for (succ_distance, succ_necessary_edges) in succ_distances { let potential_necessary_edges = SmallBitmap::from_iter( std::iter::once(edge_idx).chain(succ_necessary_edges.iter()), - self.all_edges.len() as u16, + self.edges_store.len() as u16, ); match self_distances.entry(edge.cost as u16 + succ_distance) { Entry::Occupied(mut prev_necessary_edges) => { diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index c0c46289c..f7bf1b002 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -3,28 +3,13 @@ use std::marker::PhantomData; use fxhash::FxHashMap; use roaring::RoaringBitmap; -use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; +use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::{BitmapOrAllRef, SearchContext}; use crate::Result; -// TODO: the cache should have a G::EdgeDetails as key -// but then it means that we should have a quick way of -// computing their hash and comparing them -// which can be done... -// by using a pointer (real, Rc, bumpalo, or in a vector)??? -// -// But actually.... the edge details' docids are a subset of the universe at the -// moment they were computed. -// But the universes between two iterations of a ranking rule are completely different -// Thus, there is no point in doing this. -// UNLESS... -// we compute the whole docids corresponding to the edge details (potentially expensive in time and memory -// in the common case) -// -// But we could still benefit within a single iteration for requests like: -// `a a a a a a a a a` where we have many of the same edge details, repeated - +/// A cache storing the document ids associated with each ranking rule edge pub struct EdgeDocidsCache { + // TODO: should be FxHashMap, RoaringBitmap> pub cache: FxHashMap, _phantom: PhantomData, } @@ -34,19 +19,24 @@ impl Default for EdgeDocidsCache { } } impl EdgeDocidsCache { + /// Retrieve the document ids for the given edge condition. + /// + /// If the cache does not yet contain these docids, they are computed + /// and inserted in the cache. pub fn get_edge_docids<'s, 'search>( &'s mut self, ctx: &mut SearchContext<'search>, + // TODO: should be Interned edge_index: u16, graph: &RankingRuleGraph, // TODO: maybe universe doesn't belong here universe: &RoaringBitmap, ) -> Result> { - let edge = graph.all_edges[edge_index as usize].as_ref().unwrap(); + let edge = graph.edges_store[edge_index as usize].as_ref().unwrap(); - match &edge.details { - EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All), - EdgeDetails::Data(details) => { + match &edge.condition { + EdgeCondition::Unconditional => Ok(BitmapOrAllRef::All), + EdgeCondition::Conditional(details) => { if self.cache.contains_key(&edge_index) { // TODO: should we update the bitmap in the cache if the new universe // reduces it? @@ -56,7 +46,7 @@ impl EdgeDocidsCache { return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); } // TODO: maybe universe doesn't belong here - let docids = universe & G::compute_docids(ctx, details, universe)?; + let docids = universe & G::resolve_edge_condition(ctx, details, universe)?; let _ = self.cache.insert(edge_index, docids); let docids = &self.cache[&edge_index]; Ok(BitmapOrAllRef::Bitmap(docids)) diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index 659042a01..deac05502 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,20 +1,29 @@ -use super::paths_map::PathsMap; +use super::paths_map::PathSet; use crate::search::new::small_bitmap::SmallBitmap; +/// A cache which stores sufficient conditions for a path +/// to resolve to an empty set of candidates within the current +/// universe. #[derive(Clone)] pub struct EmptyPathsCache { + /// The set of edge indexes that resolve to no documents. pub empty_edges: SmallBitmap, - pub empty_prefixes: PathsMap<()>, + /// A set of path prefixes that resolve to no documents. + pub empty_prefixes: PathSet, + /// A set of empty couple of edge indexes that resolve to no documents. pub empty_couple_edges: Vec, } impl EmptyPathsCache { + /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. pub fn new(all_edges_len: u16) -> Self { Self { empty_edges: SmallBitmap::new(all_edges_len), - empty_prefixes: PathsMap::default(), + empty_prefixes: PathSet::default(), empty_couple_edges: vec![SmallBitmap::new(all_edges_len); all_edges_len as usize], } } + + /// Store in the cache that every path containing the given edge resolves to no documents. pub fn forbid_edge(&mut self, edge_idx: u16) { self.empty_edges.insert(edge_idx); self.empty_couple_edges[edge_idx as usize].clear(); @@ -23,12 +32,17 @@ impl EmptyPathsCache { edges2.remove(edge_idx); } } + /// Store in the cache that every path containing the given prefix resolves to no documents. pub fn forbid_prefix(&mut self, prefix: &[u16]) { - self.empty_prefixes.insert(prefix.iter().copied(), ()); + self.empty_prefixes.insert(prefix.iter().copied()); } + + /// Store in the cache that every path containing the two given edges resolves to no documents. pub fn forbid_couple_edges(&mut self, edge1: u16, edge2: u16) { self.empty_couple_edges[edge1 as usize].insert(edge2); } + + /// Returns true if the cache can determine that the given path resolves to no documents. pub fn path_is_empty(&self, path: &[u16], path_bitmap: &SmallBitmap) -> bool { if path_bitmap.intersects(&self.empty_edges) { return true; diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 635f194f5..3f74a3cf5 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -1,9 +1,19 @@ +/*! Module implementing the graph used for the graph-based ranking rules +and its related algorithms. + +A ranking rule graph is built on top of the [`QueryGraph`]: the nodes stay +the same but the edges are replaced. +*/ + mod build; mod cheapest_paths; mod edge_docids_cache; mod empty_paths_cache; mod paths_map; + +/// Implementation of the `proximity` ranking rule mod proximity; +/// Implementation of the `typo` ranking rule mod typo; pub use edge_docids_cache::EdgeDocidsCache; @@ -17,30 +27,38 @@ use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; use crate::Result; +/// The condition that is associated with an edge in the ranking rule graph. +/// +/// Some edges are unconditional, which means that traversing them does not reduce +/// the set of candidates. +/// +/// Most edges, however, have a condition attached to them. For example, for the +/// proximity ranking rule, the condition could be that a word is N-close to another one. +/// When the edge is traversed, some database operations are executed to retrieve the set +/// of documents that satisfy the condition, which reduces the list of candidate document ids. #[derive(Debug, Clone)] -pub enum EdgeDetails { +pub enum EdgeCondition { Unconditional, - Data(E), + Conditional(E), } +/// An edge in the ranking rule graph. +/// +/// It contains: +/// 1. The source and destination nodes +/// 2. The cost of traversing this edge +/// 3. The condition associated with it #[derive(Debug, Clone)] pub struct Edge { - pub from_node: u16, - pub to_node: u16, + pub source_node: u16, + pub dest_node: u16, pub cost: u8, - pub details: EdgeDetails, -} - -#[derive(Debug, Clone)] -pub struct EdgePointer<'graph, E> { - pub index: u16, - pub edge: &'graph Edge, + pub condition: EdgeCondition, } // pub struct SubWordDerivations { // words: FxHashSet>, -// synonyms: FxHashSet>, // NO! they're phrases, not strings -// split_words: bool, +// phrases: FxHashSet>, // use_prefix_db: bool, // } @@ -74,46 +92,55 @@ pub struct EdgePointer<'graph, E> { // } // fn word_derivations_used_by_edge( -// edge: G::EdgeDetails, +// edge: G::EdgeCondition, // ) -> SubWordDerivations { // todo!() // } +/// A trait to be implemented by a marker type to build a graph-based ranking rule. +/// +/// It mostly describes how to: +/// 1. Retrieve the set of edges (their cost and condition) between two nodes. +/// 2. Compute the document ids satisfying a condition pub trait RankingRuleGraphTrait: Sized { - /// The details of an edge connecting two query nodes. These details + /// The condition of an edge connecting two query nodes. The condition /// should be sufficient to compute the edge's cost and associated document ids - /// in [`compute_docids`](RankingRuleGraphTrait). - type EdgeDetails: Sized + Clone; + /// in [`resolve_edge_condition`](RankingRuleGraphTrait::resolve_edge_condition). + type EdgeCondition: Sized + Clone; + /// A structure used in the construction of the graph, created when a + /// query graph source node is visited. It is used to determine the cost + /// and condition of a ranking rule edge when the destination node is visited. type BuildVisitedFromNode; - /// Return the label of the given edge details, to be used when visualising - /// the ranking rule graph using GraphViz. - fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String; + /// Return the label of the given edge condition, to be used when visualising + /// the ranking rule graph. + fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String; - /// Compute the document ids associated with the given edge. - fn compute_docids<'search>( + /// Compute the document ids associated with the given edge condition, + /// restricted to the given universe. + fn resolve_edge_condition<'search>( ctx: &mut SearchContext<'search>, - edge_details: &Self::EdgeDetails, + edge_condition: &Self::EdgeCondition, universe: &RoaringBitmap, ) -> Result; - /// Prepare to build the edges outgoing from `from_node`. + /// Prepare to build the edges outgoing from `source_node`. /// - /// This call is followed by zero, one or more calls to [`build_visit_to_node`](RankingRuleGraphTrait::build_visit_to_node), + /// This call is followed by zero, one or more calls to [`build_step_visit_destination_node`](RankingRuleGraphTrait::build_step_visit_destination_node), /// which builds the actual edges. - fn build_visit_from_node<'search>( + fn build_step_visit_source_node<'search>( ctx: &mut SearchContext<'search>, - from_node: &QueryNode, + source_node: &QueryNode, ) -> Result>; - /// Return the cost and details of the edges going from the previously visited node - /// (with [`build_visit_from_node`](RankingRuleGraphTrait::build_visit_from_node)) to `to_node`. - fn build_visit_to_node<'from_data, 'search: 'from_data>( + /// Return the cost and condition of the edges going from the previously visited node + /// (with [`build_step_visit_source_node`](RankingRuleGraphTrait::build_step_visit_source_node)) to `dest_node`. + fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( ctx: &mut SearchContext<'search>, - to_node: &QueryNode, - from_node_data: &'from_data Self::BuildVisitedFromNode, - ) -> Result)>>; + dest_node: &QueryNode, + source_node_data: &'from_data Self::BuildVisitedFromNode, + ) -> Result)>>; fn log_state( graph: &RankingRuleGraph, @@ -126,45 +153,32 @@ pub trait RankingRuleGraphTrait: Sized { ); } +/// The graph used by graph-based ranking rules. +/// +/// It is built on top of a [`QueryGraph`], keeping the same nodes +/// but replacing the edges. pub struct RankingRuleGraph { pub query_graph: QueryGraph, - // pub edges: Vec>>>, - pub all_edges: Vec>>, - - pub node_edges: Vec, - - pub successors: Vec, - // TODO: to get the edges between two nodes: - // 1. get node_outgoing_edges[from] - // 2. get node_incoming_edges[to] - // 3. take intersection betweem the two + pub edges_store: Vec>>, + pub edges_of_node: Vec, } impl Clone for RankingRuleGraph { fn clone(&self) -> Self { Self { query_graph: self.query_graph.clone(), - all_edges: self.all_edges.clone(), - node_edges: self.node_edges.clone(), - successors: self.successors.clone(), + edges_store: self.edges_store.clone(), + edges_of_node: self.edges_of_node.clone(), } } } impl RankingRuleGraph { - pub fn remove_edge(&mut self, edge_index: u16) { - let edge_opt = &mut self.all_edges[edge_index as usize]; + /// Remove the given edge from the ranking rule graph + pub fn remove_ranking_rule_edge(&mut self, edge_index: u16) { + let edge_opt = &mut self.edges_store[edge_index as usize]; let Some(edge) = &edge_opt else { return }; - let (from_node, _to_node) = (edge.from_node, edge.to_node); + let (source_node, _dest_node) = (edge.source_node, edge.dest_node); *edge_opt = None; - let from_node_edges = &mut self.node_edges[from_node as usize]; - from_node_edges.remove(edge_index); - - let mut new_successors_from_node = SmallBitmap::new(self.all_edges.len() as u16); - let all_edges = &self.all_edges; - for from_node_edge in from_node_edges.iter() { - let Edge { to_node, .. } = &all_edges[from_node_edge as usize].as_ref().unwrap(); - new_successors_from_node.insert(*to_node); - } - self.successors[from_node as usize] = new_successors_from_node; + self.edges_of_node[source_node as usize].remove(edge_index); } } diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 82f181b97..b601f28d9 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -1,117 +1,32 @@ -use super::cheapest_paths::Path; -use crate::search::new::small_bitmap::SmallBitmap; - -// What is PathsMap used for? +// What is PathSet used for? // For the empty_prefixes field in the EmptyPathsCache only :/ // but it could be used for more, like efficient computing of a set of paths -#[derive(Debug, Clone)] -pub struct PathsMap { - pub nodes: Vec<(u16, PathsMap)>, - pub value: Option, +/// A set of [`Path`] +#[derive(Default, Debug, Clone)] +pub struct PathSet { + nodes: Vec<(u16, PathSet)>, + is_end: bool, } -impl Default for PathsMap { - fn default() -> Self { - Self { nodes: vec![], value: None } - } -} - -impl PathsMap { - pub fn from_paths(paths: &[Path]) -> Self { - let mut result = Self::default(); - for p in paths { - result.add_path(p); - } - result - } - pub fn add_path(&mut self, path: &Path) { - self.insert(path.edges.iter().copied(), path.cost); - } -} -impl PathsMap { - pub fn is_empty(&self) -> bool { - self.nodes.is_empty() && self.value.is_none() - } - - pub fn insert(&mut self, mut edges: impl Iterator, value: V) { +impl PathSet { + pub fn insert(&mut self, mut edges: impl Iterator) { match edges.next() { None => { - self.value = Some(value); + self.is_end = true; } Some(first_edge) => { - // comment for (edge, next_node) in &mut self.nodes { if edge == &first_edge { - return next_node.insert(edges, value); + return next_node.insert(edges); } } - let mut rest = PathsMap::default(); - rest.insert(edges, value); + let mut rest = PathSet::default(); + rest.insert(edges); self.nodes.push((first_edge, rest)); } } } - fn remove_first_rec(&mut self, cur: &mut Vec) -> (bool, V) { - let Some((first_edge, rest)) = self.nodes.first_mut() else { - // The PathsMap has to be correct by construction here, otherwise - // the unwrap() will crash - return (true, self.value.take().unwrap()) - }; - cur.push(*first_edge); - let (rest_is_empty, value) = rest.remove_first_rec(cur); - if rest_is_empty { - self.nodes.remove(0); - (self.nodes.is_empty(), value) - } else { - (false, value) - } - } - pub fn remove_first(&mut self) -> Option<(Vec, V)> { - if self.is_empty() { - return None; - } - let mut result = vec![]; - let (_, value) = self.remove_first_rec(&mut result); - Some((result, value)) - } - pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { - if let Some(value) = &self.value { - visit(cur, value); - } - for (first_edge, rest) in self.nodes.iter() { - cur.push(*first_edge); - rest.iterate_rec(cur, visit); - cur.pop(); - } - } - pub fn iterate(&self, mut visit: impl FnMut(&Vec, &V)) { - self.iterate_rec(&mut vec![], &mut visit) - } - - pub fn remove_prefixes(&mut self, prefixes: &PathsMap) { - prefixes.iterate(|prefix, _v| { - self.remove_prefix(prefix); - }); - } - pub fn remove_edges(&mut self, forbidden_edges: &SmallBitmap) { - let mut i = 0; - while i < self.nodes.len() { - let should_remove = if forbidden_edges.contains(self.nodes[i].0) { - true - } else if !self.nodes[i].1.nodes.is_empty() { - self.nodes[i].1.remove_edges(forbidden_edges); - self.nodes[i].1.nodes.is_empty() - } else { - false - }; - if should_remove { - self.nodes.remove(i); - } else { - i += 1; - } - } - } pub fn remove_edge(&mut self, forbidden_edge: &u16) { let mut i = 0; while i < self.nodes.len() { @@ -130,34 +45,11 @@ impl PathsMap { } } } - pub fn remove_prefix(&mut self, forbidden_prefix: &[u16]) { - let [first_edge, remaining_prefix @ ..] = forbidden_prefix else { - self.nodes.clear(); - self.value = None; - return; - }; - - let mut i = 0; - while i < self.nodes.len() { - let edge = self.nodes[i].0; - let should_remove = if edge == *first_edge { - self.nodes[i].1.remove_prefix(remaining_prefix); - self.nodes[i].1.nodes.is_empty() - } else { - false - }; - if should_remove { - self.nodes.remove(i); - } else { - i += 1; - } - } - } pub fn final_edges_after_prefix(&self, prefix: &[u16], visit: &mut impl FnMut(u16)) { let [first_edge, remaining_prefix @ ..] = prefix else { for node in self.nodes.iter() { - if node.1.value.is_some() { + if node.1.is_end { visit(node.0) } } @@ -170,20 +62,8 @@ impl PathsMap { } } - pub fn edge_indices_after_prefix(&self, prefix: &[u16]) -> Vec { - let [first_edge, remaining_prefix @ ..] = prefix else { - return self.nodes.iter().map(|n| n.0).collect(); - }; - for (edge, rest) in self.nodes.iter() { - if edge == first_edge { - return rest.edge_indices_after_prefix(remaining_prefix); - } - } - vec![] - } - pub fn contains_prefix_of_path(&self, path: &[u16]) -> bool { - if self.value.is_some() { + if self.is_end { return true; } match path { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 48a6dda7e..0911f0638 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -5,7 +5,7 @@ use itertools::Itertools; use super::ProximityEdge; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::search::new::ranking_rule_graph::proximity::WordPair; -use crate::search::new::ranking_rule_graph::EdgeDetails; +use crate::search::new::ranking_rule_graph::EdgeCondition; use crate::search::new::{QueryNode, SearchContext}; use crate::Result; @@ -57,10 +57,10 @@ pub fn visit_to_node<'search, 'from_data>( ctx: &mut SearchContext<'search>, to_node: &QueryNode, from_node_data: &'from_data (WordDerivations, i8), -) -> Result)>> { +) -> Result)>> { let (derivations1, pos1) = from_node_data; let term2 = match &to_node { - QueryNode::End => return Ok(vec![(0, EdgeDetails::Unconditional)]), + QueryNode::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), QueryNode::Deleted | QueryNode::Start => return Ok(vec![]), QueryNode::Term(term) => term, }; @@ -96,7 +96,7 @@ pub fn visit_to_node<'search, 'from_data>( // We want to effectively ignore this pair of terms // Unconditionally walk through the edge without computing the docids // But also what should the cost be? - return Ok(vec![(0, EdgeDetails::Unconditional)]); + return Ok(vec![(0, EdgeCondition::Unconditional)]); } let updb1 = derivations1.use_prefix_db; @@ -189,7 +189,7 @@ pub fn visit_to_node<'search, 'from_data>( for (proximity, word_pairs) in proximity_word_pairs { edges.push(( cost, - EdgeDetails::Data(ProximityEdge { + EdgeCondition::Conditional(ProximityEdge { pairs: word_pairs.into_boxed_slice(), proximity, }), @@ -198,6 +198,6 @@ pub fn visit_to_node<'search, 'from_data>( edges }) .collect::>(); - new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeDetails::Unconditional)); + new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeCondition::Unconditional)); Ok(new_edges) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 09c9aa960..bf07bf21d 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -4,7 +4,7 @@ pub mod compute_docids; use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; -use super::{EdgeDetails, RankingRuleGraphTrait}; +use super::{EdgeCondition, RankingRuleGraphTrait}; use crate::search::new::interner::Interned; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::WordDerivations; @@ -30,34 +30,34 @@ pub struct ProximityEdge { pub enum ProximityGraph {} impl RankingRuleGraphTrait for ProximityGraph { - type EdgeDetails = ProximityEdge; + type EdgeCondition = ProximityEdge; type BuildVisitedFromNode = (WordDerivations, i8); - fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String { + fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String { let ProximityEdge { pairs, proximity } = edge; format!(", prox {proximity}, {} pairs", pairs.len()) } - fn compute_docids<'search>( + fn resolve_edge_condition<'search>( ctx: &mut SearchContext<'search>, - edge: &Self::EdgeDetails, + edge: &Self::EdgeCondition, universe: &RoaringBitmap, ) -> Result { compute_docids::compute_docids(ctx, edge, universe) } - fn build_visit_from_node<'search>( + fn build_step_visit_source_node<'search>( ctx: &mut SearchContext<'search>, from_node: &QueryNode, ) -> Result> { build::visit_from_node(ctx, from_node) } - fn build_visit_to_node<'from_data, 'search: 'from_data>( + fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( ctx: &mut SearchContext<'search>, to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, - ) -> Result)>> { + ) -> Result)>> { build::visit_to_node(ctx, to_node, from_node_data) } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index bf2c6572e..2f6e7ad80 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -2,7 +2,7 @@ use heed::BytesDecode; use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; -use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; +use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::Interned; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; @@ -20,19 +20,19 @@ pub enum TypoEdge { pub enum TypoGraph {} impl RankingRuleGraphTrait for TypoGraph { - type EdgeDetails = TypoEdge; + type EdgeCondition = TypoEdge; type BuildVisitedFromNode = (); - fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String { + fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String { match edge { TypoEdge::Phrase { .. } => ", 0 typos".to_owned(), TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"), } } - fn compute_docids<'db_cache, 'search>( + fn resolve_edge_condition<'db_cache, 'search>( ctx: &mut SearchContext<'search>, - edge: &Self::EdgeDetails, + edge: &Self::EdgeCondition, universe: &RoaringBitmap, ) -> Result { match edge { @@ -66,29 +66,29 @@ impl RankingRuleGraphTrait for TypoGraph { } } - fn build_visit_from_node<'search>( + fn build_step_visit_source_node<'search>( _ctx: &mut SearchContext<'search>, _from_node: &QueryNode, ) -> Result> { Ok(Some(())) } - fn build_visit_to_node<'from_data, 'search: 'from_data>( + fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( _ctx: &mut SearchContext<'search>, to_node: &QueryNode, _from_node_data: &'from_data Self::BuildVisitedFromNode, - ) -> Result)>> { + ) -> Result)>> { match to_node { QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { &QueryTerm::Phrase { phrase } => { - Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase }))]) + Ok(vec![(0, EdgeCondition::Conditional(TypoEdge::Phrase { phrase }))]) } QueryTerm::Word { derivations } => { let mut edges = vec![]; if !derivations.zero_typo.is_empty() || derivations.use_prefix_db { edges.push(( 0, - EdgeDetails::Data(TypoEdge::Word { + EdgeCondition::Conditional(TypoEdge::Word { derivations: derivations.clone(), nbr_typos: 0, }), @@ -97,7 +97,7 @@ impl RankingRuleGraphTrait for TypoGraph { if !derivations.one_typo.is_empty() { edges.push(( 1, - EdgeDetails::Data(TypoEdge::Word { + EdgeCondition::Conditional(TypoEdge::Word { derivations: derivations.clone(), nbr_typos: 1, }), @@ -106,7 +106,7 @@ impl RankingRuleGraphTrait for TypoGraph { if !derivations.two_typos.is_empty() { edges.push(( 2, - EdgeDetails::Data(TypoEdge::Word { + EdgeCondition::Conditional(TypoEdge::Word { derivations: derivations.clone(), nbr_typos: 2, }), @@ -115,7 +115,7 @@ impl RankingRuleGraphTrait for TypoGraph { Ok(edges) } }, - QueryNode::End => Ok(vec![(0, EdgeDetails::Unconditional)]), + QueryNode::End => Ok(vec![(0, EdgeCondition::Unconditional)]), QueryNode::Deleted | QueryNode::Start => panic!(), } } From 0465ba4a050821823c22dc2952baed32395ac3d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 9 Mar 2023 11:12:31 +0100 Subject: [PATCH 048/234] Intern more values --- milli/src/search/new/db_cache.rs | 77 ++-- milli/src/search/new/logger/detailed.rs | 9 +- milli/src/search/new/mod.rs | 387 +++++++++++++++++- milli/src/search/new/query_graph.rs | 19 +- milli/src/search/new/query_term.rs | 74 ++-- .../new/ranking_rule_graph/proximity/build.rs | 100 +++-- .../proximity/compute_docids.rs | 36 +- .../search/new/ranking_rule_graph/typo/mod.rs | 147 ++++--- milli/src/search/new/ranking_rules.rs | 62 +-- milli/src/search/new/resolve_query_graph.rs | 239 +++++++---- milli/src/search/new/small_bitmap.rs | 28 -- milli/src/search/new/sort.rs | 29 +- 12 files changed, 879 insertions(+), 328 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 7e68ec5e5..8435eb9da 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -5,9 +5,8 @@ use fxhash::FxHashMap; use heed::types::ByteSlice; use heed::{BytesEncode, Database, RoTxn}; -use super::interner::Interned; -use super::SearchContext; -use crate::Result; +use super::interner::{Interned, Interner}; +use crate::{Index, Result}; /// A cache storing pointers to values in the LMDB databases. /// @@ -26,7 +25,7 @@ pub struct DatabaseCache<'search> { pub exact_word_docids: FxHashMap, Option<&'search [u8]>>, pub word_prefix_docids: FxHashMap, Option<&'search [u8]>>, } -impl<'search> SearchContext<'search> { +impl<'search> DatabaseCache<'search> { fn get_value<'v, K1, KC>( txn: &'search RoTxn, cache_key: K1, @@ -50,82 +49,92 @@ impl<'search> SearchContext<'search> { } /// Retrieve or insert the given value in the `word_docids` database. - pub fn get_word_docids(&mut self, word: Interned) -> Result> { + pub fn get_word_docids( + &mut self, + index: &Index, + txn: &'search RoTxn, + word_interner: &Interner, + word: Interned, + ) -> Result> { Self::get_value( - self.txn, + txn, word, - self.word_interner.get(word).as_str(), - &mut self.db_cache.word_docids, - self.index.word_docids.remap_data_type::(), + word_interner.get(word).as_str(), + &mut self.word_docids, + index.word_docids.remap_data_type::(), ) } /// Retrieve or insert the given value in the `word_prefix_docids` database. pub fn get_word_prefix_docids( &mut self, + index: &Index, + txn: &'search RoTxn, + word_interner: &Interner, prefix: Interned, ) -> Result> { Self::get_value( - self.txn, + txn, prefix, - self.word_interner.get(prefix).as_str(), - &mut self.db_cache.word_prefix_docids, - self.index.word_prefix_docids.remap_data_type::(), + word_interner.get(prefix).as_str(), + &mut self.word_prefix_docids, + index.word_prefix_docids.remap_data_type::(), ) } pub fn get_word_pair_proximity_docids( &mut self, + index: &Index, + txn: &'search RoTxn, + word_interner: &Interner, word1: Interned, word2: Interned, proximity: u8, ) -> Result> { Self::get_value( - self.txn, + txn, (proximity, word1, word2), - &( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(word2).as_str(), - ), - &mut self.db_cache.word_pair_proximity_docids, - self.index.word_pair_proximity_docids.remap_data_type::(), + &(proximity, word_interner.get(word1).as_str(), word_interner.get(word2).as_str()), + &mut self.word_pair_proximity_docids, + index.word_pair_proximity_docids.remap_data_type::(), ) } pub fn get_word_prefix_pair_proximity_docids( &mut self, + index: &Index, + txn: &'search RoTxn, + word_interner: &Interner, word1: Interned, prefix2: Interned, proximity: u8, ) -> Result> { Self::get_value( - self.txn, + txn, (proximity, word1, prefix2), - &( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(prefix2).as_str(), - ), - &mut self.db_cache.word_prefix_pair_proximity_docids, - self.index.word_prefix_pair_proximity_docids.remap_data_type::(), + &(proximity, word_interner.get(word1).as_str(), word_interner.get(prefix2).as_str()), + &mut self.word_prefix_pair_proximity_docids, + index.word_prefix_pair_proximity_docids.remap_data_type::(), ) } pub fn get_prefix_word_pair_proximity_docids( &mut self, + index: &Index, + txn: &'search RoTxn, + word_interner: &Interner, left_prefix: Interned, right: Interned, proximity: u8, ) -> Result> { Self::get_value( - self.txn, + txn, (proximity, left_prefix, right), &( proximity, - self.word_interner.get(left_prefix).as_str(), - self.word_interner.get(right).as_str(), + word_interner.get(left_prefix).as_str(), + word_interner.get(right).as_str(), ), - &mut self.db_cache.prefix_word_pair_proximity_docids, - self.index.prefix_word_pair_proximity_docids.remap_data_type::(), + &mut self.prefix_word_pair_proximity_docids, + index.prefix_word_pair_proximity_docids.remap_data_type::(), ) } } diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 468bc0343..5929f406c 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -432,7 +432,10 @@ results.{random} {{ file: &mut File, ) { match &node { - QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { + QueryNode::Term(LocatedQueryTerm { value, .. }) => match ctx + .query_term_interner + .get(*value) + { QueryTerm::Phrase { phrase } => { let phrase = ctx.phrase_interner.get(*phrase); let phrase_str = phrase.description(&ctx.word_interner); @@ -593,7 +596,7 @@ shape: class" graph.edges_store[edge_idx as usize].as_ref().unwrap(); let source_node = &graph.query_graph.nodes[*source_node as usize]; let source_node_desc = match source_node { - QueryNode::Term(term) => match &term.value { + QueryNode::Term(term) => match ctx.query_term_interner.get(term.value) { QueryTerm::Phrase { phrase } => { let phrase = ctx.phrase_interner.get(*phrase); phrase.description(&ctx.word_interner) @@ -608,7 +611,7 @@ shape: class" }; let dest_node = &graph.query_graph.nodes[*dest_node as usize]; let dest_node_desc = match dest_node { - QueryNode::Term(term) => match &term.value { + QueryNode::Term(term) => match ctx.query_term_interner.get(term.value) { QueryTerm::Phrase { phrase } => { let phrase = ctx.phrase_interner.get(*phrase); phrase.description(&ctx.word_interner) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index f2cc7d5f4..3975dd4b6 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -5,10 +5,14 @@ mod logger; mod query_graph; mod query_term; mod ranking_rule_graph; + mod ranking_rules; mod resolve_query_graph; +// TODO: documentation + comments mod small_bitmap; +// TODO: documentation + comments mod sort; +// TODO: documentation + comments mod words; pub use logger::{DefaultSearchLogger, SearchLogger}; @@ -19,16 +23,16 @@ use charabia::Tokenize; use db_cache::DatabaseCache; use heed::RoTxn; use query_graph::{QueryGraph, QueryNode}; -pub use ranking_rules::{ - apply_ranking_rules, RankingRule, RankingRuleOutput, RankingRuleOutputIter, - RankingRuleOutputIterWrapper, RankingRuleQueryTrait, -}; +pub use ranking_rules::{bucket_sort, RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use roaring::RoaringBitmap; use self::interner::Interner; -use self::query_term::Phrase; -use self::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}; +use self::query_term::{Phrase, WordDerivations}; +use self::resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; +use crate::search::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::search::new::query_term::located_query_terms_from_string; +use crate::search::new::ranking_rule_graph::{ProximityGraph, TypoGraph}; +use crate::search::new::words::Words; use crate::{Filter, Index, Result, TermsMatchingStrategy}; pub enum BitmapOrAllRef<'s> { @@ -42,7 +46,8 @@ pub struct SearchContext<'search> { pub db_cache: DatabaseCache<'search>, pub word_interner: Interner, pub phrase_interner: Interner, - pub node_docids_cache: NodeDocIdsCache, + pub derivations_interner: Interner, + pub query_term_docids: QueryTermDocIdsCache, } impl<'search> SearchContext<'search> { pub fn new(index: &'search Index, txn: &'search RoTxn<'search>) -> Self { @@ -52,7 +57,8 @@ impl<'search> SearchContext<'search> { db_cache: <_>::default(), word_interner: <_>::default(), phrase_interner: <_>::default(), - node_docids_cache: <_>::default(), + derivations_interner: <_>::default(), + query_term_docids: <_>::default(), } } } @@ -129,5 +135,368 @@ pub fn execute_search<'search>( logger.initial_universe(&universe); - apply_ranking_rules(ctx, &graph, &universe, from, length, logger) + let words = &mut Words::new(TermsMatchingStrategy::Last); + // let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; + let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); + let typo = &mut GraphBasedRankingRule::::new("typo".to_owned()); + // TODO: ranking rules given as argument + let ranking_rules: Vec<&mut dyn RankingRule<'search, QueryGraph>> = + vec![words, typo, proximity /*sort*/]; + + bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, logger) +} + +#[cfg(test)] +mod tests { + // use crate::allocator::ALLOC; + use std::fs::File; + use std::io::{BufRead, BufReader, Cursor, Seek}; + use std::time::Instant; + + use big_s::S; + use heed::EnvOpenOptions; + use maplit::hashset; + + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + // use crate::search::new::logger::detailed::DetailedSearchLogger; + use crate::search::new::logger::DefaultSearchLogger; + use crate::search::new::{execute_search, SearchContext}; + use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; + use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; + + #[test] + fn search_wiki_new() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_wiki").unwrap(); + let txn = index.read_txn().unwrap(); + + println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); + + // loop { + let start = Instant::now(); + + let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let results = execute_search( + &mut ctx, + "zero config", + None, + 0, + 20, + // &mut DefaultSearchLogger, + &mut logger, + ) + .unwrap(); + + logger.write_d2_description(&mut ctx); + + let elapsed = start.elapsed(); + println!("{}us", elapsed.as_micros()); + + let _documents = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); + + println!("{}us: {:?}", elapsed.as_micros(), results); + // } + // for (id, _document) in documents { + // println!("{id}:"); + // // println!("{document}"); + // } + } + + #[test] + fn search_wiki_old() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_wiki").unwrap(); + + let txn = index.read_txn().unwrap(); + + let rr = index.criteria(&txn).unwrap(); + println!("{rr:?}"); + + let start = Instant::now(); + + let mut s = Search::new(&txn, &index); + s.query("which a the releases from poison by the government"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); + let docs = s.execute().unwrap(); + + let elapsed = start.elapsed(); + + let documents = index + .documents(&txn, docs.documents_ids.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); + + println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + for (id, _document) in documents { + println!("{id}:"); + // println!("{document}"); + } + } + #[test] + fn search_movies_new() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + let txn = index.read_txn().unwrap(); + + // let primary_key = index.primary_key(&txn).unwrap().unwrap(); + // let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + // loop { + let start = Instant::now(); + + let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let results = execute_search( + &mut ctx, + "releases from poison by the government", + None, + 0, + 20, + // &mut DefaultSearchLogger, + &mut logger, + ) + .unwrap(); + + logger.write_d2_description(&mut ctx); + + let elapsed = start.elapsed(); + + // let ids = index + // .documents(&txn, results.iter().copied()) + // .unwrap() + // .into_iter() + // .map(|x| { + // let obkv = &x.1; + // let id = obkv.get(primary_key).unwrap(); + // let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + // id.as_str().unwrap().to_owned() + // }) + // .collect::>(); + + println!("{}us: {results:?}", elapsed.as_micros()); + // println!("external ids: {ids:?}"); + // } + } + + #[test] + fn search_movies_old() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + + let txn = index.read_txn().unwrap(); + + let rr = index.criteria(&txn).unwrap(); + println!("{rr:?}"); + + let primary_key = index.primary_key(&txn).unwrap().unwrap(); + let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + + let start = Instant::now(); + + let mut s = Search::new(&txn, &index); + s.query("which a the releases from poison by the government"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); + let docs = s.execute().unwrap(); + + let elapsed = start.elapsed(); + + let ids = index + .documents(&txn, docs.documents_ids.iter().copied()) + .unwrap() + .into_iter() + .map(|x| { + let obkv = &x.1; + let id = obkv.get(primary_key).unwrap(); + let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + id.as_str().unwrap().to_owned() + }) + .collect::>(); + + println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + println!("external ids: {ids:?}"); + } + + #[test] + fn _settings_movies() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_min_word_len_one_typo(5); + builder.set_min_word_len_two_typos(100); + builder.set_sortable_fields(hashset! { S("release_date") }); + builder.set_criteria(vec![ + Criterion::Words, + Criterion::Typo, + Criterion::Proximity, + Criterion::Asc("release_date".to_owned()), + ]); + + builder.execute(|_| (), || false).unwrap(); + wtxn.commit().unwrap(); + } + + #[test] + fn _index_movies() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + let primary_key = "id"; + let searchable_fields = vec!["title", "overview"]; + let filterable_fields = vec!["release_date", "genres"]; + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key(primary_key.to_owned()); + let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(filterable_fields); + + builder.set_min_word_len_one_typo(5); + builder.set_min_word_len_two_typos(100); + builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + builder.execute(|_| (), || false).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) + .unwrap(); + + let documents = documents_from( + "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json", + "json", + ); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + } + #[test] + fn _index_wiki() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_wiki").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + // let primary_key = "id"; + let searchable_fields = vec!["body", "title", "url"]; + // let filterable_fields = vec![]; + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + // builder.set_primary_key(primary_key.to_owned()); + let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_filterable_fields(filterable_fields); + + // builder.set_min_word_len_one_typo(5); + // builder.set_min_word_len_two_typos(100); + builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); + builder.execute(|_| (), || false).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) + .unwrap(); + + let documents = documents_from( + "/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv", + "csv", + ); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + } + + fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { + let reader = File::open(filename) + .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename)); + let reader = BufReader::new(reader); + let documents = match filetype { + "csv" => documents_from_csv(reader).unwrap(), + "json" => documents_from_json(reader).unwrap(), + "jsonl" => documents_from_jsonl(reader).unwrap(), + otherwise => panic!("invalid update format {:?}", otherwise), + }; + DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() + } + + fn documents_from_jsonl(reader: impl BufRead) -> crate::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { + let object = result.unwrap(); + documents.append_json_object(&object)?; + } + + documents.into_inner().map_err(Into::into) + } + + fn documents_from_json(reader: impl BufRead) -> crate::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + documents.append_json_array(reader)?; + + documents.into_inner().map_err(Into::into) + } + + fn documents_from_csv(reader: impl BufRead) -> crate::Result> { + let csv = csv::Reader::from_reader(reader); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) + } } diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 88d1849e3..90edd4f09 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -3,7 +3,7 @@ use super::small_bitmap::SmallBitmap; use super::SearchContext; use crate::Result; -const QUERY_GRAPH_NODE_LENGTH_LIMIT: u16 = 64; +pub const QUERY_GRAPH_NODE_LENGTH_LIMIT: u16 = 64; /// A node of the [`QueryGraph`]. /// @@ -148,7 +148,7 @@ impl QueryGraph { let mut new_nodes = vec![]; let new_node_idx = graph.add_node(&prev0, QueryNode::Term(term0.clone())); new_nodes.push(new_node_idx); - if term0.is_empty() { + if term0.is_empty(&ctx.derivations_interner) { empty_nodes.push(new_node_idx); } @@ -159,7 +159,7 @@ impl QueryGraph { if word_set.contains(ctx.word_interner.get(ngram2_str)) { let ngram2 = LocatedQueryTerm { value: QueryTerm::Word { - derivations: WordDerivations { + derivations: ctx.derivations_interner.insert(WordDerivations { original: ngram2_str, // TODO: could add a typo if it's an ngram? zero_typo: Box::new([ngram2_str]), @@ -168,7 +168,7 @@ impl QueryGraph { use_prefix_db: false, synonyms: Box::new([]), // TODO: ngram synonyms split_words: None, // TODO: maybe ngram split words? - }, + }), }, positions: ngram2_pos, }; @@ -187,7 +187,7 @@ impl QueryGraph { if word_set.contains(ctx.word_interner.get(ngram3_str)) { let ngram3 = LocatedQueryTerm { value: QueryTerm::Word { - derivations: WordDerivations { + derivations: ctx.derivations_interner.insert(WordDerivations { original: ngram3_str, // TODO: could add a typo if it's an ngram? zero_typo: Box::new([ngram3_str]), @@ -197,7 +197,7 @@ impl QueryGraph { synonyms: Box::new([]), // TODO: ngram synonyms split_words: None, // TODO: maybe ngram split words? // would be nice for typos like su nflower - }, + }), }, positions: ngram3_pos, }; @@ -277,9 +277,10 @@ impl QueryGraph { loop { let mut nodes_to_remove = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { - if !matches!(node, QueryNode::End | QueryNode::Deleted) - && (self.edges[node_idx].successors.is_empty() - || self.edges[node_idx].predecessors.is_empty()) + if (!matches!(node, QueryNode::End | QueryNode::Deleted) + && self.edges[node_idx].successors.is_empty()) + || (!matches!(node, QueryNode::Start | QueryNode::Deleted) + && self.edges[node_idx].predecessors.is_empty()) { nodes_to_remove.push(node_idx as u16); } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 446f34e68..1b709d0e9 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -29,7 +29,7 @@ impl Phrase { /// A structure storing all the different ways to match /// a term in the user's search query. -#[derive(Clone)] +#[derive(Clone, PartialEq, Eq, Hash)] pub struct WordDerivations { /// The original word pub original: Interned, @@ -59,12 +59,12 @@ impl WordDerivations { /// Return an iterator over all the single words derived from the original word. /// /// This excludes synonyms, split words, and words stored in the prefix databases. - pub fn all_derivations_except_prefix_db( + pub fn all_single_word_derivations_except_prefix_db( &'_ self, ) -> impl Iterator> + Clone + '_ { self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter()).copied() } - fn is_empty(&self) -> bool { + pub fn is_empty(&self) -> bool { self.zero_typo.is_empty() && self.one_typo.is_empty() && self.two_typos.is_empty() @@ -101,10 +101,10 @@ pub fn word_derivations( let prefix = Str::new(word).starts_with(); let mut stream = fst.search(prefix).into_stream(); - while let Some(word) = stream.next() { - let word = std::str::from_utf8(word)?.to_owned(); - let word_interned = ctx.word_interner.insert(word); - zero_typo.push(word_interned); + while let Some(derived_word) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?.to_owned(); + let derived_word_interned = ctx.word_interner.insert(derived_word); + zero_typo.push(derived_word_interned); } } else if fst.contains(word) { zero_typo.push(word_interned); @@ -113,17 +113,19 @@ pub fn word_derivations( let dfa = build_dfa(word, 1, is_prefix); let starts = StartsWith(Str::new(get_first(word))); let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); + // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? + + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; - while let Some((word, state)) = stream.next() { - let word = std::str::from_utf8(word)?; - let word_interned = ctx.word_interner.insert(word.to_owned()); let d = dfa.distance(state.1); + let derived_word_interned = ctx.word_interner.insert(derived_word.to_owned()); match d.to_u8() { 0 => { - zero_typo.push(word_interned); + zero_typo.push(derived_word_interned); } 1 => { - one_typo.push(word_interned); + one_typo.push(derived_word_interned); } _ => panic!(), } @@ -136,27 +138,28 @@ pub fn word_derivations( let automaton = Union(first, &second); let mut stream = fst.search_with_state(automaton).into_stream(); + // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? - while let Some((found_word, state)) = stream.next() { - let found_word = std::str::from_utf8(found_word)?; - let found_word_interned = ctx.word_interner.insert(found_word.to_owned()); + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; + let derived_word_interned = ctx.word_interner.insert(derived_word.to_owned()); // in the case the typo is on the first letter, we know the number of typo // is two - if get_first(found_word) != get_first(word) { - two_typos.push(found_word_interned); + if get_first(derived_word) != get_first(word) { + two_typos.push(derived_word_interned); } else { // Else, we know that it is the second dfa that matched and compute the // correct distance let d = second_dfa.distance((state.1).0); match d.to_u8() { 0 => { - zero_typo.push(found_word_interned); + zero_typo.push(derived_word_interned); } 1 => { - one_typo.push(found_word_interned); + one_typo.push(derived_word_interned); } 2 => { - two_typos.push(found_word_interned); + two_typos.push(derived_word_interned); } _ => panic!(), } @@ -223,10 +226,11 @@ fn split_best_frequency( Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned()))) } -#[derive(Clone)] +#[derive(Clone, PartialEq, Eq, Hash)] pub enum QueryTerm { Phrase { phrase: Interned }, - Word { derivations: WordDerivations }, + // TODO: change to `Interned`? + Word { derivations: Interned }, } impl QueryTerm { @@ -234,10 +238,12 @@ impl QueryTerm { pub fn original_single_word<'interner>( &self, word_interner: &'interner Interner, + derivations_interner: &'interner Interner, ) -> Option<&'interner str> { match self { QueryTerm::Phrase { phrase: _ } => None, QueryTerm::Word { derivations } => { + let derivations = derivations_interner.get(*derivations); if derivations.is_empty() { None } else { @@ -257,12 +263,12 @@ pub struct LocatedQueryTerm { impl LocatedQueryTerm { /// Return `true` iff the word derivations within the query term are empty - pub fn is_empty(&self) -> bool { - match &self.value { + pub fn is_empty(&self, interner: &Interner) -> bool { + match self.value { // TODO: phrases should be greedily computed, so that they can be excluded from // the query graph right from the start? QueryTerm::Phrase { phrase: _ } => false, - QueryTerm::Word { derivations, .. } => derivations.is_empty(), + QueryTerm::Word { derivations, .. } => interner.get(derivations).is_empty(), } } } @@ -336,7 +342,9 @@ pub fn located_query_terms_from_string<'search>( let word = token.lemma(); let derivations = word_derivations(ctx, word, nbr_typos(word), false)?; let located_term = LocatedQueryTerm { - value: QueryTerm::Word { derivations }, + value: QueryTerm::Word { + derivations: ctx.derivations_interner.insert(derivations), + }, positions: position..=position, }; located_terms.push(located_term); @@ -347,7 +355,9 @@ pub fn located_query_terms_from_string<'search>( let word = token.lemma(); let derivations = word_derivations(ctx, word, nbr_typos(word), true)?; let located_term = LocatedQueryTerm { - value: QueryTerm::Word { derivations }, + value: QueryTerm::Word { + derivations: ctx.derivations_interner.insert(derivations), + }, positions: position..=position, }; located_terms.push(located_term); @@ -409,8 +419,8 @@ pub fn ngram2( return None; } match ( - &x.value.original_single_word(&ctx.word_interner), - &y.value.original_single_word(&ctx.word_interner), + &x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), + &y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), ) { (Some(w1), Some(w2)) => { let term = ( @@ -436,9 +446,9 @@ pub fn ngram3( return None; } match ( - &x.value.original_single_word(&ctx.word_interner), - &y.value.original_single_word(&ctx.word_interner), - &z.value.original_single_word(&ctx.word_interner), + &x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), + &y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), + &z.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), ) { (Some(w1), Some(w2), Some(w3)) => { let term = ( diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 0911f0638..6caa4a769 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -14,29 +14,33 @@ pub fn visit_from_node( from_node: &QueryNode, ) -> Result> { Ok(Some(match from_node { - QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 { - QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()), - QueryTerm::Phrase { phrase: phrase1 } => { - let phrase1 = ctx.phrase_interner.get(*phrase1); - if let Some(original) = *phrase1.words.last().unwrap() { - ( - WordDerivations { - original, - zero_typo: Box::new([original]), - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: false, - synonyms: Box::new([]), - split_words: None, - }, - *pos1.end(), - ) - } else { - // No word pairs if the phrase does not have a regular word as its last term - return Ok(None); + QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => { + match value1 { + QueryTerm::Word { derivations } => { + (ctx.derivations_interner.get(*derivations).clone(), *pos1.end()) + } + QueryTerm::Phrase { phrase: phrase1 } => { + let phrase1 = ctx.phrase_interner.get(*phrase1); + if let Some(original) = *phrase1.words.last().unwrap() { + ( + WordDerivations { + original, + zero_typo: Box::new([original]), + one_typo: Box::new([]), + two_typos: Box::new([]), + use_prefix_db: false, + synonyms: Box::new([]), + split_words: None, + }, + *pos1.end(), + ) + } else { + // No word pairs if the phrase does not have a regular word as its last term + return Ok(None); + } } } - }, + } QueryNode::Start => ( WordDerivations { original: ctx.word_interner.insert(String::new()), @@ -58,6 +62,10 @@ pub fn visit_to_node<'search, 'from_data>( to_node: &QueryNode, from_node_data: &'from_data (WordDerivations, i8), ) -> Result)>> { + let SearchContext { index, txn, db_cache, word_interner, derivations_interner, .. } = ctx; + + // IMPORTANT! TODO: split words support + let (derivations1, pos1) = from_node_data; let term2 = match &to_node { QueryNode::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), @@ -67,7 +75,9 @@ pub fn visit_to_node<'search, 'from_data>( let LocatedQueryTerm { value: value2, positions: pos2 } = term2; let (derivations2, pos2, ngram_len2) = match value2 { - QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()), + QueryTerm::Word { derivations } => { + (derivations_interner.get(*derivations).clone(), *pos2.start(), pos2.len()) + } QueryTerm::Phrase { phrase: phrase2 } => { let phrase2 = ctx.phrase_interner.get(*phrase2); if let Some(original) = *phrase2.words.first().unwrap() { @@ -105,7 +115,8 @@ pub fn visit_to_node<'search, 'from_data>( // left term cannot be a prefix assert!(!updb1); - let derivations1 = derivations1.all_derivations_except_prefix_db(); + // TODO: IMPORTANT! split words and synonyms support + let derivations1 = derivations1.all_single_word_derivations_except_prefix_db(); // TODO: eventually, we want to get rid of the uses from `orginal` let mut cost_proximity_word_pairs = BTreeMap::>>::new(); @@ -115,8 +126,11 @@ pub fn visit_to_node<'search, 'from_data>( let cost = (proximity + ngram_len2 - 1) as u8; // TODO: if we had access to the universe here, we could already check whether // the bitmap corresponding to this word pair is disjoint with the universe or not - if ctx + if db_cache .get_word_prefix_pair_proximity_docids( + index, + txn, + word_interner, word1, derivations2.original, proximity as u8, @@ -133,8 +147,11 @@ pub fn visit_to_node<'search, 'from_data>( right_prefix: derivations2.original, }); } - if ctx + if db_cache .get_prefix_word_pair_proximity_docids( + index, + txn, + word_interner, derivations2.original, word1, proximity as u8 - 1, @@ -155,14 +172,30 @@ pub fn visit_to_node<'search, 'from_data>( } } - let derivations2 = derivations2.all_derivations_except_prefix_db(); - // TODO: add safeguard in case the cartesian product is too large? + // TODO: important! support split words and synonyms as well + let derivations2 = derivations2.all_single_word_derivations_except_prefix_db(); + // TODO: add safeguard in case the cartesian product is too large! + // even if we restrict the word derivations to a maximum of 100, the size of the + // caterisan product could reach a maximum of 10_000 derivations, which is way too much. + // mMaybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo + // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been + // reached let product_derivations = derivations1.cartesian_product(derivations2); for (word1, word2) in product_derivations { for proximity in 1..=(8 - ngram_len2) { let cost = (proximity + ngram_len2 - 1) as u8; - if ctx.get_word_pair_proximity_docids(word1, word2, proximity as u8)?.is_some() { + if db_cache + .get_word_pair_proximity_docids( + index, + txn, + word_interner, + word1, + word2, + proximity as u8, + )? + .is_some() + { cost_proximity_word_pairs .entry(cost) .or_default() @@ -171,7 +204,16 @@ pub fn visit_to_node<'search, 'from_data>( .push(WordPair::Words { left: word1, right: word2 }); } if proximity > 1 - && ctx.get_word_pair_proximity_docids(word2, word1, proximity as u8 - 1)?.is_some() + && db_cache + .get_word_pair_proximity_docids( + index, + txn, + word_interner, + word2, + word1, + proximity as u8 - 1, + )? + .is_some() { cost_proximity_word_pairs .entry(cost) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 9aa4ce446..777d69b64 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -9,19 +9,37 @@ pub fn compute_docids<'search>( edge: &ProximityEdge, universe: &RoaringBitmap, ) -> Result { + let SearchContext { index, txn, db_cache, word_interner, .. } = ctx; let ProximityEdge { pairs, proximity } = edge; let mut pair_docids = RoaringBitmap::new(); for pair in pairs.iter() { let bytes = match pair { - WordPair::Words { left, right } => { - ctx.get_word_pair_proximity_docids(*left, *right, *proximity) - } - WordPair::WordPrefix { left, right_prefix } => { - ctx.get_word_prefix_pair_proximity_docids(*left, *right_prefix, *proximity) - } - WordPair::WordPrefixSwapped { left_prefix, right } => { - ctx.get_prefix_word_pair_proximity_docids(*left_prefix, *right, *proximity) - } + WordPair::Words { left, right } => db_cache.get_word_pair_proximity_docids( + index, + txn, + word_interner, + *left, + *right, + *proximity, + ), + WordPair::WordPrefix { left, right_prefix } => db_cache + .get_word_prefix_pair_proximity_docids( + index, + txn, + word_interner, + *left, + *right_prefix, + *proximity, + ), + WordPair::WordPrefixSwapped { left_prefix, right } => db_cache + .get_prefix_word_pair_proximity_docids( + index, + txn, + word_interner, + *left_prefix, + *right, + *proximity, + ), }?; // TODO: deserialize bitmap within a universe, and (maybe) using a bump allocator? let bitmap = universe diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 2f6e7ad80..6354909f6 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,4 +1,3 @@ -use heed::BytesDecode; use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; @@ -6,15 +5,14 @@ use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::Interned; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; -use crate::search::new::resolve_query_graph::resolve_phrase; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; -use crate::{Result, RoaringBitmapCodec}; +use crate::Result; #[derive(Clone)] pub enum TypoEdge { Phrase { phrase: Interned }, - Word { derivations: WordDerivations, nbr_typos: u8 }, + Word { derivations: Interned, nbr_typos: u8 }, } pub enum TypoGraph {} @@ -35,32 +33,37 @@ impl RankingRuleGraphTrait for TypoGraph { edge: &Self::EdgeCondition, universe: &RoaringBitmap, ) -> Result { + let SearchContext { + index, + txn, + db_cache, + word_interner, + phrase_interner, + derivations_interner, + query_term_docids, + } = ctx; match edge { - TypoEdge::Phrase { phrase } => resolve_phrase(ctx, *phrase), - TypoEdge::Word { derivations, nbr_typos } => { - let words = match nbr_typos { - 0 => &derivations.zero_typo, - 1 => &derivations.one_typo, - 2 => &derivations.two_typos, - _ => panic!(), - }; - let mut docids = RoaringBitmap::new(); - for word in words.iter().copied() { - let Some(bytes) = ctx.get_word_docids(word)? else { continue }; - // TODO: deserialize bitmap within a universe - let bitmap = universe - & RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; - docids |= bitmap; - } - if *nbr_typos == 0 { - if let Some(bytes) = ctx.get_word_prefix_docids(derivations.original)? { - // TODO: deserialize bitmap within a universe - let bitmap = universe - & RoaringBitmapCodec::bytes_decode(bytes) - .ok_or(heed::Error::Decoding)?; - docids |= bitmap; - } - } + &TypoEdge::Phrase { phrase } => Ok(universe + & query_term_docids.get_phrase_docids( + index, + txn, + db_cache, + word_interner, + phrase_interner, + phrase, + )?), + TypoEdge::Word { derivations, .. } => { + let docids = universe + & query_term_docids.get_word_derivations_docids( + index, + txn, + db_cache, + word_interner, + derivations_interner, + phrase_interner, + *derivations, + )?; + Ok(docids) } } @@ -74,43 +77,71 @@ impl RankingRuleGraphTrait for TypoGraph { } fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( - _ctx: &mut SearchContext<'search>, + ctx: &mut SearchContext<'search>, to_node: &QueryNode, _from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { + let SearchContext { derivations_interner, .. } = ctx; match to_node { - QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { - &QueryTerm::Phrase { phrase } => { + QueryNode::Term(LocatedQueryTerm { value, .. }) => match *value { + QueryTerm::Phrase { phrase } => { Ok(vec![(0, EdgeCondition::Conditional(TypoEdge::Phrase { phrase }))]) } QueryTerm::Word { derivations } => { let mut edges = vec![]; - if !derivations.zero_typo.is_empty() || derivations.use_prefix_db { - edges.push(( - 0, - EdgeCondition::Conditional(TypoEdge::Word { - derivations: derivations.clone(), - nbr_typos: 0, - }), - )) - } - if !derivations.one_typo.is_empty() { - edges.push(( - 1, - EdgeCondition::Conditional(TypoEdge::Word { - derivations: derivations.clone(), - nbr_typos: 1, - }), - )) - } - if !derivations.two_typos.is_empty() { - edges.push(( - 2, - EdgeCondition::Conditional(TypoEdge::Word { - derivations: derivations.clone(), - nbr_typos: 2, - }), - )) + + for nbr_typos in 0..=2 { + let derivations = derivations_interner.get(derivations).clone(); + let new_derivations = match nbr_typos { + 0 => { + // TODO: think about how split words and synonyms should be handled here + // TODO: what about ngrams? + // Maybe 2grams should have one typo by default and 3grams 2 typos by default + WordDerivations { + original: derivations.original, + synonyms: derivations.synonyms, + split_words: None, + zero_typo: derivations.zero_typo, + one_typo: Box::new([]), + two_typos: Box::new([]), + use_prefix_db: derivations.use_prefix_db, + } + } + 1 => { + // What about split words and synonyms here? + WordDerivations { + original: derivations.original, + synonyms: Box::new([]), + split_words: derivations.split_words, + zero_typo: Box::new([]), + one_typo: derivations.one_typo, + two_typos: Box::new([]), + use_prefix_db: false, // false because all items from use_prefix_db haev 0 typos + } + } + 2 => { + // What about split words and synonyms here? + WordDerivations { + original: derivations.original, + synonyms: Box::new([]), + split_words: None, + zero_typo: Box::new([]), + one_typo: Box::new([]), + two_typos: derivations.two_typos, + use_prefix_db: false, // false because all items from use_prefix_db haev 0 typos + } + } + _ => panic!(), + }; + if !new_derivations.is_empty() { + edges.push(( + nbr_typos, + EdgeCondition::Conditional(TypoEdge::Word { + derivations: derivations_interner.insert(new_derivations), + nbr_typos, + }), + )) + } } Ok(edges) } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 3ccb54032..d65610f9d 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -2,42 +2,23 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::{QueryGraph, SearchContext}; -use crate::search::new::graph_based_ranking_rule::GraphBasedRankingRule; -use crate::search::new::ranking_rule_graph::{ProximityGraph, TypoGraph}; -use crate::search::new::words::Words; // use crate::search::new::sort::Sort; -use crate::{Result, TermsMatchingStrategy}; - -pub trait RankingRuleOutputIter<'search, Query> { - fn next_bucket(&mut self) -> Result>>; -} - -pub struct RankingRuleOutputIterWrapper<'search, Query> { - iter: Box>> + 'search>, -} -impl<'search, Query> RankingRuleOutputIterWrapper<'search, Query> { - pub fn new(iter: Box>> + 'search>) -> Self { - Self { iter } - } -} -impl<'search, Query> RankingRuleOutputIter<'search, Query> - for RankingRuleOutputIterWrapper<'search, Query> -{ - fn next_bucket(&mut self) -> Result>> { - match self.iter.next() { - Some(x) => x.map(Some), - None => Ok(None), - } - } -} +use crate::Result; +/// An internal trait implemented by only [`PlaceholderQuery`] and [`QueryGraph`] pub trait RankingRuleQueryTrait: Sized + Clone + 'static {} +/// A type describing a placeholder search #[derive(Clone)] pub struct PlaceholderQuery; impl RankingRuleQueryTrait for PlaceholderQuery {} impl RankingRuleQueryTrait for QueryGraph {} +/// A trait that must be implemented by all ranking rules. +/// +/// It is generic over `'search`, the lifetime of the search context +/// (i.e. the read transaction and the cache) and over `Query`, which +/// can be either [`PlaceholderQuery`] or [`QueryGraph`]. pub trait RankingRule<'search, Query: RankingRuleQueryTrait> { fn id(&self) -> String; @@ -76,6 +57,8 @@ pub trait RankingRule<'search, Query: RankingRuleQueryTrait> { ); } +/// Output of a ranking rule, consisting of the query to be used +/// by the child ranking rule and a set of document ids. #[derive(Debug)] pub struct RankingRuleOutput { /// The query corresponding to the current bucket for the child ranking rule @@ -84,25 +67,16 @@ pub struct RankingRuleOutput { pub candidates: RoaringBitmap, } -// TODO: can make it generic over the query type (either query graph or placeholder) fairly easily -#[allow(clippy::too_many_arguments)] -pub fn apply_ranking_rules<'search>( +pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'search>, - // TODO: ranking rules parameter - query_graph: &QueryGraph, + mut ranking_rules: Vec<&mut dyn RankingRule<'search, Q>>, + query_graph: &Q, universe: &RoaringBitmap, from: usize, length: usize, - logger: &mut dyn SearchLogger, + logger: &mut dyn SearchLogger, ) -> Result> { logger.initial_query(query_graph); - let words = &mut Words::new(TermsMatchingStrategy::Last); - // let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; - let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); - let typo = &mut GraphBasedRankingRule::::new("typo".to_owned()); - // TODO: ranking rules given as argument - let mut ranking_rules: Vec<&mut dyn RankingRule<'search, QueryGraph>> = - vec![words, typo, proximity /*sort*/]; logger.ranking_rules(&ranking_rules); @@ -119,6 +93,9 @@ pub fn apply_ranking_rules<'search>( let mut cur_ranking_rule_index = 0; + /// Finish iterating over the current ranking rule, yielding + /// control to the parent (or finishing the search if not possible). + /// Update the candidates accordingly and inform the logger. macro_rules! back { () => { assert!(candidates[cur_ranking_rule_index].is_empty()); @@ -140,8 +117,8 @@ pub fn apply_ranking_rules<'search>( let mut results = vec![]; let mut cur_offset = 0usize; - // Add the candidates to the results. Take the `from`, `limit`, and `cur_offset` - // into account and inform the logger. + /// Add the candidates to the results. Take the `from`, `limit`, and `cur_offset` + /// into account and inform the logger. macro_rules! maybe_add_to_results { ($candidates:expr) => { let candidates = $candidates; @@ -193,7 +170,6 @@ pub fn apply_ranking_rules<'search>( } let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &candidates[cur_ranking_rule_index])? else { - // TODO: add remaining candidates automatically here? back!(); continue; }; diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index ca64e4342..4606b9226 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -1,74 +1,140 @@ +#![allow(clippy::too_many_arguments)] + use std::collections::VecDeque; use fxhash::FxHashMap; -use heed::BytesDecode; +use heed::{BytesDecode, RoTxn}; use roaring::{MultiOps, RoaringBitmap}; -use super::interner::Interned; +use super::db_cache::DatabaseCache; +use super::interner::{Interned, Interner}; +use super::query_graph::QUERY_GRAPH_NODE_LENGTH_LIMIT; use super::query_term::{Phrase, QueryTerm, WordDerivations}; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; -use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; -// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. #[derive(Default)] -pub struct NodeDocIdsCache { - pub cache: FxHashMap, +pub struct QueryTermDocIdsCache { + pub phrases: FxHashMap, RoaringBitmap>, + pub derivations: FxHashMap, RoaringBitmap>, } -impl<'search> SearchContext<'search> { - fn get_node_docids<'cache>( - &'cache mut self, - term: &QueryTerm, - node_idx: u16, - ) -> Result<&'cache RoaringBitmap> { - if self.node_docids_cache.cache.contains_key(&node_idx) { - return Ok(&self.node_docids_cache.cache[&node_idx]); +impl QueryTermDocIdsCache { + /// Get the document ids associated with the given phrase + pub fn get_phrase_docids<'s, 'search>( + &'s mut self, + index: &Index, + txn: &'search RoTxn, + db_cache: &mut DatabaseCache<'search>, + word_interner: &Interner, + phrase_interner: &Interner, + phrase: Interned, + ) -> Result<&'s RoaringBitmap> { + if self.phrases.contains_key(&phrase) { + return Ok(&self.phrases[&phrase]); }; - let docids = match term { - QueryTerm::Phrase { phrase } => resolve_phrase(self, *phrase)?, - QueryTerm::Word { - derivations: - WordDerivations { - original, - zero_typo, - one_typo, - two_typos, - use_prefix_db, - synonyms, - split_words, - }, - } => { - let mut or_docids = vec![]; - for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()).copied() - { - if let Some(word_docids) = self.get_word_docids(word)? { - or_docids.push(word_docids); - } - } - if *use_prefix_db { - if let Some(prefix_docids) = self.get_word_prefix_docids(*original)? { - or_docids.push(prefix_docids); - } - } - let mut docids = or_docids - .into_iter() - .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()) - .collect::>(); - for synonym in synonyms.iter().copied() { - // TODO: cache resolve_phrase? - docids.push(resolve_phrase(self, synonym)?); - } - if let Some(split_words) = split_words { - docids.push(resolve_phrase(self, *split_words)?); - } - - MultiOps::union(docids) - } - }; - let _ = self.node_docids_cache.cache.insert(node_idx, docids); - let docids = &self.node_docids_cache.cache[&node_idx]; + let docids = resolve_phrase(index, txn, db_cache, word_interner, phrase_interner, phrase)?; + let _ = self.phrases.insert(phrase, docids); + let docids = &self.phrases[&phrase]; Ok(docids) } + + /// Get the document ids associated with the given word derivations + pub fn get_word_derivations_docids<'s, 'search>( + &'s mut self, + index: &Index, + txn: &'search RoTxn, + db_cache: &mut DatabaseCache<'search>, + word_interner: &Interner, + derivations_interner: &Interner, + phrase_interner: &Interner, + derivations: Interned, + ) -> Result<&'s RoaringBitmap> { + if self.derivations.contains_key(&derivations) { + return Ok(&self.derivations[&derivations]); + }; + let WordDerivations { + original, + synonyms, + split_words, + zero_typo, + one_typo, + two_typos, + use_prefix_db, + } = derivations_interner.get(derivations); + let mut or_docids = vec![]; + for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()).copied() { + if let Some(word_docids) = db_cache.get_word_docids(index, txn, word_interner, word)? { + or_docids.push(word_docids); + } + } + if *use_prefix_db { + // TODO: this will change if we decide to change from (original, zero_typo) to: + // (debug_original, prefix_of, zero_typo) + if let Some(prefix_docids) = + db_cache.get_word_prefix_docids(index, txn, word_interner, *original)? + { + or_docids.push(prefix_docids); + } + } + let mut docids = or_docids + .into_iter() + .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()) + .collect::>(); + for synonym in synonyms.iter().copied() { + // TODO: cache resolve_phrase? + docids.push(resolve_phrase( + index, + txn, + db_cache, + word_interner, + phrase_interner, + synonym, + )?); + } + if let Some(split_words) = split_words { + docids.push(resolve_phrase( + index, + txn, + db_cache, + word_interner, + phrase_interner, + *split_words, + )?); + } + + let docids = MultiOps::union(docids); + let _ = self.derivations.insert(derivations, docids); + let docids = &self.derivations[&derivations]; + Ok(docids) + } + + /// Get the document ids associated with the given query term. + fn get_query_term_docids<'s, 'search>( + &'s mut self, + index: &Index, + txn: &'search RoTxn, + db_cache: &mut DatabaseCache<'search>, + word_interner: &Interner, + derivations_interner: &Interner, + phrase_interner: &Interner, + term: &QueryTerm, + ) -> Result<&'s RoaringBitmap> { + match *term { + QueryTerm::Phrase { phrase } => { + self.get_phrase_docids(index, txn, db_cache, word_interner, phrase_interner, phrase) + } + QueryTerm::Word { derivations } => self.get_word_derivations_docids( + index, + txn, + db_cache, + word_interner, + derivations_interner, + phrase_interner, + derivations, + ), + } + } } pub fn resolve_query_graph<'search>( @@ -76,14 +142,23 @@ pub fn resolve_query_graph<'search>( q: &QueryGraph, universe: &RoaringBitmap, ) -> Result { - // TODO: there is definitely a faster way to compute this big + let SearchContext { + index, + txn, + db_cache, + word_interner, + phrase_interner, + derivations_interner, + query_term_docids, + } = ctx; + // TODO: there is a faster way to compute this big // roaring bitmap expression - let mut nodes_resolved = SmallBitmap::new(64); + let mut nodes_resolved = SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT); let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()]; let mut next_nodes_to_visit = VecDeque::new(); - next_nodes_to_visit.push_front(q.root_node); + next_nodes_to_visit.push_back(q.root_node); while let Some(node) = next_nodes_to_visit.pop_front() { let predecessors = &q.edges[node as usize].predecessors; @@ -101,8 +176,15 @@ pub fn resolve_query_graph<'search>( let node_docids = match n { QueryNode::Term(located_term) => { - let term = &located_term.value; - let derivations_docids = ctx.get_node_docids(term, node)?; + let derivations_docids = query_term_docids.get_query_term_docids( + index, + txn, + db_cache, + word_interner, + derivations_interner, + phrase_interner, + &located_term.value, + )?; predecessors_docids & derivations_docids } QueryNode::Deleted => { @@ -122,19 +204,24 @@ pub fn resolve_query_graph<'search>( } } - // This is currently slow but could easily be implemented very efficiently for prec in q.edges[node as usize].predecessors.iter() { if q.edges[prec as usize].successors.is_subset(&nodes_resolved) { path_nodes_docids[prec as usize].clear(); } } } - panic!() } -pub fn resolve_phrase(ctx: &mut SearchContext, phrase: Interned) -> Result { - let Phrase { words } = ctx.phrase_interner.get(phrase).clone(); +pub fn resolve_phrase<'search>( + index: &Index, + txn: &'search RoTxn, + db_cache: &mut DatabaseCache<'search>, + word_interner: &Interner, + phrase_interner: &Interner, + phrase: Interned, +) -> Result { + let Phrase { words } = phrase_interner.get(phrase).clone(); let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = words.len().min(3); @@ -158,7 +245,14 @@ pub fn resolve_phrase(ctx: &mut SearchContext, phrase: Interned) -> Resu .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) { if dist == 0 { - match ctx.get_word_pair_proximity_docids(s1, s2, 1)? { + match db_cache.get_word_pair_proximity_docids( + index, + txn, + word_interner, + s1, + s2, + 1, + )? { Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?), // If there are no documents for this pair, there will be no // results for the phrase query. @@ -167,9 +261,14 @@ pub fn resolve_phrase(ctx: &mut SearchContext, phrase: Interned) -> Resu } else { let mut bitmap = RoaringBitmap::new(); for dist in 0..=dist { - if let Some(m) = - ctx.get_word_pair_proximity_docids(s1, s2, dist as u8 + 1)? - { + if let Some(m) = db_cache.get_word_pair_proximity_docids( + index, + txn, + word_interner, + s1, + s2, + dist as u8 + 1, + )? { bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?; } } diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index fea5a5684..48a2e02fc 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -18,13 +18,6 @@ impl SmallBitmap { } s } - pub fn from_array(xs: &[u16], universe_length: u16) -> Self { - let mut s = Self::new(universe_length); - for x in xs { - s.insert(*x); - } - s - } pub fn is_empty(&self) -> bool { match self { SmallBitmap::Tiny(set) => *set == 0, @@ -81,27 +74,6 @@ impl SmallBitmap { }; *set &= !(0b1 << x); } - // fn iter_single(mut set: u64, mut visit: impl FnMut(u16) -> Result<()>) -> Result<()> { - // while set > 0 { - // let idx = set.trailing_zeros() as u16; - // visit(idx)?; - // set &= set - 1; - // } - // Ok(()) - // } - // pub fn iter(&self, mut visit: impl FnMut(u16) -> Result<()>) -> Result<()> { - // match self { - // SmallBitmap::Tiny(set) => Self::iter_single(*set, &mut visit), - // SmallBitmap::Small(sets) => { - // let mut base = 0; - // for set in sets.iter() { - // Self::iter_single(*set, |x| visit(base + x))?; - // base += 64; - // } - // Ok(()) - // } - // } - // } pub fn intersection(&mut self, other: &SmallBitmap) { self.apply_op(other, |a, b| *a &= b); diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index d5a6276ad..e43a9e8aa 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -1,10 +1,31 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; -use super::{ - RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper, - RankingRuleQueryTrait, SearchContext, -}; +use super::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait, SearchContext}; + +pub trait RankingRuleOutputIter<'search, Query> { + fn next_bucket(&mut self) -> Result>>; +} + +pub struct RankingRuleOutputIterWrapper<'search, Query> { + iter: Box>> + 'search>, +} +impl<'search, Query> RankingRuleOutputIterWrapper<'search, Query> { + pub fn new(iter: Box>> + 'search>) -> Self { + Self { iter } + } +} +impl<'search, Query> RankingRuleOutputIter<'search, Query> + for RankingRuleOutputIterWrapper<'search, Query> +{ + fn next_bucket(&mut self) -> Result>> { + match self.iter.next() { + Some(x) => x.map(Some), + None => Ok(None), + } + } +} + use crate::{ // facet::FacetType, heed_codec::{facet::FacetGroupKeyCodec, ByteSliceRefCodec}, From 78b9304d52cf106cec5dcd9ba7a51aca56439357 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 9 Mar 2023 15:20:29 +0100 Subject: [PATCH 049/234] Implement distinct attribute --- milli/src/search/new/distinct.rs | 119 ++++++++++++++++++++++++ milli/src/search/new/logger/detailed.rs | 38 ++++---- milli/src/search/new/mod.rs | 2 +- milli/src/search/new/ranking_rules.rs | 54 +++++++---- 4 files changed, 176 insertions(+), 37 deletions(-) create mode 100644 milli/src/search/new/distinct.rs diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs new file mode 100644 index 000000000..9ee1746df --- /dev/null +++ b/milli/src/search/new/distinct.rs @@ -0,0 +1,119 @@ +use heed::{ + types::{ByteSlice, Str, Unit}, + Database, RoPrefix, RoTxn, +}; +use roaring::RoaringBitmap; + +const FID_SIZE: usize = 2; +const DOCID_SIZE: usize = 4; + +use crate::{ + heed_codec::{ + facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec}, + ByteSliceRefCodec, + }, + Index, Result, SearchContext, +}; + +pub struct DistinctOutput { + pub remaining: RoaringBitmap, + pub excluded: RoaringBitmap, +} + +pub fn apply_distinct_rule<'search>( + ctx: &mut SearchContext<'search>, + field_id: u16, + candidates: &RoaringBitmap, +) -> Result { + let mut excluded = RoaringBitmap::new(); + let mut remaining = RoaringBitmap::new(); + for docid in candidates { + if excluded.contains(docid) { + continue; + } + distinct_single_docid(ctx.index, ctx.txn, field_id, docid, &mut excluded)?; + remaining.push(docid); + } + Ok(DistinctOutput { remaining, excluded }) +} + +fn distinct_single_docid( + index: &Index, + txn: &RoTxn, + field_id: u16, + docid: u32, + excluded: &mut RoaringBitmap, +) -> Result<()> { + for item in facet_string_values(docid, field_id, index, txn)? { + let ((_, _, facet_value), _) = item?; + if let Some(facet_docids) = facet_value_docids( + index.facet_id_string_docids.remap_types(), + txn, + field_id, + facet_value, + )? { + *excluded |= facet_docids; + } + } + for item in facet_number_values(docid, field_id, index, txn)? { + let ((_, _, facet_value), _) = item?; + if let Some(facet_docids) = facet_value_docids( + index.facet_id_string_docids.remap_types(), + txn, + field_id, + facet_value, + )? { + *excluded |= facet_docids; + } + } + Ok(()) +} + +fn facet_value_docids( + database: Database, FacetGroupValueCodec>, + txn: &RoTxn, + field_id: u16, + facet_value: &[u8], +) -> heed::Result> { + database + .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value }) + .map(|opt| opt.map(|v| v.bitmap)) +} +fn facet_number_values<'a>( + id: u32, + distinct: u16, + index: &Index, + txn: &'a RoTxn, +) -> Result, Unit>> { + let key = facet_values_prefix_key(distinct, id); + + let iter = index + .field_id_docid_facet_f64s + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_key_type(); + + Ok(iter) +} + +fn facet_string_values<'a>( + docid: u32, + distinct: u16, + index: &Index, + txn: &'a RoTxn, +) -> Result, Str>> { + let key = facet_values_prefix_key(distinct, docid); + + let iter = index + .field_id_docid_facet_strings + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_types(); + + Ok(iter) +} + +#[allow(clippy::drop_non_drop)] +fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] { + concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) +} diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 5929f406c..ef33bdbf9 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -432,27 +432,23 @@ results.{random} {{ file: &mut File, ) { match &node { - QueryNode::Term(LocatedQueryTerm { value, .. }) => match ctx - .query_term_interner - .get(*value) - { + QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { QueryTerm::Phrase { phrase } => { let phrase = ctx.phrase_interner.get(*phrase); let phrase_str = phrase.description(&ctx.word_interner); writeln!(file, "{node_idx} : \"{phrase_str}\"").unwrap(); } - QueryTerm::Word { - derivations: - WordDerivations { - original, - zero_typo, - one_typo, - two_typos, - use_prefix_db, - synonyms, - split_words, - }, - } => { + QueryTerm::Word { derivations } => { + let WordDerivations { + original, + zero_typo, + one_typo, + two_typos, + use_prefix_db, + synonyms, + split_words, + } = ctx.derivations_interner.get(*derivations); + let original = ctx.word_interner.get(*original); writeln!( file, @@ -596,12 +592,13 @@ shape: class" graph.edges_store[edge_idx as usize].as_ref().unwrap(); let source_node = &graph.query_graph.nodes[*source_node as usize]; let source_node_desc = match source_node { - QueryNode::Term(term) => match ctx.query_term_interner.get(term.value) { + QueryNode::Term(term) => match term.value { QueryTerm::Phrase { phrase } => { - let phrase = ctx.phrase_interner.get(*phrase); + let phrase = ctx.phrase_interner.get(phrase); phrase.description(&ctx.word_interner) } QueryTerm::Word { derivations } => { + let derivations = ctx.derivations_interner.get(derivations); ctx.word_interner.get(derivations.original).to_owned() } }, @@ -611,12 +608,13 @@ shape: class" }; let dest_node = &graph.query_graph.nodes[*dest_node as usize]; let dest_node_desc = match dest_node { - QueryNode::Term(term) => match ctx.query_term_interner.get(term.value) { + QueryNode::Term(term) => match term.value { QueryTerm::Phrase { phrase } => { - let phrase = ctx.phrase_interner.get(*phrase); + let phrase = ctx.phrase_interner.get(phrase); phrase.description(&ctx.word_interner) } QueryTerm::Word { derivations } => { + let derivations = ctx.derivations_interner.get(derivations); ctx.word_interner.get(derivations.original).to_owned() } }, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 3975dd4b6..125e2b1e0 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -1,11 +1,11 @@ mod db_cache; +mod distinct; mod graph_based_ranking_rule; mod interner; mod logger; mod query_graph; mod query_term; mod ranking_rule_graph; - mod ranking_rules; mod resolve_query_graph; // TODO: documentation + comments diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index d65610f9d..350eed58f 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -3,6 +3,7 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::{QueryGraph, SearchContext}; // use crate::search::new::sort::Sort; +use crate::search::new::distinct::{apply_distinct_rule, DistinctOutput}; use crate::Result; /// An internal trait implemented by only [`PlaceholderQuery`] and [`QueryGraph`] @@ -80,6 +81,12 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( logger.ranking_rules(&ranking_rules); + let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { + ctx.index.fields_ids_map(ctx.txn)?.id(field) + } else { + None + }; + if universe.len() < from as u64 { return Ok(vec![]); } @@ -88,8 +95,9 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe); ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?; - let mut candidates: Vec = vec![RoaringBitmap::default(); ranking_rules_len]; - candidates[0] = universe.clone(); + let mut ranking_rule_universes: Vec = + vec![RoaringBitmap::default(); ranking_rules_len]; + ranking_rule_universes[0] = universe.clone(); let mut cur_ranking_rule_index = 0; @@ -98,13 +106,13 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( /// Update the candidates accordingly and inform the logger. macro_rules! back { () => { - assert!(candidates[cur_ranking_rule_index].is_empty()); + assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); logger.end_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], - &candidates[cur_ranking_rule_index], + &ranking_rule_universes[cur_ranking_rule_index], ); - candidates[cur_ranking_rule_index].clear(); + ranking_rule_universes[cur_ranking_rule_index].clear(); ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger); if cur_ranking_rule_index == 0 { break; @@ -117,22 +125,35 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( let mut results = vec![]; let mut cur_offset = 0usize; - /// Add the candidates to the results. Take the `from`, `limit`, and `cur_offset` + /// Add the candidates to the results. Take `distinct`, `from`, `limit`, and `cur_offset` /// into account and inform the logger. macro_rules! maybe_add_to_results { ($candidates:expr) => { - let candidates = $candidates; + // First apply the distinct rule on the candidates, reducing the universes if necessary + let candidates = if let Some(distinct_fid) = distinct_fid { + let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?; + for universe in ranking_rule_universes.iter_mut() { + *universe -= &excluded; + } + remaining + } else { + $candidates.clone() + }; let len = candidates.len(); // if the candidates are empty, there is nothing to do; if !candidates.is_empty() { + // if we still haven't reached the first document to return if cur_offset < from { + // and if no document from this bucket can be returned if cur_offset + (candidates.len() as usize) < from { + // then just skip the bucket logger.skip_bucket_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &candidates, ); } else { + // otherwise, skip some of the documents and add some of the rest, in order of ids let all_candidates = candidates.iter().collect::>(); let (skipped_candidates, candidates) = all_candidates.split_at(from - cur_offset); @@ -150,6 +171,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( results.extend(&candidates); } } else { + // if we have passed the offset already, add some of the documents (up to the limit) let candidates = candidates.iter().take(length - results.len()).collect::>(); logger.add_to_results(&candidates); @@ -162,14 +184,14 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( while results.len() < length { // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. - if candidates[cur_ranking_rule_index].len() <= 1 { - maybe_add_to_results!(&candidates[cur_ranking_rule_index]); - candidates[cur_ranking_rule_index].clear(); + if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { + maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); + ranking_rule_universes[cur_ranking_rule_index].clear(); back!(); continue; } - let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &candidates[cur_ranking_rule_index])? else { + let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else { back!(); continue; }; @@ -177,12 +199,12 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( logger.next_bucket_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], - &candidates[cur_ranking_rule_index], + &ranking_rule_universes[cur_ranking_rule_index], &next_bucket.candidates, ); - assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates)); - candidates[cur_ranking_rule_index] -= &next_bucket.candidates; + assert!(ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates)); + ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; if cur_ranking_rule_index == ranking_rules_len - 1 || next_bucket.candidates.len() <= 1 @@ -193,12 +215,12 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( } cur_ranking_rule_index += 1; - candidates[cur_ranking_rule_index] = next_bucket.candidates.clone(); + ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); logger.start_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &next_bucket.query, - &candidates[cur_ranking_rule_index], + &ranking_rule_universes[cur_ranking_rule_index], ); ranking_rules[cur_ranking_rule_index].start_iteration( ctx, From 9ec9c204d3a65afbcc8562ced7faa24b803c759f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 9 Mar 2023 15:53:59 +0100 Subject: [PATCH 050/234] Small code cleanup --- milli/src/search/new/query_term.rs | 16 +++++++--------- milli/src/search/new/resolve_query_graph.rs | 1 + 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 1b709d0e9..5fffe6653 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -80,7 +80,7 @@ pub fn word_derivations( word: &str, max_typo: u8, is_prefix: bool, -) -> Result { +) -> Result> { let fst = ctx.index.words_fst(ctx.txn)?; let word_interned = ctx.word_interner.insert(word.to_owned()); @@ -185,7 +185,7 @@ pub fn word_derivations( }) .collect(); - Ok(WordDerivations { + let interned = ctx.derivations_interner.insert(WordDerivations { original: ctx.word_interner.insert(word.to_owned()), synonyms, split_words, @@ -193,7 +193,9 @@ pub fn word_derivations( one_typo: one_typo.into_boxed_slice(), two_typos: two_typos.into_boxed_slice(), use_prefix_db, - }) + }); + + Ok(interned) } /// Split the original word into the two words that appear the @@ -342,9 +344,7 @@ pub fn located_query_terms_from_string<'search>( let word = token.lemma(); let derivations = word_derivations(ctx, word, nbr_typos(word), false)?; let located_term = LocatedQueryTerm { - value: QueryTerm::Word { - derivations: ctx.derivations_interner.insert(derivations), - }, + value: QueryTerm::Word { derivations }, positions: position..=position, }; located_terms.push(located_term); @@ -355,9 +355,7 @@ pub fn located_query_terms_from_string<'search>( let word = token.lemma(); let derivations = word_derivations(ctx, word, nbr_typos(word), true)?; let located_term = LocatedQueryTerm { - value: QueryTerm::Word { - derivations: ctx.derivations_interner.insert(derivations), - }, + value: QueryTerm::Word { derivations }, positions: position..=position, }; located_terms.push(located_term); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 4606b9226..b70b01c34 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -150,6 +150,7 @@ pub fn resolve_query_graph<'search>( phrase_interner, derivations_interner, query_term_docids, + .. } = ctx; // TODO: there is a faster way to compute this big // roaring bitmap expression From 5155fd2bf1fd50b2b02bac4941f6cae2f2647065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 13 Mar 2023 09:52:17 +0100 Subject: [PATCH 051/234] Reorganise initialisation of ranking rules + rename PathsMap -> PathSet --- .../search/new/graph_based_ranking_rule.rs | 15 +- milli/src/search/new/logger/detailed.rs | 2 +- milli/src/search/new/logger/mod.rs | 4 +- milli/src/search/new/mod.rs | 184 +++++++++++++++--- .../ranking_rule_graph/empty_paths_cache.rs | 4 +- .../src/search/new/ranking_rule_graph/mod.rs | 2 +- .../{paths_map.rs => path_set.rs} | 0 milli/src/search/new/ranking_rules.rs | 22 +-- 8 files changed, 186 insertions(+), 47 deletions(-) rename milli/src/search/new/ranking_rule_graph/{paths_map.rs => path_set.rs} (100%) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index d8f881b07..f35d024cc 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -40,12 +40,25 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::ranking_rule_graph::{ - EdgeDocidsCache, EmptyPathsCache, RankingRuleGraph, RankingRuleGraphTrait, + EdgeDocidsCache, EmptyPathsCache, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, ProximityGraph, }; use super::small_bitmap::SmallBitmap; use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::Result; +pub type Proximity = GraphBasedRankingRule; +impl Default for GraphBasedRankingRule { + fn default() -> Self { + Self::new("proximity".to_owned()) + } +} +pub type Typo = GraphBasedRankingRule; +impl Default for GraphBasedRankingRule { + fn default() -> Self { + Self::new("typo".to_owned()) + } +} + /// A generic graph-based ranking rule pub struct GraphBasedRankingRule { id: String, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index ef33bdbf9..ba443752d 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -98,7 +98,7 @@ impl SearchLogger for DetailedSearchLogger { fn initial_universe(&mut self, universe: &RoaringBitmap) { self.initial_universe = Some(universe.clone()); } - fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]) { + fn ranking_rules(&mut self, rr: &[Box>]) { self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect()); } diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index d4d64f844..c5f3e5351 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -19,7 +19,7 @@ pub trait SearchLogger { fn initial_universe(&mut self, universe: &RoaringBitmap); /// Logs the ranking rules used to perform the search query - fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]); + fn ranking_rules(&mut self, rr: &[Box>]); /// Logs the start of a ranking rule's iteration. fn start_iteration_ranking_rule<'transaction>( @@ -90,7 +90,7 @@ impl SearchLogger for DefaultSearchLogger { fn initial_universe(&mut self, _universe: &RoaringBitmap) {} - fn ranking_rules(&mut self, _rr: &[&mut dyn RankingRule]) {} + fn ranking_rules(&mut self, _rr: &[Box>]) {} fn start_iteration_ranking_rule<'transaction>( &mut self, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 125e2b1e0..18c51f4a4 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -17,7 +17,7 @@ mod words; pub use logger::{DefaultSearchLogger, SearchLogger}; -use std::collections::BTreeSet; +use std::collections::{BTreeSet, HashSet}; use charabia::Tokenize; use db_cache::DatabaseCache; @@ -28,10 +28,10 @@ use roaring::RoaringBitmap; use self::interner::Interner; use self::query_term::{Phrase, WordDerivations}; +use self::ranking_rules::PlaceholderQuery; use self::resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; -use crate::search::new::graph_based_ranking_rule::GraphBasedRankingRule; +use crate::search::new::graph_based_ranking_rule::{Proximity, Typo}; use crate::search::new::query_term::located_query_terms_from_string; -use crate::search::new::ranking_rule_graph::{ProximityGraph, TypoGraph}; use crate::search::new::words::Words; use crate::{Filter, Index, Result, TermsMatchingStrategy}; @@ -88,7 +88,9 @@ fn resolve_maximally_reduced_query_graph<'search>( TermsMatchingStrategy::All => vec![], }; // don't remove the first term - positions_to_remove.remove(0); + if !positions_to_remove.is_empty() { + positions_to_remove.remove(0); + } loop { if positions_to_remove.is_empty() { break; @@ -102,48 +104,172 @@ fn resolve_maximally_reduced_query_graph<'search>( Ok(docids) } +fn get_ranking_rules_for_placeholder_search<'search>( + ctx: &SearchContext<'search>, +) -> Result>>> { + // let sort = false; + // let mut asc = HashSet::new(); + // let mut desc = HashSet::new(); + let /*mut*/ ranking_rules: Vec>> = vec![]; + let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; + for rr in settings_ranking_rules { + // Add Words before any of: typo, proximity, attribute, exactness + match rr { + crate::Criterion::Words + | crate::Criterion::Typo + | crate::Criterion::Attribute + | crate::Criterion::Proximity + | crate::Criterion::Exactness => continue, + crate::Criterion::Sort => todo!(), + crate::Criterion::Asc(_) => todo!(), + crate::Criterion::Desc(_) => todo!(), + } + } + Ok(ranking_rules) +} +fn get_ranking_rules_for_query_graph_search<'search>( + ctx: &SearchContext<'search>, + terms_matching_strategy: TermsMatchingStrategy, +) -> Result>>> { + // query graph search + let mut words = false; + let mut typo = false; + let mut proximity = false; + let sort = false; + let attribute = false; + let exactness = false; + let mut asc = HashSet::new(); + let mut desc = HashSet::new(); + + let mut ranking_rules: Vec>> = vec![]; + let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; + for rr in settings_ranking_rules { + // Add Words before any of: typo, proximity, attribute, exactness + match rr { + crate::Criterion::Typo + | crate::Criterion::Attribute + | crate::Criterion::Proximity + | crate::Criterion::Exactness => { + if !words { + ranking_rules.push(Box::new(Words::new(terms_matching_strategy))); + words = true; + } + } + _ => {} + } + match rr { + crate::Criterion::Words => { + if words { + continue; + } + ranking_rules.push(Box::new(Words::new(terms_matching_strategy))); + words = true; + } + crate::Criterion::Typo => { + if typo { + continue; + } + typo = true; + ranking_rules.push(Box::::default()); + } + crate::Criterion::Proximity => { + if proximity { + continue; + } + proximity = true; + ranking_rules.push(Box::::default()); + } + crate::Criterion::Attribute => { + if attribute { + continue; + } + todo!(); + // attribute = false; + } + crate::Criterion::Sort => { + if sort { + continue; + } + todo!(); + // sort = false; + } + crate::Criterion::Exactness => { + if exactness { + continue; + } + todo!(); + // exactness = false; + } + crate::Criterion::Asc(field) => { + if asc.contains(&field) { + continue; + } + asc.insert(field); + todo!(); + } + crate::Criterion::Desc(field) => { + if desc.contains(&field) { + continue; + } + desc.insert(field); + todo!(); + } + } + } + Ok(ranking_rules) +} #[allow(clippy::too_many_arguments)] pub fn execute_search<'search>( ctx: &mut SearchContext<'search>, query: &str, + terms_matching_strategy: TermsMatchingStrategy, filters: Option, from: usize, length: usize, - logger: &mut dyn SearchLogger, + placeholder_search_logger: &mut dyn SearchLogger, + query_graph_logger: &mut dyn SearchLogger, ) -> Result> { assert!(!query.is_empty()); let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None)?; let graph = QueryGraph::from_query(ctx, query_terms)?; - logger.initial_query(&graph); - let universe = if let Some(filters) = filters { filters.evaluate(ctx.txn, ctx.index)? } else { ctx.index.documents_ids(ctx.txn)? }; - let universe = resolve_maximally_reduced_query_graph( - ctx, - &universe, - &graph, - TermsMatchingStrategy::Last, - logger, - )?; - // TODO: create ranking rules here + // TODO: other way to tell whether it is a placeholder search + // This way of doing things is not correct because if someone searches + // for a word that does not appear in any document, the word will be removed + // from the graph and thus its number of nodes will be == 2 + // But in that case, we should return no results. + // + // The search is a placeholder search only if there are no tokens? + if graph.nodes.len() > 2 { + let universe = resolve_maximally_reduced_query_graph( + ctx, + &universe, + &graph, + terms_matching_strategy, + query_graph_logger, + )?; - logger.initial_universe(&universe); - - let words = &mut Words::new(TermsMatchingStrategy::Last); - // let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; - let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); - let typo = &mut GraphBasedRankingRule::::new("typo".to_owned()); - // TODO: ranking rules given as argument - let ranking_rules: Vec<&mut dyn RankingRule<'search, QueryGraph>> = - vec![words, typo, proximity /*sort*/]; - - bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, logger) + let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, terms_matching_strategy)?; + bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger) + } else { + let ranking_rules = get_ranking_rules_for_placeholder_search(ctx)?; + bucket_sort( + ctx, + ranking_rules, + &PlaceholderQuery, + &universe, + from, + length, + placeholder_search_logger, + ) + } } #[cfg(test)] @@ -182,10 +308,11 @@ mod tests { let results = execute_search( &mut ctx, "zero config", + TermsMatchingStrategy::Last, None, 0, 20, - // &mut DefaultSearchLogger, + &mut DefaultSearchLogger, &mut logger, ) .unwrap(); @@ -279,10 +406,11 @@ mod tests { let results = execute_search( &mut ctx, "releases from poison by the government", + TermsMatchingStrategy::Last, None, 0, 20, - // &mut DefaultSearchLogger, + &mut DefaultSearchLogger, &mut logger, ) .unwrap(); diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index deac05502..3178cfe27 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,4 +1,4 @@ -use super::paths_map::PathSet; +use super::path_set::PathSet; use crate::search::new::small_bitmap::SmallBitmap; /// A cache which stores sufficient conditions for a path @@ -10,7 +10,7 @@ pub struct EmptyPathsCache { pub empty_edges: SmallBitmap, /// A set of path prefixes that resolve to no documents. pub empty_prefixes: PathSet, - /// A set of empty couple of edge indexes that resolve to no documents. + /// A set of empty couples of edge indexes that resolve to no documents. pub empty_couple_edges: Vec, } impl EmptyPathsCache { diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 3f74a3cf5..143554c72 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -9,7 +9,7 @@ mod build; mod cheapest_paths; mod edge_docids_cache; mod empty_paths_cache; -mod paths_map; +mod path_set; /// Implementation of the `proximity` ranking rule mod proximity; diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/path_set.rs similarity index 100% rename from milli/src/search/new/ranking_rule_graph/paths_map.rs rename to milli/src/search/new/ranking_rule_graph/path_set.rs diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 350eed58f..57817fd7e 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -70,16 +70,15 @@ pub struct RankingRuleOutput { pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'search>, - mut ranking_rules: Vec<&mut dyn RankingRule<'search, Q>>, - query_graph: &Q, + mut ranking_rules: Vec>>, + query: &Q, universe: &RoaringBitmap, from: usize, length: usize, logger: &mut dyn SearchLogger, ) -> Result> { - logger.initial_query(query_graph); - logger.ranking_rules(&ranking_rules); + logger.initial_universe(universe); let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { ctx.index.fields_ids_map(ctx.txn)?.id(field) @@ -92,8 +91,8 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( } let ranking_rules_len = ranking_rules.len(); - logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe); - ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?; + logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe); + ranking_rules[0].start_iteration(ctx, logger, universe, query)?; let mut ranking_rule_universes: Vec = vec![RoaringBitmap::default(); ranking_rules_len]; @@ -109,7 +108,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); logger.end_iteration_ranking_rule( cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index], + ranking_rules[cur_ranking_rule_index].as_ref(), &ranking_rule_universes[cur_ranking_rule_index], ); ranking_rule_universes[cur_ranking_rule_index].clear(); @@ -149,7 +148,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( // then just skip the bucket logger.skip_bucket_ranking_rule( cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index], + ranking_rules[cur_ranking_rule_index].as_ref(), &candidates, ); } else { @@ -159,7 +158,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( all_candidates.split_at(from - cur_offset); logger.skip_bucket_ranking_rule( cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index], + ranking_rules[cur_ranking_rule_index].as_ref(), &skipped_candidates.into_iter().collect(), ); let candidates = candidates @@ -186,7 +185,6 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( // anything, just extend the results and go back to the parent ranking rule. if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); - ranking_rule_universes[cur_ranking_rule_index].clear(); back!(); continue; } @@ -198,7 +196,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( logger.next_bucket_ranking_rule( cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index], + ranking_rules[cur_ranking_rule_index].as_ref(), &ranking_rule_universes[cur_ranking_rule_index], &next_bucket.candidates, ); @@ -218,7 +216,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); logger.start_iteration_ranking_rule( cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index], + ranking_rules[cur_ranking_rule_index].as_ref(), &next_bucket.query, &ranking_rule_universes[cur_ranking_rule_index], ); From 1c58cf8426359c46f12a26a8ebef655402920ce1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 13 Mar 2023 12:46:32 +0100 Subject: [PATCH 052/234] Intern ranking rule graph edge conditions as well --- .../search/new/graph_based_ranking_rule.rs | 74 ++++++++++--------- milli/src/search/new/interner.rs | 15 ++++ milli/src/search/new/mod.rs | 5 -- .../search/new/ranking_rule_graph/build.rs | 17 +++-- .../ranking_rule_graph/edge_docids_cache.rs | 50 ++++++------- .../src/search/new/ranking_rule_graph/mod.rs | 44 +++++++---- .../new/ranking_rule_graph/proximity/build.rs | 35 +++++---- .../new/ranking_rule_graph/proximity/mod.rs | 11 ++- .../search/new/ranking_rule_graph/typo/mod.rs | 24 +++--- 9 files changed, 160 insertions(+), 115 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index f35d024cc..9af41b322 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -40,10 +40,11 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::ranking_rule_graph::{ - EdgeDocidsCache, EmptyPathsCache, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, ProximityGraph, + EdgeCondition, EdgeConditionsCache, EmptyPathsCache, ProximityGraph, RankingRuleGraph, + RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; -use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; +use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::Result; pub type Proximity = GraphBasedRankingRule; @@ -78,7 +79,7 @@ pub struct GraphBasedRankingRuleState { /// The current graph graph: RankingRuleGraph, /// Cache to retrieve the docids associated with each edge - edge_docids_cache: EdgeDocidsCache, + edge_conditions_cache: EdgeConditionsCache, /// Cache used to optimistically discard paths that resolve to no documents. empty_paths_cache: EmptyPathsCache, /// A structure giving the list of possible costs from each node to the end node, @@ -94,25 +95,27 @@ pub struct GraphBasedRankingRuleState { fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( ctx: &mut SearchContext<'search>, graph: &mut RankingRuleGraph, - edge_docids_cache: &mut EdgeDocidsCache, + edge_docids_cache: &mut EdgeConditionsCache, universe: &RoaringBitmap, empty_paths_cache: &mut EmptyPathsCache, ) -> Result<()> { for edge_index in 0..graph.edges_store.len() as u16 { - if graph.edges_store[edge_index as usize].is_none() { + let Some(edge) = graph.edges_store[edge_index as usize].as_ref() else { continue; - } - let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?; - match docids { - BitmapOrAllRef::Bitmap(docids) => { + }; + let condition = edge.condition; + + match condition { + EdgeCondition::Unconditional => continue, + EdgeCondition::Conditional(condition) => { + let docids = edge_docids_cache.get_edge_docids(ctx, condition, graph, universe)?; if docids.is_disjoint(universe) { graph.remove_ranking_rule_edge(edge_index); empty_paths_cache.forbid_edge(edge_index); - edge_docids_cache.cache.remove(&edge_index); + edge_docids_cache.cache.remove(&condition); continue; } } - BitmapOrAllRef::All => continue, } } Ok(()) @@ -132,7 +135,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> query_graph: &QueryGraph, ) -> Result<()> { let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; - let mut edge_docids_cache = EdgeDocidsCache::default(); + let mut edge_docids_cache = EdgeConditionsCache::default(); let mut empty_paths_cache = EmptyPathsCache::new(graph.edges_store.len() as u16); // First simplify the graph as much as possible, by computing the docids of the edges @@ -150,7 +153,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> let state = GraphBasedRankingRuleState { graph, - edge_docids_cache, + edge_conditions_cache: edge_docids_cache, empty_paths_cache, all_distances, cur_distance_idx: 0, @@ -174,11 +177,11 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> // should never happen let mut state = self.state.take().unwrap(); - // TODO: does this have a real positive performance cost? + // TODO: does this have a real positive performance impact? remove_empty_edges( ctx, &mut state.graph, - &mut state.edge_docids_cache, + &mut state.edge_conditions_cache, universe, &mut state.empty_paths_cache, )?; @@ -201,17 +204,17 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> let GraphBasedRankingRuleState { graph, - edge_docids_cache, + edge_conditions_cache: edge_docids_cache, empty_paths_cache, all_distances, cur_distance_idx: _, } = &mut state; - let original_universe = universe; + // let original_universe = universe; let mut universe = universe.clone(); // TODO: remove this unnecessary clone - let original_graph = graph.clone(); + // let original_graph = graph.clone(); // and this vector as well let mut paths = vec![]; @@ -241,12 +244,15 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> for &edge_index in path { visited_edges.push(edge_index); - let edge_docids = - edge_docids_cache.get_edge_docids(ctx, edge_index, graph, &universe)?; - let edge_docids = match edge_docids { - BitmapOrAllRef::Bitmap(b) => b, - BitmapOrAllRef::All => continue, + let edge = graph.edges_store[edge_index as usize].as_ref().unwrap(); + let condition = match edge.condition { + EdgeCondition::Unconditional => continue, + EdgeCondition::Conditional(condition) => condition, }; + + let edge_docids = + edge_docids_cache.get_edge_docids(ctx, condition, graph, &universe)?; + cached_edge_docids.push((edge_index, edge_docids.clone())); // If the edge is empty, then the path will be empty as well, we update the graph @@ -257,7 +263,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> // 2. remove this edge from the ranking rule graph graph.remove_ranking_rule_edge(edge_index); // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore - edge_docids_cache.cache.remove(&edge_index); + edge_docids_cache.cache.remove(&condition); return Ok(()); } path_docids &= edge_docids; @@ -279,6 +285,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> empty_paths_cache.forbid_couple_edges(*edge_index2, edge_index); } } + // We should maybe instead try to compute: + // 0th & nth & 1st & n-1th & 2nd & etc... return Ok(()); } } @@ -289,15 +297,15 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> }, )?; - G::log_state( - &original_graph, - &paths, - &state.empty_paths_cache, - original_universe, - &state.all_distances, - cost, - logger, - ); + // G::log_state( + // &original_graph, + // &paths, + // &state.empty_paths_cache, + // original_universe, + // &state.all_distances, + // cost, + // logger, + // ); // TODO: Graph-based ranking rules do not (yet) modify the query graph. We could, however, // remove nodes and/or terms within nodes that weren't present in any of the paths. diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index e68f3b949..7edef41c8 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -14,6 +14,21 @@ impl Interned { Self { idx, _phantom: PhantomData } } } + +// TODO: the stable store should be replaced by a bump allocator +// and the interned value should be a pointer wrapper +// then we can get its value with `interned.get()` instead of `interner.get(interned)` +// and as a bonus, its validity is tracked with Rust's lifetime system +// one problem is that we need two lifetimes: one for the bump allocator, one for the +// hashmap +// but that's okay, we can use: +// ``` +// struct Interner<'bump> { +// bump: &'bump Bump, +// lookup: FxHashMap +// } +// ``` + /// An [`Interner`] is used to store a unique copy of a value of type `T`. This value /// is then identified by a lightweight index of type [`Interned`], which can /// be copied, compared, and hashed efficiently. An immutable reference to the original value diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 18c51f4a4..110cfad64 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -35,11 +35,6 @@ use crate::search::new::query_term::located_query_terms_from_string; use crate::search::new::words::Words; use crate::{Filter, Index, Result, TermsMatchingStrategy}; -pub enum BitmapOrAllRef<'s> { - Bitmap(&'s RoaringBitmap), - All, -} - pub struct SearchContext<'search> { pub index: &'search Index, pub txn: &'search RoTxn<'search>, diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 49c78a32f..3393f086a 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::Interner; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, SearchContext}; use crate::Result; @@ -10,6 +11,8 @@ impl RankingRuleGraph { pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { let QueryGraph { nodes: graph_nodes, edges: graph_edges, .. } = &query_graph; + let mut conditions_interner = Interner::default(); + let mut edges_store = vec![]; let mut edges_of_node = vec![]; @@ -21,18 +24,22 @@ impl RankingRuleGraph { for successor_idx in graph_edges[node_idx].successors.iter() { let dest_node = &graph_nodes[successor_idx as usize]; - let edges = - G::build_step_visit_destination_node(ctx, dest_node, &source_node_data)?; + let edges = G::build_step_visit_destination_node( + ctx, + &mut conditions_interner, + dest_node, + &source_node_data, + )?; if edges.is_empty() { continue; } - for (cost, details) in edges { + for (cost, condition) in edges { edges_store.push(Some(Edge { source_node: node_idx as u16, dest_node: successor_idx, cost, - condition: details, + condition, })); new_edges.insert(edges_store.len() as u16 - 1); } @@ -43,6 +50,6 @@ impl RankingRuleGraph { .map(|edges| SmallBitmap::from_iter(edges.into_iter(), edges_store.len() as u16)) .collect(); - Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node }) + Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node, conditions_interner }) } } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index f7bf1b002..416ededc0 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -3,22 +3,23 @@ use std::marker::PhantomData; use fxhash::FxHashMap; use roaring::RoaringBitmap; -use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::{BitmapOrAllRef, SearchContext}; +use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::Interned; +use crate::search::new::SearchContext; use crate::Result; /// A cache storing the document ids associated with each ranking rule edge -pub struct EdgeDocidsCache { +pub struct EdgeConditionsCache { // TODO: should be FxHashMap, RoaringBitmap> - pub cache: FxHashMap, + pub cache: FxHashMap, RoaringBitmap>, _phantom: PhantomData, } -impl Default for EdgeDocidsCache { +impl Default for EdgeConditionsCache { fn default() -> Self { Self { cache: Default::default(), _phantom: Default::default() } } } -impl EdgeDocidsCache { +impl EdgeConditionsCache { /// Retrieve the document ids for the given edge condition. /// /// If the cache does not yet contain these docids, they are computed @@ -27,30 +28,25 @@ impl EdgeDocidsCache { &'s mut self, ctx: &mut SearchContext<'search>, // TODO: should be Interned - edge_index: u16, + interned_edge_condition: Interned, graph: &RankingRuleGraph, // TODO: maybe universe doesn't belong here universe: &RoaringBitmap, - ) -> Result> { - let edge = graph.edges_store[edge_index as usize].as_ref().unwrap(); - - match &edge.condition { - EdgeCondition::Unconditional => Ok(BitmapOrAllRef::All), - EdgeCondition::Conditional(details) => { - if self.cache.contains_key(&edge_index) { - // TODO: should we update the bitmap in the cache if the new universe - // reduces it? - // TODO: maybe have a generation: u32 to track every time the universe was - // reduced. Then only attempt to recompute the intersection when there is a chance - // that edge_docids & universe changed - return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); - } - // TODO: maybe universe doesn't belong here - let docids = universe & G::resolve_edge_condition(ctx, details, universe)?; - let _ = self.cache.insert(edge_index, docids); - let docids = &self.cache[&edge_index]; - Ok(BitmapOrAllRef::Bitmap(docids)) - } + ) -> Result<&'s RoaringBitmap> { + if self.cache.contains_key(&interned_edge_condition) { + // TODO: should we update the bitmap in the cache if the new universe + // reduces it? + // TODO: maybe have a generation: u32 to track every time the universe was + // reduced. Then only attempt to recompute the intersection when there is a chance + // that edge_docids & universe changed + return Ok(&self.cache[&interned_edge_condition]); } + // TODO: maybe universe doesn't belong here + let edge_condition = graph.conditions_interner.get(interned_edge_condition); + // TODO: faster way to do this? + let docids = universe & G::resolve_edge_condition(ctx, edge_condition, universe)?; + let _ = self.cache.insert(interned_edge_condition, docids); + let docids = &self.cache[&interned_edge_condition]; + Ok(docids) } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 143554c72..8aa29a8b7 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -16,12 +16,15 @@ mod proximity; /// Implementation of the `typo` ranking rule mod typo; -pub use edge_docids_cache::EdgeDocidsCache; +use std::hash::Hash; + +pub use edge_docids_cache::EdgeConditionsCache; pub use empty_paths_cache::EmptyPathsCache; pub use proximity::ProximityGraph; use roaring::RoaringBitmap; pub use typo::TypoGraph; +use super::interner::{Interned, Interner}; use super::logger::SearchLogger; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; @@ -36,10 +39,20 @@ use crate::Result; /// proximity ranking rule, the condition could be that a word is N-close to another one. /// When the edge is traversed, some database operations are executed to retrieve the set /// of documents that satisfy the condition, which reduces the list of candidate document ids. -#[derive(Debug, Clone)] pub enum EdgeCondition { Unconditional, - Conditional(E), + Conditional(Interned), +} + +impl Copy for EdgeCondition {} + +impl Clone for EdgeCondition { + fn clone(&self) -> Self { + match self { + Self::Unconditional => Self::Unconditional, + Self::Conditional(arg0) => Self::Conditional(*arg0), + } + } } /// An edge in the ranking rule graph. @@ -48,7 +61,7 @@ pub enum EdgeCondition { /// 1. The source and destination nodes /// 2. The cost of traversing this edge /// 3. The condition associated with it -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct Edge { pub source_node: u16, pub dest_node: u16, @@ -106,7 +119,7 @@ pub trait RankingRuleGraphTrait: Sized { /// The condition of an edge connecting two query nodes. The condition /// should be sufficient to compute the edge's cost and associated document ids /// in [`resolve_edge_condition`](RankingRuleGraphTrait::resolve_edge_condition). - type EdgeCondition: Sized + Clone; + type EdgeCondition: Sized + Clone + PartialEq + Eq + Hash; /// A structure used in the construction of the graph, created when a /// query graph source node is visited. It is used to determine the cost @@ -138,6 +151,7 @@ pub trait RankingRuleGraphTrait: Sized { /// (with [`build_step_visit_source_node`](RankingRuleGraphTrait::build_step_visit_source_node)) to `dest_node`. fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( ctx: &mut SearchContext<'search>, + conditions_interner: &mut Interner, dest_node: &QueryNode, source_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>>; @@ -161,16 +175,18 @@ pub struct RankingRuleGraph { pub query_graph: QueryGraph, pub edges_store: Vec>>, pub edges_of_node: Vec, + pub conditions_interner: Interner, } -impl Clone for RankingRuleGraph { - fn clone(&self) -> Self { - Self { - query_graph: self.query_graph.clone(), - edges_store: self.edges_store.clone(), - edges_of_node: self.edges_of_node.clone(), - } - } -} +// impl Clone for RankingRuleGraph { +// fn clone(&self) -> Self { +// Self { +// query_graph: self.query_graph.clone(), +// edges_store: self.edges_store.clone(), +// edges_of_node: self.edges_of_node.clone(), +// conditions_interner: self.conditions_interner.clone(), +// } +// } +// } impl RankingRuleGraph { /// Remove the given edge from the ranking rule graph pub fn remove_ranking_rule_edge(&mut self, edge_index: u16) { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 6caa4a769..192b74faf 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -3,6 +3,7 @@ use std::collections::BTreeMap; use itertools::Itertools; use super::ProximityEdge; +use crate::search::new::interner::Interner; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::search::new::ranking_rule_graph::proximity::WordPair; use crate::search::new::ranking_rule_graph::EdgeCondition; @@ -59,6 +60,7 @@ pub fn visit_from_node( pub fn visit_to_node<'search, 'from_data>( ctx: &mut SearchContext<'search>, + conditions_interner: &mut Interner, to_node: &QueryNode, from_node_data: &'from_data (WordDerivations, i8), ) -> Result)>> { @@ -224,22 +226,23 @@ pub fn visit_to_node<'search, 'from_data>( } } } - let mut new_edges = cost_proximity_word_pairs - .into_iter() - .flat_map(|(cost, proximity_word_pairs)| { - let mut edges = vec![]; - for (proximity, word_pairs) in proximity_word_pairs { - edges.push(( - cost, - EdgeCondition::Conditional(ProximityEdge { - pairs: word_pairs.into_boxed_slice(), - proximity, - }), - )) - } - edges - }) - .collect::>(); + let mut new_edges = + cost_proximity_word_pairs + .into_iter() + .flat_map(|(cost, proximity_word_pairs)| { + let mut edges = vec![]; + for (proximity, word_pairs) in proximity_word_pairs { + edges.push(( + cost, + EdgeCondition::Conditional(conditions_interner.insert(ProximityEdge { + pairs: word_pairs.into_boxed_slice(), + proximity, + })), + )) + } + edges + }) + .collect::>(); new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeCondition::Unconditional)); Ok(new_edges) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index bf07bf21d..822c9531c 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -5,23 +5,21 @@ use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; use super::{EdgeCondition, RankingRuleGraphTrait}; -use crate::search::new::interner::Interned; +use crate::search::new::interner::{Interned, Interner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::WordDerivations; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; -// TODO: intern the proximity edges as well? - -#[derive(Clone)] +#[derive(Clone, PartialEq, Eq, Hash)] pub enum WordPair { Words { left: Interned, right: Interned }, WordPrefix { left: Interned, right_prefix: Interned }, WordPrefixSwapped { left_prefix: Interned, right: Interned }, } -#[derive(Clone)] +#[derive(Clone, PartialEq, Eq, Hash)] pub struct ProximityEdge { pairs: Box<[WordPair]>, proximity: u8, @@ -55,10 +53,11 @@ impl RankingRuleGraphTrait for ProximityGraph { fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( ctx: &mut SearchContext<'search>, + conditions_interner: &mut Interner, to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { - build::visit_to_node(ctx, to_node, from_node_data) + build::visit_to_node(ctx, conditions_interner, to_node, from_node_data) } fn log_state( diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 6354909f6..596fbfb64 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -2,14 +2,14 @@ use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::interner::Interned; +use crate::search::new::interner::{Interned, Interner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; -#[derive(Clone)] +#[derive(Clone, PartialEq, Eq, Hash)] pub enum TypoEdge { Phrase { phrase: Interned }, Word { derivations: Interned, nbr_typos: u8 }, @@ -78,15 +78,19 @@ impl RankingRuleGraphTrait for TypoGraph { fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( ctx: &mut SearchContext<'search>, + conditions_interner: &mut Interner, to_node: &QueryNode, _from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { let SearchContext { derivations_interner, .. } = ctx; match to_node { QueryNode::Term(LocatedQueryTerm { value, .. }) => match *value { - QueryTerm::Phrase { phrase } => { - Ok(vec![(0, EdgeCondition::Conditional(TypoEdge::Phrase { phrase }))]) - } + QueryTerm::Phrase { phrase } => Ok(vec![( + 0, + EdgeCondition::Conditional( + conditions_interner.insert(TypoEdge::Phrase { phrase }), + ), + )]), QueryTerm::Word { derivations } => { let mut edges = vec![]; @@ -136,10 +140,12 @@ impl RankingRuleGraphTrait for TypoGraph { if !new_derivations.is_empty() { edges.push(( nbr_typos, - EdgeCondition::Conditional(TypoEdge::Word { - derivations: derivations_interner.insert(new_derivations), - nbr_typos, - }), + EdgeCondition::Conditional(conditions_interner.insert( + TypoEdge::Word { + derivations: derivations_interner.insert(new_derivations), + nbr_typos, + }, + )), )) } } From 14e8d0aaa2f10f72b59f759b7fb98b9f988ab1cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 13 Mar 2023 14:03:48 +0100 Subject: [PATCH 053/234] Rename lifetime --- milli/src/search/new/db_cache.rs | 42 ++++---- milli/src/search/new/distinct.rs | 4 +- .../search/new/graph_based_ranking_rule.rs | 36 +++---- milli/src/search/new/interner.rs | 1 + milli/src/search/new/logger/detailed.rs | 5 +- milli/src/search/new/mod.rs | 100 +++++++++--------- milli/src/search/new/query_term.rs | 4 +- .../search/new/ranking_rule_graph/build.rs | 6 ++ .../ranking_rule_graph/edge_docids_cache.rs | 4 +- .../src/search/new/ranking_rule_graph/mod.rs | 73 +++---------- .../new/ranking_rule_graph/proximity/build.rs | 4 +- .../proximity/compute_docids.rs | 4 +- .../new/ranking_rule_graph/proximity/mod.rs | 12 +-- .../search/new/ranking_rule_graph/typo/mod.rs | 12 +-- milli/src/search/new/ranking_rules.rs | 17 +-- milli/src/search/new/resolve_query_graph.rs | 28 ++--- milli/src/search/new/sort.rs | 30 +++--- milli/src/search/new/words.rs | 8 +- 18 files changed, 177 insertions(+), 213 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 8435eb9da..b1f57fd0e 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -14,25 +14,25 @@ use crate::{Index, Result}; /// database lookup and instead get a direct reference to the value using a fast /// local HashMap lookup. #[derive(Default)] -pub struct DatabaseCache<'search> { +pub struct DatabaseCache<'ctx> { pub word_pair_proximity_docids: - FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, + FxHashMap<(u8, Interned, Interned), Option<&'ctx [u8]>>, pub word_prefix_pair_proximity_docids: - FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, + FxHashMap<(u8, Interned, Interned), Option<&'ctx [u8]>>, pub prefix_word_pair_proximity_docids: - FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, - pub word_docids: FxHashMap, Option<&'search [u8]>>, - pub exact_word_docids: FxHashMap, Option<&'search [u8]>>, - pub word_prefix_docids: FxHashMap, Option<&'search [u8]>>, + FxHashMap<(u8, Interned, Interned), Option<&'ctx [u8]>>, + pub word_docids: FxHashMap, Option<&'ctx [u8]>>, + pub exact_word_docids: FxHashMap, Option<&'ctx [u8]>>, + pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, } -impl<'search> DatabaseCache<'search> { +impl<'ctx> DatabaseCache<'ctx> { fn get_value<'v, K1, KC>( - txn: &'search RoTxn, + txn: &'ctx RoTxn, cache_key: K1, db_key: &'v KC::EItem, - cache: &mut FxHashMap>, + cache: &mut FxHashMap>, db: Database, - ) -> Result> + ) -> Result> where K1: Copy + Eq + Hash, KC: BytesEncode<'v>, @@ -52,10 +52,10 @@ impl<'search> DatabaseCache<'search> { pub fn get_word_docids( &mut self, index: &Index, - txn: &'search RoTxn, + txn: &'ctx RoTxn, word_interner: &Interner, word: Interned, - ) -> Result> { + ) -> Result> { Self::get_value( txn, word, @@ -68,10 +68,10 @@ impl<'search> DatabaseCache<'search> { pub fn get_word_prefix_docids( &mut self, index: &Index, - txn: &'search RoTxn, + txn: &'ctx RoTxn, word_interner: &Interner, prefix: Interned, - ) -> Result> { + ) -> Result> { Self::get_value( txn, prefix, @@ -84,12 +84,12 @@ impl<'search> DatabaseCache<'search> { pub fn get_word_pair_proximity_docids( &mut self, index: &Index, - txn: &'search RoTxn, + txn: &'ctx RoTxn, word_interner: &Interner, word1: Interned, word2: Interned, proximity: u8, - ) -> Result> { + ) -> Result> { Self::get_value( txn, (proximity, word1, word2), @@ -102,12 +102,12 @@ impl<'search> DatabaseCache<'search> { pub fn get_word_prefix_pair_proximity_docids( &mut self, index: &Index, - txn: &'search RoTxn, + txn: &'ctx RoTxn, word_interner: &Interner, word1: Interned, prefix2: Interned, proximity: u8, - ) -> Result> { + ) -> Result> { Self::get_value( txn, (proximity, word1, prefix2), @@ -119,12 +119,12 @@ impl<'search> DatabaseCache<'search> { pub fn get_prefix_word_pair_proximity_docids( &mut self, index: &Index, - txn: &'search RoTxn, + txn: &'ctx RoTxn, word_interner: &Interner, left_prefix: Interned, right: Interned, proximity: u8, - ) -> Result> { + ) -> Result> { Self::get_value( txn, (proximity, left_prefix, right), diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs index 9ee1746df..b29f0e8c3 100644 --- a/milli/src/search/new/distinct.rs +++ b/milli/src/search/new/distinct.rs @@ -20,8 +20,8 @@ pub struct DistinctOutput { pub excluded: RoaringBitmap, } -pub fn apply_distinct_rule<'search>( - ctx: &mut SearchContext<'search>, +pub fn apply_distinct_rule<'ctx>( + ctx: &mut SearchContext<'ctx>, field_id: u16, candidates: &RoaringBitmap, ) -> Result { diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 9af41b322..3281ffd2b 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -92,8 +92,8 @@ pub struct GraphBasedRankingRuleState { /// Traverse each edge of the graph, computes its associated document ids, /// and remove this edge from the graph if its docids are disjoint with the /// given universe. -fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( - ctx: &mut SearchContext<'search>, +fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( + ctx: &mut SearchContext<'ctx>, graph: &mut RankingRuleGraph, edge_docids_cache: &mut EdgeConditionsCache, universe: &RoaringBitmap, @@ -121,15 +121,13 @@ fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( Ok(()) } -impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> - for GraphBasedRankingRule -{ +impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBasedRankingRule { fn id(&self) -> String { self.id.clone() } fn start_iteration( &mut self, - ctx: &mut SearchContext<'search>, + ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, universe: &RoaringBitmap, query_graph: &QueryGraph, @@ -166,7 +164,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> fn next_bucket( &mut self, - ctx: &mut SearchContext<'search>, + ctx: &mut SearchContext<'ctx>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { @@ -210,11 +208,11 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> cur_distance_idx: _, } = &mut state; - // let original_universe = universe; + let original_universe = universe; let mut universe = universe.clone(); // TODO: remove this unnecessary clone - // let original_graph = graph.clone(); + let original_graph = graph.clone(); // and this vector as well let mut paths = vec![]; @@ -297,15 +295,15 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> }, )?; - // G::log_state( - // &original_graph, - // &paths, - // &state.empty_paths_cache, - // original_universe, - // &state.all_distances, - // cost, - // logger, - // ); + G::log_state( + &original_graph, + &paths, + &state.empty_paths_cache, + original_universe, + &state.all_distances, + cost, + logger, + ); // TODO: Graph-based ranking rules do not (yet) modify the query graph. We could, however, // remove nodes and/or terms within nodes that weren't present in any of the paths. @@ -318,7 +316,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> fn end_iteration( &mut self, - _ctx: &mut SearchContext<'search>, + _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, ) { self.state = None; diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index 7edef41c8..55c343cd5 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -33,6 +33,7 @@ impl Interned { /// is then identified by a lightweight index of type [`Interned`], which can /// be copied, compared, and hashed efficiently. An immutable reference to the original value /// can be retrieved using `self.get(interned)`. +#[derive(Clone)] pub struct Interner { stable_store: Vec, lookup: FxHashMap>, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index ba443752d..c049e7c17 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -545,12 +545,13 @@ shape: class" ) .unwrap(); } - EdgeCondition::Conditional(details) => { + EdgeCondition::Conditional(condition) => { + let condition = graph.conditions_interner.get(*condition); writeln!( file, "{source_node} -> {dest_node} : \"cost {cost} {edge_label}\"", cost = edge.cost, - edge_label = R::label_for_edge_condition(details) + edge_label = R::label_for_edge_condition(condition) ) .unwrap(); } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 110cfad64..03b2845cd 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -35,17 +35,17 @@ use crate::search::new::query_term::located_query_terms_from_string; use crate::search::new::words::Words; use crate::{Filter, Index, Result, TermsMatchingStrategy}; -pub struct SearchContext<'search> { - pub index: &'search Index, - pub txn: &'search RoTxn<'search>, - pub db_cache: DatabaseCache<'search>, +pub struct SearchContext<'ctx> { + pub index: &'ctx Index, + pub txn: &'ctx RoTxn<'ctx>, + pub db_cache: DatabaseCache<'ctx>, pub word_interner: Interner, pub phrase_interner: Interner, pub derivations_interner: Interner, pub query_term_docids: QueryTermDocIdsCache, } -impl<'search> SearchContext<'search> { - pub fn new(index: &'search Index, txn: &'search RoTxn<'search>) -> Self { +impl<'ctx> SearchContext<'ctx> { + pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { Self { index, txn, @@ -59,8 +59,8 @@ impl<'search> SearchContext<'search> { } #[allow(clippy::too_many_arguments)] -fn resolve_maximally_reduced_query_graph<'search>( - ctx: &mut SearchContext<'search>, +fn resolve_maximally_reduced_query_graph<'ctx>( + ctx: &mut SearchContext<'ctx>, universe: &RoaringBitmap, query_graph: &QueryGraph, matching_strategy: TermsMatchingStrategy, @@ -99,9 +99,9 @@ fn resolve_maximally_reduced_query_graph<'search>( Ok(docids) } -fn get_ranking_rules_for_placeholder_search<'search>( - ctx: &SearchContext<'search>, -) -> Result>>> { +fn get_ranking_rules_for_placeholder_search<'ctx>( + ctx: &SearchContext<'ctx>, +) -> Result>>> { // let sort = false; // let mut asc = HashSet::new(); // let mut desc = HashSet::new(); @@ -122,10 +122,10 @@ fn get_ranking_rules_for_placeholder_search<'search>( } Ok(ranking_rules) } -fn get_ranking_rules_for_query_graph_search<'search>( - ctx: &SearchContext<'search>, +fn get_ranking_rules_for_query_graph_search<'ctx>( + ctx: &SearchContext<'ctx>, terms_matching_strategy: TermsMatchingStrategy, -) -> Result>>> { +) -> Result>>> { // query graph search let mut words = false; let mut typo = false; @@ -215,8 +215,8 @@ fn get_ranking_rules_for_query_graph_search<'search>( } #[allow(clippy::too_many_arguments)] -pub fn execute_search<'search>( - ctx: &mut SearchContext<'search>, +pub fn execute_search<'ctx>( + ctx: &mut SearchContext<'ctx>, query: &str, terms_matching_strategy: TermsMatchingStrategy, filters: Option, @@ -295,45 +295,45 @@ mod tests { println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - // loop { - let start = Instant::now(); + loop { + let start = Instant::now(); - let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "zero config", - TermsMatchingStrategy::Last, - None, - 0, - 20, - &mut DefaultSearchLogger, - &mut logger, - ) - .unwrap(); + // let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let results = execute_search( + &mut ctx, + "zero config", + TermsMatchingStrategy::Last, + None, + 0, + 20, + &mut DefaultSearchLogger, + &mut DefaultSearchLogger, + ) + .unwrap(); - logger.write_d2_description(&mut ctx); + // logger.write_d2_description(&mut ctx); - let elapsed = start.elapsed(); - println!("{}us", elapsed.as_micros()); + let elapsed = start.elapsed(); + println!("{}us", elapsed.as_micros()); - let _documents = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); + let _documents = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); - println!("{}us: {:?}", elapsed.as_micros(), results); - // } + println!("{}us: {:?}", elapsed.as_micros(), results); + } // for (id, _document) in documents { // println!("{id}:"); // // println!("{document}"); diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 5fffe6653..c55a8d44e 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -276,8 +276,8 @@ impl LocatedQueryTerm { } /// Convert the tokenised search query into a list of located query terms. -pub fn located_query_terms_from_string<'search>( - ctx: &mut SearchContext<'search>, +pub fn located_query_terms_from_string<'ctx>( + ctx: &mut SearchContext<'ctx>, query: NormalizedTokenIter>, words_limit: Option, ) -> Result> { diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 3393f086a..7b00fc445 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -7,6 +7,12 @@ use crate::search::new::{QueryGraph, SearchContext}; use crate::Result; impl RankingRuleGraph { + // TODO: here, the docids of all the edges should already be computed! + // an edge condition would then be reduced to a (ptr to) a roaring bitmap? + // we could build fewer of them by directly comparing them with the universe + // (e.g. for each word pairs?) with `deserialize_within_universe` maybe + // + /// Build the ranking rule graph from the given query graph pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { let QueryGraph { nodes: graph_nodes, edges: graph_edges, .. } = &query_graph; diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 416ededc0..5da3de326 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -24,9 +24,9 @@ impl EdgeConditionsCache { /// /// If the cache does not yet contain these docids, they are computed /// and inserted in the cache. - pub fn get_edge_docids<'s, 'search>( + pub fn get_edge_docids<'s, 'ctx>( &'s mut self, - ctx: &mut SearchContext<'search>, + ctx: &mut SearchContext<'ctx>, // TODO: should be Interned interned_edge_condition: Interned, graph: &RankingRuleGraph, diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 8aa29a8b7..16c75b072 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -69,47 +69,6 @@ pub struct Edge { pub condition: EdgeCondition, } -// pub struct SubWordDerivations { -// words: FxHashSet>, -// phrases: FxHashSet>, -// use_prefix_db: bool, -// } - -// pub struct EdgeWordDerivations { -// // TODO: not Option, instead: Any | All | Subset(SubWordDerivations) -// from_words: Option, // ??? -// to_words: Option, // + use prefix db? -// } - -// fn aggregate_edge_word_derivations( -// graph: (), -// edges: Vec, -// ) -> BTreeMap { -// todo!() -// } - -// fn reduce_word_term_to_sub_word_derivations( -// term: &mut WordDerivations, -// derivations: &SubWordDerivations, -// ) { -// let mut new_one_typo = vec![]; -// for w in term.one_typo { -// if derivations.words.contains(w) { -// new_one_typo.push(w); -// } -// } -// if term.use_prefix_db && !derivations.use_prefix_db { -// term.use_prefix_db = false; -// } -// // etc. -// } - -// fn word_derivations_used_by_edge( -// edge: G::EdgeCondition, -// ) -> SubWordDerivations { -// todo!() -// } - /// A trait to be implemented by a marker type to build a graph-based ranking rule. /// /// It mostly describes how to: @@ -132,8 +91,8 @@ pub trait RankingRuleGraphTrait: Sized { /// Compute the document ids associated with the given edge condition, /// restricted to the given universe. - fn resolve_edge_condition<'search>( - ctx: &mut SearchContext<'search>, + fn resolve_edge_condition<'ctx>( + ctx: &mut SearchContext<'ctx>, edge_condition: &Self::EdgeCondition, universe: &RoaringBitmap, ) -> Result; @@ -142,15 +101,15 @@ pub trait RankingRuleGraphTrait: Sized { /// /// This call is followed by zero, one or more calls to [`build_step_visit_destination_node`](RankingRuleGraphTrait::build_step_visit_destination_node), /// which builds the actual edges. - fn build_step_visit_source_node<'search>( - ctx: &mut SearchContext<'search>, + fn build_step_visit_source_node<'ctx>( + ctx: &mut SearchContext<'ctx>, source_node: &QueryNode, ) -> Result>; /// Return the cost and condition of the edges going from the previously visited node /// (with [`build_step_visit_source_node`](RankingRuleGraphTrait::build_step_visit_source_node)) to `dest_node`. - fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( - ctx: &mut SearchContext<'search>, + fn build_step_visit_destination_node<'from_data, 'ctx: 'from_data>( + ctx: &mut SearchContext<'ctx>, conditions_interner: &mut Interner, dest_node: &QueryNode, source_node_data: &'from_data Self::BuildVisitedFromNode, @@ -177,16 +136,16 @@ pub struct RankingRuleGraph { pub edges_of_node: Vec, pub conditions_interner: Interner, } -// impl Clone for RankingRuleGraph { -// fn clone(&self) -> Self { -// Self { -// query_graph: self.query_graph.clone(), -// edges_store: self.edges_store.clone(), -// edges_of_node: self.edges_of_node.clone(), -// conditions_interner: self.conditions_interner.clone(), -// } -// } -// } +impl Clone for RankingRuleGraph { + fn clone(&self) -> Self { + Self { + query_graph: self.query_graph.clone(), + edges_store: self.edges_store.clone(), + edges_of_node: self.edges_of_node.clone(), + conditions_interner: self.conditions_interner.clone(), + } + } +} impl RankingRuleGraph { /// Remove the given edge from the ranking rule graph pub fn remove_ranking_rule_edge(&mut self, edge_index: u16) { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 192b74faf..87cb75e45 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -58,8 +58,8 @@ pub fn visit_from_node( })) } -pub fn visit_to_node<'search, 'from_data>( - ctx: &mut SearchContext<'search>, +pub fn visit_to_node<'ctx, 'from_data>( + ctx: &mut SearchContext<'ctx>, conditions_interner: &mut Interner, to_node: &QueryNode, from_node_data: &'from_data (WordDerivations, i8), diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 777d69b64..0821cd5d0 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -4,8 +4,8 @@ use super::{ProximityEdge, WordPair}; use crate::search::new::SearchContext; use crate::{CboRoaringBitmapCodec, Result}; -pub fn compute_docids<'search>( - ctx: &mut SearchContext<'search>, +pub fn compute_docids<'ctx>( + ctx: &mut SearchContext<'ctx>, edge: &ProximityEdge, universe: &RoaringBitmap, ) -> Result { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 822c9531c..b099e79f6 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -36,23 +36,23 @@ impl RankingRuleGraphTrait for ProximityGraph { format!(", prox {proximity}, {} pairs", pairs.len()) } - fn resolve_edge_condition<'search>( - ctx: &mut SearchContext<'search>, + fn resolve_edge_condition<'ctx>( + ctx: &mut SearchContext<'ctx>, edge: &Self::EdgeCondition, universe: &RoaringBitmap, ) -> Result { compute_docids::compute_docids(ctx, edge, universe) } - fn build_step_visit_source_node<'search>( - ctx: &mut SearchContext<'search>, + fn build_step_visit_source_node<'ctx>( + ctx: &mut SearchContext<'ctx>, from_node: &QueryNode, ) -> Result> { build::visit_from_node(ctx, from_node) } - fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( - ctx: &mut SearchContext<'search>, + fn build_step_visit_destination_node<'from_data, 'ctx: 'from_data>( + ctx: &mut SearchContext<'ctx>, conditions_interner: &mut Interner, to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 596fbfb64..ae5c850e3 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -28,8 +28,8 @@ impl RankingRuleGraphTrait for TypoGraph { } } - fn resolve_edge_condition<'db_cache, 'search>( - ctx: &mut SearchContext<'search>, + fn resolve_edge_condition<'db_cache, 'ctx>( + ctx: &mut SearchContext<'ctx>, edge: &Self::EdgeCondition, universe: &RoaringBitmap, ) -> Result { @@ -69,15 +69,15 @@ impl RankingRuleGraphTrait for TypoGraph { } } - fn build_step_visit_source_node<'search>( - _ctx: &mut SearchContext<'search>, + fn build_step_visit_source_node<'ctx>( + _ctx: &mut SearchContext<'ctx>, _from_node: &QueryNode, ) -> Result> { Ok(Some(())) } - fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( - ctx: &mut SearchContext<'search>, + fn build_step_visit_destination_node<'from_data, 'ctx: 'from_data>( + ctx: &mut SearchContext<'ctx>, conditions_interner: &mut Interner, to_node: &QueryNode, _from_node_data: &'from_data Self::BuildVisitedFromNode, diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 57817fd7e..5e5da8716 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -17,10 +17,10 @@ impl RankingRuleQueryTrait for QueryGraph {} /// A trait that must be implemented by all ranking rules. /// -/// It is generic over `'search`, the lifetime of the search context +/// It is generic over `'ctx`, the lifetime of the search context /// (i.e. the read transaction and the cache) and over `Query`, which /// can be either [`PlaceholderQuery`] or [`QueryGraph`]. -pub trait RankingRule<'search, Query: RankingRuleQueryTrait> { +pub trait RankingRule<'ctx, Query: RankingRuleQueryTrait> { fn id(&self) -> String; /// Prepare the ranking rule such that it can start iterating over its @@ -29,7 +29,7 @@ pub trait RankingRule<'search, Query: RankingRuleQueryTrait> { /// The given universe is the universe that will be given to [`next_bucket`](RankingRule::next_bucket). fn start_iteration( &mut self, - ctx: &mut SearchContext<'search>, + ctx: &mut SearchContext<'ctx>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, query: &Query, @@ -44,7 +44,7 @@ pub trait RankingRule<'search, Query: RankingRuleQueryTrait> { /// - the universe given to [`start_iteration`](RankingRule::start_iteration) fn next_bucket( &mut self, - ctx: &mut SearchContext<'search>, + ctx: &mut SearchContext<'ctx>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>>; @@ -53,7 +53,7 @@ pub trait RankingRule<'search, Query: RankingRuleQueryTrait> { /// The next call to this ranking rule, if any, will be [`start_iteration`](RankingRule::start_iteration). fn end_iteration( &mut self, - ctx: &mut SearchContext<'search>, + ctx: &mut SearchContext<'ctx>, logger: &mut dyn SearchLogger, ); } @@ -68,15 +68,16 @@ pub struct RankingRuleOutput { pub candidates: RoaringBitmap, } -pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( - ctx: &mut SearchContext<'search>, - mut ranking_rules: Vec>>, +pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( + ctx: &mut SearchContext<'ctx>, + mut ranking_rules: Vec>>, query: &Q, universe: &RoaringBitmap, from: usize, length: usize, logger: &mut dyn SearchLogger, ) -> Result> { + logger.initial_query(query); logger.ranking_rules(&ranking_rules); logger.initial_universe(universe); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index b70b01c34..2553f42c9 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -21,11 +21,11 @@ pub struct QueryTermDocIdsCache { } impl QueryTermDocIdsCache { /// Get the document ids associated with the given phrase - pub fn get_phrase_docids<'s, 'search>( + pub fn get_phrase_docids<'s, 'ctx>( &'s mut self, index: &Index, - txn: &'search RoTxn, - db_cache: &mut DatabaseCache<'search>, + txn: &'ctx RoTxn, + db_cache: &mut DatabaseCache<'ctx>, word_interner: &Interner, phrase_interner: &Interner, phrase: Interned, @@ -40,11 +40,11 @@ impl QueryTermDocIdsCache { } /// Get the document ids associated with the given word derivations - pub fn get_word_derivations_docids<'s, 'search>( + pub fn get_word_derivations_docids<'s, 'ctx>( &'s mut self, index: &Index, - txn: &'search RoTxn, - db_cache: &mut DatabaseCache<'search>, + txn: &'ctx RoTxn, + db_cache: &mut DatabaseCache<'ctx>, word_interner: &Interner, derivations_interner: &Interner, phrase_interner: &Interner, @@ -110,11 +110,11 @@ impl QueryTermDocIdsCache { } /// Get the document ids associated with the given query term. - fn get_query_term_docids<'s, 'search>( + fn get_query_term_docids<'s, 'ctx>( &'s mut self, index: &Index, - txn: &'search RoTxn, - db_cache: &mut DatabaseCache<'search>, + txn: &'ctx RoTxn, + db_cache: &mut DatabaseCache<'ctx>, word_interner: &Interner, derivations_interner: &Interner, phrase_interner: &Interner, @@ -137,8 +137,8 @@ impl QueryTermDocIdsCache { } } -pub fn resolve_query_graph<'search>( - ctx: &mut SearchContext<'search>, +pub fn resolve_query_graph<'ctx>( + ctx: &mut SearchContext<'ctx>, q: &QueryGraph, universe: &RoaringBitmap, ) -> Result { @@ -214,10 +214,10 @@ pub fn resolve_query_graph<'search>( panic!() } -pub fn resolve_phrase<'search>( +pub fn resolve_phrase<'ctx>( index: &Index, - txn: &'search RoTxn, - db_cache: &mut DatabaseCache<'search>, + txn: &'ctx RoTxn, + db_cache: &mut DatabaseCache<'ctx>, word_interner: &Interner, phrase_interner: &Interner, phrase: Interned, diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index e43a9e8aa..70173889e 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -3,21 +3,19 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait, SearchContext}; -pub trait RankingRuleOutputIter<'search, Query> { +pub trait RankingRuleOutputIter<'ctx, Query> { fn next_bucket(&mut self) -> Result>>; } -pub struct RankingRuleOutputIterWrapper<'search, Query> { - iter: Box>> + 'search>, +pub struct RankingRuleOutputIterWrapper<'ctx, Query> { + iter: Box>> + 'ctx>, } -impl<'search, Query> RankingRuleOutputIterWrapper<'search, Query> { - pub fn new(iter: Box>> + 'search>) -> Self { +impl<'ctx, Query> RankingRuleOutputIterWrapper<'ctx, Query> { + pub fn new(iter: Box>> + 'ctx>) -> Self { Self { iter } } } -impl<'search, Query> RankingRuleOutputIter<'search, Query> - for RankingRuleOutputIterWrapper<'search, Query> -{ +impl<'ctx, Query> RankingRuleOutputIter<'ctx, Query> for RankingRuleOutputIterWrapper<'ctx, Query> { fn next_bucket(&mut self) -> Result>> { match self.iter.next() { Some(x) => x.map(Some), @@ -35,17 +33,17 @@ use crate::{ Result, }; -pub struct Sort<'search, Query> { +pub struct Sort<'ctx, Query> { field_name: String, field_id: Option, is_ascending: bool, original_query: Option, - iter: Option>, + iter: Option>, } -impl<'search, Query> Sort<'search, Query> { +impl<'ctx, Query> Sort<'ctx, Query> { pub fn _new( index: &Index, - rtxn: &'search heed::RoTxn, + rtxn: &'ctx heed::RoTxn, field_name: String, is_ascending: bool, ) -> Result { @@ -56,14 +54,14 @@ impl<'search, Query> Sort<'search, Query> { } } -impl<'search, Query: RankingRuleQueryTrait> RankingRule<'search, Query> for Sort<'search, Query> { +impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, Query> { fn id(&self) -> String { let Self { field_name, is_ascending, .. } = self; format!("{field_name}:{}", if *is_ascending { "asc" } else { "desc " }) } fn start_iteration( &mut self, - ctx: &mut SearchContext<'search>, + ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, parent_candidates: &RoaringBitmap, parent_query_graph: &Query, @@ -106,7 +104,7 @@ impl<'search, Query: RankingRuleQueryTrait> RankingRule<'search, Query> for Sort fn next_bucket( &mut self, - _ctx: &mut SearchContext<'search>, + _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { @@ -123,7 +121,7 @@ impl<'search, Query: RankingRuleQueryTrait> RankingRule<'search, Query> for Sort fn end_iteration( &mut self, - _ctx: &mut SearchContext<'search>, + _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, ) { self.original_query = None; diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 2015367da..5bc5ff1fe 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -26,13 +26,13 @@ impl Words { } } -impl<'search> RankingRule<'search, QueryGraph> for Words { +impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { fn id(&self) -> String { "words".to_owned() } fn start_iteration( &mut self, - _ctx: &mut SearchContext<'search>, + _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, _parent_candidates: &RoaringBitmap, parent_query_graph: &QueryGraph, @@ -65,7 +65,7 @@ impl<'search> RankingRule<'search, QueryGraph> for Words { fn next_bucket( &mut self, - ctx: &mut SearchContext<'search>, + ctx: &mut SearchContext<'ctx>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { @@ -101,7 +101,7 @@ impl<'search> RankingRule<'search, QueryGraph> for Words { fn end_iteration( &mut self, - _ctx: &mut SearchContext<'search>, + _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, ) { self.iterating = false; From 3004e281d7df62ea5bb8dff4561fd8b0b9122ae7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 13 Mar 2023 17:21:29 +0100 Subject: [PATCH 054/234] Support ngram typos + splitwords and splitwords+synonyms in proximity --- milli/src/search/new/logger/detailed.rs | 11 +- milli/src/search/new/mod.rs | 68 +-- milli/src/search/new/query_graph.rs | 68 +-- milli/src/search/new/query_term.rs | 342 ++++++++++---- .../new/ranking_rule_graph/proximity/build.rs | 439 +++++++++++------- .../proximity/compute_docids.rs | 85 +++- .../new/ranking_rule_graph/proximity/mod.rs | 32 +- .../search/new/ranking_rule_graph/typo/mod.rs | 47 +- milli/src/search/new/resolve_query_graph.rs | 20 +- 9 files changed, 701 insertions(+), 411 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index c049e7c17..9f612f239 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -447,6 +447,8 @@ results.{random} {{ use_prefix_db, synonyms, split_words, + prefix_of, + is_prefix: _, } = ctx.derivations_interner.get(*derivations); let original = ctx.word_interner.get(*original); @@ -460,6 +462,10 @@ shape: class" let w = ctx.word_interner.get(w); writeln!(file, "\"{w}\" : 0").unwrap(); } + for w in prefix_of.iter().copied() { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 0P").unwrap(); + } for w in one_typo.iter().copied() { let w = ctx.word_interner.get(w); writeln!(file, "\"{w}\" : 1").unwrap(); @@ -478,8 +484,9 @@ shape: class" let phrase_str = phrase.description(&ctx.word_interner); writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); } - if *use_prefix_db { - writeln!(file, "use prefix DB : true").unwrap(); + if let Some(use_prefix_db) = use_prefix_db { + let p = ctx.word_interner.get(*use_prefix_db); + writeln!(file, "use prefix DB : {p}").unwrap(); } for (d, edges) in distances.iter() { writeln!(file, "\"distance {d}\" : {:?}", edges.iter().collect::>()) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 03b2845cd..323b8eb62 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -295,45 +295,45 @@ mod tests { println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - loop { - let start = Instant::now(); + // loop { + let start = Instant::now(); - // let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "zero config", - TermsMatchingStrategy::Last, - None, - 0, - 20, - &mut DefaultSearchLogger, - &mut DefaultSearchLogger, - ) - .unwrap(); + let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let results = execute_search( + &mut ctx, + "sun flower s are the best", + TermsMatchingStrategy::Last, + None, + 0, + 20, + &mut DefaultSearchLogger, + &mut logger, + ) + .unwrap(); - // logger.write_d2_description(&mut ctx); + logger.write_d2_description(&mut ctx); - let elapsed = start.elapsed(); - println!("{}us", elapsed.as_micros()); + let elapsed = start.elapsed(); + println!("{}us", elapsed.as_micros()); - let _documents = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); + let _documents = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); - println!("{}us: {:?}", elapsed.as_micros(), results); - } + println!("{}us: {:?}", elapsed.as_micros(), results); + // } // for (id, _document) in documents { // println!("{id}:"); // // println!("{document}"); diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 90edd4f09..f76feb80b 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,4 +1,4 @@ -use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations}; +use super::query_term::{self, number_of_typos_allowed, LocatedQueryTerm}; use super::small_bitmap::SmallBitmap; use super::SearchContext; use crate::Result; @@ -132,18 +132,18 @@ impl QueryGraph { impl QueryGraph { /// Build the query graph from the parsed user search query. pub fn from_query(ctx: &mut SearchContext, terms: Vec) -> Result { + let nbr_typos = number_of_typos_allowed(ctx)?; + let mut empty_nodes = vec![]; - let word_set = ctx.index.words_fst(ctx.txn)?; let mut graph = QueryGraph::default(); + // TODO: we could consider generalizing to 4,5,6,7,etc. ngrams let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![graph.root_node]); - for length in 1..=terms.len() { - let query = &terms[..length]; - - let term0 = query.last().unwrap(); + for term_idx in 0..terms.len() { + let term0 = &terms[term_idx]; let mut new_nodes = vec![]; let new_node_idx = graph.add_node(&prev0, QueryNode::Term(term0.clone())); @@ -153,57 +153,19 @@ impl QueryGraph { } if !prev1.is_empty() { - if let Some((ngram2_str, ngram2_pos)) = - query_term::ngram2(ctx, &query[length - 2], &query[length - 1]) + if let Some(ngram) = + query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)? { - if word_set.contains(ctx.word_interner.get(ngram2_str)) { - let ngram2 = LocatedQueryTerm { - value: QueryTerm::Word { - derivations: ctx.derivations_interner.insert(WordDerivations { - original: ngram2_str, - // TODO: could add a typo if it's an ngram? - zero_typo: Box::new([ngram2_str]), - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: false, - synonyms: Box::new([]), // TODO: ngram synonyms - split_words: None, // TODO: maybe ngram split words? - }), - }, - positions: ngram2_pos, - }; - let ngram2_idx = graph.add_node(&prev1, QueryNode::Term(ngram2)); - new_nodes.push(ngram2_idx); - } + let ngram_idx = graph.add_node(&prev1, QueryNode::Term(ngram)); + new_nodes.push(ngram_idx); } } if !prev2.is_empty() { - if let Some((ngram3_str, ngram3_pos)) = query_term::ngram3( - ctx, - &query[length - 3], - &query[length - 2], - &query[length - 1], - ) { - if word_set.contains(ctx.word_interner.get(ngram3_str)) { - let ngram3 = LocatedQueryTerm { - value: QueryTerm::Word { - derivations: ctx.derivations_interner.insert(WordDerivations { - original: ngram3_str, - // TODO: could add a typo if it's an ngram? - zero_typo: Box::new([ngram3_str]), - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: false, - synonyms: Box::new([]), // TODO: ngram synonyms - split_words: None, // TODO: maybe ngram split words? - // would be nice for typos like su nflower - }), - }, - positions: ngram3_pos, - }; - let ngram3_idx = graph.add_node(&prev2, QueryNode::Term(ngram3)); - new_nodes.push(ngram3_idx); - } + if let Some(ngram) = + query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)? + { + let ngram_idx = graph.add_node(&prev2, QueryNode::Term(ngram)); + new_nodes.push(ngram_idx); } } (prev0, prev1, prev2) = (new_nodes, prev0, prev1); diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index c55a8d44e..467752012 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -13,7 +13,7 @@ use super::interner::{Interned, Interner}; use super::SearchContext; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; -use crate::{CboRoaringBitmapLenCodec, Index, Result}; +use crate::{CboRoaringBitmapLenCodec, Index, Result, MAX_WORD_LENGTH}; /// A phrase in the user's search query, consisting of several words /// that must appear side-by-side in the search results. @@ -31,46 +31,70 @@ impl Phrase { /// a term in the user's search query. #[derive(Clone, PartialEq, Eq, Hash)] pub struct WordDerivations { - /// The original word + /// The original terms, for debugging purposes pub original: Interned, - // TODO: original should only be used for debugging purposes? - // TODO: pub zero_typo: Option>, - // TODO: pub prefix_of: Box<[Interned]>, + pub is_prefix: bool, + + /// A single word equivalent to the original one, with zero typos + pub zero_typo: Option>, + /// All the words that contain the original word as prefix + pub prefix_of: Box<[Interned]>, /// All the synonyms of the original word pub synonyms: Box<[Interned]>, /// The original word split into multiple consecutive words pub split_words: Option>, - /// The original words and words which are prefixed by it - pub zero_typo: Box<[Interned]>, - /// Words that are 1 typo away from the original word pub one_typo: Box<[Interned]>, /// Words that are 2 typos away from the original word pub two_typos: Box<[Interned]>, - /// True if the prefix databases must be used to retrieve - /// the words which are prefixed by the original word. - pub use_prefix_db: bool, + /// A prefix in the prefix databases matching the original word + pub use_prefix_db: Option>, } impl WordDerivations { + pub fn empty(word_interner: &mut Interner, original: &str) -> Self { + Self { + original: word_interner.insert(original.to_owned()), + is_prefix: false, + zero_typo: None, + prefix_of: Box::new([]), + synonyms: Box::new([]), + split_words: None, + one_typo: Box::new([]), + two_typos: Box::new([]), + use_prefix_db: None, + } + } /// Return an iterator over all the single words derived from the original word. /// /// This excludes synonyms, split words, and words stored in the prefix databases. pub fn all_single_word_derivations_except_prefix_db( &'_ self, ) -> impl Iterator> + Clone + '_ { - self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter()).copied() + self.zero_typo + .iter() + .chain(self.prefix_of.iter()) + .chain(self.one_typo.iter()) + .chain(self.two_typos.iter()) + .copied() + } + /// Return an iterator over all the single words derived from the original word. + /// + /// This excludes synonyms, split words, and words stored in the prefix databases. + pub fn all_phrase_derivations(&'_ self) -> impl Iterator> + Clone + '_ { + self.split_words.iter().chain(self.synonyms.iter()).copied() } pub fn is_empty(&self) -> bool { - self.zero_typo.is_empty() + self.zero_typo.is_none() && self.one_typo.is_empty() && self.two_typos.is_empty() + && self.prefix_of.is_empty() && self.synonyms.is_empty() && self.split_words.is_none() - && !self.use_prefix_db + && self.use_prefix_db.is_none() } } @@ -80,7 +104,11 @@ pub fn word_derivations( word: &str, max_typo: u8, is_prefix: bool, -) -> Result> { +) -> Result { + if word.len() > MAX_WORD_LENGTH { + return Ok(WordDerivations::empty(&mut ctx.word_interner, word)); + } + let fst = ctx.index.words_fst(ctx.txn)?; let word_interned = ctx.word_interner.insert(word.to_owned()); @@ -91,23 +119,29 @@ pub fn word_derivations( .remap_data_type::() .get(ctx.txn, word)? .is_some(); + let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; - let mut zero_typo = vec![]; + let mut zero_typo = None; + let mut prefix_of = vec![]; let mut one_typo = vec![]; let mut two_typos = vec![]; + if fst.contains(word) { + zero_typo = Some(word_interned); + } + if max_typo == 0 { - if is_prefix && !use_prefix_db { + if is_prefix && use_prefix_db.is_none() { let prefix = Str::new(word).starts_with(); let mut stream = fst.search(prefix).into_stream(); while let Some(derived_word) = stream.next() { let derived_word = std::str::from_utf8(derived_word)?.to_owned(); let derived_word_interned = ctx.word_interner.insert(derived_word); - zero_typo.push(derived_word_interned); + if derived_word_interned != word_interned { + prefix_of.push(derived_word_interned); + } } - } else if fst.contains(word) { - zero_typo.push(word_interned); } } else if max_typo == 1 { let dfa = build_dfa(word, 1, is_prefix); @@ -122,7 +156,9 @@ pub fn word_derivations( let derived_word_interned = ctx.word_interner.insert(derived_word.to_owned()); match d.to_u8() { 0 => { - zero_typo.push(derived_word_interned); + if derived_word_interned != word_interned { + prefix_of.push(derived_word_interned); + } } 1 => { one_typo.push(derived_word_interned); @@ -153,7 +189,9 @@ pub fn word_derivations( let d = second_dfa.distance((state.1).0); match d.to_u8() { 0 => { - zero_typo.push(derived_word_interned); + if derived_word_interned != word_interned { + prefix_of.push(derived_word_interned); + } } 1 => { one_typo.push(derived_word_interned); @@ -185,17 +223,17 @@ pub fn word_derivations( }) .collect(); - let interned = ctx.derivations_interner.insert(WordDerivations { - original: ctx.word_interner.insert(word.to_owned()), + Ok(WordDerivations { + original: word_interned, + is_prefix, + zero_typo, + prefix_of: prefix_of.into_boxed_slice(), synonyms, split_words, - zero_typo: zero_typo.into_boxed_slice(), one_typo: one_typo.into_boxed_slice(), two_typos: two_typos.into_boxed_slice(), use_prefix_db, - }); - - Ok(interned) + }) } /// Split the original word into the two words that appear the @@ -236,12 +274,17 @@ pub enum QueryTerm { } impl QueryTerm { + pub fn is_prefix(&self, derivations_interner: &Interner) -> bool { + match self { + QueryTerm::Phrase { .. } => false, + QueryTerm::Word { derivations } => derivations_interner.get(*derivations).is_prefix, + } + } /// Return the original word from the given query term - pub fn original_single_word<'interner>( + pub fn original_single_word( &self, - word_interner: &'interner Interner, - derivations_interner: &'interner Interner, - ) -> Option<&'interner str> { + derivations_interner: &Interner, + ) -> Option> { match self { QueryTerm::Phrase { phrase: _ } => None, QueryTerm::Word { derivations } => { @@ -249,7 +292,7 @@ impl QueryTerm { if derivations.is_empty() { None } else { - Some(word_interner.get(derivations.original)) + Some(derivations.original) } } } @@ -281,25 +324,7 @@ pub fn located_query_terms_from_string<'ctx>( query: NormalizedTokenIter>, words_limit: Option, ) -> Result> { - let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; - let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; - let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; - - // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? - let exact_words = ctx.index.exact_words(ctx.txn)?; - - let nbr_typos = |word: &str| { - if !authorize_typos - || word.len() < min_len_one_typo as usize - || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) - { - 0 - } else if word.len() < min_len_two_typos as usize { - 1 - } else { - 2 - } - }; + let nbr_typos = number_of_typos_allowed(ctx)?; let mut located_terms = Vec::new(); @@ -344,7 +369,9 @@ pub fn located_query_terms_from_string<'ctx>( let word = token.lemma(); let derivations = word_derivations(ctx, word, nbr_typos(word), false)?; let located_term = LocatedQueryTerm { - value: QueryTerm::Word { derivations }, + value: QueryTerm::Word { + derivations: ctx.derivations_interner.insert(derivations), + }, positions: position..=position, }; located_terms.push(located_term); @@ -355,7 +382,9 @@ pub fn located_query_terms_from_string<'ctx>( let word = token.lemma(); let derivations = word_derivations(ctx, word, nbr_typos(word), true)?; let located_term = LocatedQueryTerm { - value: QueryTerm::Word { derivations }, + value: QueryTerm::Word { + derivations: ctx.derivations_interner.insert(derivations), + }, positions: position..=position, }; located_terms.push(located_term); @@ -407,54 +436,171 @@ pub fn located_query_terms_from_string<'ctx>( Ok(located_terms) } -// TODO: return a word derivations instead? -pub fn ngram2( - ctx: &mut SearchContext, - x: &LocatedQueryTerm, - y: &LocatedQueryTerm, -) -> Option<(Interned, RangeInclusive)> { - if *x.positions.end() != y.positions.start() - 1 { - return None; - } - match ( - &x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), - &y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), - ) { - (Some(w1), Some(w2)) => { - let term = ( - ctx.word_interner.insert(format!("{w1}{w2}")), - *x.positions.start()..=*y.positions.end(), - ); - Some(term) +pub fn number_of_typos_allowed<'ctx>( + ctx: &SearchContext<'ctx>, +) -> Result u8 + 'ctx> { + let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; + let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; + let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; + + // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? + let exact_words = ctx.index.exact_words(ctx.txn)?; + + Ok(Box::new(move |word: &str| { + if !authorize_typos + || word.len() < min_len_one_typo as usize + || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) + { + 0 + } else if word.len() < min_len_two_typos as usize { + 1 + } else { + 2 } - _ => None, - } + })) } -// TODO: return a word derivations instead? -pub fn ngram3( +pub fn make_ngram( ctx: &mut SearchContext, - x: &LocatedQueryTerm, - y: &LocatedQueryTerm, - z: &LocatedQueryTerm, -) -> Option<(Interned, RangeInclusive)> { - if *x.positions.end() != y.positions.start() - 1 - || *y.positions.end() != z.positions.start() - 1 - { - return None; - } - match ( - &x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), - &y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), - &z.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), - ) { - (Some(w1), Some(w2), Some(w3)) => { - let term = ( - ctx.word_interner.insert(format!("{w1}{w2}{w3}")), - *x.positions.start()..=*z.positions.end(), - ); - Some(term) + terms: &[LocatedQueryTerm], + number_of_typos_allowed: &impl Fn(&str) -> u8, +) -> Result> { + assert!(!terms.is_empty()); + for ts in terms.windows(2) { + let [t1, t2] = ts else { panic!() }; + if *t1.positions.end() != t2.positions.start() - 1 { + return Ok(None); } - _ => None, } + let mut words_interned = vec![]; + for term in terms { + if let Some(original_term_word) = term.value.original_single_word(&ctx.derivations_interner) + { + words_interned.push(original_term_word); + } else { + return Ok(None); + } + } + let words = + words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::>(); + + let start = *terms.first().as_ref().unwrap().positions.start(); + let end = *terms.last().as_ref().unwrap().positions.end(); + let is_prefix = terms.last().as_ref().unwrap().value.is_prefix(&ctx.derivations_interner); + let ngram_str = words.join(""); + if ngram_str.len() > MAX_WORD_LENGTH { + return Ok(None); + } + + let mut derivations = word_derivations( + ctx, + &ngram_str, + number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8), + is_prefix, + )?; + derivations.original = ctx.word_interner.insert(words.join(" ")); + // Now add the synonyms + let index_synonyms = ctx.index.synonyms(ctx.txn)?; + let mut derivations_synonyms = derivations.synonyms.to_vec(); + derivations_synonyms.extend( + index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| { + let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); + ctx.phrase_interner.insert(Phrase { words }) + }), + ); + derivations.synonyms = derivations_synonyms.into_boxed_slice(); + if let Some(split_words) = derivations.split_words { + let split_words = ctx.phrase_interner.get(split_words); + if split_words.words == words_interned.iter().map(|&i| Some(i)).collect::>() { + derivations.split_words = None; + } + } + if derivations.is_empty() { + return Ok(None); + } + let term = LocatedQueryTerm { + value: QueryTerm::Word { derivations: ctx.derivations_interner.insert(derivations) }, + positions: start..=end, + }; + + Ok(Some(term)) } + +// // TODO: return a word derivations instead? +// pub fn ngram2( +// ctx: &mut SearchContext, +// x: &LocatedQueryTerm, +// y: &LocatedQueryTerm, +// number_of_typos_allowed: impl Fn(&str) -> u8, +// ) -> Result> { +// if *x.positions.end() != y.positions.start() - 1 { +// return Ok(None); +// } +// match ( +// x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), +// y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), +// ) { +// (Some(w1), Some(w2)) => { +// let ngram2_str = format!("{w1}{w2}"); +// let mut derivations = word_derivations( +// ctx, +// &ngram2_str, +// number_of_typos_allowed(ngram2_str.as_str()).saturating_sub(1), +// y.value.is_prefix(&ctx.derivations_interner), +// )?; +// // Now add the synonyms +// let index_synonyms = ctx.index.synonyms(ctx.txn)?; +// let mut derivations_synonyms = derivations.synonyms.to_vec(); +// derivations_synonyms.extend( +// index_synonyms +// .get(&vec![w1.to_owned(), w2.to_owned()]) +// .cloned() +// .unwrap_or_default() +// .into_iter() +// .map(|words| { +// let words = +// words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); +// ctx.phrase_interner.insert(Phrase { words }) +// }), +// ); + +// let term = LocatedQueryTerm { +// value: QueryTerm::Word { +// derivations: ctx.derivations_interner.insert(derivations), +// }, +// positions: *x.positions.start()..=*y.positions.end(), +// }; + +// Ok(Some(term)) +// } +// _ => Ok(None), +// } +// } + +// // TODO: return a word derivations instead? +// pub fn ngram3( +// ctx: &mut SearchContext, +// x: &LocatedQueryTerm, +// y: &LocatedQueryTerm, +// z: &LocatedQueryTerm, +// ) -> Option<(Interned, RangeInclusive)> { +// if *x.positions.end() != y.positions.start() - 1 +// || *y.positions.end() != z.positions.start() - 1 +// { +// return None; +// } +// match ( +// &x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), +// &y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), +// &z.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), +// ) { +// (Some(w1), Some(w2), Some(w3)) => { +// let term = ( +// ctx.word_interner.insert(format!("{w1}{w2}{w3}")), +// *x.positions.start()..=*z.positions.end(), +// ); +// Some(term) +// } +// _ => None, +// } +// } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 87cb75e45..d3a219948 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -1,37 +1,43 @@ +#![allow(clippy::too_many_arguments)] use std::collections::BTreeMap; -use itertools::Itertools; - use super::ProximityEdge; -use crate::search::new::interner::Interner; -use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use crate::search::new::db_cache::DatabaseCache; +use crate::search::new::interner::{Interned, Interner}; +use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; use crate::search::new::ranking_rule_graph::proximity::WordPair; use crate::search::new::ranking_rule_graph::EdgeCondition; use crate::search::new::{QueryNode, SearchContext}; use crate::Result; +use heed::RoTxn; pub fn visit_from_node( ctx: &mut SearchContext, from_node: &QueryNode, -) -> Result> { - Ok(Some(match from_node { +) -> Result>, Interned)>, i8)>> { + let SearchContext { derivations_interner, .. } = ctx; + + let (left_phrase, left_derivations, left_end_position) = match from_node { QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => { match value1 { QueryTerm::Word { derivations } => { - (ctx.derivations_interner.get(*derivations).clone(), *pos1.end()) + (None, derivations_interner.get(*derivations).clone(), *pos1.end()) } - QueryTerm::Phrase { phrase: phrase1 } => { - let phrase1 = ctx.phrase_interner.get(*phrase1); - if let Some(original) = *phrase1.words.last().unwrap() { + QueryTerm::Phrase { phrase: phrase_interned } => { + let phrase = ctx.phrase_interner.get(*phrase_interned); + if let Some(original) = *phrase.words.last().unwrap() { ( + Some(*phrase_interned), WordDerivations { original, - zero_typo: Box::new([original]), + zero_typo: Some(original), one_typo: Box::new([]), two_typos: Box::new([]), - use_prefix_db: false, + use_prefix_db: None, synonyms: Box::new([]), split_words: None, + is_prefix: false, + prefix_of: Box::new([]), }, *pos1.end(), ) @@ -42,190 +48,175 @@ pub fn visit_from_node( } } } - QueryNode::Start => ( - WordDerivations { - original: ctx.word_interner.insert(String::new()), - zero_typo: Box::new([]), - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: false, - synonyms: Box::new([]), - split_words: None, - }, - -100, - ), + QueryNode::Start => (None, WordDerivations::empty(&mut ctx.word_interner, ""), -1), _ => return Ok(None), - })) + }; + + // left term cannot be a prefix + assert!(left_derivations.use_prefix_db.is_none() && !left_derivations.is_prefix); + + let last_word_left_phrase = if let Some(left_phrase_interned) = left_phrase { + let left_phrase = ctx.phrase_interner.get(left_phrase_interned); + left_phrase.words.last().copied().unwrap() + } else { + None + }; + let left_single_word_iter: Vec<(Option>, Interned)> = left_derivations + .all_single_word_derivations_except_prefix_db() + .chain(last_word_left_phrase.iter().copied()) + .map(|w| (left_phrase, w)) + .collect(); + let left_phrase_iter: Vec<(Option>, Interned)> = left_derivations + .all_phrase_derivations() + .map(|left_phrase_interned: Interned| { + let left_phrase = ctx.phrase_interner.get(left_phrase_interned); + let last_word_left_phrase: Interned = + left_phrase.words.last().unwrap().unwrap(); + let r: (Option>, Interned) = + (Some(left_phrase_interned), last_word_left_phrase); + r + }) + .collect(); + let mut left_word_iter = left_single_word_iter; + left_word_iter.extend(left_phrase_iter); + + Ok(Some((left_word_iter, left_end_position))) } -pub fn visit_to_node<'ctx, 'from_data>( +pub fn build_step_visit_destination_node<'ctx, 'from_data>( ctx: &mut SearchContext<'ctx>, conditions_interner: &mut Interner, + from_node_data: &'from_data (Vec<(Option>, Interned)>, i8), to_node: &QueryNode, - from_node_data: &'from_data (WordDerivations, i8), ) -> Result)>> { - let SearchContext { index, txn, db_cache, word_interner, derivations_interner, .. } = ctx; - - // IMPORTANT! TODO: split words support - - let (derivations1, pos1) = from_node_data; - let term2 = match &to_node { + let SearchContext { + index, + txn, + db_cache, + word_interner, + phrase_interner, + derivations_interner, + query_term_docids: _, + } = ctx; + let right_term = match &to_node { QueryNode::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), QueryNode::Deleted | QueryNode::Start => return Ok(vec![]), QueryNode::Term(term) => term, }; - let LocatedQueryTerm { value: value2, positions: pos2 } = term2; + let LocatedQueryTerm { value: right_value, positions: right_positions } = right_term; - let (derivations2, pos2, ngram_len2) = match value2 { - QueryTerm::Word { derivations } => { - (derivations_interner.get(*derivations).clone(), *pos2.start(), pos2.len()) - } - QueryTerm::Phrase { phrase: phrase2 } => { - let phrase2 = ctx.phrase_interner.get(*phrase2); - if let Some(original) = *phrase2.words.first().unwrap() { - ( - WordDerivations { - original, - zero_typo: Box::new([original]), - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: false, - synonyms: Box::new([]), - split_words: None, - }, - *pos2.start(), - 1, - ) - } else { - // No word pairs if the phrase does not have a regular word as its first term - return Ok(vec![]); + let (right_phrase, right_derivations, right_start_position, right_ngram_length) = + match right_value { + QueryTerm::Word { derivations } => ( + None, + derivations_interner.get(*derivations).clone(), + *right_positions.start(), + right_positions.len(), + ), + QueryTerm::Phrase { phrase: right_phrase_interned } => { + let right_phrase = phrase_interner.get(*right_phrase_interned); + if let Some(original) = *right_phrase.words.first().unwrap() { + ( + Some(*right_phrase_interned), + WordDerivations { + original, + zero_typo: Some(original), + one_typo: Box::new([]), + two_typos: Box::new([]), + use_prefix_db: None, + synonyms: Box::new([]), + split_words: None, + is_prefix: false, + prefix_of: Box::new([]), + }, + *right_positions.start(), + 1, + ) + } else { + // No word pairs if the phrase does not have a regular word as its first term + return Ok(vec![]); + } } - } - }; + }; - if pos1 + 1 != pos2 { - // TODO: how should this actually be handled? - // We want to effectively ignore this pair of terms + let (left_derivations, left_end_position) = from_node_data; + + if left_end_position + 1 != right_start_position { + // We want to ignore this pair of terms // Unconditionally walk through the edge without computing the docids - // But also what should the cost be? + // This can happen when, in a query like `the sun flowers are beautiful`, the term + // `flowers` is removed by the words ranking rule due to the terms matching strategy. + // The remaining query graph represents `the sun .. are beautiful` + // but `sun` and `are` have no proximity condition between them return Ok(vec![(0, EdgeCondition::Unconditional)]); } - let updb1 = derivations1.use_prefix_db; - let updb2 = derivations2.use_prefix_db; - - // left term cannot be a prefix - assert!(!updb1); - - // TODO: IMPORTANT! split words and synonyms support - let derivations1 = derivations1.all_single_word_derivations_except_prefix_db(); - // TODO: eventually, we want to get rid of the uses from `orginal` let mut cost_proximity_word_pairs = BTreeMap::>>::new(); - if updb2 { - for word1 in derivations1.clone() { - for proximity in 1..=(8 - ngram_len2) { - let cost = (proximity + ngram_len2 - 1) as u8; - // TODO: if we had access to the universe here, we could already check whether - // the bitmap corresponding to this word pair is disjoint with the universe or not - if db_cache - .get_word_prefix_pair_proximity_docids( - index, - txn, - word_interner, - word1, - derivations2.original, - proximity as u8, - )? - .is_some() - { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8) - .or_default() - .push(WordPair::WordPrefix { - left: word1, - right_prefix: derivations2.original, - }); - } - if db_cache - .get_prefix_word_pair_proximity_docids( - index, - txn, - word_interner, - derivations2.original, - word1, - proximity as u8 - 1, - )? - .is_some() - { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8) - .or_default() - .push(WordPair::WordPrefixSwapped { - left_prefix: derivations2.original, - right: word1, - }); - } - } + if let Some(right_prefix) = right_derivations.use_prefix_db { + for (left_phrase, left_word) in left_derivations.iter().copied() { + add_prefix_edges( + index, + txn, + db_cache, + word_interner, + right_ngram_length, + left_word, + right_prefix, + &mut cost_proximity_word_pairs, + left_phrase, + )?; } } - // TODO: important! support split words and synonyms as well - let derivations2 = derivations2.all_single_word_derivations_except_prefix_db(); // TODO: add safeguard in case the cartesian product is too large! // even if we restrict the word derivations to a maximum of 100, the size of the // caterisan product could reach a maximum of 10_000 derivations, which is way too much. // mMaybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been // reached - let product_derivations = derivations1.cartesian_product(derivations2); + let first_word_right_phrase = if let Some(right_phrase_interned) = right_phrase { + let right_phrase = phrase_interner.get(right_phrase_interned); + right_phrase.words.first().copied().unwrap() + } else { + None + }; + let right_single_word_iter: Vec<(Option>, Interned)> = + right_derivations + .all_single_word_derivations_except_prefix_db() + .chain(first_word_right_phrase.iter().copied()) + .map(|w| (right_phrase, w)) + .collect(); + let right_phrase_iter: Vec<(Option>, Interned)> = right_derivations + .all_phrase_derivations() + .map(|right_phrase_interned: Interned| { + let right_phrase = phrase_interner.get(right_phrase_interned); + let first_word_right_phrase: Interned = + right_phrase.words.first().unwrap().unwrap(); + let r: (Option>, Interned) = + (Some(right_phrase_interned), first_word_right_phrase); + r + }) + .collect(); + let mut right_word_iter = right_single_word_iter; + right_word_iter.extend(right_phrase_iter); - for (word1, word2) in product_derivations { - for proximity in 1..=(8 - ngram_len2) { - let cost = (proximity + ngram_len2 - 1) as u8; - if db_cache - .get_word_pair_proximity_docids( - index, - txn, - word_interner, - word1, - word2, - proximity as u8, - )? - .is_some() - { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8) - .or_default() - .push(WordPair::Words { left: word1, right: word2 }); - } - if proximity > 1 - && db_cache - .get_word_pair_proximity_docids( - index, - txn, - word_interner, - word2, - word1, - proximity as u8 - 1, - )? - .is_some() - { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8 - 1) - .or_default() - .push(WordPair::Words { left: word2, right: word1 }); - } + for (left_phrase, left_word) in left_derivations.iter().copied() { + for (right_phrase, right_word) in right_word_iter.iter().copied() { + add_non_prefix_edges( + index, + txn, + db_cache, + word_interner, + right_ngram_length, + left_word, + right_word, + &mut cost_proximity_word_pairs, + &[left_phrase, right_phrase].iter().copied().flatten().collect::>(), + )?; } } + let mut new_edges = cost_proximity_word_pairs .into_iter() @@ -243,6 +234,124 @@ pub fn visit_to_node<'ctx, 'from_data>( edges }) .collect::>(); - new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeCondition::Unconditional)); + new_edges.push((8 + (right_ngram_length - 1) as u8, EdgeCondition::Unconditional)); Ok(new_edges) } + +fn add_prefix_edges<'ctx>( + index: &mut &crate::Index, + txn: &'ctx RoTxn, + db_cache: &mut DatabaseCache<'ctx>, + word_interner: &mut Interner, + right_ngram_length: usize, + left_word: Interned, + right_prefix: Interned, + cost_proximity_word_pairs: &mut BTreeMap>>, + left_phrase: Option>, +) -> Result<()> { + for proximity in 1..=(8 - right_ngram_length) { + let cost = (proximity + right_ngram_length - 1) as u8; + // TODO: if we had access to the universe here, we could already check whether + // the bitmap corresponding to this word pair is disjoint with the universe or not + if db_cache + .get_word_prefix_pair_proximity_docids( + index, + txn, + word_interner, + left_word, + right_prefix, + proximity as u8, + )? + .is_some() + { + cost_proximity_word_pairs + .entry(cost) + .or_default() + .entry(proximity as u8) + .or_default() + .push(WordPair::WordPrefix { + phrases: left_phrase.into_iter().collect(), + left: left_word, + right_prefix, + }); + } + + // No swapping when computing the proximity between a phrase and a word + if left_phrase.is_none() + && db_cache + .get_prefix_word_pair_proximity_docids( + index, + txn, + word_interner, + right_prefix, + left_word, + proximity as u8 - 1, + )? + .is_some() + { + cost_proximity_word_pairs + .entry(cost) + .or_default() + .entry(proximity as u8) + .or_default() + .push(WordPair::WordPrefixSwapped { left_prefix: right_prefix, right: left_word }); + } + } + Ok(()) +} + +fn add_non_prefix_edges<'ctx>( + index: &mut &crate::Index, + txn: &'ctx RoTxn, + db_cache: &mut DatabaseCache<'ctx>, + word_interner: &mut Interner, + right_ngram_length: usize, + word1: Interned, + word2: Interned, + cost_proximity_word_pairs: &mut BTreeMap>>, + phrases: &[Interned], +) -> Result<()> { + for proximity in 1..=(8 - right_ngram_length) { + let cost = (proximity + right_ngram_length - 1) as u8; + if db_cache + .get_word_pair_proximity_docids( + index, + txn, + word_interner, + word1, + word2, + proximity as u8, + )? + .is_some() + { + cost_proximity_word_pairs + .entry(cost) + .or_default() + .entry(proximity as u8) + .or_default() + .push(WordPair::Words { phrases: phrases.to_vec(), left: word1, right: word2 }); + } + if proximity > 1 + // no swapping when either term is a phrase + && phrases.is_empty() + && db_cache + .get_word_pair_proximity_docids( + index, + txn, + word_interner, + word2, + word1, + proximity as u8 - 1, + )? + .is_some() + { + cost_proximity_word_pairs + .entry(cost) + .or_default() + .entry(proximity as u8 - 1) + .or_default() + .push(WordPair::Words { phrases: vec![], left: word2, right: word1 }); + } + } + Ok(()) +} diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 0821cd5d0..8dfe805c7 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -13,24 +13,61 @@ pub fn compute_docids<'ctx>( let ProximityEdge { pairs, proximity } = edge; let mut pair_docids = RoaringBitmap::new(); for pair in pairs.iter() { - let bytes = match pair { - WordPair::Words { left, right } => db_cache.get_word_pair_proximity_docids( - index, - txn, - word_interner, - *left, - *right, - *proximity, - ), - WordPair::WordPrefix { left, right_prefix } => db_cache - .get_word_prefix_pair_proximity_docids( - index, - txn, - word_interner, - *left, - *right_prefix, - *proximity, - ), + let pair = match pair { + WordPair::Words { phrases, left, right } => { + let mut docids = db_cache + .get_word_pair_proximity_docids( + index, + txn, + word_interner, + *left, + *right, + *proximity, + )? + .map(CboRoaringBitmapCodec::deserialize_from) + .transpose()? + .unwrap_or_default(); + if !docids.is_empty() { + for phrase in phrases { + docids &= ctx.query_term_docids.get_phrase_docids( + index, + txn, + db_cache, + word_interner, + &ctx.phrase_interner, + *phrase, + )?; + } + } + docids + } + WordPair::WordPrefix { phrases, left, right_prefix } => { + let mut docids = db_cache + .get_word_prefix_pair_proximity_docids( + index, + txn, + word_interner, + *left, + *right_prefix, + *proximity, + )? + .map(CboRoaringBitmapCodec::deserialize_from) + .transpose()? + .unwrap_or_default(); + if !docids.is_empty() { + for phrase in phrases { + docids &= ctx.query_term_docids.get_phrase_docids( + index, + txn, + db_cache, + word_interner, + &ctx.phrase_interner, + *phrase, + )?; + } + } + docids + } WordPair::WordPrefixSwapped { left_prefix, right } => db_cache .get_prefix_word_pair_proximity_docids( index, @@ -39,11 +76,13 @@ pub fn compute_docids<'ctx>( *left_prefix, *right, *proximity, - ), - }?; - // TODO: deserialize bitmap within a universe, and (maybe) using a bump allocator? - let bitmap = universe - & bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); + )? + .map(CboRoaringBitmapCodec::deserialize_from) + .transpose()? + .unwrap_or_default(), + }; + // TODO: deserialize bitmap within a universe + let bitmap = universe & pair; pair_docids |= bitmap; } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index b099e79f6..876bd3ac0 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -7,16 +7,27 @@ use super::empty_paths_cache::EmptyPathsCache; use super::{EdgeCondition, RankingRuleGraphTrait}; use crate::search::new::interner::{Interned, Interner}; use crate::search::new::logger::SearchLogger; -use crate::search::new::query_term::WordDerivations; +use crate::search::new::query_term::Phrase; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] pub enum WordPair { - Words { left: Interned, right: Interned }, - WordPrefix { left: Interned, right_prefix: Interned }, - WordPrefixSwapped { left_prefix: Interned, right: Interned }, + Words { + phrases: Vec>, + left: Interned, + right: Interned, + }, + WordPrefix { + phrases: Vec>, + left: Interned, + right_prefix: Interned, + }, + WordPrefixSwapped { + left_prefix: Interned, + right: Interned, + }, } #[derive(Clone, PartialEq, Eq, Hash)] @@ -29,7 +40,7 @@ pub enum ProximityGraph {} impl RankingRuleGraphTrait for ProximityGraph { type EdgeCondition = ProximityEdge; - type BuildVisitedFromNode = (WordDerivations, i8); + type BuildVisitedFromNode = (Vec<(Option>, Interned)>, i8); fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String { let ProximityEdge { pairs, proximity } = edge; @@ -54,10 +65,15 @@ impl RankingRuleGraphTrait for ProximityGraph { fn build_step_visit_destination_node<'from_data, 'ctx: 'from_data>( ctx: &mut SearchContext<'ctx>, conditions_interner: &mut Interner, - to_node: &QueryNode, - from_node_data: &'from_data Self::BuildVisitedFromNode, + dest_node: &QueryNode, + source_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { - build::visit_to_node(ctx, conditions_interner, to_node, from_node_data) + build::build_step_visit_destination_node( + ctx, + conditions_interner, + source_node_data, + dest_node, + ) } fn log_state( diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index ae5c850e3..9b80cd314 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -84,7 +84,7 @@ impl RankingRuleGraphTrait for TypoGraph { ) -> Result)>> { let SearchContext { derivations_interner, .. } = ctx; match to_node { - QueryNode::Term(LocatedQueryTerm { value, .. }) => match *value { + QueryNode::Term(LocatedQueryTerm { value, positions }) => match *value { QueryTerm::Phrase { phrase } => Ok(vec![( 0, EdgeCondition::Conditional( @@ -93,57 +93,62 @@ impl RankingRuleGraphTrait for TypoGraph { )]), QueryTerm::Word { derivations } => { let mut edges = vec![]; + // Ngrams have a base typo cost + // 2-gram -> equivalent to 1 typo + // 3-gram -> equivalent to 2 typos + let base_cost = positions.len().max(2) as u8; for nbr_typos in 0..=2 { let derivations = derivations_interner.get(derivations).clone(); let new_derivations = match nbr_typos { - 0 => { - // TODO: think about how split words and synonyms should be handled here - // TODO: what about ngrams? - // Maybe 2grams should have one typo by default and 3grams 2 typos by default - WordDerivations { - original: derivations.original, - synonyms: derivations.synonyms, - split_words: None, - zero_typo: derivations.zero_typo, - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: derivations.use_prefix_db, - } - } + 0 => WordDerivations { + original: derivations.original, + is_prefix: derivations.is_prefix, + zero_typo: derivations.zero_typo, + prefix_of: derivations.prefix_of, + synonyms: derivations.synonyms, + split_words: None, + one_typo: Box::new([]), + two_typos: Box::new([]), + use_prefix_db: derivations.use_prefix_db, + }, 1 => { // What about split words and synonyms here? WordDerivations { original: derivations.original, + is_prefix: false, + zero_typo: None, + prefix_of: Box::new([]), synonyms: Box::new([]), split_words: derivations.split_words, - zero_typo: Box::new([]), one_typo: derivations.one_typo, two_typos: Box::new([]), - use_prefix_db: false, // false because all items from use_prefix_db haev 0 typos + use_prefix_db: None, // false because all items from use_prefix_db have 0 typos } } 2 => { // What about split words and synonyms here? WordDerivations { original: derivations.original, + zero_typo: None, + is_prefix: false, + prefix_of: Box::new([]), synonyms: Box::new([]), split_words: None, - zero_typo: Box::new([]), one_typo: Box::new([]), two_typos: derivations.two_typos, - use_prefix_db: false, // false because all items from use_prefix_db haev 0 typos + use_prefix_db: None, // false because all items from use_prefix_db have 0 typos } } _ => panic!(), }; if !new_derivations.is_empty() { edges.push(( - nbr_typos, + nbr_typos as u8 + base_cost, EdgeCondition::Conditional(conditions_interner.insert( TypoEdge::Word { derivations: derivations_interner.insert(new_derivations), - nbr_typos, + nbr_typos: nbr_typos as u8, }, )), )) diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 2553f42c9..0ebeaa6df 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -54,25 +54,31 @@ impl QueryTermDocIdsCache { return Ok(&self.derivations[&derivations]); }; let WordDerivations { - original, + original: _, + is_prefix: _, + zero_typo, + prefix_of, synonyms, split_words, - zero_typo, one_typo, two_typos, use_prefix_db, } = derivations_interner.get(derivations); let mut or_docids = vec![]; - for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()).copied() { + for word in zero_typo + .iter() + .chain(prefix_of.iter()) + .chain(one_typo.iter()) + .chain(two_typos.iter()) + .copied() + { if let Some(word_docids) = db_cache.get_word_docids(index, txn, word_interner, word)? { or_docids.push(word_docids); } } - if *use_prefix_db { - // TODO: this will change if we decide to change from (original, zero_typo) to: - // (debug_original, prefix_of, zero_typo) + if let Some(prefix) = use_prefix_db { if let Some(prefix_docids) = - db_cache.get_word_prefix_docids(index, txn, word_interner, *original)? + db_cache.get_word_prefix_docids(index, txn, word_interner, *prefix)? { or_docids.push(prefix_docids); } From 31628c5cd4c7f4b4f5d0ddcca36796596af0e554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 14 Mar 2023 10:54:55 +0100 Subject: [PATCH 055/234] Merge Phrase and WordDerivations into one structure --- milli/src/search/new/logger/detailed.rs | 150 +++++------ milli/src/search/new/mod.rs | 10 +- milli/src/search/new/query_graph.rs | 4 +- milli/src/search/new/query_term.rs | 244 ++++++------------ .../search/new/ranking_rule_graph/build.rs | 19 +- .../src/search/new/ranking_rule_graph/mod.rs | 18 +- .../new/ranking_rule_graph/proximity/build.rs | 185 ++++--------- .../proximity/compute_docids.rs | 4 +- .../new/ranking_rule_graph/proximity/mod.rs | 19 +- .../search/new/ranking_rule_graph/typo/mod.rs | 195 ++++++-------- milli/src/search/new/resolve_query_graph.rs | 132 +++------- 11 files changed, 335 insertions(+), 645 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 9f612f239..575d5b0bf 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -6,7 +6,7 @@ use std::time::Instant; use rand::random; use roaring::RoaringBitmap; -use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; use crate::search::new::ranking_rule_graph::{ Edge, EdgeCondition, EmptyPathsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, @@ -432,70 +432,70 @@ results.{random} {{ file: &mut File, ) { match &node { - QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { - QueryTerm::Phrase { phrase } => { + QueryNode::Term(LocatedQueryTerm { value, .. }) => { + let QueryTerm { + original, + zero_typo, + one_typo, + two_typos, + use_prefix_db, + synonyms, + split_words, + prefix_of, + is_prefix: _, + is_ngram: _, + phrase, + } = ctx.term_interner.get(*value); + + let original = ctx.word_interner.get(*original); + writeln!( + file, + "{node_idx} : \"{original}\" {{ +shape: class" + ) + .unwrap(); + for w in zero_typo.iter().copied() { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 0").unwrap(); + } + for w in prefix_of.iter().copied() { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 0P").unwrap(); + } + for w in one_typo.iter().copied() { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 1").unwrap(); + } + for w in two_typos.iter().copied() { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 2").unwrap(); + } + if let Some(phrase) = phrase { let phrase = ctx.phrase_interner.get(*phrase); let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "{node_idx} : \"{phrase_str}\"").unwrap(); + writeln!(file, "\"{phrase_str}\" : phrase").unwrap(); } - QueryTerm::Word { derivations } => { - let WordDerivations { - original, - zero_typo, - one_typo, - two_typos, - use_prefix_db, - synonyms, - split_words, - prefix_of, - is_prefix: _, - } = ctx.derivations_interner.get(*derivations); - - let original = ctx.word_interner.get(*original); - writeln!( - file, - "{node_idx} : \"{original}\" {{ -shape: class" - ) - .unwrap(); - for w in zero_typo.iter().copied() { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0").unwrap(); - } - for w in prefix_of.iter().copied() { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0P").unwrap(); - } - for w in one_typo.iter().copied() { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 1").unwrap(); - } - for w in two_typos.iter().copied() { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 2").unwrap(); - } - if let Some(split_words) = split_words { - let phrase = ctx.phrase_interner.get(*split_words); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); - } - for synonym in synonyms.iter().copied() { - let phrase = ctx.phrase_interner.get(synonym); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); - } - if let Some(use_prefix_db) = use_prefix_db { - let p = ctx.word_interner.get(*use_prefix_db); - writeln!(file, "use prefix DB : {p}").unwrap(); - } - for (d, edges) in distances.iter() { - writeln!(file, "\"distance {d}\" : {:?}", edges.iter().collect::>()) - .unwrap(); - } - - writeln!(file, "}}").unwrap(); + if let Some(split_words) = split_words { + let phrase = ctx.phrase_interner.get(*split_words); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); } - }, + for synonym in synonyms.iter().copied() { + let phrase = ctx.phrase_interner.get(synonym); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); + } + if let Some(use_prefix_db) = use_prefix_db { + let p = ctx.word_interner.get(*use_prefix_db); + writeln!(file, "use prefix DB : {p}").unwrap(); + } + for (d, edges) in distances.iter() { + writeln!(file, "\"distance {d}\" : {:?}", edges.iter().collect::>()) + .unwrap(); + } + + writeln!(file, "}}").unwrap(); + } QueryNode::Deleted => panic!(), QueryNode::Start => { writeln!(file, "{node_idx} : START").unwrap(); @@ -600,32 +600,20 @@ shape: class" graph.edges_store[edge_idx as usize].as_ref().unwrap(); let source_node = &graph.query_graph.nodes[*source_node as usize]; let source_node_desc = match source_node { - QueryNode::Term(term) => match term.value { - QueryTerm::Phrase { phrase } => { - let phrase = ctx.phrase_interner.get(phrase); - phrase.description(&ctx.word_interner) - } - QueryTerm::Word { derivations } => { - let derivations = ctx.derivations_interner.get(derivations); - ctx.word_interner.get(derivations.original).to_owned() - } - }, + QueryNode::Term(term) => { + let term = ctx.term_interner.get(term.value); + ctx.word_interner.get(term.original).to_owned() + } QueryNode::Deleted => panic!(), QueryNode::Start => "START".to_owned(), QueryNode::End => "END".to_owned(), }; let dest_node = &graph.query_graph.nodes[*dest_node as usize]; let dest_node_desc = match dest_node { - QueryNode::Term(term) => match term.value { - QueryTerm::Phrase { phrase } => { - let phrase = ctx.phrase_interner.get(phrase); - phrase.description(&ctx.word_interner) - } - QueryTerm::Word { derivations } => { - let derivations = ctx.derivations_interner.get(derivations); - ctx.word_interner.get(derivations.original).to_owned() - } - }, + QueryNode::Term(term) => { + let term = ctx.term_interner.get(term.value); + ctx.word_interner.get(term.original).to_owned() + } QueryNode::Deleted => panic!(), QueryNode::Start => "START".to_owned(), QueryNode::End => "END".to_owned(), diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 323b8eb62..1eaa6d347 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -27,7 +27,7 @@ pub use ranking_rules::{bucket_sort, RankingRule, RankingRuleOutput, RankingRule use roaring::RoaringBitmap; use self::interner::Interner; -use self::query_term::{Phrase, WordDerivations}; +use self::query_term::{Phrase, QueryTerm}; use self::ranking_rules::PlaceholderQuery; use self::resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; use crate::search::new::graph_based_ranking_rule::{Proximity, Typo}; @@ -41,8 +41,8 @@ pub struct SearchContext<'ctx> { pub db_cache: DatabaseCache<'ctx>, pub word_interner: Interner, pub phrase_interner: Interner, - pub derivations_interner: Interner, - pub query_term_docids: QueryTermDocIdsCache, + pub term_interner: Interner, + pub term_docids: QueryTermDocIdsCache, } impl<'ctx> SearchContext<'ctx> { pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { @@ -52,8 +52,8 @@ impl<'ctx> SearchContext<'ctx> { db_cache: <_>::default(), word_interner: <_>::default(), phrase_interner: <_>::default(), - derivations_interner: <_>::default(), - query_term_docids: <_>::default(), + term_interner: <_>::default(), + term_docids: <_>::default(), } } } diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index f76feb80b..7bed15571 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -45,7 +45,7 @@ For the search query `sunflower`, we need to register the following things: - and also the couple of adjacent words `sun flower` - as well as all the user-defined synonyms of `sunflower` -All these derivations of a word will be stored in [`WordDerivations`]. +All these derivations of a word will be stored in [`QueryTerm`]. ## Example 2: For the search query `summer house by`. @@ -148,7 +148,7 @@ impl QueryGraph { let mut new_nodes = vec![]; let new_node_idx = graph.add_node(&prev0, QueryNode::Term(term0.clone())); new_nodes.push(new_node_idx); - if term0.is_empty(&ctx.derivations_interner) { + if term0.is_empty(&ctx.term_interner) { empty_nodes.push(new_node_idx); } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 467752012..c6cb81131 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -30,16 +30,20 @@ impl Phrase { /// A structure storing all the different ways to match /// a term in the user's search query. #[derive(Clone, PartialEq, Eq, Hash)] -pub struct WordDerivations { +pub struct QueryTerm { /// The original terms, for debugging purposes pub original: Interned, + /// Whether the term is an ngram + pub is_ngram: bool, + /// Whether the term can be only the prefix of a word pub is_prefix: bool, - - /// A single word equivalent to the original one, with zero typos + /// The original phrase, if any + pub phrase: Option>, + /// A single word equivalent to the original term, with zero typos pub zero_typo: Option>, /// All the words that contain the original word as prefix pub prefix_of: Box<[Interned]>, - /// All the synonyms of the original word + /// All the synonyms of the original word or phrase pub synonyms: Box<[Interned]>, /// The original word split into multiple consecutive words @@ -54,10 +58,15 @@ pub struct WordDerivations { /// A prefix in the prefix databases matching the original word pub use_prefix_db: Option>, } -impl WordDerivations { - pub fn empty(word_interner: &mut Interner, original: &str) -> Self { +impl QueryTerm { + pub fn phrase( + word_interner: &mut Interner, + phrase_interner: &mut Interner, + phrase: Phrase, + ) -> Self { Self { - original: word_interner.insert(original.to_owned()), + original: word_interner.insert(phrase.description(word_interner)), + phrase: Some(phrase_interner.insert(phrase)), is_prefix: false, zero_typo: None, prefix_of: Box::new([]), @@ -66,12 +75,28 @@ impl WordDerivations { one_typo: Box::new([]), two_typos: Box::new([]), use_prefix_db: None, + is_ngram: false, + } + } + pub fn empty(word_interner: &mut Interner, original: &str) -> Self { + Self { + original: word_interner.insert(original.to_owned()), + phrase: None, + is_prefix: false, + zero_typo: None, + prefix_of: Box::new([]), + synonyms: Box::new([]), + split_words: None, + one_typo: Box::new([]), + two_typos: Box::new([]), + use_prefix_db: None, + is_ngram: false, } } /// Return an iterator over all the single words derived from the original word. /// /// This excludes synonyms, split words, and words stored in the prefix databases. - pub fn all_single_word_derivations_except_prefix_db( + pub fn all_single_words_except_prefix_db( &'_ self, ) -> impl Iterator> + Clone + '_ { self.zero_typo @@ -84,7 +109,7 @@ impl WordDerivations { /// Return an iterator over all the single words derived from the original word. /// /// This excludes synonyms, split words, and words stored in the prefix databases. - pub fn all_phrase_derivations(&'_ self) -> impl Iterator> + Clone + '_ { + pub fn all_phrases(&'_ self) -> impl Iterator> + Clone + '_ { self.split_words.iter().chain(self.synonyms.iter()).copied() } pub fn is_empty(&self) -> bool { @@ -98,15 +123,15 @@ impl WordDerivations { } } -/// Compute the word derivations for the given word -pub fn word_derivations( +/// Compute the query term for the given word +pub fn query_term_from_word( ctx: &mut SearchContext, word: &str, max_typo: u8, is_prefix: bool, -) -> Result { +) -> Result { if word.len() > MAX_WORD_LENGTH { - return Ok(WordDerivations::empty(&mut ctx.word_interner, word)); + return Ok(QueryTerm::empty(&mut ctx.word_interner, word)); } let fst = ctx.index.words_fst(ctx.txn)?; @@ -223,8 +248,9 @@ pub fn word_derivations( }) .collect(); - Ok(WordDerivations { + Ok(QueryTerm { original: word_interned, + phrase: None, is_prefix, zero_typo, prefix_of: prefix_of.into_boxed_slice(), @@ -233,6 +259,7 @@ pub fn word_derivations( one_typo: one_typo.into_boxed_slice(), two_typos: two_typos.into_boxed_slice(), use_prefix_db, + is_ngram: false, }) } @@ -266,35 +293,13 @@ fn split_best_frequency( Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned()))) } -#[derive(Clone, PartialEq, Eq, Hash)] -pub enum QueryTerm { - Phrase { phrase: Interned }, - // TODO: change to `Interned`? - Word { derivations: Interned }, -} - impl QueryTerm { - pub fn is_prefix(&self, derivations_interner: &Interner) -> bool { - match self { - QueryTerm::Phrase { .. } => false, - QueryTerm::Word { derivations } => derivations_interner.get(*derivations).is_prefix, - } - } /// Return the original word from the given query term - pub fn original_single_word( - &self, - derivations_interner: &Interner, - ) -> Option> { - match self { - QueryTerm::Phrase { phrase: _ } => None, - QueryTerm::Word { derivations } => { - let derivations = derivations_interner.get(*derivations); - if derivations.is_empty() { - None - } else { - Some(derivations.original) - } - } + pub fn original_single_word(&self) -> Option> { + if self.phrase.is_some() || self.is_ngram { + None + } else { + Some(self.original) } } } @@ -302,19 +307,14 @@ impl QueryTerm { /// A query term term coupled with its position in the user's search query. #[derive(Clone)] pub struct LocatedQueryTerm { - pub value: QueryTerm, + pub value: Interned, pub positions: RangeInclusive, } impl LocatedQueryTerm { - /// Return `true` iff the word derivations within the query term are empty - pub fn is_empty(&self, interner: &Interner) -> bool { - match self.value { - // TODO: phrases should be greedily computed, so that they can be excluded from - // the query graph right from the start? - QueryTerm::Phrase { phrase: _ } => false, - QueryTerm::Word { derivations, .. } => interner.get(derivations).is_empty(), - } + /// Return `true` iff the term is empty + pub fn is_empty(&self, interner: &Interner) -> bool { + interner.get(self.value).is_empty() } } @@ -360,18 +360,16 @@ pub fn located_query_terms_from_string<'ctx>( } else { let word = ctx.word_interner.insert(token.lemma().to_string()); // TODO: in a phrase, check that every word exists - // otherwise return WordDerivations::Empty + // otherwise return an empty term phrase.push(Some(word)); } } else if peekable.peek().is_some() { match token.kind { TokenKind::Word => { let word = token.lemma(); - let derivations = word_derivations(ctx, word, nbr_typos(word), false)?; + let term = query_term_from_word(ctx, word, nbr_typos(word), false)?; let located_term = LocatedQueryTerm { - value: QueryTerm::Word { - derivations: ctx.derivations_interner.insert(derivations), - }, + value: ctx.term_interner.insert(term), positions: position..=position, }; located_terms.push(located_term); @@ -380,11 +378,9 @@ pub fn located_query_terms_from_string<'ctx>( } } else { let word = token.lemma(); - let derivations = word_derivations(ctx, word, nbr_typos(word), true)?; + let term = query_term_from_word(ctx, word, nbr_typos(word), true)?; let located_term = LocatedQueryTerm { - value: QueryTerm::Word { - derivations: ctx.derivations_interner.insert(derivations), - }, + value: ctx.term_interner.insert(term), positions: position..=position, }; located_terms.push(located_term); @@ -408,11 +404,11 @@ pub fn located_query_terms_from_string<'ctx>( if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) { let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase { - phrase: ctx - .phrase_interner - .insert(Phrase { words: mem::take(&mut phrase) }), - }, + value: ctx.term_interner.insert(QueryTerm::phrase( + &mut ctx.word_interner, + &mut ctx.phrase_interner, + Phrase { words: mem::take(&mut phrase) }, + )), positions: phrase_start..=phrase_end, }; located_terms.push(located_query_term); @@ -425,9 +421,11 @@ pub fn located_query_terms_from_string<'ctx>( // If a quote is never closed, we consider all of the end of the query as a phrase. if !phrase.is_empty() { let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase { - phrase: ctx.phrase_interner.insert(Phrase { words: mem::take(&mut phrase) }), - }, + value: ctx.term_interner.insert(QueryTerm::phrase( + &mut ctx.word_interner, + &mut ctx.phrase_interner, + Phrase { words: mem::take(&mut phrase) }, + )), positions: phrase_start..=phrase_end, }; located_terms.push(located_query_term); @@ -474,8 +472,7 @@ pub fn make_ngram( } let mut words_interned = vec![]; for term in terms { - if let Some(original_term_word) = term.value.original_single_word(&ctx.derivations_interner) - { + if let Some(original_term_word) = ctx.term_interner.get(term.value).original_single_word() { words_interned.push(original_term_word); } else { return Ok(None); @@ -486,121 +483,40 @@ pub fn make_ngram( let start = *terms.first().as_ref().unwrap().positions.start(); let end = *terms.last().as_ref().unwrap().positions.end(); - let is_prefix = terms.last().as_ref().unwrap().value.is_prefix(&ctx.derivations_interner); + let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix; let ngram_str = words.join(""); if ngram_str.len() > MAX_WORD_LENGTH { return Ok(None); } - let mut derivations = word_derivations( + let mut term = query_term_from_word( ctx, &ngram_str, number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8), is_prefix, )?; - derivations.original = ctx.word_interner.insert(words.join(" ")); + term.original = ctx.word_interner.insert(words.join(" ")); // Now add the synonyms let index_synonyms = ctx.index.synonyms(ctx.txn)?; - let mut derivations_synonyms = derivations.synonyms.to_vec(); - derivations_synonyms.extend( - index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| { + let mut term_synonyms = term.synonyms.to_vec(); + term_synonyms.extend(index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map( + |words| { let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); ctx.phrase_interner.insert(Phrase { words }) - }), - ); - derivations.synonyms = derivations_synonyms.into_boxed_slice(); - if let Some(split_words) = derivations.split_words { + }, + )); + term.synonyms = term_synonyms.into_boxed_slice(); + if let Some(split_words) = term.split_words { let split_words = ctx.phrase_interner.get(split_words); if split_words.words == words_interned.iter().map(|&i| Some(i)).collect::>() { - derivations.split_words = None; + term.split_words = None; } } - if derivations.is_empty() { + if term.is_empty() { return Ok(None); } - let term = LocatedQueryTerm { - value: QueryTerm::Word { derivations: ctx.derivations_interner.insert(derivations) }, - positions: start..=end, - }; + term.is_ngram = true; + let term = LocatedQueryTerm { value: ctx.term_interner.insert(term), positions: start..=end }; Ok(Some(term)) } - -// // TODO: return a word derivations instead? -// pub fn ngram2( -// ctx: &mut SearchContext, -// x: &LocatedQueryTerm, -// y: &LocatedQueryTerm, -// number_of_typos_allowed: impl Fn(&str) -> u8, -// ) -> Result> { -// if *x.positions.end() != y.positions.start() - 1 { -// return Ok(None); -// } -// match ( -// x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), -// y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), -// ) { -// (Some(w1), Some(w2)) => { -// let ngram2_str = format!("{w1}{w2}"); -// let mut derivations = word_derivations( -// ctx, -// &ngram2_str, -// number_of_typos_allowed(ngram2_str.as_str()).saturating_sub(1), -// y.value.is_prefix(&ctx.derivations_interner), -// )?; -// // Now add the synonyms -// let index_synonyms = ctx.index.synonyms(ctx.txn)?; -// let mut derivations_synonyms = derivations.synonyms.to_vec(); -// derivations_synonyms.extend( -// index_synonyms -// .get(&vec![w1.to_owned(), w2.to_owned()]) -// .cloned() -// .unwrap_or_default() -// .into_iter() -// .map(|words| { -// let words = -// words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); -// ctx.phrase_interner.insert(Phrase { words }) -// }), -// ); - -// let term = LocatedQueryTerm { -// value: QueryTerm::Word { -// derivations: ctx.derivations_interner.insert(derivations), -// }, -// positions: *x.positions.start()..=*y.positions.end(), -// }; - -// Ok(Some(term)) -// } -// _ => Ok(None), -// } -// } - -// // TODO: return a word derivations instead? -// pub fn ngram3( -// ctx: &mut SearchContext, -// x: &LocatedQueryTerm, -// y: &LocatedQueryTerm, -// z: &LocatedQueryTerm, -// ) -> Option<(Interned, RangeInclusive)> { -// if *x.positions.end() != y.positions.start() - 1 -// || *y.positions.end() != z.positions.start() - 1 -// { -// return None; -// } -// match ( -// &x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), -// &y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), -// &z.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner), -// ) { -// (Some(w1), Some(w2), Some(w3)) => { -// let term = ( -// ctx.word_interner.insert(format!("{w1}{w2}{w3}")), -// *x.positions.start()..=*z.positions.end(), -// ); -// Some(term) -// } -// _ => None, -// } -// } diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 7b00fc445..286a98ab1 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -22,28 +22,21 @@ impl RankingRuleGraph { let mut edges_store = vec![]; let mut edges_of_node = vec![]; - for (node_idx, node) in graph_nodes.iter().enumerate() { + for (source_idx, source_node) in graph_nodes.iter().enumerate() { edges_of_node.push(HashSet::new()); let new_edges = edges_of_node.last_mut().unwrap(); - let Some(source_node_data) = G::build_step_visit_source_node(ctx, node)? else { continue }; - - for successor_idx in graph_edges[node_idx].successors.iter() { - let dest_node = &graph_nodes[successor_idx as usize]; - let edges = G::build_step_visit_destination_node( - ctx, - &mut conditions_interner, - dest_node, - &source_node_data, - )?; + for dest_idx in graph_edges[source_idx].successors.iter() { + let dest_node = &graph_nodes[dest_idx as usize]; + let edges = G::build_edges(ctx, &mut conditions_interner, source_node, dest_node)?; if edges.is_empty() { continue; } for (cost, condition) in edges { edges_store.push(Some(Edge { - source_node: node_idx as u16, - dest_node: successor_idx, + source_node: source_idx as u16, + dest_node: dest_idx, cost, condition, })); diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 16c75b072..ee93bee13 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -80,11 +80,6 @@ pub trait RankingRuleGraphTrait: Sized { /// in [`resolve_edge_condition`](RankingRuleGraphTrait::resolve_edge_condition). type EdgeCondition: Sized + Clone + PartialEq + Eq + Hash; - /// A structure used in the construction of the graph, created when a - /// query graph source node is visited. It is used to determine the cost - /// and condition of a ranking rule edge when the destination node is visited. - type BuildVisitedFromNode; - /// Return the label of the given edge condition, to be used when visualising /// the ranking rule graph. fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String; @@ -97,22 +92,13 @@ pub trait RankingRuleGraphTrait: Sized { universe: &RoaringBitmap, ) -> Result; - /// Prepare to build the edges outgoing from `source_node`. - /// - /// This call is followed by zero, one or more calls to [`build_step_visit_destination_node`](RankingRuleGraphTrait::build_step_visit_destination_node), - /// which builds the actual edges. - fn build_step_visit_source_node<'ctx>( - ctx: &mut SearchContext<'ctx>, - source_node: &QueryNode, - ) -> Result>; - /// Return the cost and condition of the edges going from the previously visited node /// (with [`build_step_visit_source_node`](RankingRuleGraphTrait::build_step_visit_source_node)) to `dest_node`. - fn build_step_visit_destination_node<'from_data, 'ctx: 'from_data>( + fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, conditions_interner: &mut Interner, + source_node: &QueryNode, dest_node: &QueryNode, - source_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>>; fn log_state( diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index d3a219948..b8042c408 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -4,89 +4,40 @@ use std::collections::BTreeMap; use super::ProximityEdge; use crate::search::new::db_cache::DatabaseCache; use crate::search::new::interner::{Interned, Interner}; -use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; +use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; use crate::search::new::ranking_rule_graph::proximity::WordPair; use crate::search::new::ranking_rule_graph::EdgeCondition; use crate::search::new::{QueryNode, SearchContext}; use crate::Result; use heed::RoTxn; -pub fn visit_from_node( - ctx: &mut SearchContext, - from_node: &QueryNode, -) -> Result>, Interned)>, i8)>> { - let SearchContext { derivations_interner, .. } = ctx; - - let (left_phrase, left_derivations, left_end_position) = match from_node { - QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => { - match value1 { - QueryTerm::Word { derivations } => { - (None, derivations_interner.get(*derivations).clone(), *pos1.end()) - } - QueryTerm::Phrase { phrase: phrase_interned } => { - let phrase = ctx.phrase_interner.get(*phrase_interned); - if let Some(original) = *phrase.words.last().unwrap() { - ( - Some(*phrase_interned), - WordDerivations { - original, - zero_typo: Some(original), - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: None, - synonyms: Box::new([]), - split_words: None, - is_prefix: false, - prefix_of: Box::new([]), - }, - *pos1.end(), - ) - } else { - // No word pairs if the phrase does not have a regular word as its last term - return Ok(None); - } - } - } - } - QueryNode::Start => (None, WordDerivations::empty(&mut ctx.word_interner, ""), -1), - _ => return Ok(None), - }; - - // left term cannot be a prefix - assert!(left_derivations.use_prefix_db.is_none() && !left_derivations.is_prefix); - - let last_word_left_phrase = if let Some(left_phrase_interned) = left_phrase { - let left_phrase = ctx.phrase_interner.get(left_phrase_interned); - left_phrase.words.last().copied().unwrap() - } else { - None - }; - let left_single_word_iter: Vec<(Option>, Interned)> = left_derivations - .all_single_word_derivations_except_prefix_db() - .chain(last_word_left_phrase.iter().copied()) - .map(|w| (left_phrase, w)) - .collect(); - let left_phrase_iter: Vec<(Option>, Interned)> = left_derivations - .all_phrase_derivations() - .map(|left_phrase_interned: Interned| { - let left_phrase = ctx.phrase_interner.get(left_phrase_interned); - let last_word_left_phrase: Interned = - left_phrase.words.last().unwrap().unwrap(); - let r: (Option>, Interned) = - (Some(left_phrase_interned), last_word_left_phrase); - r - }) - .collect(); - let mut left_word_iter = left_single_word_iter; - left_word_iter.extend(left_phrase_iter); - - Ok(Some((left_word_iter, left_end_position))) +fn last_word_of_term_iter<'t>( + t: &'t QueryTerm, + phrase_interner: &'t Interner, +) -> impl Iterator>, Interned)> + 't { + t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map( + move |p| { + let phrase = phrase_interner.get(p); + phrase.words.last().unwrap().map(|last| (Some(p), last)) + }, + )) +} +fn first_word_of_term_iter<'t>( + t: &'t QueryTerm, + phrase_interner: &'t Interner, +) -> impl Iterator, Option>)> + 't { + t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map( + move |p| { + let phrase = phrase_interner.get(p); + phrase.words.first().unwrap().map(|first| (first, Some(p))) + }, + )) } -pub fn build_step_visit_destination_node<'ctx, 'from_data>( +pub fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, conditions_interner: &mut Interner, - from_node_data: &'from_data (Vec<(Option>, Interned)>, i8), + from_node: &QueryNode, to_node: &QueryNode, ) -> Result)>> { let SearchContext { @@ -95,9 +46,19 @@ pub fn build_step_visit_destination_node<'ctx, 'from_data>( db_cache, word_interner, phrase_interner, - derivations_interner, - query_term_docids: _, + term_interner, + term_docids: _, } = ctx; + + let (left_term, left_end_position) = match from_node { + QueryNode::Term(LocatedQueryTerm { value, positions }) => { + (term_interner.get(*value), *positions.end()) + } + QueryNode::Deleted => return Ok(vec![]), + QueryNode::Start => return Ok(vec![(0, EdgeCondition::Unconditional)]), + QueryNode::End => return Ok(vec![]), + }; + let right_term = match &to_node { QueryNode::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), QueryNode::Deleted | QueryNode::Start => return Ok(vec![]), @@ -105,47 +66,14 @@ pub fn build_step_visit_destination_node<'ctx, 'from_data>( }; let LocatedQueryTerm { value: right_value, positions: right_positions } = right_term; - let (right_phrase, right_derivations, right_start_position, right_ngram_length) = - match right_value { - QueryTerm::Word { derivations } => ( - None, - derivations_interner.get(*derivations).clone(), - *right_positions.start(), - right_positions.len(), - ), - QueryTerm::Phrase { phrase: right_phrase_interned } => { - let right_phrase = phrase_interner.get(*right_phrase_interned); - if let Some(original) = *right_phrase.words.first().unwrap() { - ( - Some(*right_phrase_interned), - WordDerivations { - original, - zero_typo: Some(original), - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: None, - synonyms: Box::new([]), - split_words: None, - is_prefix: false, - prefix_of: Box::new([]), - }, - *right_positions.start(), - 1, - ) - } else { - // No word pairs if the phrase does not have a regular word as its first term - return Ok(vec![]); - } - } - }; - - let (left_derivations, left_end_position) = from_node_data; + let (right_term, right_start_position, right_ngram_length) = + (term_interner.get(*right_value), *right_positions.start(), right_positions.len()); if left_end_position + 1 != right_start_position { // We want to ignore this pair of terms // Unconditionally walk through the edge without computing the docids // This can happen when, in a query like `the sun flowers are beautiful`, the term - // `flowers` is removed by the words ranking rule due to the terms matching strategy. + // `flowers` is removed by the `words` ranking rule. // The remaining query graph represents `the sun .. are beautiful` // but `sun` and `are` have no proximity condition between them return Ok(vec![(0, EdgeCondition::Unconditional)]); @@ -153,8 +81,8 @@ pub fn build_step_visit_destination_node<'ctx, 'from_data>( let mut cost_proximity_word_pairs = BTreeMap::>>::new(); - if let Some(right_prefix) = right_derivations.use_prefix_db { - for (left_phrase, left_word) in left_derivations.iter().copied() { + if let Some(right_prefix) = right_term.use_prefix_db { + for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { add_prefix_edges( index, txn, @@ -172,37 +100,12 @@ pub fn build_step_visit_destination_node<'ctx, 'from_data>( // TODO: add safeguard in case the cartesian product is too large! // even if we restrict the word derivations to a maximum of 100, the size of the // caterisan product could reach a maximum of 10_000 derivations, which is way too much. - // mMaybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo + // Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been // reached - let first_word_right_phrase = if let Some(right_phrase_interned) = right_phrase { - let right_phrase = phrase_interner.get(right_phrase_interned); - right_phrase.words.first().copied().unwrap() - } else { - None - }; - let right_single_word_iter: Vec<(Option>, Interned)> = - right_derivations - .all_single_word_derivations_except_prefix_db() - .chain(first_word_right_phrase.iter().copied()) - .map(|w| (right_phrase, w)) - .collect(); - let right_phrase_iter: Vec<(Option>, Interned)> = right_derivations - .all_phrase_derivations() - .map(|right_phrase_interned: Interned| { - let right_phrase = phrase_interner.get(right_phrase_interned); - let first_word_right_phrase: Interned = - right_phrase.words.first().unwrap().unwrap(); - let r: (Option>, Interned) = - (Some(right_phrase_interned), first_word_right_phrase); - r - }) - .collect(); - let mut right_word_iter = right_single_word_iter; - right_word_iter.extend(right_phrase_iter); - for (left_phrase, left_word) in left_derivations.iter().copied() { - for (right_phrase, right_word) in right_word_iter.iter().copied() { + for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { + for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) { add_non_prefix_edges( index, txn, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 8dfe805c7..0acee0329 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -29,7 +29,7 @@ pub fn compute_docids<'ctx>( .unwrap_or_default(); if !docids.is_empty() { for phrase in phrases { - docids &= ctx.query_term_docids.get_phrase_docids( + docids &= ctx.term_docids.get_phrase_docids( index, txn, db_cache, @@ -56,7 +56,7 @@ pub fn compute_docids<'ctx>( .unwrap_or_default(); if !docids.is_empty() { for phrase in phrases { - docids &= ctx.query_term_docids.get_phrase_docids( + docids &= ctx.term_docids.get_phrase_docids( index, txn, db_cache, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 876bd3ac0..2cfee0b65 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -40,7 +40,6 @@ pub enum ProximityGraph {} impl RankingRuleGraphTrait for ProximityGraph { type EdgeCondition = ProximityEdge; - type BuildVisitedFromNode = (Vec<(Option>, Interned)>, i8); fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String { let ProximityEdge { pairs, proximity } = edge; @@ -55,25 +54,13 @@ impl RankingRuleGraphTrait for ProximityGraph { compute_docids::compute_docids(ctx, edge, universe) } - fn build_step_visit_source_node<'ctx>( - ctx: &mut SearchContext<'ctx>, - from_node: &QueryNode, - ) -> Result> { - build::visit_from_node(ctx, from_node) - } - - fn build_step_visit_destination_node<'from_data, 'ctx: 'from_data>( + fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, conditions_interner: &mut Interner, + source_node: &QueryNode, dest_node: &QueryNode, - source_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { - build::build_step_visit_destination_node( - ctx, - conditions_interner, - source_node_data, - dest_node, - ) + build::build_edges(ctx, conditions_interner, source_node, dest_node) } fn log_state( diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 9b80cd314..6b832f9b2 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -4,28 +4,24 @@ use super::empty_paths_cache::EmptyPathsCache; use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{Interned, Interner}; use crate::search::new::logger::SearchLogger; -use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; +use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] -pub enum TypoEdge { - Phrase { phrase: Interned }, - Word { derivations: Interned, nbr_typos: u8 }, +pub struct TypoEdge { + term: Interned, + nbr_typos: u8, } pub enum TypoGraph {} impl RankingRuleGraphTrait for TypoGraph { type EdgeCondition = TypoEdge; - type BuildVisitedFromNode = (); fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String { - match edge { - TypoEdge::Phrase { .. } => ", 0 typos".to_owned(), - TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"), - } + format!(", {} typos", edge.nbr_typos) } fn resolve_edge_condition<'db_cache, 'ctx>( @@ -39,124 +35,101 @@ impl RankingRuleGraphTrait for TypoGraph { db_cache, word_interner, phrase_interner, - derivations_interner, - query_term_docids, + term_interner, + term_docids: query_term_docids, } = ctx; - match edge { - &TypoEdge::Phrase { phrase } => Ok(universe - & query_term_docids.get_phrase_docids( - index, - txn, - db_cache, - word_interner, - phrase_interner, - phrase, - )?), - TypoEdge::Word { derivations, .. } => { - let docids = universe - & query_term_docids.get_word_derivations_docids( - index, - txn, - db_cache, - word_interner, - derivations_interner, - phrase_interner, - *derivations, - )?; - Ok(docids) - } - } + let docids = universe + & query_term_docids.get_query_term_docids( + index, + txn, + db_cache, + word_interner, + term_interner, + phrase_interner, + edge.term, + )?; + + Ok(docids) } - fn build_step_visit_source_node<'ctx>( - _ctx: &mut SearchContext<'ctx>, - _from_node: &QueryNode, - ) -> Result> { - Ok(Some(())) - } - - fn build_step_visit_destination_node<'from_data, 'ctx: 'from_data>( + fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, conditions_interner: &mut Interner, + _from_node: &QueryNode, to_node: &QueryNode, - _from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { - let SearchContext { derivations_interner, .. } = ctx; + let SearchContext { term_interner, .. } = ctx; match to_node { - QueryNode::Term(LocatedQueryTerm { value, positions }) => match *value { - QueryTerm::Phrase { phrase } => Ok(vec![( - 0, - EdgeCondition::Conditional( - conditions_interner.insert(TypoEdge::Phrase { phrase }), - ), - )]), - QueryTerm::Word { derivations } => { - let mut edges = vec![]; - // Ngrams have a base typo cost - // 2-gram -> equivalent to 1 typo - // 3-gram -> equivalent to 2 typos - let base_cost = positions.len().max(2) as u8; + QueryNode::Term(LocatedQueryTerm { value, positions }) => { + let mut edges = vec![]; + // Ngrams have a base typo cost + // 2-gram -> equivalent to 1 typo + // 3-gram -> equivalent to 2 typos + let base_cost = positions.len().max(2) as u8; - for nbr_typos in 0..=2 { - let derivations = derivations_interner.get(derivations).clone(); - let new_derivations = match nbr_typos { - 0 => WordDerivations { - original: derivations.original, - is_prefix: derivations.is_prefix, - zero_typo: derivations.zero_typo, - prefix_of: derivations.prefix_of, - synonyms: derivations.synonyms, + for nbr_typos in 0..=2 { + let term = term_interner.get(*value).clone(); + let new_term = match nbr_typos { + 0 => QueryTerm { + original: term.original, + is_prefix: term.is_prefix, + zero_typo: term.zero_typo, + prefix_of: term.prefix_of, + synonyms: term.synonyms, + split_words: None, + one_typo: Box::new([]), + two_typos: Box::new([]), + use_prefix_db: term.use_prefix_db, + is_ngram: term.is_ngram, + phrase: term.phrase, + }, + 1 => { + // What about split words and synonyms here? + QueryTerm { + original: term.original, + is_prefix: false, + zero_typo: None, + prefix_of: Box::new([]), + synonyms: Box::new([]), + split_words: term.split_words, + one_typo: term.one_typo, + two_typos: Box::new([]), + use_prefix_db: None, // false because all items from use_prefix_db have 0 typos + is_ngram: term.is_ngram, + phrase: None, + } + } + 2 => { + // What about split words and synonyms here? + QueryTerm { + original: term.original, + zero_typo: None, + is_prefix: false, + prefix_of: Box::new([]), + synonyms: Box::new([]), split_words: None, one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: derivations.use_prefix_db, - }, - 1 => { - // What about split words and synonyms here? - WordDerivations { - original: derivations.original, - is_prefix: false, - zero_typo: None, - prefix_of: Box::new([]), - synonyms: Box::new([]), - split_words: derivations.split_words, - one_typo: derivations.one_typo, - two_typos: Box::new([]), - use_prefix_db: None, // false because all items from use_prefix_db have 0 typos - } + two_typos: term.two_typos, + use_prefix_db: None, // false because all items from use_prefix_db have 0 typos + is_ngram: term.is_ngram, + phrase: None, } - 2 => { - // What about split words and synonyms here? - WordDerivations { - original: derivations.original, - zero_typo: None, - is_prefix: false, - prefix_of: Box::new([]), - synonyms: Box::new([]), - split_words: None, - one_typo: Box::new([]), - two_typos: derivations.two_typos, - use_prefix_db: None, // false because all items from use_prefix_db have 0 typos - } - } - _ => panic!(), - }; - if !new_derivations.is_empty() { - edges.push(( - nbr_typos as u8 + base_cost, - EdgeCondition::Conditional(conditions_interner.insert( - TypoEdge::Word { - derivations: derivations_interner.insert(new_derivations), - nbr_typos: nbr_typos as u8, - }, - )), - )) } + _ => panic!(), + }; + if !new_term.is_empty() { + edges.push(( + nbr_typos as u8 + base_cost, + EdgeCondition::Conditional(conditions_interner.insert(TypoEdge { + term: term_interner.insert(new_term), + nbr_typos: nbr_typos as u8, + })), + )) } - Ok(edges) } - }, + Ok(edges) + } QueryNode::End => Ok(vec![(0, EdgeCondition::Unconditional)]), QueryNode::Deleted | QueryNode::Start => panic!(), } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 0ebeaa6df..5ce6ecec2 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -4,12 +4,12 @@ use std::collections::VecDeque; use fxhash::FxHashMap; use heed::{BytesDecode, RoTxn}; -use roaring::{MultiOps, RoaringBitmap}; +use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; use super::interner::{Interned, Interner}; use super::query_graph::QUERY_GRAPH_NODE_LENGTH_LIMIT; -use super::query_term::{Phrase, QueryTerm, WordDerivations}; +use super::query_term::{Phrase, QueryTerm}; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; @@ -17,7 +17,7 @@ use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; #[derive(Default)] pub struct QueryTermDocIdsCache { pub phrases: FxHashMap, RoaringBitmap>, - pub derivations: FxHashMap, RoaringBitmap>, + pub terms: FxHashMap, RoaringBitmap>, } impl QueryTermDocIdsCache { /// Get the document ids associated with the given phrase @@ -38,109 +38,53 @@ impl QueryTermDocIdsCache { let docids = &self.phrases[&phrase]; Ok(docids) } - - /// Get the document ids associated with the given word derivations - pub fn get_word_derivations_docids<'s, 'ctx>( + /// Get the document ids associated with the given term + pub fn get_query_term_docids<'s, 'ctx>( &'s mut self, index: &Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, word_interner: &Interner, - derivations_interner: &Interner, + term_interner: &Interner, phrase_interner: &Interner, - derivations: Interned, + term_interned: Interned, ) -> Result<&'s RoaringBitmap> { - if self.derivations.contains_key(&derivations) { - return Ok(&self.derivations[&derivations]); + if self.terms.contains_key(&term_interned) { + return Ok(&self.terms[&term_interned]); }; - let WordDerivations { - original: _, - is_prefix: _, - zero_typo, - prefix_of, - synonyms, - split_words, - one_typo, - two_typos, - use_prefix_db, - } = derivations_interner.get(derivations); - let mut or_docids = vec![]; - for word in zero_typo - .iter() - .chain(prefix_of.iter()) - .chain(one_typo.iter()) - .chain(two_typos.iter()) - .copied() - { + let mut docids = RoaringBitmap::new(); + + let term = term_interner.get(term_interned); + for word in term.all_single_words_except_prefix_db() { if let Some(word_docids) = db_cache.get_word_docids(index, txn, word_interner, word)? { - or_docids.push(word_docids); + docids |= + RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?; } } - if let Some(prefix) = use_prefix_db { + for phrase in term.all_phrases() { + docids |= self.get_phrase_docids( + index, + txn, + db_cache, + word_interner, + phrase_interner, + phrase, + )?; + } + + if let Some(prefix) = term.use_prefix_db { if let Some(prefix_docids) = - db_cache.get_word_prefix_docids(index, txn, word_interner, *prefix)? + db_cache.get_word_prefix_docids(index, txn, word_interner, prefix)? { - or_docids.push(prefix_docids); + docids |= + RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?; } } - let mut docids = or_docids - .into_iter() - .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()) - .collect::>(); - for synonym in synonyms.iter().copied() { - // TODO: cache resolve_phrase? - docids.push(resolve_phrase( - index, - txn, - db_cache, - word_interner, - phrase_interner, - synonym, - )?); - } - if let Some(split_words) = split_words { - docids.push(resolve_phrase( - index, - txn, - db_cache, - word_interner, - phrase_interner, - *split_words, - )?); - } - let docids = MultiOps::union(docids); - let _ = self.derivations.insert(derivations, docids); - let docids = &self.derivations[&derivations]; + let _ = self.terms.insert(term_interned, docids); + let docids = &self.terms[&term_interned]; Ok(docids) } - - /// Get the document ids associated with the given query term. - fn get_query_term_docids<'s, 'ctx>( - &'s mut self, - index: &Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - word_interner: &Interner, - derivations_interner: &Interner, - phrase_interner: &Interner, - term: &QueryTerm, - ) -> Result<&'s RoaringBitmap> { - match *term { - QueryTerm::Phrase { phrase } => { - self.get_phrase_docids(index, txn, db_cache, word_interner, phrase_interner, phrase) - } - QueryTerm::Word { derivations } => self.get_word_derivations_docids( - index, - txn, - db_cache, - word_interner, - derivations_interner, - phrase_interner, - derivations, - ), - } - } } pub fn resolve_query_graph<'ctx>( @@ -154,8 +98,8 @@ pub fn resolve_query_graph<'ctx>( db_cache, word_interner, phrase_interner, - derivations_interner, - query_term_docids, + term_interner, + term_docids: query_term_docids, .. } = ctx; // TODO: there is a faster way to compute this big @@ -183,16 +127,16 @@ pub fn resolve_query_graph<'ctx>( let node_docids = match n { QueryNode::Term(located_term) => { - let derivations_docids = query_term_docids.get_query_term_docids( + let term_docids = query_term_docids.get_query_term_docids( index, txn, db_cache, word_interner, - derivations_interner, + term_interner, phrase_interner, - &located_term.value, + located_term.value, )?; - predecessors_docids & derivations_docids + predecessors_docids & term_docids } QueryNode::Deleted => { panic!() From e9cf58d5844d5be21022efa0a1a219ea8a83625d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 14 Mar 2023 16:37:47 +0100 Subject: [PATCH 056/234] Refactor of the Interner --- milli/src/search/new/db_cache.rs | 12 +- .../search/new/graph_based_ranking_rule.rs | 67 ++--- milli/src/search/new/interner.rs | 120 ++++++++- milli/src/search/new/logger/detailed.rs | 119 ++++----- milli/src/search/new/logger/mod.rs | 22 +- milli/src/search/new/mod.rs | 90 ++++--- milli/src/search/new/query_graph.rs | 246 +++++++++++------- milli/src/search/new/query_term.rs | 12 +- .../search/new/ranking_rule_graph/build.rs | 39 +-- .../new/ranking_rule_graph/cheapest_paths.rs | 201 ++++++++------ .../ranking_rule_graph/edge_docids_cache.rs | 6 +- .../ranking_rule_graph/empty_paths_cache.rs | 81 +++--- .../src/search/new/ranking_rule_graph/mod.rs | 49 ++-- .../search/new/ranking_rule_graph/path_set.rs | 4 +- .../new/ranking_rule_graph/proximity/build.rs | 31 +-- .../new/ranking_rule_graph/proximity/mod.rs | 19 +- .../search/new/ranking_rule_graph/typo/mod.rs | 21 +- milli/src/search/new/ranking_rules.rs | 1 + milli/src/search/new/resolve_query_graph.rs | 57 ++-- milli/src/search/new/small_bitmap.rs | 190 ++++++++++---- milli/src/search/new/words.rs | 11 +- 21 files changed, 857 insertions(+), 541 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index b1f57fd0e..2fa92900c 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -5,7 +5,7 @@ use fxhash::FxHashMap; use heed::types::ByteSlice; use heed::{BytesEncode, Database, RoTxn}; -use super::interner::{Interned, Interner}; +use super::interner::{DedupInterner, Interned}; use crate::{Index, Result}; /// A cache storing pointers to values in the LMDB databases. @@ -53,7 +53,7 @@ impl<'ctx> DatabaseCache<'ctx> { &mut self, index: &Index, txn: &'ctx RoTxn, - word_interner: &Interner, + word_interner: &DedupInterner, word: Interned, ) -> Result> { Self::get_value( @@ -69,7 +69,7 @@ impl<'ctx> DatabaseCache<'ctx> { &mut self, index: &Index, txn: &'ctx RoTxn, - word_interner: &Interner, + word_interner: &DedupInterner, prefix: Interned, ) -> Result> { Self::get_value( @@ -85,7 +85,7 @@ impl<'ctx> DatabaseCache<'ctx> { &mut self, index: &Index, txn: &'ctx RoTxn, - word_interner: &Interner, + word_interner: &DedupInterner, word1: Interned, word2: Interned, proximity: u8, @@ -103,7 +103,7 @@ impl<'ctx> DatabaseCache<'ctx> { &mut self, index: &Index, txn: &'ctx RoTxn, - word_interner: &Interner, + word_interner: &DedupInterner, word1: Interned, prefix2: Interned, proximity: u8, @@ -120,7 +120,7 @@ impl<'ctx> DatabaseCache<'ctx> { &mut self, index: &Index, txn: &'ctx RoTxn, - word_interner: &Interner, + word_interner: &DedupInterner, left_prefix: Interned, right: Interned, proximity: u8, diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 3281ffd2b..5a28ab58a 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -38,13 +38,16 @@ That is we find the documents where either: use roaring::RoaringBitmap; +use super::interner::MappedInterner; use super::logger::SearchLogger; +use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - EdgeCondition, EdgeConditionsCache, EmptyPathsCache, ProximityGraph, RankingRuleGraph, + DeadEndPathCache, EdgeCondition, EdgeConditionDocIdsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; +use crate::search::new::interner::Interned; use crate::Result; pub type Proximity = GraphBasedRankingRule; @@ -79,12 +82,12 @@ pub struct GraphBasedRankingRuleState { /// The current graph graph: RankingRuleGraph, /// Cache to retrieve the docids associated with each edge - edge_conditions_cache: EdgeConditionsCache, + edge_conditions_cache: EdgeConditionDocIdsCache, /// Cache used to optimistically discard paths that resolve to no documents. - empty_paths_cache: EmptyPathsCache, + empty_paths_cache: DeadEndPathCache, /// A structure giving the list of possible costs from each node to the end node, /// along with a set of unavoidable edges that must be traversed to achieve that distance. - all_distances: Vec>, + all_distances: MappedInterner)>, QueryNode>, /// An index in the first element of `all_distances`, giving the cost of the next bucket cur_distance_idx: usize, } @@ -95,12 +98,12 @@ pub struct GraphBasedRankingRuleState { fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( ctx: &mut SearchContext<'ctx>, graph: &mut RankingRuleGraph, - edge_docids_cache: &mut EdgeConditionsCache, + edge_docids_cache: &mut EdgeConditionDocIdsCache, universe: &RoaringBitmap, - empty_paths_cache: &mut EmptyPathsCache, + empty_paths_cache: &mut DeadEndPathCache, ) -> Result<()> { - for edge_index in 0..graph.edges_store.len() as u16 { - let Some(edge) = graph.edges_store[edge_index as usize].as_ref() else { + for edge_id in graph.edges_store.indexes() { + let Some(edge) = graph.edges_store.get(edge_id).as_ref() else { continue; }; let condition = edge.condition; @@ -110,8 +113,8 @@ fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( EdgeCondition::Conditional(condition) => { let docids = edge_docids_cache.get_edge_docids(ctx, condition, graph, universe)?; if docids.is_disjoint(universe) { - graph.remove_ranking_rule_edge(edge_index); - empty_paths_cache.forbid_edge(edge_index); + graph.remove_edges_with_condition(condition); + empty_paths_cache.add_condition(condition); edge_docids_cache.cache.remove(&condition); continue; } @@ -133,8 +136,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase query_graph: &QueryGraph, ) -> Result<()> { let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; - let mut edge_docids_cache = EdgeConditionsCache::default(); - let mut empty_paths_cache = EmptyPathsCache::new(graph.edges_store.len() as u16); + let mut edge_docids_cache = EdgeConditionDocIdsCache::default(); + let mut empty_paths_cache = DeadEndPathCache::new(&graph.conditions_interner); // First simplify the graph as much as possible, by computing the docids of the edges // within the rule's universe and removing the edges that have no associated docids. @@ -187,7 +190,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // If the cur_distance_idx does not point to a valid cost in the `all_distances` // structure, then we have computed all the buckets and can return. if state.cur_distance_idx - >= state.all_distances[state.graph.query_graph.root_node as usize].len() + >= state.all_distances.get(state.graph.query_graph.root_node).len() { self.state = None; return Ok(None); @@ -195,7 +198,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // Retrieve the cost of the paths to compute let (cost, _) = - state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx]; + state.all_distances.get(state.graph.query_graph.root_node)[state.cur_distance_idx]; state.cur_distance_idx += 1; let mut bucket = RoaringBitmap::new(); @@ -226,7 +229,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // Updating the empty_paths_cache helps speed up the execution of `visit_paths_of_cost` and reduces // the number of future candidate paths given by that same function. graph.visit_paths_of_cost( - graph.query_graph.root_node as usize, + graph.query_graph.root_node, cost, all_distances, empty_paths_cache, @@ -237,29 +240,27 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // We store the edges and their docids in vectors in case the path turns out to be // empty and we need to figure out why it was empty. - let mut visited_edges = vec![]; - let mut cached_edge_docids = vec![]; + let mut visited_conditions = vec![]; + let mut cached_edge_docids = + graph.conditions_interner.map(|_| RoaringBitmap::new()); - for &edge_index in path { - visited_edges.push(edge_index); - let edge = graph.edges_store[edge_index as usize].as_ref().unwrap(); - let condition = match edge.condition { - EdgeCondition::Unconditional => continue, - EdgeCondition::Conditional(condition) => condition, - }; + for &condition_interned_raw in path { + let condition = Interned::new(condition_interned_raw); + visited_conditions.push(condition_interned_raw); let edge_docids = edge_docids_cache.get_edge_docids(ctx, condition, graph, &universe)?; - cached_edge_docids.push((edge_index, edge_docids.clone())); + *cached_edge_docids.get_mut(condition) = edge_docids.clone(); // If the edge is empty, then the path will be empty as well, we update the graph // and caches accordingly and skip to the next candidate path. if edge_docids.is_disjoint(&universe) { // 1. Store in the cache that this edge is empty for this universe - empty_paths_cache.forbid_edge(edge_index); + empty_paths_cache.add_condition(condition); // 2. remove this edge from the ranking rule graph - graph.remove_ranking_rule_edge(edge_index); + // ouch, no! :( need to link a condition to one or more ranking rule edges + graph.remove_edges_with_condition(condition); // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore edge_docids_cache.cache.remove(&condition); return Ok(()); @@ -270,17 +271,18 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase if path_docids.is_disjoint(&universe) { // First, we know that this path is empty, and thus any path // that is a superset of it will also be empty. - empty_paths_cache.forbid_prefix(&visited_edges); + empty_paths_cache.add_prefix(&visited_conditions); // Second, if the intersection between this edge and any // previous one is disjoint with the universe, // then we also know that any path containing the same couple of // edges will also be empty. - for (edge_index2, edge_docids2) in - cached_edge_docids[..cached_edge_docids.len() - 1].iter() - { + for (past_condition, edge_docids2) in cached_edge_docids.iter() { + if past_condition == condition { + continue; + }; let intersection = edge_docids & edge_docids2; if intersection.is_disjoint(&universe) { - empty_paths_cache.forbid_couple_edges(*edge_index2, edge_index); + empty_paths_cache.add_condition_couple(past_condition, condition); } } // We should maybe instead try to compute: @@ -291,6 +293,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase bucket |= &path_docids; // Reduce the size of the universe so that we can more optimistically discard candidate paths universe -= path_docids; + // TODO: if the universe is empty, stop iterating Ok(()) }, )?; diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index 55c343cd5..da8473e92 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -1,3 +1,4 @@ +use std::fmt; use std::hash::Hash; use std::marker::PhantomData; @@ -5,14 +6,16 @@ use fxhash::FxHashMap; /// An index within a [`Interner`] structure. pub struct Interned { - idx: u32, + idx: u16, _phantom: PhantomData, } - impl Interned { - fn new(idx: u32) -> Self { + pub fn new(idx: u16) -> Self { Self { idx, _phantom: PhantomData } } + pub fn into_inner(self) -> u16 { + self.idx + } } // TODO: the stable store should be replaced by a bump allocator @@ -34,17 +37,22 @@ impl Interned { /// be copied, compared, and hashed efficiently. An immutable reference to the original value /// can be retrieved using `self.get(interned)`. #[derive(Clone)] -pub struct Interner { +pub struct DedupInterner { stable_store: Vec, lookup: FxHashMap>, } -impl Default for Interner { +impl Default for DedupInterner { fn default() -> Self { Self { stable_store: Default::default(), lookup: Default::default() } } } +impl DedupInterner { + pub fn freeze(self) -> FixedSizeInterner { + FixedSizeInterner { stable_store: self.stable_store } + } +} -impl Interner +impl DedupInterner where T: Clone + Eq + Hash, { @@ -52,8 +60,9 @@ where if let Some(interned) = self.lookup.get(&s) { *interned } else { + assert!(self.stable_store.len() < u16::MAX as usize); self.stable_store.push(s.clone()); - let interned = Interned::new(self.stable_store.len() as u32 - 1); + let interned = Interned::new(self.stable_store.len() as u16 - 1); self.lookup.insert(s, interned); interned } @@ -62,7 +71,93 @@ where &self.stable_store[interned.idx as usize] } } +#[derive(Clone)] +pub struct Interner { + stable_store: Vec, +} +impl Default for Interner { + fn default() -> Self { + Self { stable_store: Default::default() } + } +} +impl Interner { + pub fn freeze(self) -> FixedSizeInterner { + FixedSizeInterner { stable_store: self.stable_store } + } + pub fn push(&mut self, s: T) -> Interned { + assert!(self.stable_store.len() < u16::MAX as usize); + self.stable_store.push(s); + Interned::new(self.stable_store.len() as u16 - 1) + } +} +#[derive(Clone)] +pub struct FixedSizeInterner { + stable_store: Vec, +} +impl FixedSizeInterner { + pub fn new(length: u16, value: T) -> Self { + Self { stable_store: vec![value; length as usize] } + } +} + +impl FixedSizeInterner { + pub fn from_vec(store: Vec) -> Self { + Self { stable_store: store } + } + pub fn get(&self, interned: Interned) -> &T { + &self.stable_store[interned.idx as usize] + } + pub fn get_mut(&mut self, interned: Interned) -> &mut T { + &mut self.stable_store[interned.idx as usize] + } + + pub fn len(&self) -> u16 { + self.stable_store.len() as u16 + } + + pub fn map(&self, map_f: impl Fn(&T) -> U) -> MappedInterner { + MappedInterner { + stable_store: self.stable_store.iter().map(map_f).collect(), + _phantom: PhantomData, + } + } + pub fn indexes(&self) -> impl Iterator> { + (0..self.stable_store.len()).map(|i| Interned::new(i as u16)) + } + pub fn iter(&self) -> impl Iterator, &T)> { + self.stable_store.iter().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + } + pub fn iter_mut(&mut self) -> impl Iterator, &mut T)> { + self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + } +} +#[derive(Clone)] +pub struct MappedInterner { + stable_store: Vec, + _phantom: PhantomData, +} + +impl MappedInterner { + pub fn get(&self, interned: Interned) -> &T { + &self.stable_store[interned.idx as usize] + } + pub fn get_mut(&mut self, interned: Interned) -> &mut T { + &mut self.stable_store[interned.idx as usize] + } + pub fn map(&self, map_f: impl Fn(&T) -> U) -> MappedInterner { + MappedInterner { + stable_store: self.stable_store.iter().map(map_f).collect(), + _phantom: PhantomData, + } + } + pub fn iter(&self) -> impl Iterator, &T)> { + self.stable_store.iter().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + } + pub fn iter_mut(&mut self) -> impl Iterator, &mut T)> { + self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + } +} // Interned boilerplate implementations impl Hash for Interned { @@ -97,3 +192,14 @@ impl Clone for Interned { } impl Copy for Interned {} + +impl fmt::Display for Interned { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.idx, f) + } +} +impl fmt::Debug for Interned { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.idx, f) + } +} diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 575d5b0bf..fb5a296bd 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -6,10 +6,12 @@ use std::time::Instant; use rand::random; use roaring::RoaringBitmap; +use crate::search::new::interner::{Interned, MappedInterner}; +use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; use crate::search::new::ranking_rule_graph::{ - Edge, EdgeCondition, EmptyPathsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, - TypoGraph, + DeadEndPathCache, Edge, EdgeCondition, ProximityEdge, ProximityGraph, RankingRuleGraph, + RankingRuleGraphTrait, TypoEdge, TypoGraph, }; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; @@ -42,17 +44,17 @@ pub enum SearchEvents { ProximityState { graph: RankingRuleGraph, paths: Vec>, - empty_paths_cache: EmptyPathsCache, + empty_paths_cache: DeadEndPathCache, universe: RoaringBitmap, - distances: Vec>, + distances: MappedInterner)>, QueryNode>, cost: u16, }, TypoState { graph: RankingRuleGraph, paths: Vec>, - empty_paths_cache: EmptyPathsCache, + empty_paths_cache: DeadEndPathCache, universe: RoaringBitmap, - distances: Vec>, + distances: MappedInterner)>, QueryNode>, cost: u16, }, RankingRuleSkipBucket { @@ -168,9 +170,9 @@ impl SearchLogger for DetailedSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: Vec>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ) { self.events.push(SearchEvents::ProximityState { @@ -178,7 +180,7 @@ impl SearchLogger for DetailedSearchLogger { paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), - distances, + distances: distances.clone(), cost, }) } @@ -187,9 +189,9 @@ impl SearchLogger for DetailedSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: Vec>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ) { self.events.push(SearchEvents::TypoState { @@ -197,7 +199,7 @@ impl SearchLogger for DetailedSearchLogger { paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), - distances, + distances: distances.clone(), cost, }) } @@ -424,15 +426,15 @@ results.{random} {{ writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc( + fn query_node_d2_desc( ctx: &mut SearchContext, - node_idx: usize, + node_idx: Interned, node: &QueryNode, - distances: &[(u16, SmallBitmap)], + distances: &[(u16, SmallBitmap)], file: &mut File, ) { - match &node { - QueryNode::Term(LocatedQueryTerm { value, .. }) => { + match &node.data { + QueryNodeData::Term(LocatedQueryTerm { value, .. }) => { let QueryTerm { original, zero_typo, @@ -496,11 +498,11 @@ shape: class" writeln!(file, "}}").unwrap(); } - QueryNode::Deleted => panic!(), - QueryNode::Start => { + QueryNodeData::Deleted => panic!(), + QueryNodeData::Start => { writeln!(file, "{node_idx} : START").unwrap(); } - QueryNode::End => { + QueryNodeData::End => { writeln!(file, "{node_idx} : END").unwrap(); } } @@ -511,14 +513,14 @@ shape: class" file: &mut File, ) { writeln!(file, "direction: right").unwrap(); - for node in 0..query_graph.nodes.len() { - if matches!(query_graph.nodes[node], QueryNode::Deleted) { + for (node_id, node) in query_graph.nodes.iter() { + if matches!(node.data, QueryNodeData::Deleted) { continue; } - Self::query_node_d2_desc(ctx, node, &query_graph.nodes[node], &[], file); + Self::query_node_d2_desc::(ctx, node_id, node, &[], file); - for edge in query_graph.edges[node].successors.iter() { - writeln!(file, "{node} -> {edge};\n").unwrap(); + for edge in node.successors.iter() { + writeln!(file, "{node_id} -> {edge};\n").unwrap(); } } } @@ -526,31 +528,28 @@ shape: class" ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], - _empty_paths_cache: &EmptyPathsCache, - distances: Vec>, + _empty_paths_cache: &DeadEndPathCache, + distances: MappedInterner)>, QueryNode>, file: &mut File, ) { writeln!(file, "direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); - for (node_idx, node) in graph.query_graph.nodes.iter().enumerate() { - if matches!(node, QueryNode::Deleted) { + for (node_idx, node) in graph.query_graph.nodes.iter() { + if matches!(&node.data, QueryNodeData::Deleted) { continue; } - let distances = &distances[node_idx]; - Self::query_node_d2_desc(ctx, node_idx, node, distances.as_slice(), file); + let distances = &distances.get(node_idx); + Self::query_node_d2_desc::(ctx, node_idx, node, distances, file); } - for edge in graph.edges_store.iter().flatten() { - let Edge { source_node, dest_node, condition: details, .. } = edge; + for (_edge_id, edge) in graph.edges_store.iter() { + let Some(edge) = edge else { continue }; + let Edge { source_node, dest_node, condition: details, cost } = edge; match &details { EdgeCondition::Unconditional => { - writeln!( - file, - "{source_node} -> {dest_node} : \"always cost {cost}\"", - cost = edge.cost, - ) - .unwrap(); + writeln!(file, "{source_node} -> {dest_node} : \"always cost {cost}\"",) + .unwrap(); } EdgeCondition::Conditional(condition) => { let condition = graph.conditions_interner.get(*condition); @@ -590,39 +589,19 @@ shape: class" // } // writeln!(file, "}}").unwrap(); } - fn edge_d2_description( - ctx: &mut SearchContext, + fn condition_d2_description( + _ctx: &mut SearchContext, graph: &RankingRuleGraph, - edge_idx: u16, + condition_id: Interned, file: &mut File, ) { - let Edge { source_node, dest_node, cost, .. } = - graph.edges_store[edge_idx as usize].as_ref().unwrap(); - let source_node = &graph.query_graph.nodes[*source_node as usize]; - let source_node_desc = match source_node { - QueryNode::Term(term) => { - let term = ctx.term_interner.get(term.value); - ctx.word_interner.get(term.original).to_owned() - } - QueryNode::Deleted => panic!(), - QueryNode::Start => "START".to_owned(), - QueryNode::End => "END".to_owned(), - }; - let dest_node = &graph.query_graph.nodes[*dest_node as usize]; - let dest_node_desc = match dest_node { - QueryNode::Term(term) => { - let term = ctx.term_interner.get(term.value); - ctx.word_interner.get(term.original).to_owned() - } - QueryNode::Deleted => panic!(), - QueryNode::Start => "START".to_owned(), - QueryNode::End => "END".to_owned(), - }; + let condition = graph.conditions_interner.get(condition_id); writeln!( file, - "{edge_idx}: \"{source_node_desc}->{dest_node_desc} [{cost}]\" {{ + "{condition_id}: \"{}\" {{ shape: class - }}" + }}", + R::label_for_edge_condition(condition) ) .unwrap(); } @@ -632,12 +611,12 @@ shape: class" paths: &[Vec], file: &mut File, ) { - for (path_idx, edge_indexes) in paths.iter().enumerate() { + for (path_idx, condition_indexes) in paths.iter().enumerate() { writeln!(file, "{path_idx} {{").unwrap(); - for edge_idx in edge_indexes.iter() { - Self::edge_d2_description(ctx, graph, *edge_idx, file); + for condition in condition_indexes.iter() { + Self::condition_d2_description(ctx, graph, Interned::new(*condition), file); } - for couple_edges in edge_indexes.windows(2) { + for couple_edges in condition_indexes.windows(2) { let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() }; writeln!(file, "{src_edge_idx} -> {dest_edge_idx}").unwrap(); } diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index c5f3e5351..ff500d4b8 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -3,7 +3,11 @@ pub mod detailed; use roaring::RoaringBitmap; -use super::ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGraph, TypoGraph}; +use super::interner::MappedInterner; +use super::query_graph::QueryNode; +use super::ranking_rule_graph::{ + DeadEndPathCache, ProximityEdge, ProximityGraph, RankingRuleGraph, TypoEdge, TypoGraph, +}; use super::small_bitmap::SmallBitmap; use super::{RankingRule, RankingRuleQueryTrait}; @@ -62,9 +66,9 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: Vec>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ); @@ -73,9 +77,9 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: Vec>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ); } @@ -133,9 +137,9 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths_map: &[Vec], - _empty_paths_cache: &EmptyPathsCache, + _empty_paths_cache: &DeadEndPathCache, _universe: &RoaringBitmap, - _distances: Vec>, + _distances: &MappedInterner)>, QueryNode>, _cost: u16, ) { } @@ -144,9 +148,9 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths: &[Vec], - _empty_paths_cache: &EmptyPathsCache, + _empty_paths_cache: &DeadEndPathCache, _universe: &RoaringBitmap, - _distances: Vec>, + _distances: &MappedInterner)>, QueryNode>, _cost: u16, ) { } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 1eaa6d347..11420545c 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -26,7 +26,8 @@ use query_graph::{QueryGraph, QueryNode}; pub use ranking_rules::{bucket_sort, RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use roaring::RoaringBitmap; -use self::interner::Interner; +use self::interner::DedupInterner; +use self::query_graph::QueryNodeData; use self::query_term::{Phrase, QueryTerm}; use self::ranking_rules::PlaceholderQuery; use self::resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; @@ -39,9 +40,9 @@ pub struct SearchContext<'ctx> { pub index: &'ctx Index, pub txn: &'ctx RoTxn<'ctx>, pub db_cache: DatabaseCache<'ctx>, - pub word_interner: Interner, - pub phrase_interner: Interner, - pub term_interner: Interner, + pub word_interner: DedupInterner, + pub phrase_interner: DedupInterner, + pub term_interner: DedupInterner, pub term_docids: QueryTermDocIdsCache, } impl<'ctx> SearchContext<'ctx> { @@ -70,12 +71,12 @@ fn resolve_maximally_reduced_query_graph<'ctx>( let mut positions_to_remove = match matching_strategy { TermsMatchingStrategy::Last => { let mut all_positions = BTreeSet::new(); - for n in query_graph.nodes.iter() { - match n { - QueryNode::Term(term) => { + for (_, n) in query_graph.nodes.iter() { + match &n.data { + QueryNodeData::Term(term) => { all_positions.extend(term.positions.clone().into_iter()); } - QueryNode::Deleted | QueryNode::Start | QueryNode::End => {} + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {} } } all_positions.into_iter().collect() @@ -200,7 +201,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( continue; } asc.insert(field); - todo!(); + // TODO } crate::Criterion::Desc(field) => { if desc.contains(&field) { @@ -295,45 +296,48 @@ mod tests { println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - // loop { - let start = Instant::now(); + loop { + let start = Instant::now(); - let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "sun flower s are the best", - TermsMatchingStrategy::Last, - None, - 0, - 20, - &mut DefaultSearchLogger, - &mut logger, - ) - .unwrap(); + // let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let results = execute_search( + &mut ctx, + // "which a the releases from poison by the government", + // "sun flower s are the best", + "zero config", + TermsMatchingStrategy::Last, + None, + 0, + 20, + &mut DefaultSearchLogger, + &mut DefaultSearchLogger, + //&mut logger, + ) + .unwrap(); - logger.write_d2_description(&mut ctx); + // logger.write_d2_description(&mut ctx); - let elapsed = start.elapsed(); - println!("{}us", elapsed.as_micros()); + let elapsed = start.elapsed(); + println!("{}us", elapsed.as_micros()); - let _documents = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); + let _documents = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); - println!("{}us: {:?}", elapsed.as_micros(), results); - // } + println!("{}us: {:?}", elapsed.as_micros(), results); + } // for (id, _document) in documents { // println!("{id}:"); // // println!("{document}"); diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 7bed15571..d487a644f 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,10 +1,11 @@ +use std::collections::HashSet; + +use super::interner::{FixedSizeInterner, Interned}; use super::query_term::{self, number_of_typos_allowed, LocatedQueryTerm}; use super::small_bitmap::SmallBitmap; use super::SearchContext; use crate::Result; -pub const QUERY_GRAPH_NODE_LENGTH_LIMIT: u16 = 64; - /// A node of the [`QueryGraph`]. /// /// There are four types of nodes: @@ -15,22 +16,19 @@ pub const QUERY_GRAPH_NODE_LENGTH_LIMIT: u16 = 64; /// 4. `Term` is a regular node representing a word or combination of words /// from the user query. #[derive(Clone)] -pub enum QueryNode { +pub struct QueryNode { + pub data: QueryNodeData, + pub predecessors: SmallBitmap, + pub successors: SmallBitmap, +} +#[derive(Clone)] +pub enum QueryNodeData { Term(LocatedQueryTerm), Deleted, Start, End, } -/// The edges associated with a node in the query graph. -#[derive(Clone)] -pub struct Edges { - /// Set of nodes which have an edge going to the current node - pub predecessors: SmallBitmap, - /// Set of nodes which are reached by an edge from the current node - pub successors: SmallBitmap, -} - /** A graph representing all the ways to interpret the user's search query. @@ -78,55 +76,45 @@ and the transformations that were done on the query graph). #[derive(Clone)] pub struct QueryGraph { /// The index of the start node within `self.nodes` - pub root_node: u16, + pub root_node: Interned, /// The index of the end node within `self.nodes` - pub end_node: u16, + pub end_node: Interned, /// The list of all query nodes - pub nodes: Vec, - /// The list of all node edges - pub edges: Vec, + pub nodes: FixedSizeInterner, } -impl Default for QueryGraph { - /// Create a new QueryGraph with two disconnected nodes: the root and end nodes. - fn default() -> Self { - let nodes = vec![QueryNode::Start, QueryNode::End]; - let edges = vec![ - Edges { - predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - }, - Edges { - predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - }, - ]; +// impl Default for QueryGraph { +// /// Create a new QueryGraph with two disconnected nodes: the root and end nodes. +// fn default() -> Self { +// let nodes = vec![ +// QueryNode { +// data: QueryNodeData::Start, +// predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), +// successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), +// }, +// QueryNode { +// data: QueryNodeData::End, +// predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), +// successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), +// }, +// ]; - Self { root_node: 0, end_node: 1, nodes, edges } - } -} +// Self { root_node: 0, end_node: 1, nodes } +// } +// } impl QueryGraph { /// Connect all the given predecessor nodes to the given successor node - fn connect_to_node(&mut self, from_nodes: &[u16], to_node: u16) { + fn connect_to_node( + &mut self, + from_nodes: &[Interned], + to_node: Interned, + ) { for &from_node in from_nodes { - self.edges[from_node as usize].successors.insert(to_node); - self.edges[to_node as usize].predecessors.insert(from_node); + self.nodes.get_mut(from_node).successors.insert(to_node); + self.nodes.get_mut(to_node).predecessors.insert(from_node); } } - /// Add the given node to the graph and connect it to all the given predecessor nodes - fn add_node(&mut self, from_nodes: &[u16], node: QueryNode) -> u16 { - let new_node_idx = self.nodes.len() as u16; - assert!(new_node_idx <= QUERY_GRAPH_NODE_LENGTH_LIMIT); - self.nodes.push(node); - self.edges.push(Edges { - predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - }); - self.connect_to_node(from_nodes, new_node_idx); - - new_node_idx - } } impl QueryGraph { @@ -136,17 +124,27 @@ impl QueryGraph { let mut empty_nodes = vec![]; - let mut graph = QueryGraph::default(); + let mut predecessors: Vec> = vec![HashSet::new(), HashSet::new()]; + let mut successors: Vec> = vec![HashSet::new(), HashSet::new()]; + let mut nodes_data: Vec = vec![QueryNodeData::Start, QueryNodeData::End]; + let root_node = 0; + let end_node = 1; // TODO: we could consider generalizing to 4,5,6,7,etc. ngrams let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = - (vec![], vec![], vec![graph.root_node]); + (vec![], vec![], vec![root_node]); for term_idx in 0..terms.len() { let term0 = &terms[term_idx]; let mut new_nodes = vec![]; - let new_node_idx = graph.add_node(&prev0, QueryNode::Term(term0.clone())); + let new_node_idx = add_node( + &mut nodes_data, + QueryNodeData::Term(term0.clone()), + &prev0, + &mut successors, + &mut predecessors, + ); new_nodes.push(new_node_idx); if term0.is_empty(&ctx.term_interner) { empty_nodes.push(new_node_idx); @@ -156,7 +154,13 @@ impl QueryGraph { if let Some(ngram) = query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)? { - let ngram_idx = graph.add_node(&prev1, QueryNode::Term(ngram)); + let ngram_idx = add_node( + &mut nodes_data, + QueryNodeData::Term(ngram), + &prev1, + &mut successors, + &mut predecessors, + ); new_nodes.push(ngram_idx); } } @@ -164,53 +168,96 @@ impl QueryGraph { if let Some(ngram) = query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)? { - let ngram_idx = graph.add_node(&prev2, QueryNode::Term(ngram)); + let ngram_idx = add_node( + &mut nodes_data, + QueryNodeData::Term(ngram), + &prev2, + &mut successors, + &mut predecessors, + ); new_nodes.push(ngram_idx); } } (prev0, prev1, prev2) = (new_nodes, prev0, prev1); } - graph.connect_to_node(&prev0, graph.end_node); + let root_node = Interned::new(root_node); + let end_node = Interned::new(end_node); + let mut nodes = FixedSizeInterner::new( + nodes_data.len() as u16, + QueryNode { + data: QueryNodeData::Deleted, + predecessors: SmallBitmap::new(nodes_data.len() as u16), + successors: SmallBitmap::new(nodes_data.len() as u16), + }, + ); + for (node_idx, ((node_data, predecessors), successors)) in nodes_data + .into_iter() + .zip(predecessors.into_iter()) + .zip(successors.into_iter()) + .enumerate() + { + let node = nodes.get_mut(Interned::new(node_idx as u16)); + node.data = node_data; + for x in predecessors { + node.predecessors.insert(Interned::new(x)); + } + for x in successors { + node.successors.insert(Interned::new(x)); + } + } + let mut graph = QueryGraph { root_node, end_node, nodes }; + + graph.connect_to_node( + prev0.into_iter().map(Interned::new).collect::>().as_slice(), + end_node, + ); + let empty_nodes = empty_nodes.into_iter().map(Interned::new).collect::>(); graph.remove_nodes_keep_edges(&empty_nodes); Ok(graph) } /// Remove the given nodes and all their edges from the query graph. - pub fn remove_nodes(&mut self, nodes: &[u16]) { - for &node in nodes { - self.nodes[node as usize] = QueryNode::Deleted; - let edges = self.edges[node as usize].clone(); - for pred in edges.predecessors.iter() { - self.edges[pred as usize].successors.remove(node); + pub fn remove_nodes(&mut self, nodes: &[Interned]) { + for &node_id in nodes { + let node = &self.nodes.get(node_id); + let old_node_pred = node.predecessors.clone(); + let old_node_succ = node.successors.clone(); + + for pred in old_node_pred.iter() { + self.nodes.get_mut(pred).successors.remove(node_id); } - for succ in edges.successors.iter() { - self.edges[succ as usize].predecessors.remove(node); + for succ in old_node_succ.iter() { + self.nodes.get_mut(succ).predecessors.remove(node_id); } - self.edges[node as usize] = Edges { - predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - }; + + let node = self.nodes.get_mut(node_id); + node.data = QueryNodeData::Deleted; + node.predecessors.clear(); + node.successors.clear(); } } /// Remove the given nodes, connecting all their predecessors to all their successors. - pub fn remove_nodes_keep_edges(&mut self, nodes: &[u16]) { - for &node in nodes { - self.nodes[node as usize] = QueryNode::Deleted; - let edges = self.edges[node as usize].clone(); - for pred in edges.predecessors.iter() { - self.edges[pred as usize].successors.remove(node); - self.edges[pred as usize].successors.union(&edges.successors); + pub fn remove_nodes_keep_edges(&mut self, nodes: &[Interned]) { + for &node_id in nodes { + let node = self.nodes.get(node_id); + let old_node_pred = node.predecessors.clone(); + let old_node_succ = node.successors.clone(); + for pred in old_node_pred.iter() { + let pred_successors = &mut self.nodes.get_mut(pred).successors; + pred_successors.remove(node_id); + pred_successors.union(&old_node_succ); } - for succ in edges.successors.iter() { - self.edges[succ as usize].predecessors.remove(node); - self.edges[succ as usize].predecessors.union(&edges.predecessors); + for succ in old_node_succ.iter() { + let succ_predecessors = &mut self.nodes.get_mut(succ).predecessors; + succ_predecessors.remove(node_id); + succ_predecessors.union(&old_node_pred); } - self.edges[node as usize] = Edges { - predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - }; + let node = self.nodes.get_mut(node_id); + node.data = QueryNodeData::Deleted; + node.predecessors.clear(); + node.successors.clear(); } } @@ -219,9 +266,8 @@ impl QueryGraph { /// Return `true` if any node was removed. pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool { let mut nodes_to_remove_keeping_edges = vec![]; - for (node_idx, node) in self.nodes.iter().enumerate() { - let node_idx = node_idx as u16; - let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue }; + for (node_idx, node) in self.nodes.iter() { + let QueryNodeData::Term(LocatedQueryTerm { value: _, positions }) = &node.data else { continue }; if positions.start() == &position { nodes_to_remove_keeping_edges.push(node_idx); } @@ -238,13 +284,13 @@ impl QueryGraph { fn simplify(&mut self) { loop { let mut nodes_to_remove = vec![]; - for (node_idx, node) in self.nodes.iter().enumerate() { - if (!matches!(node, QueryNode::End | QueryNode::Deleted) - && self.edges[node_idx].successors.is_empty()) - || (!matches!(node, QueryNode::Start | QueryNode::Deleted) - && self.edges[node_idx].predecessors.is_empty()) + for (node_idx, node) in self.nodes.iter() { + if (!matches!(node.data, QueryNodeData::End | QueryNodeData::Deleted) + && node.successors.is_empty()) + || (!matches!(node.data, QueryNodeData::Start | QueryNodeData::Deleted) + && node.predecessors.is_empty()) { - nodes_to_remove.push(node_idx as u16); + nodes_to_remove.push(node_idx); } } if nodes_to_remove.is_empty() { @@ -255,3 +301,21 @@ impl QueryGraph { } } } + +fn add_node( + nodes_data: &mut Vec, + node_data: QueryNodeData, + from_nodes: &Vec, + successors: &mut Vec>, + predecessors: &mut Vec>, +) -> u16 { + successors.push(HashSet::new()); + predecessors.push(HashSet::new()); + let new_node_idx = nodes_data.len() as u16; + nodes_data.push(node_data); + for &from_node in from_nodes { + successors[from_node as usize].insert(new_node_idx); + predecessors[new_node_idx as usize].insert(from_node); + } + new_node_idx +} diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index c6cb81131..3272464f8 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -9,7 +9,7 @@ use heed::types::DecodeIgnore; use heed::RoTxn; use itertools::Itertools; -use super::interner::{Interned, Interner}; +use super::interner::{DedupInterner, Interned}; use super::SearchContext; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; @@ -22,7 +22,7 @@ pub struct Phrase { pub words: Vec>>, } impl Phrase { - pub fn description(&self, interner: &Interner) -> String { + pub fn description(&self, interner: &DedupInterner) -> String { self.words.iter().flatten().map(|w| interner.get(*w)).join(" ") } } @@ -60,8 +60,8 @@ pub struct QueryTerm { } impl QueryTerm { pub fn phrase( - word_interner: &mut Interner, - phrase_interner: &mut Interner, + word_interner: &mut DedupInterner, + phrase_interner: &mut DedupInterner, phrase: Phrase, ) -> Self { Self { @@ -78,7 +78,7 @@ impl QueryTerm { is_ngram: false, } } - pub fn empty(word_interner: &mut Interner, original: &str) -> Self { + pub fn empty(word_interner: &mut DedupInterner, original: &str) -> Self { Self { original: word_interner.insert(original.to_owned()), phrase: None, @@ -313,7 +313,7 @@ pub struct LocatedQueryTerm { impl LocatedQueryTerm { /// Return `true` iff the term is empty - pub fn is_empty(&self, interner: &Interner) -> bool { + pub fn is_empty(&self, interner: &DedupInterner) -> bool { interner.get(self.value).is_empty() } } diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 286a98ab1..7ab08aceb 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,7 +1,7 @@ use std::collections::HashSet; use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::interner::Interner; +use crate::search::new::interner::{DedupInterner, Interner}; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, SearchContext}; use crate::Result; @@ -15,40 +15,43 @@ impl RankingRuleGraph { /// Build the ranking rule graph from the given query graph pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { - let QueryGraph { nodes: graph_nodes, edges: graph_edges, .. } = &query_graph; + let QueryGraph { nodes: graph_nodes, .. } = &query_graph; - let mut conditions_interner = Interner::default(); + let mut conditions_interner = DedupInterner::default(); - let mut edges_store = vec![]; - let mut edges_of_node = vec![]; + let mut edges_store = Interner::default(); + let mut edges_of_node = query_graph.nodes.map(|_| HashSet::new()); - for (source_idx, source_node) in graph_nodes.iter().enumerate() { - edges_of_node.push(HashSet::new()); - let new_edges = edges_of_node.last_mut().unwrap(); + for (source_id, source_node) in graph_nodes.iter() { + let new_edges = edges_of_node.get_mut(source_id); - for dest_idx in graph_edges[source_idx].successors.iter() { - let dest_node = &graph_nodes[dest_idx as usize]; + for dest_idx in source_node.successors.iter() { + let dest_node = graph_nodes.get(dest_idx); let edges = G::build_edges(ctx, &mut conditions_interner, source_node, dest_node)?; if edges.is_empty() { continue; } for (cost, condition) in edges { - edges_store.push(Some(Edge { - source_node: source_idx as u16, + let new_edge_id = edges_store.push(Some(Edge { + source_node: source_id, dest_node: dest_idx, cost, condition, })); - new_edges.insert(edges_store.len() as u16 - 1); + new_edges.insert(new_edge_id); } } } - let edges_of_node = edges_of_node - .into_iter() - .map(|edges| SmallBitmap::from_iter(edges.into_iter(), edges_store.len() as u16)) - .collect(); + let edges_store = edges_store.freeze(); + let edges_of_node = + edges_of_node.map(|edges| SmallBitmap::from_iter(edges.iter().copied(), &edges_store)); - Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node, conditions_interner }) + Ok(RankingRuleGraph { + query_graph, + edges_store, + edges_of_node, + conditions_interner: conditions_interner.freeze(), + }) } } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 529bb32c4..597aff661 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -3,8 +3,10 @@ use std::collections::btree_map::Entry; use std::collections::{BTreeMap, VecDeque}; -use super::empty_paths_cache::EmptyPathsCache; -use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::empty_paths_cache::DeadEndPathCache; +use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::{Interned, MappedInterner}; +use crate::search::new::query_graph::QueryNode; use crate::search::new::small_bitmap::SmallBitmap; use crate::Result; @@ -17,11 +19,11 @@ pub struct Path { impl RankingRuleGraph { pub fn visit_paths_of_cost( &mut self, - from: usize, + from: Interned, cost: u16, - all_distances: &[Vec<(u16, SmallBitmap)>], - empty_paths_cache: &mut EmptyPathsCache, - mut visit: impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, + all_distances: &MappedInterner)>, QueryNode>, + empty_paths_cache: &mut DeadEndPathCache, + mut visit: impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache) -> Result<()>, ) -> Result<()> { let _ = self.visit_paths_of_cost_rec( from, @@ -30,76 +32,108 @@ impl RankingRuleGraph { empty_paths_cache, &mut visit, &mut vec![], - &mut SmallBitmap::new(self.edges_store.len() as u16), - empty_paths_cache.empty_edges.clone(), + &mut SmallBitmap::new(self.edges_store.len()), + &mut empty_paths_cache.conditions.clone(), )?; Ok(()) } pub fn visit_paths_of_cost_rec( &mut self, - from: usize, + from: Interned, cost: u16, - all_distances: &[Vec<(u16, SmallBitmap)>], - empty_paths_cache: &mut EmptyPathsCache, - visit: &mut impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, - prev_edges: &mut Vec, - cur_path: &mut SmallBitmap, - mut forbidden_edges: SmallBitmap, + all_distances: &MappedInterner)>, QueryNode>, + empty_paths_cache: &mut DeadEndPathCache, + visit: &mut impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache) -> Result<()>, + prev_conditions: &mut Vec, + cur_path: &mut SmallBitmap, + forbidden_conditions: &mut SmallBitmap, ) -> Result { let mut any_valid = false; - let edges = self.edges_of_node[from].clone(); + let edges = self.edges_of_node.get(from).clone(); for edge_idx in edges.iter() { - let Some(edge) = self.edges_store[edge_idx as usize].as_ref() else { continue }; - if cost < edge.cost as u16 - || forbidden_edges.contains(edge_idx) - || !all_distances[edge.dest_node as usize].iter().any( - |(next_cost, necessary_edges)| { - (*next_cost == cost - edge.cost as u16) - && !forbidden_edges.intersects(necessary_edges) - }, - ) - { + let Some(edge) = self.edges_store.get(edge_idx).as_ref() else { continue }; + if cost < edge.cost as u16 { continue; } - cur_path.insert(edge_idx); - prev_edges.push(edge_idx); + let next_any_valid = match edge.condition { + EdgeCondition::Unconditional => { + if edge.dest_node == self.query_graph.end_node { + any_valid = true; + visit(prev_conditions, self, empty_paths_cache)?; + true + } else { + self.visit_paths_of_cost_rec( + edge.dest_node, + cost - edge.cost as u16, + all_distances, + empty_paths_cache, + visit, + prev_conditions, + cur_path, + forbidden_conditions, + )? + } + } + EdgeCondition::Conditional(condition) => { + if forbidden_conditions.contains(condition) + || !all_distances.get(edge.dest_node).iter().any( + |(next_cost, necessary_conditions)| { + (*next_cost == cost - edge.cost as u16) + && !forbidden_conditions.intersects(necessary_conditions) + }, + ) + { + continue; + } + cur_path.insert(condition); + // TODO: typed path set + prev_conditions.push(condition.into_inner()); - let mut new_forbidden_edges = forbidden_edges.clone(); - new_forbidden_edges.union(&empty_paths_cache.empty_couple_edges[edge_idx as usize]); - empty_paths_cache.empty_prefixes.final_edges_after_prefix(prev_edges, &mut |x| { - new_forbidden_edges.insert(x); - }); - - let next_any_valid = if edge.dest_node == self.query_graph.end_node { - any_valid = true; - visit(prev_edges, self, empty_paths_cache)?; - true - } else { - self.visit_paths_of_cost_rec( - edge.dest_node as usize, - cost - edge.cost as u16, - all_distances, - empty_paths_cache, - visit, - prev_edges, - cur_path, - new_forbidden_edges, - )? + let mut new_forbidden_conditions = forbidden_conditions.clone(); + new_forbidden_conditions + .union(empty_paths_cache.condition_couples.get(condition)); + empty_paths_cache.prefixes.final_edges_after_prefix( + prev_conditions, + &mut |x| { + new_forbidden_conditions.insert(Interned::new(x)); + }, + ); + let next_any_valid = if edge.dest_node == self.query_graph.end_node { + any_valid = true; + visit(prev_conditions, self, empty_paths_cache)?; + true + } else { + self.visit_paths_of_cost_rec( + edge.dest_node, + cost - edge.cost as u16, + all_distances, + empty_paths_cache, + visit, + prev_conditions, + cur_path, + &mut new_forbidden_conditions, + )? + }; + cur_path.remove(condition); + prev_conditions.pop(); + next_any_valid + } }; any_valid |= next_any_valid; - cur_path.remove(edge_idx); - prev_edges.pop(); + if next_any_valid { - if empty_paths_cache.path_is_empty(prev_edges, cur_path) { + if empty_paths_cache.path_is_dead_end(prev_conditions, cur_path) { return Ok(any_valid); } - forbidden_edges.union(&empty_paths_cache.empty_edges); - for edge in prev_edges.iter() { - forbidden_edges.union(&empty_paths_cache.empty_couple_edges[*edge as usize]); + forbidden_conditions.union(&empty_paths_cache.conditions); + for prev_condition in prev_conditions.iter() { + forbidden_conditions.union( + empty_paths_cache.condition_couples.get(Interned::new(*prev_condition)), + ); } - empty_paths_cache.empty_prefixes.final_edges_after_prefix(prev_edges, &mut |x| { - forbidden_edges.insert(x); + empty_paths_cache.prefixes.final_edges_after_prefix(prev_conditions, &mut |x| { + forbidden_conditions.insert(Interned::new(x)); }); } } @@ -107,36 +141,41 @@ impl RankingRuleGraph { Ok(any_valid) } - pub fn initialize_distances_with_necessary_edges(&self) -> Vec> { - let mut distances_to_end: Vec> = - vec![vec![]; self.query_graph.nodes.len()]; - let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len() as u16); + pub fn initialize_distances_with_necessary_edges( + &self, + ) -> MappedInterner)>, QueryNode> { + let mut distances_to_end = self.query_graph.nodes.map(|_| vec![]); + let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); let mut node_stack = VecDeque::new(); - distances_to_end[self.query_graph.end_node as usize] = - vec![(0, SmallBitmap::new(self.edges_store.len() as u16))]; + *distances_to_end.get_mut(self.query_graph.end_node) = + vec![(0, SmallBitmap::for_interned_values_in(&self.conditions_interner))]; - for prev_node in - self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter() - { - node_stack.push_back(prev_node as usize); + for prev_node in self.query_graph.nodes.get(self.query_graph.end_node).predecessors.iter() { + node_stack.push_back(prev_node); enqueued.insert(prev_node); } while let Some(cur_node) = node_stack.pop_front() { - let mut self_distances = BTreeMap::::new(); + let mut self_distances = BTreeMap::>::new(); - let cur_node_edges = &self.edges_of_node[cur_node]; + let cur_node_edges = &self.edges_of_node.get(cur_node); for edge_idx in cur_node_edges.iter() { - let edge = self.edges_store[edge_idx as usize].as_ref().unwrap(); + let edge = self.edges_store.get(edge_idx).as_ref().unwrap(); + let condition = match edge.condition { + EdgeCondition::Unconditional => None, + EdgeCondition::Conditional(condition) => Some(condition), + }; let succ_node = edge.dest_node; - let succ_distances = &distances_to_end[succ_node as usize]; - for (succ_distance, succ_necessary_edges) in succ_distances { - let potential_necessary_edges = SmallBitmap::from_iter( - std::iter::once(edge_idx).chain(succ_necessary_edges.iter()), - self.edges_store.len() as u16, - ); + let succ_distances = distances_to_end.get(succ_node); + for (succ_distance, succ_necessary_conditions) in succ_distances { + let mut potential_necessary_edges = + SmallBitmap::for_interned_values_in(&self.conditions_interner); + for condition in condition.into_iter().chain(succ_necessary_conditions.iter()) { + potential_necessary_edges.insert(condition); + } + match self_distances.entry(edge.cost as u16 + succ_distance) { Entry::Occupied(mut prev_necessary_edges) => { prev_necessary_edges.get_mut().intersection(&potential_necessary_edges); @@ -147,10 +186,14 @@ impl RankingRuleGraph { } } } - distances_to_end[cur_node] = self_distances.into_iter().collect(); - for prev_node in self.query_graph.edges[cur_node].predecessors.iter() { + let distances_to_end_cur_node = distances_to_end.get_mut(cur_node); + for (cost, necessary_edges) in self_distances.iter() { + distances_to_end_cur_node.push((*cost, necessary_edges.clone())); + } + *distances_to_end.get_mut(cur_node) = self_distances.into_iter().collect(); + for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { if !enqueued.contains(prev_node) { - node_stack.push_back(prev_node as usize); + node_stack.push_back(prev_node); enqueued.insert(prev_node); } } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 5da3de326..b3426619b 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -9,17 +9,17 @@ use crate::search::new::SearchContext; use crate::Result; /// A cache storing the document ids associated with each ranking rule edge -pub struct EdgeConditionsCache { +pub struct EdgeConditionDocIdsCache { // TODO: should be FxHashMap, RoaringBitmap> pub cache: FxHashMap, RoaringBitmap>, _phantom: PhantomData, } -impl Default for EdgeConditionsCache { +impl Default for EdgeConditionDocIdsCache { fn default() -> Self { Self { cache: Default::default(), _phantom: Default::default() } } } -impl EdgeConditionsCache { +impl EdgeConditionDocIdsCache { /// Retrieve the document ids for the given edge condition. /// /// If the cache does not yet contain these docids, they are computed diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index 3178cfe27..3b518bc9b 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,59 +1,82 @@ -use super::path_set::PathSet; -use crate::search::new::small_bitmap::SmallBitmap; +use super::{path_set::PathSet, RankingRuleGraphTrait}; +use crate::search::new::{ + interner::{FixedSizeInterner, Interned, MappedInterner}, + small_bitmap::SmallBitmap, +}; /// A cache which stores sufficient conditions for a path /// to resolve to an empty set of candidates within the current /// universe. -#[derive(Clone)] -pub struct EmptyPathsCache { - /// The set of edge indexes that resolve to no documents. - pub empty_edges: SmallBitmap, +pub struct DeadEndPathCache { + /// The set of edge conditions that resolve to no documents. + pub conditions: SmallBitmap, /// A set of path prefixes that resolve to no documents. - pub empty_prefixes: PathSet, - /// A set of empty couples of edge indexes that resolve to no documents. - pub empty_couple_edges: Vec, + pub prefixes: PathSet, + /// A set of empty couples of edge conditions that resolve to no documents. + pub condition_couples: MappedInterner, G::EdgeCondition>, } -impl EmptyPathsCache { - /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. - pub fn new(all_edges_len: u16) -> Self { +impl Clone for DeadEndPathCache { + fn clone(&self) -> Self { Self { - empty_edges: SmallBitmap::new(all_edges_len), - empty_prefixes: PathSet::default(), - empty_couple_edges: vec![SmallBitmap::new(all_edges_len); all_edges_len as usize], + conditions: self.conditions.clone(), + prefixes: self.prefixes.clone(), + condition_couples: self.condition_couples.clone(), + } + } +} + +impl DeadEndPathCache { + /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. + pub fn new(all_edge_conditions: &FixedSizeInterner) -> Self { + Self { + conditions: SmallBitmap::for_interned_values_in(all_edge_conditions), + prefixes: PathSet::default(), + condition_couples: all_edge_conditions + .map(|_| SmallBitmap::for_interned_values_in(all_edge_conditions)), } } /// Store in the cache that every path containing the given edge resolves to no documents. - pub fn forbid_edge(&mut self, edge_idx: u16) { - self.empty_edges.insert(edge_idx); - self.empty_couple_edges[edge_idx as usize].clear(); - self.empty_prefixes.remove_edge(&edge_idx); - for edges2 in self.empty_couple_edges.iter_mut() { - edges2.remove(edge_idx); + pub fn add_condition(&mut self, condition: Interned) { + self.conditions.insert(condition); + self.condition_couples.get_mut(condition).clear(); + self.prefixes.remove_edge(condition.into_inner()); // TODO: typed PathSet + for (_, edges2) in self.condition_couples.iter_mut() { + edges2.remove(condition); } } /// Store in the cache that every path containing the given prefix resolves to no documents. - pub fn forbid_prefix(&mut self, prefix: &[u16]) { - self.empty_prefixes.insert(prefix.iter().copied()); + pub fn add_prefix(&mut self, prefix: &[u16]) { + // TODO: typed PathSet + self.prefixes.insert(prefix.iter().copied()); } /// Store in the cache that every path containing the two given edges resolves to no documents. - pub fn forbid_couple_edges(&mut self, edge1: u16, edge2: u16) { - self.empty_couple_edges[edge1 as usize].insert(edge2); + pub fn add_condition_couple( + &mut self, + edge1: Interned, + edge2: Interned, + ) { + self.condition_couples.get_mut(edge1).insert(edge2); } /// Returns true if the cache can determine that the given path resolves to no documents. - pub fn path_is_empty(&self, path: &[u16], path_bitmap: &SmallBitmap) -> bool { - if path_bitmap.intersects(&self.empty_edges) { + pub fn path_is_dead_end( + &self, + path: &[u16], + path_bitmap: &SmallBitmap, + ) -> bool { + if path_bitmap.intersects(&self.conditions) { return true; } for edge in path.iter() { - let forbidden_other_edges = &self.empty_couple_edges[*edge as usize]; + // TODO: typed path + let forbidden_other_edges = self.condition_couples.get(Interned::new(*edge)); if path_bitmap.intersects(forbidden_other_edges) { return true; } } - if self.empty_prefixes.contains_prefix_of_path(path) { + if self.prefixes.contains_prefix_of_path(path) { return true; } false diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index ee93bee13..7b82dc0a1 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -18,13 +18,13 @@ mod typo; use std::hash::Hash; -pub use edge_docids_cache::EdgeConditionsCache; -pub use empty_paths_cache::EmptyPathsCache; -pub use proximity::ProximityGraph; +pub use edge_docids_cache::EdgeConditionDocIdsCache; +pub use empty_paths_cache::DeadEndPathCache; +pub use proximity::{ProximityEdge, ProximityGraph}; use roaring::RoaringBitmap; -pub use typo::TypoGraph; +pub use typo::{TypoEdge, TypoGraph}; -use super::interner::{Interned, Interner}; +use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner}; use super::logger::SearchLogger; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; @@ -63,8 +63,8 @@ impl Clone for EdgeCondition { /// 3. The condition associated with it #[derive(Clone)] pub struct Edge { - pub source_node: u16, - pub dest_node: u16, + pub source_node: Interned, + pub dest_node: Interned, pub cost: u8, pub condition: EdgeCondition, } @@ -96,7 +96,7 @@ pub trait RankingRuleGraphTrait: Sized { /// (with [`build_step_visit_source_node`](RankingRuleGraphTrait::build_step_visit_source_node)) to `dest_node`. fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut Interner, + conditions_interner: &mut DedupInterner, source_node: &QueryNode, dest_node: &QueryNode, ) -> Result)>>; @@ -104,9 +104,9 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &[Vec<(u16, SmallBitmap)>], + distances: &MappedInterner)>, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ); @@ -118,9 +118,9 @@ pub trait RankingRuleGraphTrait: Sized { /// but replacing the edges. pub struct RankingRuleGraph { pub query_graph: QueryGraph, - pub edges_store: Vec>>, - pub edges_of_node: Vec, - pub conditions_interner: Interner, + pub edges_store: FixedSizeInterner>>, + pub edges_of_node: MappedInterner>>, QueryNode>, + pub conditions_interner: FixedSizeInterner, } impl Clone for RankingRuleGraph { fn clone(&self) -> Self { @@ -133,13 +133,20 @@ impl Clone for RankingRuleGraph { } } impl RankingRuleGraph { - /// Remove the given edge from the ranking rule graph - pub fn remove_ranking_rule_edge(&mut self, edge_index: u16) { - let edge_opt = &mut self.edges_store[edge_index as usize]; - let Some(edge) = &edge_opt else { return }; - let (source_node, _dest_node) = (edge.source_node, edge.dest_node); - *edge_opt = None; - - self.edges_of_node[source_node as usize].remove(edge_index); + /// Remove all edges with the given condition + pub fn remove_edges_with_condition(&mut self, condition_to_remove: Interned) { + for (edge_id, edge_opt) in self.edges_store.iter_mut() { + let Some(edge) = edge_opt.as_mut() else { continue }; + match edge.condition { + EdgeCondition::Unconditional => continue, + EdgeCondition::Conditional(condition) => { + if condition == condition_to_remove { + let (source_node, _dest_node) = (edge.source_node, edge.dest_node); + *edge_opt = None; + self.edges_of_node.get_mut(source_node).remove(edge_id); + } + } + } + } } } diff --git a/milli/src/search/new/ranking_rule_graph/path_set.rs b/milli/src/search/new/ranking_rule_graph/path_set.rs index b601f28d9..d5bab6c14 100644 --- a/milli/src/search/new/ranking_rule_graph/path_set.rs +++ b/milli/src/search/new/ranking_rule_graph/path_set.rs @@ -27,10 +27,10 @@ impl PathSet { } } - pub fn remove_edge(&mut self, forbidden_edge: &u16) { + pub fn remove_edge(&mut self, forbidden_edge: u16) { let mut i = 0; while i < self.nodes.len() { - let should_remove = if &self.nodes[i].0 == forbidden_edge { + let should_remove = if self.nodes[i].0 == forbidden_edge { true } else if !self.nodes[i].1.nodes.is_empty() { self.nodes[i].1.remove_edge(forbidden_edge); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index b8042c408..556b3cb2b 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -3,7 +3,8 @@ use std::collections::BTreeMap; use super::ProximityEdge; use crate::search::new::db_cache::DatabaseCache; -use crate::search::new::interner::{Interned, Interner}; +use crate::search::new::interner::{DedupInterner, Interned}; +use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; use crate::search::new::ranking_rule_graph::proximity::WordPair; use crate::search::new::ranking_rule_graph::EdgeCondition; @@ -13,7 +14,7 @@ use heed::RoTxn; fn last_word_of_term_iter<'t>( t: &'t QueryTerm, - phrase_interner: &'t Interner, + phrase_interner: &'t DedupInterner, ) -> impl Iterator>, Interned)> + 't { t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map( move |p| { @@ -24,7 +25,7 @@ fn last_word_of_term_iter<'t>( } fn first_word_of_term_iter<'t>( t: &'t QueryTerm, - phrase_interner: &'t Interner, + phrase_interner: &'t DedupInterner, ) -> impl Iterator, Option>)> + 't { t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map( move |p| { @@ -36,7 +37,7 @@ fn first_word_of_term_iter<'t>( pub fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut Interner, + conditions_interner: &mut DedupInterner, from_node: &QueryNode, to_node: &QueryNode, ) -> Result)>> { @@ -50,19 +51,19 @@ pub fn build_edges<'ctx>( term_docids: _, } = ctx; - let (left_term, left_end_position) = match from_node { - QueryNode::Term(LocatedQueryTerm { value, positions }) => { + let (left_term, left_end_position) = match &from_node.data { + QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { (term_interner.get(*value), *positions.end()) } - QueryNode::Deleted => return Ok(vec![]), - QueryNode::Start => return Ok(vec![(0, EdgeCondition::Unconditional)]), - QueryNode::End => return Ok(vec![]), + QueryNodeData::Deleted => return Ok(vec![]), + QueryNodeData::Start => return Ok(vec![(0, EdgeCondition::Unconditional)]), + QueryNodeData::End => return Ok(vec![]), }; - let right_term = match &to_node { - QueryNode::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), - QueryNode::Deleted | QueryNode::Start => return Ok(vec![]), - QueryNode::Term(term) => term, + let right_term = match &to_node.data { + QueryNodeData::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), + QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]), + QueryNodeData::Term(term) => term, }; let LocatedQueryTerm { value: right_value, positions: right_positions } = right_term; @@ -145,7 +146,7 @@ fn add_prefix_edges<'ctx>( index: &mut &crate::Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &mut Interner, + word_interner: &mut DedupInterner, right_ngram_length: usize, left_word: Interned, right_prefix: Interned, @@ -207,7 +208,7 @@ fn add_non_prefix_edges<'ctx>( index: &mut &crate::Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &mut Interner, + word_interner: &mut DedupInterner, right_ngram_length: usize, word1: Interned, word2: Interned, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 2cfee0b65..2d226cfc7 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -3,9 +3,9 @@ pub mod compute_docids; use roaring::RoaringBitmap; -use super::empty_paths_cache::EmptyPathsCache; +use super::empty_paths_cache::DeadEndPathCache; use super::{EdgeCondition, RankingRuleGraphTrait}; -use crate::search::new::interner::{Interned, Interner}; +use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::Phrase; use crate::search::new::small_bitmap::SmallBitmap; @@ -56,7 +56,7 @@ impl RankingRuleGraphTrait for ProximityGraph { fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut Interner, + conditions_interner: &mut DedupInterner, source_node: &QueryNode, dest_node: &QueryNode, ) -> Result)>> { @@ -66,19 +66,12 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &super::RankingRuleGraph, paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &[Vec<(u16, SmallBitmap)>], + distances: &MappedInterner)>, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ) { - logger.log_proximity_state( - graph, - paths, - empty_paths_cache, - universe, - distances.to_vec(), - cost, - ); + logger.log_proximity_state(graph, paths, empty_paths_cache, universe, distances, cost); } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 6b832f9b2..c0404d391 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,9 +1,10 @@ use roaring::RoaringBitmap; -use super::empty_paths_cache::EmptyPathsCache; +use super::empty_paths_cache::DeadEndPathCache; use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::interner::{Interned, Interner}; +use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; +use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; @@ -55,13 +56,13 @@ impl RankingRuleGraphTrait for TypoGraph { fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut Interner, + conditions_interner: &mut DedupInterner, _from_node: &QueryNode, to_node: &QueryNode, ) -> Result)>> { let SearchContext { term_interner, .. } = ctx; - match to_node { - QueryNode::Term(LocatedQueryTerm { value, positions }) => { + match &to_node.data { + QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { let mut edges = vec![]; // Ngrams have a base typo cost // 2-gram -> equivalent to 1 typo @@ -130,20 +131,20 @@ impl RankingRuleGraphTrait for TypoGraph { } Ok(edges) } - QueryNode::End => Ok(vec![(0, EdgeCondition::Unconditional)]), - QueryNode::Deleted | QueryNode::Start => panic!(), + QueryNodeData::End => Ok(vec![(0, EdgeCondition::Unconditional)]), + QueryNodeData::Deleted | QueryNodeData::Start => panic!(), } } fn log_state( graph: &RankingRuleGraph, paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &[Vec<(u16, SmallBitmap)>], + distances: &MappedInterner)>, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ) { - logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances.to_vec(), cost); + logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances, cost); } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 5e5da8716..7549cfff7 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -186,6 +186,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( // anything, just extend the results and go back to the parent ranking rule. if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); + ranking_rule_universes[cur_ranking_rule_index].clear(); back!(); continue; } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 5ce6ecec2..2f941098d 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -7,11 +7,11 @@ use heed::{BytesDecode, RoTxn}; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; -use super::interner::{Interned, Interner}; -use super::query_graph::QUERY_GRAPH_NODE_LENGTH_LIMIT; +use super::interner::{DedupInterner, Interned}; +use super::query_graph::QueryNodeData; use super::query_term::{Phrase, QueryTerm}; use super::small_bitmap::SmallBitmap; -use super::{QueryGraph, QueryNode, SearchContext}; +use super::{QueryGraph, SearchContext}; use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; #[derive(Default)] @@ -26,8 +26,8 @@ impl QueryTermDocIdsCache { index: &Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &Interner, - phrase_interner: &Interner, + word_interner: &DedupInterner, + phrase_interner: &DedupInterner, phrase: Interned, ) -> Result<&'s RoaringBitmap> { if self.phrases.contains_key(&phrase) { @@ -44,9 +44,9 @@ impl QueryTermDocIdsCache { index: &Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &Interner, - term_interner: &Interner, - phrase_interner: &Interner, + word_interner: &DedupInterner, + term_interner: &DedupInterner, + phrase_interner: &DedupInterner, term_interned: Interned, ) -> Result<&'s RoaringBitmap> { if self.terms.contains_key(&term_interned) { @@ -105,28 +105,27 @@ pub fn resolve_query_graph<'ctx>( // TODO: there is a faster way to compute this big // roaring bitmap expression - let mut nodes_resolved = SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT); - let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()]; + let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes); + let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new()); let mut next_nodes_to_visit = VecDeque::new(); next_nodes_to_visit.push_back(q.root_node); - while let Some(node) = next_nodes_to_visit.pop_front() { - let predecessors = &q.edges[node as usize].predecessors; + while let Some(node_id) = next_nodes_to_visit.pop_front() { + let node = q.nodes.get(node_id); + let predecessors = &node.predecessors; if !predecessors.is_subset(&nodes_resolved) { - next_nodes_to_visit.push_back(node); + next_nodes_to_visit.push_back(node_id); continue; } // Take union of all predecessors let mut predecessors_docids = RoaringBitmap::new(); for p in predecessors.iter() { - predecessors_docids |= &path_nodes_docids[p as usize]; + predecessors_docids |= path_nodes_docids.get(p); } - let n = &q.nodes[node as usize]; - - let node_docids = match n { - QueryNode::Term(located_term) => { + let node_docids = match &node.data { + QueryNodeData::Term(located_term) => { let term_docids = query_term_docids.get_query_term_docids( index, txn, @@ -138,26 +137,26 @@ pub fn resolve_query_graph<'ctx>( )?; predecessors_docids & term_docids } - QueryNode::Deleted => { + QueryNodeData::Deleted => { panic!() } - QueryNode::Start => universe.clone(), - QueryNode::End => { + QueryNodeData::Start => universe.clone(), + QueryNodeData::End => { return Ok(predecessors_docids); } }; - nodes_resolved.insert(node); - path_nodes_docids[node as usize] = node_docids; + nodes_resolved.insert(node_id); + *path_nodes_docids.get_mut(node_id) = node_docids; - for succ in q.edges[node as usize].successors.iter() { + for succ in node.successors.iter() { if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) { next_nodes_to_visit.push_back(succ); } } - for prec in q.edges[node as usize].predecessors.iter() { - if q.edges[prec as usize].successors.is_subset(&nodes_resolved) { - path_nodes_docids[prec as usize].clear(); + for prec in node.predecessors.iter() { + if q.nodes.get(prec).successors.is_subset(&nodes_resolved) { + path_nodes_docids.get_mut(prec).clear(); } } } @@ -168,8 +167,8 @@ pub fn resolve_phrase<'ctx>( index: &Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &Interner, - phrase_interner: &Interner, + word_interner: &DedupInterner, + phrase_interner: &DedupInterner, phrase: Interned, ) -> Result { let Phrase { words } = phrase_interner.get(phrase).clone(); diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index 48a2e02fc..7ab2b61ae 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -1,9 +1,85 @@ +use std::marker::PhantomData; + +use super::interner::{FixedSizeInterner, Interned}; + +pub struct SmallBitmap { + internal: SmallBitmapInternal, + _phantom: PhantomData, +} +impl Clone for SmallBitmap { + fn clone(&self) -> Self { + Self { internal: self.internal.clone(), _phantom: PhantomData } + } +} +impl SmallBitmap { + pub fn for_interned_values_in(interner: &FixedSizeInterner) -> Self { + Self::new(interner.len()) + } + pub fn new(universe_length: u16) -> Self { + if universe_length <= 64 { + Self { internal: SmallBitmapInternal::Tiny(0), _phantom: PhantomData } + } else { + Self { + internal: SmallBitmapInternal::Small( + vec![0; 1 + universe_length as usize / 64].into_boxed_slice(), + ), + _phantom: PhantomData, + } + } + } + pub fn from_iter( + xs: impl Iterator>, + for_interner: &FixedSizeInterner, + ) -> Self { + Self { + internal: SmallBitmapInternal::from_iter( + xs.map(|x| x.into_inner()), + for_interner.len(), + ), + _phantom: PhantomData, + } + } + pub fn is_empty(&self) -> bool { + self.internal.is_empty() + } + pub fn clear(&mut self) { + self.internal.clear() + } + pub fn contains(&self, x: Interned) -> bool { + self.internal.contains(x.into_inner()) + } + pub fn insert(&mut self, x: Interned) { + self.internal.insert(x.into_inner()) + } + pub fn remove(&mut self, x: Interned) { + self.internal.remove(x.into_inner()) + } + + pub fn intersection(&mut self, other: &Self) { + self.internal.intersection(&other.internal) + } + pub fn union(&mut self, other: &Self) { + self.internal.union(&other.internal) + } + pub fn subtract(&mut self, other: &Self) { + self.internal.subtract(&other.internal) + } + pub fn is_subset(&self, other: &Self) -> bool { + self.internal.is_subset(&other.internal) + } + pub fn intersects(&self, other: &Self) -> bool { + self.internal.intersects(&other.internal) + } + pub fn iter(&self) -> impl Iterator> + '_ { + self.internal.iter().map(|x| Interned::new(x)) + } +} #[derive(Clone)] -pub enum SmallBitmap { +pub enum SmallBitmapInternal { Tiny(u64), Small(Box<[u64]>), } -impl SmallBitmap { +impl SmallBitmapInternal { pub fn new(universe_length: u16) -> Self { if universe_length <= 64 { Self::Tiny(0) @@ -20,8 +96,8 @@ impl SmallBitmap { } pub fn is_empty(&self) -> bool { match self { - SmallBitmap::Tiny(set) => *set == 0, - SmallBitmap::Small(sets) => { + SmallBitmapInternal::Tiny(set) => *set == 0, + SmallBitmapInternal::Small(sets) => { for set in sets.iter() { if *set != 0 { return false; @@ -33,8 +109,8 @@ impl SmallBitmap { } pub fn clear(&mut self) { match self { - SmallBitmap::Tiny(set) => *set = 0, - SmallBitmap::Small(sets) => { + SmallBitmapInternal::Tiny(set) => *set = 0, + SmallBitmapInternal::Small(sets) => { for set in sets.iter_mut() { *set = 0; } @@ -43,8 +119,8 @@ impl SmallBitmap { } pub fn contains(&self, mut x: u16) -> bool { let set = match self { - SmallBitmap::Tiny(set) => *set, - SmallBitmap::Small(set) => { + SmallBitmapInternal::Tiny(set) => *set, + SmallBitmapInternal::Small(set) => { let idx = x / 64; x %= 64; set[idx as usize] @@ -54,8 +130,8 @@ impl SmallBitmap { } pub fn insert(&mut self, mut x: u16) { let set = match self { - SmallBitmap::Tiny(set) => set, - SmallBitmap::Small(set) => { + SmallBitmapInternal::Tiny(set) => set, + SmallBitmapInternal::Small(set) => { let idx = x / 64; x %= 64; &mut set[idx as usize] @@ -65,8 +141,8 @@ impl SmallBitmap { } pub fn remove(&mut self, mut x: u16) { let set = match self { - SmallBitmap::Tiny(set) => set, - SmallBitmap::Small(set) => { + SmallBitmapInternal::Tiny(set) => set, + SmallBitmapInternal::Small(set) => { let idx = x / 64; x %= 64; &mut set[idx as usize] @@ -75,20 +151,20 @@ impl SmallBitmap { *set &= !(0b1 << x); } - pub fn intersection(&mut self, other: &SmallBitmap) { + pub fn intersection(&mut self, other: &SmallBitmapInternal) { self.apply_op(other, |a, b| *a &= b); } - pub fn union(&mut self, other: &SmallBitmap) { + pub fn union(&mut self, other: &SmallBitmapInternal) { self.apply_op(other, |a, b| *a |= b); } - pub fn subtract(&mut self, other: &SmallBitmap) { + pub fn subtract(&mut self, other: &SmallBitmapInternal) { self.apply_op(other, |a, b| *a &= !b); } - pub fn apply_op(&mut self, other: &SmallBitmap, op: impl Fn(&mut u64, u64)) { + pub fn apply_op(&mut self, other: &SmallBitmapInternal, op: impl Fn(&mut u64, u64)) { match (self, other) { - (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(a, *b), - (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + (SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(a, *b), + (SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => { assert!(a.len() == b.len(),); for (a, b) in a.iter_mut().zip(b.iter()) { op(a, *b); @@ -99,10 +175,14 @@ impl SmallBitmap { } } } - pub fn all_satisfy_op(&self, other: &SmallBitmap, op: impl Fn(u64, u64) -> bool) -> bool { + pub fn all_satisfy_op( + &self, + other: &SmallBitmapInternal, + op: impl Fn(u64, u64) -> bool, + ) -> bool { match (self, other) { - (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(*a, *b), - (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + (SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(*a, *b), + (SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => { assert!(a.len() == b.len()); for (a, b) in a.iter().zip(b.iter()) { if !op(*a, *b) { @@ -116,10 +196,14 @@ impl SmallBitmap { } } } - pub fn any_satisfy_op(&self, other: &SmallBitmap, op: impl Fn(u64, u64) -> bool) -> bool { + pub fn any_satisfy_op( + &self, + other: &SmallBitmapInternal, + op: impl Fn(u64, u64) -> bool, + ) -> bool { match (self, other) { - (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(*a, *b), - (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + (SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(*a, *b), + (SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => { assert!(a.len() == b.len()); for (a, b) in a.iter().zip(b.iter()) { if op(*a, *b) { @@ -133,32 +217,32 @@ impl SmallBitmap { } } } - pub fn is_subset(&self, other: &SmallBitmap) -> bool { + pub fn is_subset(&self, other: &SmallBitmapInternal) -> bool { self.all_satisfy_op(other, |a, b| a & !b == 0) } - pub fn intersects(&self, other: &SmallBitmap) -> bool { + pub fn intersects(&self, other: &SmallBitmapInternal) -> bool { self.any_satisfy_op(other, |a, b| a & b != 0) } - pub fn iter(&self) -> SmallBitmapIter<'_> { + pub fn iter(&self) -> SmallBitmapInternalIter<'_> { match self { - SmallBitmap::Tiny(x) => SmallBitmapIter::Tiny(*x), - SmallBitmap::Small(xs) => { - SmallBitmapIter::Small { cur: xs[0], next: &xs[1..], base: 0 } + SmallBitmapInternal::Tiny(x) => SmallBitmapInternalIter::Tiny(*x), + SmallBitmapInternal::Small(xs) => { + SmallBitmapInternalIter::Small { cur: xs[0], next: &xs[1..], base: 0 } } } } } -pub enum SmallBitmapIter<'b> { +pub enum SmallBitmapInternalIter<'b> { Tiny(u64), Small { cur: u64, next: &'b [u64], base: u16 }, } -impl<'b> Iterator for SmallBitmapIter<'b> { +impl<'b> Iterator for SmallBitmapInternalIter<'b> { type Item = u16; fn next(&mut self) -> Option { match self { - SmallBitmapIter::Tiny(set) => { + SmallBitmapInternalIter::Tiny(set) => { if *set > 0 { let idx = set.trailing_zeros() as u16; *set &= *set - 1; @@ -167,7 +251,7 @@ impl<'b> Iterator for SmallBitmapIter<'b> { None } } - SmallBitmapIter::Small { cur, next, base } => { + SmallBitmapInternalIter::Small { cur, next, base } => { if *cur > 0 { let idx = cur.trailing_zeros() as u16; *cur &= *cur - 1; @@ -185,23 +269,23 @@ impl<'b> Iterator for SmallBitmapIter<'b> { } } -#[cfg(test)] -mod tests { - use super::SmallBitmap; +// #[cfg(test)] +// mod tests { +// use super::SmallBitmap; - #[test] - fn test_small_bitmap() { - let mut bitmap1 = SmallBitmap::new(32); - for x in 0..16 { - bitmap1.insert(x * 2); - } - let mut bitmap2 = SmallBitmap::new(32); - for x in 0..=10 { - bitmap2.insert(x * 3); - } - bitmap1.intersection(&bitmap2); - for v in bitmap1.iter() { - println!("{v}"); - } - } -} +// #[test] +// fn test_small_bitmap() { +// let mut bitmap1 = SmallBitmap::new(32); +// for x in 0..16 { +// bitmap1.insert(x * 2); +// } +// let mut bitmap2 = SmallBitmap::new(32); +// for x in 0..=10 { +// bitmap2.insert(x * 3); +// } +// bitmap1.intersection(&bitmap2); +// for v in bitmap1.iter() { +// println!("{v}"); +// } +// } +// } diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 5bc5ff1fe..f5f8c0895 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -3,8 +3,9 @@ use std::collections::BTreeSet; use roaring::RoaringBitmap; use super::logger::SearchLogger; +use super::query_graph::QueryNodeData; use super::resolve_query_graph::resolve_query_graph; -use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput, SearchContext}; +use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::{Result, TermsMatchingStrategy}; pub struct Words { @@ -43,12 +44,12 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { let positions_to_remove = match self.terms_matching_strategy { TermsMatchingStrategy::Last => { let mut all_positions = BTreeSet::new(); - for n in parent_query_graph.nodes.iter() { - match n { - QueryNode::Term(term) => { + for (_, n) in parent_query_graph.nodes.iter() { + match &n.data { + QueryNodeData::Term(term) => { all_positions.extend(term.positions.clone().into_iter()); } - QueryNode::Deleted | QueryNode::Start | QueryNode::End => {} + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {} } } let mut r: Vec = all_positions.into_iter().collect(); From c0cdaf9f535d4eeed89e17fb58c6ee88bd0e2281 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Mar 2023 12:52:40 +0100 Subject: [PATCH 057/234] Fix bug in the proximity ranking rule for queries with ngrams --- .../search/new/graph_based_ranking_rule.rs | 14 +++- milli/src/search/new/logger/detailed.rs | 6 +- milli/src/search/new/logger/mod.rs | 6 +- milli/src/search/new/mod.rs | 10 +-- .../new/ranking_rule_graph/cheapest_paths.rs | 17 +++- .../src/search/new/ranking_rule_graph/mod.rs | 2 +- .../new/ranking_rule_graph/proximity/build.rs | 82 ++++++++++++------- .../proximity/compute_docids.rs | 31 ++++++- .../new/ranking_rule_graph/proximity/mod.rs | 32 +++++--- 9 files changed, 132 insertions(+), 68 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 5a28ab58a..a568e6d27 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -36,6 +36,8 @@ That is we find the documents where either: - OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by` */ +use std::ops::ControlFlow; + use roaring::RoaringBitmap; use super::interner::MappedInterner; @@ -263,7 +265,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase graph.remove_edges_with_condition(condition); // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore edge_docids_cache.cache.remove(&condition); - return Ok(()); + return Ok(ControlFlow::Continue(())); } path_docids &= edge_docids; @@ -287,14 +289,18 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase } // We should maybe instead try to compute: // 0th & nth & 1st & n-1th & 2nd & etc... - return Ok(()); + return Ok(ControlFlow::Continue(())); } } bucket |= &path_docids; // Reduce the size of the universe so that we can more optimistically discard candidate paths universe -= path_docids; - // TODO: if the universe is empty, stop iterating - Ok(()) + + if universe.is_empty() { + Ok(ControlFlow::Break(())) + } else { + Ok(ControlFlow::Continue(())) + } }, )?; diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index fb5a296bd..f3ce3f7e9 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -10,7 +10,7 @@ use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; use crate::search::new::ranking_rule_graph::{ - DeadEndPathCache, Edge, EdgeCondition, ProximityEdge, ProximityGraph, RankingRuleGraph, + DeadEndPathCache, Edge, EdgeCondition, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoEdge, TypoGraph, }; use crate::search::new::small_bitmap::SmallBitmap; @@ -46,7 +46,7 @@ pub enum SearchEvents { paths: Vec>, empty_paths_cache: DeadEndPathCache, universe: RoaringBitmap, - distances: MappedInterner)>, QueryNode>, + distances: MappedInterner)>, QueryNode>, cost: u16, }, TypoState { @@ -172,7 +172,7 @@ impl SearchLogger for DetailedSearchLogger { paths_map: &[Vec], empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ) { self.events.push(SearchEvents::ProximityState { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index ff500d4b8..c2e9bca80 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use super::interner::MappedInterner; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - DeadEndPathCache, ProximityEdge, ProximityGraph, RankingRuleGraph, TypoEdge, TypoGraph, + DeadEndPathCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoEdge, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{RankingRule, RankingRuleQueryTrait}; @@ -68,7 +68,7 @@ pub trait SearchLogger { paths: &[Vec], empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ); @@ -139,7 +139,7 @@ impl SearchLogger for DefaultSearchLogger { _paths_map: &[Vec], _empty_paths_cache: &DeadEndPathCache, _universe: &RoaringBitmap, - _distances: &MappedInterner)>, QueryNode>, + _distances: &MappedInterner)>, QueryNode>, _cost: u16, ) { } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 11420545c..02cd7b1de 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -303,16 +303,16 @@ mod tests { let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( &mut ctx, - // "which a the releases from poison by the government", + "releases from poison by the government", // "sun flower s are the best", - "zero config", + // "zero config", TermsMatchingStrategy::Last, None, 0, 20, &mut DefaultSearchLogger, &mut DefaultSearchLogger, - //&mut logger, + // &mut logger, ) .unwrap(); @@ -359,9 +359,9 @@ mod tests { let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("which a the releases from poison by the government"); + s.query("releases from poison by the government"); s.terms_matching_strategy(TermsMatchingStrategy::Last); - s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); + // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap(); let elapsed = start.elapsed(); diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 597aff661..cc3bfd7b4 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -2,6 +2,7 @@ use std::collections::btree_map::Entry; use std::collections::{BTreeMap, VecDeque}; +use std::ops::ControlFlow; use super::empty_paths_cache::DeadEndPathCache; use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; @@ -23,7 +24,7 @@ impl RankingRuleGraph { cost: u16, all_distances: &MappedInterner)>, QueryNode>, empty_paths_cache: &mut DeadEndPathCache, - mut visit: impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache) -> Result<()>, + mut visit: impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache) -> Result>, ) -> Result<()> { let _ = self.visit_paths_of_cost_rec( from, @@ -43,7 +44,7 @@ impl RankingRuleGraph { cost: u16, all_distances: &MappedInterner)>, QueryNode>, empty_paths_cache: &mut DeadEndPathCache, - visit: &mut impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache) -> Result<()>, + visit: &mut impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache) -> Result>, prev_conditions: &mut Vec, cur_path: &mut SmallBitmap, forbidden_conditions: &mut SmallBitmap, @@ -60,7 +61,11 @@ impl RankingRuleGraph { EdgeCondition::Unconditional => { if edge.dest_node == self.query_graph.end_node { any_valid = true; - visit(prev_conditions, self, empty_paths_cache)?; + let control_flow = visit(prev_conditions, self, empty_paths_cache)?; + match control_flow { + ControlFlow::Continue(_) => {} + ControlFlow::Break(_) => return Ok(true), + } true } else { self.visit_paths_of_cost_rec( @@ -101,7 +106,11 @@ impl RankingRuleGraph { ); let next_any_valid = if edge.dest_node == self.query_graph.end_node { any_valid = true; - visit(prev_conditions, self, empty_paths_cache)?; + let control_flow = visit(prev_conditions, self, empty_paths_cache)?; + match control_flow { + ControlFlow::Continue(_) => {} + ControlFlow::Break(_) => return Ok(true), + } true } else { self.visit_paths_of_cost_rec( diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 7b82dc0a1..4e0384ae0 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -20,7 +20,7 @@ use std::hash::Hash; pub use edge_docids_cache::EdgeConditionDocIdsCache; pub use empty_paths_cache::DeadEndPathCache; -pub use proximity::{ProximityEdge, ProximityGraph}; +pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoEdge, TypoGraph}; diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 556b3cb2b..c7eaa5d0c 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -1,7 +1,7 @@ #![allow(clippy::too_many_arguments)] use std::collections::BTreeMap; -use super::ProximityEdge; +use super::ProximityCondition; use crate::search::new::db_cache::DatabaseCache; use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_graph::QueryNodeData; @@ -37,10 +37,10 @@ fn first_word_of_term_iter<'t>( pub fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut DedupInterner, + conditions_interner: &mut DedupInterner, from_node: &QueryNode, to_node: &QueryNode, -) -> Result)>> { +) -> Result)>> { let SearchContext { index, txn, @@ -51,24 +51,33 @@ pub fn build_edges<'ctx>( term_docids: _, } = ctx; - let (left_term, left_end_position) = match &from_node.data { - QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { - (term_interner.get(*value), *positions.end()) - } - QueryNodeData::Deleted => return Ok(vec![]), - QueryNodeData::Start => return Ok(vec![(0, EdgeCondition::Unconditional)]), - QueryNodeData::End => return Ok(vec![]), - }; - let right_term = match &to_node.data { QueryNodeData::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]), QueryNodeData::Term(term) => term, }; - let LocatedQueryTerm { value: right_value, positions: right_positions } = right_term; + + let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term; let (right_term, right_start_position, right_ngram_length) = - (term_interner.get(*right_value), *right_positions.start(), right_positions.len()); + (term_interner.get(*right_term_interned), *right_positions.start(), right_positions.len()); + + let (left_term, left_end_position) = match &from_node.data { + QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { + (term_interner.get(*value), *positions.end()) + } + QueryNodeData::Deleted => return Ok(vec![]), + QueryNodeData::Start => { + return Ok(vec![( + (right_ngram_length - 1) as u8, + EdgeCondition::Conditional( + conditions_interner + .insert(ProximityCondition::Term { term: *right_term_interned }), + ), + )]) + } + QueryNodeData::End => return Ok(vec![]), + }; if left_end_position + 1 != right_start_position { // We want to ignore this pair of terms @@ -77,7 +86,12 @@ pub fn build_edges<'ctx>( // `flowers` is removed by the `words` ranking rule. // The remaining query graph represents `the sun .. are beautiful` // but `sun` and `are` have no proximity condition between them - return Ok(vec![(0, EdgeCondition::Unconditional)]); + return Ok(vec![( + (right_ngram_length - 1) as u8, + EdgeCondition::Conditional( + conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }), + ), + )]); } let mut cost_proximity_word_pairs = BTreeMap::>>::new(); @@ -121,24 +135,30 @@ pub fn build_edges<'ctx>( } } - let mut new_edges = - cost_proximity_word_pairs - .into_iter() - .flat_map(|(cost, proximity_word_pairs)| { - let mut edges = vec![]; - for (proximity, word_pairs) in proximity_word_pairs { - edges.push(( - cost, - EdgeCondition::Conditional(conditions_interner.insert(ProximityEdge { + let mut new_edges = cost_proximity_word_pairs + .into_iter() + .flat_map(|(cost, proximity_word_pairs)| { + let mut edges = vec![]; + for (proximity, word_pairs) in proximity_word_pairs { + edges.push(( + cost, + EdgeCondition::Conditional(conditions_interner.insert( + ProximityCondition::Pairs { pairs: word_pairs.into_boxed_slice(), proximity, - })), - )) - } - edges - }) - .collect::>(); - new_edges.push((8 + (right_ngram_length - 1) as u8, EdgeCondition::Unconditional)); + }, + )), + )) + } + edges + }) + .collect::>(); + new_edges.push(( + 8 + (right_ngram_length - 1) as u8, + EdgeCondition::Conditional( + conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }), + ), + )); Ok(new_edges) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 0acee0329..1123692f3 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,16 +1,39 @@ use roaring::RoaringBitmap; -use super::{ProximityEdge, WordPair}; +use super::{ProximityCondition, WordPair}; use crate::search::new::SearchContext; use crate::{CboRoaringBitmapCodec, Result}; pub fn compute_docids<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &ProximityEdge, + edge: &ProximityCondition, universe: &RoaringBitmap, ) -> Result { - let SearchContext { index, txn, db_cache, word_interner, .. } = ctx; - let ProximityEdge { pairs, proximity } = edge; + let SearchContext { + index, + txn, + db_cache, + word_interner, + term_docids, + phrase_interner, + term_interner, + } = ctx; + let (pairs, proximity) = match edge { + ProximityCondition::Term { term } => { + return term_docids + .get_query_term_docids( + index, + txn, + db_cache, + word_interner, + term_interner, + phrase_interner, + *term, + ) + .cloned() + } + ProximityCondition::Pairs { pairs, proximity } => (pairs, proximity), + }; let mut pair_docids = RoaringBitmap::new(); for pair in pairs.iter() { let pair = match pair { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 2d226cfc7..427a1e904 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -4,15 +4,15 @@ pub mod compute_docids; use roaring::RoaringBitmap; use super::empty_paths_cache::DeadEndPathCache; -use super::{EdgeCondition, RankingRuleGraphTrait}; +use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; -use crate::search::new::query_term::Phrase; +use crate::search::new::query_term::{Phrase, QueryTerm}; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; -#[derive(Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum WordPair { Words { phrases: Vec>, @@ -31,27 +31,33 @@ pub enum WordPair { } #[derive(Clone, PartialEq, Eq, Hash)] -pub struct ProximityEdge { - pairs: Box<[WordPair]>, - proximity: u8, +pub enum ProximityCondition { + Term { term: Interned }, + Pairs { pairs: Box<[WordPair]>, proximity: u8 }, } pub enum ProximityGraph {} impl RankingRuleGraphTrait for ProximityGraph { - type EdgeCondition = ProximityEdge; + type EdgeCondition = ProximityCondition; fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String { - let ProximityEdge { pairs, proximity } = edge; - format!(", prox {proximity}, {} pairs", pairs.len()) + match edge { + ProximityCondition::Term { term } => { + format!("term {term}") + } + ProximityCondition::Pairs { pairs, proximity } => { + format!("prox {proximity}, {} pairs", pairs.len()) + } + } } fn resolve_edge_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &Self::EdgeCondition, + condition: &Self::EdgeCondition, universe: &RoaringBitmap, ) -> Result { - compute_docids::compute_docids(ctx, edge, universe) + compute_docids::compute_docids(ctx, condition, universe) } fn build_edges<'ctx>( @@ -64,11 +70,11 @@ impl RankingRuleGraphTrait for ProximityGraph { } fn log_state( - graph: &super::RankingRuleGraph, + graph: &RankingRuleGraph, paths: &[Vec], empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner)>, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ) { From 05fe856e6e75ce4e9d39c7f3123da85611edb4e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Mar 2023 13:02:55 +0100 Subject: [PATCH 058/234] Merge forward and backward proximity conditions in proximity graph --- milli/src/search/new/mod.rs | 4 +- .../new/ranking_rule_graph/proximity/build.rs | 85 ++++++++----------- .../proximity/compute_docids.rs | 10 +-- .../new/ranking_rule_graph/proximity/mod.rs | 9 +- 4 files changed, 50 insertions(+), 58 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 02cd7b1de..d893691b8 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -303,7 +303,7 @@ mod tests { let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( &mut ctx, - "releases from poison by the government", + "which a the releases from poison by the government", // "sun flower s are the best", // "zero config", TermsMatchingStrategy::Last, @@ -359,7 +359,7 @@ mod tests { let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("releases from poison by the government"); + s.query("which a the releases from poison by the government"); s.terms_matching_strategy(TermsMatchingStrategy::Last); // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap(); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index c7eaa5d0c..8ae634fbf 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -94,7 +94,7 @@ pub fn build_edges<'ctx>( )]); } - let mut cost_proximity_word_pairs = BTreeMap::>>::new(); + let mut cost_word_pairs = BTreeMap::>::new(); if let Some(right_prefix) = right_term.use_prefix_db { for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { @@ -106,7 +106,7 @@ pub fn build_edges<'ctx>( right_ngram_length, left_word, right_prefix, - &mut cost_proximity_word_pairs, + &mut cost_word_pairs, left_phrase, )?; } @@ -129,28 +129,22 @@ pub fn build_edges<'ctx>( right_ngram_length, left_word, right_word, - &mut cost_proximity_word_pairs, + &mut cost_word_pairs, &[left_phrase, right_phrase].iter().copied().flatten().collect::>(), )?; } } - let mut new_edges = cost_proximity_word_pairs + let mut new_edges = cost_word_pairs .into_iter() - .flat_map(|(cost, proximity_word_pairs)| { - let mut edges = vec![]; - for (proximity, word_pairs) in proximity_word_pairs { - edges.push(( - cost, - EdgeCondition::Conditional(conditions_interner.insert( - ProximityCondition::Pairs { - pairs: word_pairs.into_boxed_slice(), - proximity, - }, - )), - )) - } - edges + .map(|(cost, word_pairs)| { + ( + cost, + EdgeCondition::Conditional( + conditions_interner + .insert(ProximityCondition::Pairs { pairs: word_pairs.into_boxed_slice() }), + ), + ) }) .collect::>(); new_edges.push(( @@ -170,7 +164,7 @@ fn add_prefix_edges<'ctx>( right_ngram_length: usize, left_word: Interned, right_prefix: Interned, - cost_proximity_word_pairs: &mut BTreeMap>>, + cost_proximity_word_pairs: &mut BTreeMap>, left_phrase: Option>, ) -> Result<()> { for proximity in 1..=(8 - right_ngram_length) { @@ -188,16 +182,12 @@ fn add_prefix_edges<'ctx>( )? .is_some() { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8) - .or_default() - .push(WordPair::WordPrefix { - phrases: left_phrase.into_iter().collect(), - left: left_word, - right_prefix, - }); + cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefix { + phrases: left_phrase.into_iter().collect(), + left: left_word, + right_prefix, + proximity: proximity as u8, + }); } // No swapping when computing the proximity between a phrase and a word @@ -213,12 +203,11 @@ fn add_prefix_edges<'ctx>( )? .is_some() { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8) - .or_default() - .push(WordPair::WordPrefixSwapped { left_prefix: right_prefix, right: left_word }); + cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefixSwapped { + left_prefix: right_prefix, + right: left_word, + proximity: proximity as u8 - 1, + }); } } Ok(()) @@ -232,7 +221,7 @@ fn add_non_prefix_edges<'ctx>( right_ngram_length: usize, word1: Interned, word2: Interned, - cost_proximity_word_pairs: &mut BTreeMap>>, + cost_proximity_word_pairs: &mut BTreeMap>, phrases: &[Interned], ) -> Result<()> { for proximity in 1..=(8 - right_ngram_length) { @@ -248,12 +237,12 @@ fn add_non_prefix_edges<'ctx>( )? .is_some() { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8) - .or_default() - .push(WordPair::Words { phrases: phrases.to_vec(), left: word1, right: word2 }); + cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words { + phrases: phrases.to_vec(), + left: word1, + right: word2, + proximity: proximity as u8, + }); } if proximity > 1 // no swapping when either term is a phrase @@ -269,12 +258,12 @@ fn add_non_prefix_edges<'ctx>( )? .is_some() { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8 - 1) - .or_default() - .push(WordPair::Words { phrases: vec![], left: word2, right: word1 }); + cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words { + phrases: vec![], + left: word2, + right: word1, + proximity: proximity as u8 - 1, + }); } } Ok(()) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 1123692f3..cdf167cb0 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -18,7 +18,7 @@ pub fn compute_docids<'ctx>( phrase_interner, term_interner, } = ctx; - let (pairs, proximity) = match edge { + let pairs = match edge { ProximityCondition::Term { term } => { return term_docids .get_query_term_docids( @@ -32,12 +32,12 @@ pub fn compute_docids<'ctx>( ) .cloned() } - ProximityCondition::Pairs { pairs, proximity } => (pairs, proximity), + ProximityCondition::Pairs { pairs } => pairs, }; let mut pair_docids = RoaringBitmap::new(); for pair in pairs.iter() { let pair = match pair { - WordPair::Words { phrases, left, right } => { + WordPair::Words { phrases, left, right, proximity } => { let mut docids = db_cache .get_word_pair_proximity_docids( index, @@ -64,7 +64,7 @@ pub fn compute_docids<'ctx>( } docids } - WordPair::WordPrefix { phrases, left, right_prefix } => { + WordPair::WordPrefix { phrases, left, right_prefix, proximity } => { let mut docids = db_cache .get_word_prefix_pair_proximity_docids( index, @@ -91,7 +91,7 @@ pub fn compute_docids<'ctx>( } docids } - WordPair::WordPrefixSwapped { left_prefix, right } => db_cache + WordPair::WordPrefixSwapped { left_prefix, right, proximity } => db_cache .get_prefix_word_pair_proximity_docids( index, txn, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 427a1e904..65c282108 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -18,22 +18,25 @@ pub enum WordPair { phrases: Vec>, left: Interned, right: Interned, + proximity: u8, }, WordPrefix { phrases: Vec>, left: Interned, right_prefix: Interned, + proximity: u8, }, WordPrefixSwapped { left_prefix: Interned, right: Interned, + proximity: u8, }, } #[derive(Clone, PartialEq, Eq, Hash)] pub enum ProximityCondition { Term { term: Interned }, - Pairs { pairs: Box<[WordPair]>, proximity: u8 }, + Pairs { pairs: Box<[WordPair]> }, } pub enum ProximityGraph {} @@ -46,8 +49,8 @@ impl RankingRuleGraphTrait for ProximityGraph { ProximityCondition::Term { term } => { format!("term {term}") } - ProximityCondition::Pairs { pairs, proximity } => { - format!("prox {proximity}, {} pairs", pairs.len()) + ProximityCondition::Pairs { pairs } => { + format!("pairs {}", pairs.len()) } } } From a49ddec9df8749632cb1cb3627179a3ce915160a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Mar 2023 16:08:43 +0100 Subject: [PATCH 059/234] Prune the query graph after executing a ranking rule --- .../search/new/graph_based_ranking_rule.rs | 63 +++++++-- milli/src/search/new/logger/detailed.rs | 51 ++++---- milli/src/search/new/mod.rs | 13 +- milli/src/search/new/query_graph.rs | 2 +- milli/src/search/new/query_term.rs | 106 +++++++++++++++ .../new/ranking_rule_graph/cheapest_paths.rs | 2 +- .../src/search/new/ranking_rule_graph/mod.rs | 16 ++- .../new/ranking_rule_graph/proximity/mod.rs | 123 ++++++++++++++++-- .../search/new/ranking_rule_graph/typo/mod.rs | 83 +++++++++++- 9 files changed, 401 insertions(+), 58 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index a568e6d27..5f270de6a 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -36,6 +36,7 @@ That is we find the documents where either: - OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by` */ +use std::collections::HashSet; use std::ops::ControlFlow; use roaring::RoaringBitmap; @@ -50,6 +51,7 @@ use super::ranking_rule_graph::{ use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::search::new::interner::Interned; +use crate::search::new::query_graph::QueryNodeData; use crate::Result; pub type Proximity = GraphBasedRankingRule; @@ -216,9 +218,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let original_universe = universe; let mut universe = universe.clone(); - // TODO: remove this unnecessary clone let original_graph = graph.clone(); - // and this vector as well + let mut used_conditions = SmallBitmap::for_interned_values_in(&graph.conditions_interner); let mut paths = vec![]; // For each path of the given cost, we will compute its associated @@ -243,8 +244,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // We store the edges and their docids in vectors in case the path turns out to be // empty and we need to figure out why it was empty. let mut visited_conditions = vec![]; - let mut cached_edge_docids = - graph.conditions_interner.map(|_| RoaringBitmap::new()); + let mut cached_edge_docids = vec![]; + // graph.conditions_interner.map(|_| RoaringBitmap::new()); for &condition_interned_raw in path { let condition = Interned::new(condition_interned_raw); @@ -253,7 +254,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let edge_docids = edge_docids_cache.get_edge_docids(ctx, condition, graph, &universe)?; - *cached_edge_docids.get_mut(condition) = edge_docids.clone(); + cached_edge_docids.push((condition, edge_docids.clone())); // .get_mut(condition) = edge_docids.clone(); // If the edge is empty, then the path will be empty as well, we update the graph // and caches accordingly and skip to the next candidate path. @@ -279,12 +280,12 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // then we also know that any path containing the same couple of // edges will also be empty. for (past_condition, edge_docids2) in cached_edge_docids.iter() { - if past_condition == condition { + if *past_condition == condition { continue; }; let intersection = edge_docids & edge_docids2; if intersection.is_disjoint(&universe) { - empty_paths_cache.add_condition_couple(past_condition, condition); + empty_paths_cache.add_condition_couple(*past_condition, condition); } } // We should maybe instead try to compute: @@ -292,6 +293,10 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase return Ok(ControlFlow::Continue(())); } } + assert!(!path_docids.is_empty()); + for condition in path { + used_conditions.insert(Interned::new(*condition)); + } bucket |= &path_docids; // Reduce the size of the universe so that we can more optimistically discard candidate paths universe -= path_docids; @@ -307,16 +312,50 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase G::log_state( &original_graph, &paths, - &state.empty_paths_cache, + empty_paths_cache, original_universe, - &state.all_distances, + all_distances, cost, logger, ); - // TODO: Graph-based ranking rules do not (yet) modify the query graph. We could, however, - // remove nodes and/or terms within nodes that weren't present in any of the paths. - let next_query_graph = state.graph.query_graph.clone(); + // We modify the next query graph so that it only contains the subgraph + // that was used to compute this bucket + // But we only do it in case the bucket length is >1, because otherwise + // we know the child ranking rule won't be called anyway + let mut next_query_graph = original_graph.query_graph; + next_query_graph.simplify(); + if bucket.len() > 1 { + // 1. Gather all the words and phrases used in the computation of this bucket + let mut used_words = HashSet::new(); + let mut used_phrases = HashSet::new(); + for condition in used_conditions.iter() { + let condition = graph.conditions_interner.get(condition); + used_words.extend(G::words_used_by_edge_condition(ctx, condition)?); + used_phrases.extend(G::phrases_used_by_edge_condition(ctx, condition)?); + } + // 2. Remove the unused words and phrases from all the nodes in the graph + let mut nodes_to_remove = vec![]; + for (node_id, node) in next_query_graph.nodes.iter_mut() { + let term = match &mut node.data { + QueryNodeData::Term(term) => term, + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, + }; + if let Some(new_term) = ctx + .term_interner + .get(term.value) + .removing_forbidden_terms(&used_words, &used_phrases) + { + if new_term.is_empty() { + nodes_to_remove.push(node_id); + } else { + term.value = ctx.term_interner.insert(new_term); + } + } + } + // 3. Remove the empty nodes from the graph + next_query_graph.remove_nodes(&nodes_to_remove); + } self.state = Some(state); diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index f3ce3f7e9..6b62c63b5 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -528,7 +528,7 @@ shape: class" ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], - _empty_paths_cache: &DeadEndPathCache, + dead_end_paths_cache: &DeadEndPathCache, distances: MappedInterner)>, QueryNode>, file: &mut File, ) { @@ -552,12 +552,11 @@ shape: class" .unwrap(); } EdgeCondition::Conditional(condition) => { - let condition = graph.conditions_interner.get(*condition); + // let condition = graph.conditions_interner.get(*condition); writeln!( file, - "{source_node} -> {dest_node} : \"cost {cost} {edge_label}\"", + "{source_node} -> {dest_node} : \"{condition} cost {cost}\"", cost = edge.cost, - edge_label = R::label_for_edge_condition(condition) ) .unwrap(); } @@ -569,28 +568,33 @@ shape: class" // Self::paths_d2_description(graph, paths, file); // writeln!(file, "}}").unwrap(); - writeln!(file, "Shortest Paths {{").unwrap(); + writeln!(file, "Paths {{").unwrap(); Self::paths_d2_description(ctx, graph, paths, file); writeln!(file, "}}").unwrap(); - // writeln!(file, "Empty Edge Couples {{").unwrap(); - // for (i, (e1, e2)) in empty_paths_cache.empty_couple_edges.iter().enumerate() { - // writeln!(file, "{i} : \"\" {{").unwrap(); - // Self::edge_d2_description(graph, *e1, file); - // Self::edge_d2_description(graph, *e2, file); - // writeln!(file, "{e1} -- {e2}").unwrap(); - // writeln!(file, "}}").unwrap(); - // } - // writeln!(file, "}}").unwrap(); + writeln!(file, "Dead-end couples of conditions {{").unwrap(); + for (i, (e1, e2)) in dead_end_paths_cache.condition_couples.iter().enumerate() { + writeln!(file, "{i} : \"\" {{").unwrap(); + Self::condition_d2_description(ctx, graph, e1, file); + for e2 in e2.iter() { + Self::condition_d2_description(ctx, graph, e2, file); + writeln!(file, "{e1} -- {e2}").unwrap(); + } + writeln!(file, "}}").unwrap(); + } + writeln!(file, "}}").unwrap(); - // writeln!(file, "Removed Edges {{").unwrap(); - // for edge_idx in empty_paths_cache.empty_edges.iter() { - // writeln!(file, "{edge_idx}").unwrap(); - // } + writeln!(file, "Dead-end edges {{").unwrap(); + for condition in dead_end_paths_cache.conditions.iter() { + writeln!(file, "{condition}").unwrap(); + } + writeln!(file, "}}").unwrap(); + + // writeln!(file, "Dead-end prefixes {{").unwrap(); // writeln!(file, "}}").unwrap(); } fn condition_d2_description( - _ctx: &mut SearchContext, + ctx: &mut SearchContext, graph: &RankingRuleGraph, condition_id: Interned, file: &mut File, @@ -598,10 +602,11 @@ shape: class" let condition = graph.conditions_interner.get(condition_id); writeln!( file, - "{condition_id}: \"{}\" {{ - shape: class - }}", - R::label_for_edge_condition(condition) + "{condition_id} {{ +shape: class +{} +}}", + R::label_for_edge_condition(ctx, condition).unwrap() ) .unwrap(); } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index d893691b8..92e00df11 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -303,7 +303,8 @@ mod tests { let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( &mut ctx, - "which a the releases from poison by the government", + "released from prison by the government", + // "which a the releases from poison by the government", // "sun flower s are the best", // "zero config", TermsMatchingStrategy::Last, @@ -338,7 +339,7 @@ mod tests { println!("{}us: {:?}", elapsed.as_micros(), results); } - // for (id, _document) in documents { + // for (id, document) in documents { // println!("{id}:"); // // println!("{document}"); // } @@ -359,9 +360,13 @@ mod tests { let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("which a the releases from poison by the government"); + s.query( + // "which a the releases from poison by the government", + // "sun flower s are the best", + "zero config", + ); s.terms_matching_strategy(TermsMatchingStrategy::Last); - // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); + // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlyIterative); let docs = s.execute().unwrap(); let elapsed = start.elapsed(); diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index d487a644f..1012030be 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -281,7 +281,7 @@ impl QueryGraph { /// Simplify the query graph by removing all nodes that are disconnected from /// the start or end nodes. - fn simplify(&mut self) { + pub fn simplify(&mut self) { loop { let mut nodes_to_remove = vec![]; for (node_idx, node) in self.nodes.iter() { diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 3272464f8..0ce000537 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use std::mem; use std::ops::RangeInclusive; @@ -59,6 +60,111 @@ pub struct QueryTerm { pub use_prefix_db: Option>, } impl QueryTerm { + pub fn removing_forbidden_terms( + &self, + allowed_words: &HashSet>, + allowed_phrases: &HashSet>, + ) -> Option { + let QueryTerm { + original, + is_ngram, + is_prefix, + phrase, + zero_typo, + prefix_of, + synonyms, + split_words, + one_typo, + two_typos, + use_prefix_db, + } = self; + + let mut changed = false; + + let mut new_zero_typo = None; + if let Some(w) = zero_typo { + if allowed_words.contains(w) { + new_zero_typo = Some(*w); + } else { + changed = true; + } + } + // TODO: this is incorrect, prefix DB stuff should be treated separately + let mut new_use_prefix_db = None; + if let Some(w) = use_prefix_db { + if allowed_words.contains(w) { + new_use_prefix_db = Some(*w); + } else { + changed = true; + } + } + let mut new_prefix_of = vec![]; + for w in prefix_of.iter() { + if allowed_words.contains(w) { + new_prefix_of.push(*w); + } else { + changed = true; + } + } + let mut new_one_typo = vec![]; + for w in one_typo.iter() { + if allowed_words.contains(w) { + new_one_typo.push(*w); + } else { + changed = true; + } + } + let mut new_two_typos = vec![]; + for w in two_typos.iter() { + if allowed_words.contains(w) { + new_two_typos.push(*w); + } else { + changed = true; + } + } + // TODO: this is incorrect, prefix DB stuff should be treated separately + let mut new_phrase = None; + if let Some(w) = phrase { + if !allowed_phrases.contains(w) { + new_phrase = Some(*w); + } else { + changed = true; + } + } + let mut new_split_words = None; + if let Some(w) = split_words { + if allowed_phrases.contains(w) { + new_split_words = Some(*w); + } else { + changed = true; + } + } + let mut new_synonyms = vec![]; + for w in synonyms.iter() { + if allowed_phrases.contains(w) { + new_synonyms.push(*w); + } else { + changed = true; + } + } + if changed { + Some(QueryTerm { + original: *original, + is_ngram: *is_ngram, + is_prefix: *is_prefix, + phrase: new_phrase, + zero_typo: new_zero_typo, + prefix_of: new_prefix_of.into_boxed_slice(), + synonyms: new_synonyms.into_boxed_slice(), + split_words: new_split_words, + one_typo: new_one_typo.into_boxed_slice(), + two_typos: new_two_typos.into_boxed_slice(), + use_prefix_db: new_use_prefix_db, + }) + } else { + None + } + } pub fn phrase( word_interner: &mut DedupInterner, phrase_interner: &mut DedupInterner, diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index cc3bfd7b4..c0697091e 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -33,7 +33,7 @@ impl RankingRuleGraph { empty_paths_cache, &mut visit, &mut vec![], - &mut SmallBitmap::new(self.edges_store.len()), + &mut SmallBitmap::for_interned_values_in(&self.conditions_interner), &mut empty_paths_cache.conditions.clone(), )?; Ok(()) diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 4e0384ae0..851aeae54 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -16,6 +16,7 @@ mod proximity; /// Implementation of the `typo` ranking rule mod typo; +use std::collections::HashSet; use std::hash::Hash; pub use edge_docids_cache::EdgeConditionDocIdsCache; @@ -26,6 +27,7 @@ pub use typo::{TypoEdge, TypoGraph}; use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner}; use super::logger::SearchLogger; +use super::query_term::Phrase; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; use crate::Result; @@ -82,7 +84,19 @@ pub trait RankingRuleGraphTrait: Sized { /// Return the label of the given edge condition, to be used when visualising /// the ranking rule graph. - fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String; + fn label_for_edge_condition<'ctx>( + ctx: &mut SearchContext<'ctx>, + edge: &Self::EdgeCondition, + ) -> Result; + + fn words_used_by_edge_condition<'ctx>( + ctx: &mut SearchContext<'ctx>, + edge: &Self::EdgeCondition, + ) -> Result>>; + fn phrases_used_by_edge_condition<'ctx>( + ctx: &mut SearchContext<'ctx>, + edge: &Self::EdgeCondition, + ) -> Result>>; /// Compute the document ids associated with the given edge condition, /// restricted to the given universe. diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 65c282108..8fd8190f8 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -1,6 +1,9 @@ pub mod build; pub mod compute_docids; +use std::collections::HashSet; +use std::iter::FromIterator; + use roaring::RoaringBitmap; use super::empty_paths_cache::DeadEndPathCache; @@ -44,17 +47,6 @@ pub enum ProximityGraph {} impl RankingRuleGraphTrait for ProximityGraph { type EdgeCondition = ProximityCondition; - fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String { - match edge { - ProximityCondition::Term { term } => { - format!("term {term}") - } - ProximityCondition::Pairs { pairs } => { - format!("pairs {}", pairs.len()) - } - } - } - fn resolve_edge_condition<'ctx>( ctx: &mut SearchContext<'ctx>, condition: &Self::EdgeCondition, @@ -83,4 +75,113 @@ impl RankingRuleGraphTrait for ProximityGraph { ) { logger.log_proximity_state(graph, paths, empty_paths_cache, universe, distances, cost); } + + fn label_for_edge_condition<'ctx>( + ctx: &mut SearchContext<'ctx>, + edge: &Self::EdgeCondition, + ) -> Result { + match edge { + ProximityCondition::Term { term } => { + let term = ctx.term_interner.get(*term); + Ok(format!("{} : exists", ctx.word_interner.get(term.original))) + } + ProximityCondition::Pairs { pairs } => { + let mut s = String::new(); + for pair in pairs.iter() { + match pair { + WordPair::Words { phrases, left, right, proximity } => { + let left = ctx.word_interner.get(*left); + let right = ctx.word_interner.get(*right); + if !phrases.is_empty() { + s.push_str(&format!("{} phrases + ", phrases.len())); + } + s.push_str(&format!("\"{left} {right}\": {proximity}\n")); + } + WordPair::WordPrefix { phrases, left, right_prefix, proximity } => { + let left = ctx.word_interner.get(*left); + let right = ctx.word_interner.get(*right_prefix); + if !phrases.is_empty() { + s.push_str(&format!("{} phrases + ", phrases.len())); + } + s.push_str(&format!("\"{left} {right}...\" : {proximity}\n")); + } + WordPair::WordPrefixSwapped { left_prefix, right, proximity } => { + let left = ctx.word_interner.get(*left_prefix); + let right = ctx.word_interner.get(*right); + s.push_str(&format!("\"{left}... {right}\" : {proximity}\n")); + } + } + } + Ok(s) + } + } + } + + fn words_used_by_edge_condition<'ctx>( + ctx: &mut SearchContext<'ctx>, + edge: &Self::EdgeCondition, + ) -> Result>> { + match edge { + ProximityCondition::Term { term } => { + let term = ctx.term_interner.get(*term); + Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) + } + ProximityCondition::Pairs { pairs } => { + let mut set = HashSet::new(); + for pair in pairs.iter() { + match pair { + WordPair::Words { phrases: _, left, right, proximity: _ } => { + set.insert(*left); + set.insert(*right); + } + WordPair::WordPrefix { phrases: _, left, right_prefix, proximity: _ } => { + set.insert(*left); + // TODO: this is not correct, there should be another trait method for collecting the prefixes + // to be used with the prefix DBs + set.insert(*right_prefix); + } + WordPair::WordPrefixSwapped { left_prefix, right, proximity: _ } => { + // TODO: this is not correct, there should be another trait method for collecting the prefixes + // to be used with the prefix DBs + set.insert(*left_prefix); + set.insert(*right); + } + } + } + Ok(set) + } + } + } + + fn phrases_used_by_edge_condition<'ctx>( + ctx: &mut SearchContext<'ctx>, + edge: &Self::EdgeCondition, + ) -> Result>> { + match edge { + ProximityCondition::Term { term } => { + let term = ctx.term_interner.get(*term); + Ok(HashSet::from_iter(term.all_phrases())) + } + ProximityCondition::Pairs { pairs } => { + let mut set = HashSet::new(); + for pair in pairs.iter() { + match pair { + WordPair::Words { phrases, left: _, right: _, proximity: _ } => { + set.extend(phrases.iter().copied()); + } + WordPair::WordPrefix { + phrases, + left: _, + right_prefix: _, + proximity: _, + } => { + set.extend(phrases.iter().copied()); + } + WordPair::WordPrefixSwapped { left_prefix: _, right: _, proximity: _ } => {} + } + } + Ok(set) + } + } + } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index c0404d391..abfea6499 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -5,10 +5,13 @@ use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; +use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; +use std::collections::HashSet; +use std::fmt::Write; +use std::iter::FromIterator; #[derive(Clone, PartialEq, Eq, Hash)] pub struct TypoEdge { @@ -21,10 +24,6 @@ pub enum TypoGraph {} impl RankingRuleGraphTrait for TypoGraph { type EdgeCondition = TypoEdge; - fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String { - format!(", {} typos", edge.nbr_typos) - } - fn resolve_edge_condition<'db_cache, 'ctx>( ctx: &mut SearchContext<'ctx>, edge: &Self::EdgeCondition, @@ -147,4 +146,78 @@ impl RankingRuleGraphTrait for TypoGraph { ) { logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances, cost); } + + fn label_for_edge_condition<'ctx>( + ctx: &mut SearchContext<'ctx>, + edge: &Self::EdgeCondition, + ) -> Result { + let TypoEdge { term, nbr_typos: _ } = edge; + let term = ctx.term_interner.get(*term); + let QueryTerm { + original: _, + is_ngram: _, + is_prefix: _, + phrase, + zero_typo, + prefix_of, + synonyms, + split_words, + one_typo, + two_typos, + use_prefix_db, + } = term; + let mut s = String::new(); + if let Some(phrase) = phrase { + let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner); + writeln!(&mut s, "\"{phrase}\" : phrase").unwrap(); + } + if let Some(w) = zero_typo { + let w = ctx.word_interner.get(*w); + writeln!(&mut s, "\"{w}\" : 0 typo").unwrap(); + } + for w in prefix_of.iter() { + let w = ctx.word_interner.get(*w); + writeln!(&mut s, "\"{w}\" : prefix").unwrap(); + } + for w in one_typo.iter() { + let w = ctx.word_interner.get(*w); + writeln!(&mut s, "\"{w}\" : 1 typo").unwrap(); + } + for w in two_typos.iter() { + let w = ctx.word_interner.get(*w); + writeln!(&mut s, "\"{w}\" : 2 typos").unwrap(); + } + if let Some(phrase) = split_words { + let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner); + writeln!(&mut s, "\"{phrase}\" : split words").unwrap(); + } + for phrase in synonyms.iter() { + let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner); + writeln!(&mut s, "\"{phrase}\" : synonym").unwrap(); + } + if let Some(w) = use_prefix_db { + let w = ctx.word_interner.get(*w); + writeln!(&mut s, "\"{w}\" : use prefix db").unwrap(); + } + + Ok(s) + } + + fn words_used_by_edge_condition<'ctx>( + ctx: &mut SearchContext<'ctx>, + edge: &Self::EdgeCondition, + ) -> Result>> { + let TypoEdge { term, .. } = edge; + let term = ctx.term_interner.get(*term); + Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) + } + + fn phrases_used_by_edge_condition<'ctx>( + ctx: &mut SearchContext<'ctx>, + edge: &Self::EdgeCondition, + ) -> Result>> { + let TypoEdge { term, .. } = edge; + let term = ctx.term_interner.get(*term); + Ok(HashSet::from_iter(term.all_phrases())) + } } From 7b1d8f4c6d78284cb3b68fa04d3369616f219c3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Mar 2023 09:58:59 +0100 Subject: [PATCH 060/234] Make PathSet strongly typed --- .../search/new/graph_based_ranking_rule.rs | 8 ++-- milli/src/search/new/interner.rs | 42 +++++++------------ milli/src/search/new/logger/detailed.rs | 14 +++---- milli/src/search/new/logger/mod.rs | 10 ++--- milli/src/search/new/query_graph.rs | 14 +++---- .../new/ranking_rule_graph/cheapest_paths.rs | 26 +++++++----- .../ranking_rule_graph/empty_paths_cache.rs | 12 +++--- .../src/search/new/ranking_rule_graph/mod.rs | 2 +- .../search/new/ranking_rule_graph/path_set.rs | 40 ++++++++++++++---- .../new/ranking_rule_graph/proximity/mod.rs | 2 +- .../search/new/ranking_rule_graph/typo/mod.rs | 2 +- milli/src/search/new/small_bitmap.rs | 17 ++++---- 12 files changed, 102 insertions(+), 87 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 5f270de6a..cfa0f059c 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -50,7 +50,6 @@ use super::ranking_rule_graph::{ }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; -use crate::search::new::interner::Interned; use crate::search::new::query_graph::QueryNodeData; use crate::Result; @@ -247,9 +246,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let mut cached_edge_docids = vec![]; // graph.conditions_interner.map(|_| RoaringBitmap::new()); - for &condition_interned_raw in path { - let condition = Interned::new(condition_interned_raw); - visited_conditions.push(condition_interned_raw); + for &condition in path { + visited_conditions.push(condition); let edge_docids = edge_docids_cache.get_edge_docids(ctx, condition, graph, &universe)?; @@ -295,7 +293,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase } assert!(!path_docids.is_empty()); for condition in path { - used_conditions.insert(Interned::new(*condition)); + used_conditions.insert(*condition); } bucket |= &path_docids; // Reduce the size of the universe so that we can more optimistically discard candidate paths diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index da8473e92..f4dc4a870 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -4,38 +4,27 @@ use std::marker::PhantomData; use fxhash::FxHashMap; -/// An index within a [`Interner`] structure. +/// An index within an interner ([`FixedSizeInterner`], [`DedupInterner`], or [`MappedInterner`]). pub struct Interned { idx: u16, _phantom: PhantomData, } impl Interned { - pub fn new(idx: u16) -> Self { + /// Create an interned value manually from its raw index within the interner. + pub fn from_raw(idx: u16) -> Self { Self { idx, _phantom: PhantomData } } - pub fn into_inner(self) -> u16 { + /// Get the raw index from the interned value + pub fn into_raw(self) -> u16 { self.idx } } -// TODO: the stable store should be replaced by a bump allocator -// and the interned value should be a pointer wrapper -// then we can get its value with `interned.get()` instead of `interner.get(interned)` -// and as a bonus, its validity is tracked with Rust's lifetime system -// one problem is that we need two lifetimes: one for the bump allocator, one for the -// hashmap -// but that's okay, we can use: -// ``` -// struct Interner<'bump> { -// bump: &'bump Bump, -// lookup: FxHashMap -// } -// ``` - -/// An [`Interner`] is used to store a unique copy of a value of type `T`. This value +/// A [`DedupInterner`] is used to store a unique copy of a value of type `T`. This value /// is then identified by a lightweight index of type [`Interned`], which can /// be copied, compared, and hashed efficiently. An immutable reference to the original value -/// can be retrieved using `self.get(interned)`. +/// can be retrieved using `self.get(interned)`. A set of values within the interner can be +/// efficiently managed using [`SmallBitmap`](super::small_bitmap::SmallBitmap). #[derive(Clone)] pub struct DedupInterner { stable_store: Vec, @@ -47,6 +36,7 @@ impl Default for DedupInterner { } } impl DedupInterner { + /// pub fn freeze(self) -> FixedSizeInterner { FixedSizeInterner { stable_store: self.stable_store } } @@ -62,7 +52,7 @@ where } else { assert!(self.stable_store.len() < u16::MAX as usize); self.stable_store.push(s.clone()); - let interned = Interned::new(self.stable_store.len() as u16 - 1); + let interned = Interned::from_raw(self.stable_store.len() as u16 - 1); self.lookup.insert(s, interned); interned } @@ -87,7 +77,7 @@ impl Interner { pub fn push(&mut self, s: T) -> Interned { assert!(self.stable_store.len() < u16::MAX as usize); self.stable_store.push(s); - Interned::new(self.stable_store.len() as u16 - 1) + Interned::from_raw(self.stable_store.len() as u16 - 1) } } @@ -123,13 +113,13 @@ impl FixedSizeInterner { } } pub fn indexes(&self) -> impl Iterator> { - (0..self.stable_store.len()).map(|i| Interned::new(i as u16)) + (0..self.stable_store.len()).map(|i| Interned::from_raw(i as u16)) } pub fn iter(&self) -> impl Iterator, &T)> { - self.stable_store.iter().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x)) } pub fn iter_mut(&mut self) -> impl Iterator, &mut T)> { - self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x)) } } #[derive(Clone)] @@ -152,10 +142,10 @@ impl MappedInterner { } } pub fn iter(&self) -> impl Iterator, &T)> { - self.stable_store.iter().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x)) } pub fn iter_mut(&mut self) -> impl Iterator, &mut T)> { - self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x)) } } // Interned boilerplate implementations diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 6b62c63b5..752e5ce35 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -43,7 +43,7 @@ pub enum SearchEvents { }, ProximityState { graph: RankingRuleGraph, - paths: Vec>, + paths: Vec>>, empty_paths_cache: DeadEndPathCache, universe: RoaringBitmap, distances: MappedInterner)>, QueryNode>, @@ -51,7 +51,7 @@ pub enum SearchEvents { }, TypoState { graph: RankingRuleGraph, - paths: Vec>, + paths: Vec>>, empty_paths_cache: DeadEndPathCache, universe: RoaringBitmap, distances: MappedInterner)>, QueryNode>, @@ -169,7 +169,7 @@ impl SearchLogger for DetailedSearchLogger { fn log_proximity_state( &mut self, query_graph: &RankingRuleGraph, - paths_map: &[Vec], + paths_map: &[Vec>], empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, @@ -188,7 +188,7 @@ impl SearchLogger for DetailedSearchLogger { fn log_typo_state( &mut self, query_graph: &RankingRuleGraph, - paths_map: &[Vec], + paths_map: &[Vec>], empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, @@ -527,7 +527,7 @@ shape: class" fn ranking_rule_graph_d2_description( ctx: &mut SearchContext, graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec>], dead_end_paths_cache: &DeadEndPathCache, distances: MappedInterner)>, QueryNode>, file: &mut File, @@ -613,13 +613,13 @@ shape: class fn paths_d2_description( ctx: &mut SearchContext, graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec>], file: &mut File, ) { for (path_idx, condition_indexes) in paths.iter().enumerate() { writeln!(file, "{path_idx} {{").unwrap(); for condition in condition_indexes.iter() { - Self::condition_d2_description(ctx, graph, Interned::new(*condition), file); + Self::condition_d2_description(ctx, graph, *condition, file); } for couple_edges in condition_indexes.windows(2) { let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() }; diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index c2e9bca80..e6c3931ed 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -3,7 +3,7 @@ pub mod detailed; use roaring::RoaringBitmap; -use super::interner::MappedInterner; +use super::interner::{Interned, MappedInterner}; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ DeadEndPathCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoEdge, TypoGraph, @@ -65,7 +65,7 @@ pub trait SearchLogger { fn log_proximity_state( &mut self, query_graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec>], empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, @@ -76,7 +76,7 @@ pub trait SearchLogger { fn log_typo_state( &mut self, query_graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec>], empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, @@ -136,7 +136,7 @@ impl SearchLogger for DefaultSearchLogger { fn log_proximity_state( &mut self, _query_graph: &RankingRuleGraph, - _paths_map: &[Vec], + _paths_map: &[Vec>], _empty_paths_cache: &DeadEndPathCache, _universe: &RoaringBitmap, _distances: &MappedInterner)>, QueryNode>, @@ -147,7 +147,7 @@ impl SearchLogger for DefaultSearchLogger { fn log_typo_state( &mut self, _query_graph: &RankingRuleGraph, - _paths: &[Vec], + _paths: &[Vec>], _empty_paths_cache: &DeadEndPathCache, _universe: &RoaringBitmap, _distances: &MappedInterner)>, QueryNode>, diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 1012030be..863ec0045 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -181,8 +181,8 @@ impl QueryGraph { (prev0, prev1, prev2) = (new_nodes, prev0, prev1); } - let root_node = Interned::new(root_node); - let end_node = Interned::new(end_node); + let root_node = Interned::from_raw(root_node); + let end_node = Interned::from_raw(end_node); let mut nodes = FixedSizeInterner::new( nodes_data.len() as u16, QueryNode { @@ -197,22 +197,22 @@ impl QueryGraph { .zip(successors.into_iter()) .enumerate() { - let node = nodes.get_mut(Interned::new(node_idx as u16)); + let node = nodes.get_mut(Interned::from_raw(node_idx as u16)); node.data = node_data; for x in predecessors { - node.predecessors.insert(Interned::new(x)); + node.predecessors.insert(Interned::from_raw(x)); } for x in successors { - node.successors.insert(Interned::new(x)); + node.successors.insert(Interned::from_raw(x)); } } let mut graph = QueryGraph { root_node, end_node, nodes }; graph.connect_to_node( - prev0.into_iter().map(Interned::new).collect::>().as_slice(), + prev0.into_iter().map(Interned::from_raw).collect::>().as_slice(), end_node, ); - let empty_nodes = empty_nodes.into_iter().map(Interned::new).collect::>(); + let empty_nodes = empty_nodes.into_iter().map(Interned::from_raw).collect::>(); graph.remove_nodes_keep_edges(&empty_nodes); Ok(graph) diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index c0697091e..52f199ca9 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -24,7 +24,11 @@ impl RankingRuleGraph { cost: u16, all_distances: &MappedInterner)>, QueryNode>, empty_paths_cache: &mut DeadEndPathCache, - mut visit: impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache) -> Result>, + mut visit: impl FnMut( + &[Interned], + &mut Self, + &mut DeadEndPathCache, + ) -> Result>, ) -> Result<()> { let _ = self.visit_paths_of_cost_rec( from, @@ -44,8 +48,12 @@ impl RankingRuleGraph { cost: u16, all_distances: &MappedInterner)>, QueryNode>, empty_paths_cache: &mut DeadEndPathCache, - visit: &mut impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache) -> Result>, - prev_conditions: &mut Vec, + visit: &mut impl FnMut( + &[Interned], + &mut Self, + &mut DeadEndPathCache, + ) -> Result>, + prev_conditions: &mut Vec>, cur_path: &mut SmallBitmap, forbidden_conditions: &mut SmallBitmap, ) -> Result { @@ -92,8 +100,7 @@ impl RankingRuleGraph { continue; } cur_path.insert(condition); - // TODO: typed path set - prev_conditions.push(condition.into_inner()); + prev_conditions.push(condition); let mut new_forbidden_conditions = forbidden_conditions.clone(); new_forbidden_conditions @@ -101,7 +108,7 @@ impl RankingRuleGraph { empty_paths_cache.prefixes.final_edges_after_prefix( prev_conditions, &mut |x| { - new_forbidden_conditions.insert(Interned::new(x)); + new_forbidden_conditions.insert(x); }, ); let next_any_valid = if edge.dest_node == self.query_graph.end_node { @@ -137,12 +144,11 @@ impl RankingRuleGraph { } forbidden_conditions.union(&empty_paths_cache.conditions); for prev_condition in prev_conditions.iter() { - forbidden_conditions.union( - empty_paths_cache.condition_couples.get(Interned::new(*prev_condition)), - ); + forbidden_conditions + .union(empty_paths_cache.condition_couples.get(*prev_condition)); } empty_paths_cache.prefixes.final_edges_after_prefix(prev_conditions, &mut |x| { - forbidden_conditions.insert(Interned::new(x)); + forbidden_conditions.insert(x); }); } } diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index 3b518bc9b..15346e929 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -11,7 +11,7 @@ pub struct DeadEndPathCache { /// The set of edge conditions that resolve to no documents. pub conditions: SmallBitmap, /// A set of path prefixes that resolve to no documents. - pub prefixes: PathSet, + pub prefixes: PathSet, /// A set of empty couples of edge conditions that resolve to no documents. pub condition_couples: MappedInterner, G::EdgeCondition>, } @@ -40,13 +40,13 @@ impl DeadEndPathCache { pub fn add_condition(&mut self, condition: Interned) { self.conditions.insert(condition); self.condition_couples.get_mut(condition).clear(); - self.prefixes.remove_edge(condition.into_inner()); // TODO: typed PathSet + self.prefixes.remove_edge(condition); for (_, edges2) in self.condition_couples.iter_mut() { edges2.remove(condition); } } /// Store in the cache that every path containing the given prefix resolves to no documents. - pub fn add_prefix(&mut self, prefix: &[u16]) { + pub fn add_prefix(&mut self, prefix: &[Interned]) { // TODO: typed PathSet self.prefixes.insert(prefix.iter().copied()); } @@ -63,15 +63,15 @@ impl DeadEndPathCache { /// Returns true if the cache can determine that the given path resolves to no documents. pub fn path_is_dead_end( &self, - path: &[u16], + path: &[Interned], path_bitmap: &SmallBitmap, ) -> bool { if path_bitmap.intersects(&self.conditions) { return true; } - for edge in path.iter() { + for condition in path.iter() { // TODO: typed path - let forbidden_other_edges = self.condition_couples.get(Interned::new(*edge)); + let forbidden_other_edges = self.condition_couples.get(*condition); if path_bitmap.intersects(forbidden_other_edges) { return true; } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 851aeae54..03e7c14bc 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -117,7 +117,7 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec>], empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, diff --git a/milli/src/search/new/ranking_rule_graph/path_set.rs b/milli/src/search/new/ranking_rule_graph/path_set.rs index d5bab6c14..04e6c9116 100644 --- a/milli/src/search/new/ranking_rule_graph/path_set.rs +++ b/milli/src/search/new/ranking_rule_graph/path_set.rs @@ -2,14 +2,34 @@ // For the empty_prefixes field in the EmptyPathsCache only :/ // but it could be used for more, like efficient computing of a set of paths +use crate::search::new::interner::Interned; + /// A set of [`Path`] -#[derive(Default, Debug, Clone)] -pub struct PathSet { - nodes: Vec<(u16, PathSet)>, +pub struct PathSet { + nodes: Vec<(Interned, Self)>, is_end: bool, } -impl PathSet { - pub fn insert(&mut self, mut edges: impl Iterator) { + +impl Clone for PathSet { + fn clone(&self) -> Self { + Self { nodes: self.nodes.clone(), is_end: self.is_end } + } +} + +impl std::fmt::Debug for PathSet { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PathSet").field("nodes", &self.nodes).field("is_end", &self.is_end).finish() + } +} + +impl Default for PathSet { + fn default() -> Self { + Self { nodes: Default::default(), is_end: Default::default() } + } +} + +impl PathSet { + pub fn insert(&mut self, mut edges: impl Iterator>) { match edges.next() { None => { self.is_end = true; @@ -27,7 +47,7 @@ impl PathSet { } } - pub fn remove_edge(&mut self, forbidden_edge: u16) { + pub fn remove_edge(&mut self, forbidden_edge: Interned) { let mut i = 0; while i < self.nodes.len() { let should_remove = if self.nodes[i].0 == forbidden_edge { @@ -46,7 +66,11 @@ impl PathSet { } } - pub fn final_edges_after_prefix(&self, prefix: &[u16], visit: &mut impl FnMut(u16)) { + pub fn final_edges_after_prefix( + &self, + prefix: &[Interned], + visit: &mut impl FnMut(Interned), + ) { let [first_edge, remaining_prefix @ ..] = prefix else { for node in self.nodes.iter() { if node.1.is_end { @@ -62,7 +86,7 @@ impl PathSet { } } - pub fn contains_prefix_of_path(&self, path: &[u16]) -> bool { + pub fn contains_prefix_of_path(&self, path: &[Interned]) -> bool { if self.is_end { return true; } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 8fd8190f8..5a5c5cff8 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -66,7 +66,7 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec>], empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index abfea6499..d0b18ddc8 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -137,7 +137,7 @@ impl RankingRuleGraphTrait for TypoGraph { fn log_state( graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec>], empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index 7ab2b61ae..cb1c64ec2 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -32,10 +32,7 @@ impl SmallBitmap { for_interner: &FixedSizeInterner, ) -> Self { Self { - internal: SmallBitmapInternal::from_iter( - xs.map(|x| x.into_inner()), - for_interner.len(), - ), + internal: SmallBitmapInternal::from_iter(xs.map(|x| x.into_raw()), for_interner.len()), _phantom: PhantomData, } } @@ -46,13 +43,13 @@ impl SmallBitmap { self.internal.clear() } pub fn contains(&self, x: Interned) -> bool { - self.internal.contains(x.into_inner()) + self.internal.contains(x.into_raw()) } pub fn insert(&mut self, x: Interned) { - self.internal.insert(x.into_inner()) + self.internal.insert(x.into_raw()) } pub fn remove(&mut self, x: Interned) { - self.internal.remove(x.into_inner()) + self.internal.remove(x.into_raw()) } pub fn intersection(&mut self, other: &Self) { @@ -71,7 +68,7 @@ impl SmallBitmap { self.internal.intersects(&other.internal) } pub fn iter(&self) -> impl Iterator> + '_ { - self.internal.iter().map(|x| Interned::new(x)) + self.internal.iter().map(|x| Interned::from_raw(x)) } } #[derive(Clone)] @@ -80,14 +77,14 @@ pub enum SmallBitmapInternal { Small(Box<[u64]>), } impl SmallBitmapInternal { - pub fn new(universe_length: u16) -> Self { + fn new(universe_length: u16) -> Self { if universe_length <= 64 { Self::Tiny(0) } else { Self::Small(vec![0; 1 + universe_length as usize / 64].into_boxed_slice()) } } - pub fn from_iter(xs: impl Iterator, universe_length: u16) -> Self { + fn from_iter(xs: impl Iterator, universe_length: u16) -> Self { let mut s = Self::new(universe_length); for x in xs { s.insert(x); From aa59c3bc2c7ae4c810ded2c019e2df51dadaf348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Mar 2023 11:52:51 +0100 Subject: [PATCH 061/234] Replace EdgeCondition with an Option<..> + other code cleanup --- milli/src/search/new/distinct.rs | 22 +++-- .../search/new/graph_based_ranking_rule.rs | 87 +++++++++---------- milli/src/search/new/interner.rs | 38 ++++---- milli/src/search/new/logger/detailed.rs | 26 +++--- milli/src/search/new/logger/mod.rs | 8 +- milli/src/search/new/mod.rs | 6 ++ .../search/new/ranking_rule_graph/build.rs | 6 +- .../new/ranking_rule_graph/cheapest_paths.rs | 44 +++++----- ...ids_cache.rs => condition_docids_cache.rs} | 27 +++--- ..._paths_cache.rs => dead_end_path_cache.rs} | 26 +++--- .../src/search/new/ranking_rule_graph/mod.rs | 74 +++++++--------- .../search/new/ranking_rule_graph/path_set.rs | 2 +- .../new/ranking_rule_graph/proximity/build.rs | 15 ++-- .../new/ranking_rule_graph/proximity/mod.rs | 10 +-- .../search/new/ranking_rule_graph/typo/mod.rs | 14 +-- milli/src/search/new/small_bitmap.rs | 1 + 16 files changed, 202 insertions(+), 204 deletions(-) rename milli/src/search/new/ranking_rule_graph/{edge_docids_cache.rs => condition_docids_cache.rs} (60%) rename milli/src/search/new/ranking_rule_graph/{empty_paths_cache.rs => dead_end_path_cache.rs} (78%) diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs index b29f0e8c3..10657f210 100644 --- a/milli/src/search/new/distinct.rs +++ b/milli/src/search/new/distinct.rs @@ -20,10 +20,17 @@ pub struct DistinctOutput { pub excluded: RoaringBitmap, } +/// Return a [`DistinctOutput`] containing: +/// - `remaining`: a set of docids built such that exactly one element from `candidates` +/// is kept for each distinct value inside the given field. If the field does not exist, it +/// is considered unique. +/// - `excluded`: the set of document ids that contain a value for the given field that occurs +/// in the given candidates. pub fn apply_distinct_rule<'ctx>( ctx: &mut SearchContext<'ctx>, field_id: u16, candidates: &RoaringBitmap, + // TODO: add a universe here, such that the `excluded` are a subset of the universe? ) -> Result { let mut excluded = RoaringBitmap::new(); let mut remaining = RoaringBitmap::new(); @@ -37,6 +44,7 @@ pub fn apply_distinct_rule<'ctx>( Ok(DistinctOutput { remaining, excluded }) } +/// Apply the distinct rule defined by [`apply_distinct_rule`] for a single document id. fn distinct_single_docid( index: &Index, txn: &RoTxn, @@ -69,6 +77,7 @@ fn distinct_single_docid( Ok(()) } +/// Return all the docids containing the given value in the given field fn facet_value_docids( database: Database, FacetGroupValueCodec>, txn: &RoTxn, @@ -79,13 +88,15 @@ fn facet_value_docids( .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value }) .map(|opt| opt.map(|v| v.bitmap)) } + +/// Return an iterator over each number value in the given field of the given document. fn facet_number_values<'a>( - id: u32, - distinct: u16, + docid: u32, + field_id: u16, index: &Index, txn: &'a RoTxn, ) -> Result, Unit>> { - let key = facet_values_prefix_key(distinct, id); + let key = facet_values_prefix_key(field_id, docid); let iter = index .field_id_docid_facet_f64s @@ -96,13 +107,14 @@ fn facet_number_values<'a>( Ok(iter) } +/// Return an iterator over each string value in the given field of the given document. fn facet_string_values<'a>( docid: u32, - distinct: u16, + field_id: u16, index: &Index, txn: &'a RoTxn, ) -> Result, Str>> { - let key = facet_values_prefix_key(distinct, docid); + let key = facet_values_prefix_key(field_id, docid); let iter = index .field_id_docid_facet_strings diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index cfa0f059c..72c839f2a 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -45,7 +45,7 @@ use super::interner::MappedInterner; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - DeadEndPathCache, EdgeCondition, EdgeConditionDocIdsCache, ProximityGraph, RankingRuleGraph, + DeadEndPathCache, EdgeConditionDocIdsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; @@ -87,7 +87,7 @@ pub struct GraphBasedRankingRuleState { /// Cache to retrieve the docids associated with each edge edge_conditions_cache: EdgeConditionDocIdsCache, /// Cache used to optimistically discard paths that resolve to no documents. - empty_paths_cache: DeadEndPathCache, + dead_end_path_cache: DeadEndPathCache, /// A structure giving the list of possible costs from each node to the end node, /// along with a set of unavoidable edges that must be traversed to achieve that distance. all_distances: MappedInterner)>, QueryNode>, @@ -101,27 +101,23 @@ pub struct GraphBasedRankingRuleState { fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( ctx: &mut SearchContext<'ctx>, graph: &mut RankingRuleGraph, - edge_docids_cache: &mut EdgeConditionDocIdsCache, + condition_docids_cache: &mut EdgeConditionDocIdsCache, universe: &RoaringBitmap, - empty_paths_cache: &mut DeadEndPathCache, + dead_end_path_cache: &mut DeadEndPathCache, ) -> Result<()> { for edge_id in graph.edges_store.indexes() { let Some(edge) = graph.edges_store.get(edge_id).as_ref() else { continue; }; - let condition = edge.condition; + let Some(condition) = edge.condition else { continue }; - match condition { - EdgeCondition::Unconditional => continue, - EdgeCondition::Conditional(condition) => { - let docids = edge_docids_cache.get_edge_docids(ctx, condition, graph, universe)?; - if docids.is_disjoint(universe) { - graph.remove_edges_with_condition(condition); - empty_paths_cache.add_condition(condition); - edge_docids_cache.cache.remove(&condition); - continue; - } - } + let docids = + condition_docids_cache.get_condition_docids(ctx, condition, graph, universe)?; + if docids.is_disjoint(universe) { + graph.remove_edges_with_condition(condition); + dead_end_path_cache.add_condition(condition); + condition_docids_cache.cache.remove(&condition); + continue; } } Ok(()) @@ -139,17 +135,17 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase query_graph: &QueryGraph, ) -> Result<()> { let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; - let mut edge_docids_cache = EdgeConditionDocIdsCache::default(); - let mut empty_paths_cache = DeadEndPathCache::new(&graph.conditions_interner); + let mut condition_docids_cache = EdgeConditionDocIdsCache::default(); + let mut dead_end_path_cache = DeadEndPathCache::new(&graph.conditions_interner); - // First simplify the graph as much as possible, by computing the docids of the edges + // First simplify the graph as much as possible, by computing the docids of all the conditions // within the rule's universe and removing the edges that have no associated docids. remove_empty_edges( ctx, &mut graph, - &mut edge_docids_cache, + &mut condition_docids_cache, universe, - &mut empty_paths_cache, + &mut dead_end_path_cache, )?; // Then pre-compute the cost of all paths from each node to the end node @@ -157,8 +153,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let state = GraphBasedRankingRuleState { graph, - edge_conditions_cache: edge_docids_cache, - empty_paths_cache, + edge_conditions_cache: condition_docids_cache, + dead_end_path_cache, all_distances, cur_distance_idx: 0, }; @@ -187,7 +183,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase &mut state.graph, &mut state.edge_conditions_cache, universe, - &mut state.empty_paths_cache, + &mut state.dead_end_path_cache, )?; // If the cur_distance_idx does not point to a valid cost in the `all_distances` @@ -208,8 +204,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let GraphBasedRankingRuleState { graph, - edge_conditions_cache: edge_docids_cache, - empty_paths_cache, + edge_conditions_cache: condition_docids_cache, + dead_end_path_cache, all_distances, cur_distance_idx: _, } = &mut state; @@ -224,18 +220,18 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // For each path of the given cost, we will compute its associated // document ids. // In case the path does not resolve to any document id, we try to figure out why - // and update the `empty_paths_cache` accordingly. + // and update the `dead_end_path_cache` accordingly. // For example, it may be that the path is empty because one of its edges is disjoint // with the universe, or because a prefix of the path is disjoint with the universe, or because // the path contains two edges that are disjoint from each other within the universe. - // Updating the empty_paths_cache helps speed up the execution of `visit_paths_of_cost` and reduces + // Updating the dead_end_path_cache helps speed up the execution of `visit_paths_of_cost` and reduces // the number of future candidate paths given by that same function. graph.visit_paths_of_cost( graph.query_graph.root_node, cost, all_distances, - empty_paths_cache, - |path, graph, empty_paths_cache| { + dead_end_path_cache, + |path, graph, dead_end_path_cache| { // Accumulate the path for logging purposes only paths.push(path.to_vec()); let mut path_docids = universe.clone(); @@ -243,47 +239,48 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // We store the edges and their docids in vectors in case the path turns out to be // empty and we need to figure out why it was empty. let mut visited_conditions = vec![]; - let mut cached_edge_docids = vec![]; + let mut cached_condition_docids = vec![]; // graph.conditions_interner.map(|_| RoaringBitmap::new()); for &condition in path { visited_conditions.push(condition); - let edge_docids = - edge_docids_cache.get_edge_docids(ctx, condition, graph, &universe)?; + let condition_docids = condition_docids_cache + .get_condition_docids(ctx, condition, graph, &universe)?; - cached_edge_docids.push((condition, edge_docids.clone())); // .get_mut(condition) = edge_docids.clone(); + cached_condition_docids.push((condition, condition_docids.clone())); // .get_mut(condition) = condition_docids.clone(); // If the edge is empty, then the path will be empty as well, we update the graph // and caches accordingly and skip to the next candidate path. - if edge_docids.is_disjoint(&universe) { + if condition_docids.is_disjoint(&universe) { // 1. Store in the cache that this edge is empty for this universe - empty_paths_cache.add_condition(condition); + dead_end_path_cache.add_condition(condition); // 2. remove this edge from the ranking rule graph // ouch, no! :( need to link a condition to one or more ranking rule edges graph.remove_edges_with_condition(condition); - // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore - edge_docids_cache.cache.remove(&condition); + // 3. Also remove the entry from the condition_docids_cache, since we don't need it anymore + condition_docids_cache.cache.remove(&condition); return Ok(ControlFlow::Continue(())); } - path_docids &= edge_docids; + path_docids &= condition_docids; // If the (sub)path is empty, we try to figure out why and update the caches accordingly. if path_docids.is_disjoint(&universe) { // First, we know that this path is empty, and thus any path // that is a superset of it will also be empty. - empty_paths_cache.add_prefix(&visited_conditions); + dead_end_path_cache.add_prefix(&visited_conditions); // Second, if the intersection between this edge and any // previous one is disjoint with the universe, // then we also know that any path containing the same couple of // edges will also be empty. - for (past_condition, edge_docids2) in cached_edge_docids.iter() { + for (past_condition, condition_docids2) in cached_condition_docids.iter() { if *past_condition == condition { continue; }; - let intersection = edge_docids & edge_docids2; + let intersection = condition_docids & condition_docids2; if intersection.is_disjoint(&universe) { - empty_paths_cache.add_condition_couple(*past_condition, condition); + dead_end_path_cache + .add_condition_couple(*past_condition, condition); } } // We should maybe instead try to compute: @@ -310,7 +307,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase G::log_state( &original_graph, &paths, - empty_paths_cache, + dead_end_path_cache, original_universe, all_distances, cost, @@ -322,8 +319,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // But we only do it in case the bucket length is >1, because otherwise // we know the child ranking rule won't be called anyway let mut next_query_graph = original_graph.query_graph; - next_query_graph.simplify(); if bucket.len() > 1 { + next_query_graph.simplify(); // 1. Gather all the words and phrases used in the computation of this bucket let mut used_words = HashSet::new(); let mut used_phrases = HashSet::new(); diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index f4dc4a870..3b5544b68 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -25,6 +25,8 @@ impl Interned { /// be copied, compared, and hashed efficiently. An immutable reference to the original value /// can be retrieved using `self.get(interned)`. A set of values within the interner can be /// efficiently managed using [`SmallBitmap`](super::small_bitmap::SmallBitmap). +/// +/// A dedup-interner can contain a maximum of `u16::MAX` values. #[derive(Clone)] pub struct DedupInterner { stable_store: Vec, @@ -36,7 +38,8 @@ impl Default for DedupInterner { } } impl DedupInterner { - /// + /// Convert the dedup-interner into a fixed-size interner, such that new + /// elements cannot be added to it anymore. pub fn freeze(self) -> FixedSizeInterner { FixedSizeInterner { stable_store: self.stable_store } } @@ -46,6 +49,8 @@ impl DedupInterner where T: Clone + Eq + Hash, { + /// Insert the given value into the dedup-interner, and return + /// its index. pub fn insert(&mut self, s: T) -> Interned { if let Some(interned) = self.lookup.get(&s) { *interned @@ -57,35 +62,21 @@ where interned } } + /// Get a reference to the interned value. pub fn get(&self, interned: Interned) -> &T { &self.stable_store[interned.idx as usize] } } -#[derive(Clone)] -pub struct Interner { - stable_store: Vec, -} -impl Default for Interner { - fn default() -> Self { - Self { stable_store: Default::default() } - } -} -impl Interner { - pub fn freeze(self) -> FixedSizeInterner { - FixedSizeInterner { stable_store: self.stable_store } - } - pub fn push(&mut self, s: T) -> Interned { - assert!(self.stable_store.len() < u16::MAX as usize); - self.stable_store.push(s); - Interned::from_raw(self.stable_store.len() as u16 - 1) - } -} +/// A fixed-length store for values of type `T`, where each value is identified +/// by an index of type [`Interned`]. #[derive(Clone)] pub struct FixedSizeInterner { stable_store: Vec, } impl FixedSizeInterner { + /// Create a fixed-size interner of the given length containing + /// clones of the given value. pub fn new(length: u16, value: T) -> Self { Self { stable_store: vec![value; length as usize] } } @@ -105,7 +96,6 @@ impl FixedSizeInterner { pub fn len(&self) -> u16 { self.stable_store.len() as u16 } - pub fn map(&self, map_f: impl Fn(&T) -> U) -> MappedInterner { MappedInterner { stable_store: self.stable_store.iter().map(map_f).collect(), @@ -122,6 +112,12 @@ impl FixedSizeInterner { self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x)) } } + +/// A store of values of type `T`, each linked to a value of type `From` +/// stored in another interner. To create a mapped interner, use the +/// `map` method on [`FixedSizeInterner`] or [`MappedInterner`]. +/// +/// Values in this interner are indexed with [`Interned`]. #[derive(Clone)] pub struct MappedInterner { stable_store: Vec, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 752e5ce35..96f4092b1 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -10,7 +10,7 @@ use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; use crate::search::new::ranking_rule_graph::{ - DeadEndPathCache, Edge, EdgeCondition, ProximityCondition, ProximityGraph, RankingRuleGraph, + DeadEndPathCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoEdge, TypoGraph, }; use crate::search::new::small_bitmap::SmallBitmap; @@ -44,7 +44,7 @@ pub enum SearchEvents { ProximityState { graph: RankingRuleGraph, paths: Vec>>, - empty_paths_cache: DeadEndPathCache, + dead_end_path_cache: DeadEndPathCache, universe: RoaringBitmap, distances: MappedInterner)>, QueryNode>, cost: u16, @@ -52,7 +52,7 @@ pub enum SearchEvents { TypoState { graph: RankingRuleGraph, paths: Vec>>, - empty_paths_cache: DeadEndPathCache, + dead_end_path_cache: DeadEndPathCache, universe: RoaringBitmap, distances: MappedInterner)>, QueryNode>, cost: u16, @@ -170,7 +170,7 @@ impl SearchLogger for DetailedSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec>], - empty_paths_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, @@ -178,7 +178,7 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), - empty_paths_cache: empty_paths_cache.clone(), + dead_end_path_cache: dead_end_path_cache.clone(), universe: universe.clone(), distances: distances.clone(), cost, @@ -189,7 +189,7 @@ impl SearchLogger for DetailedSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec>], - empty_paths_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, @@ -197,7 +197,7 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), - empty_paths_cache: empty_paths_cache.clone(), + dead_end_path_cache: dead_end_path_cache.clone(), universe: universe.clone(), distances: distances.clone(), cost, @@ -358,7 +358,7 @@ results.{random} {{ SearchEvents::ProximityState { graph, paths, - empty_paths_cache, + dead_end_path_cache, universe, distances, cost, @@ -374,7 +374,7 @@ results.{random} {{ ctx, graph, paths, - empty_paths_cache, + dead_end_path_cache, distances.clone(), &mut new_file, ); @@ -391,7 +391,7 @@ results.{random} {{ SearchEvents::TypoState { graph, paths, - empty_paths_cache, + dead_end_path_cache, universe, distances, cost, @@ -407,7 +407,7 @@ results.{random} {{ ctx, graph, paths, - empty_paths_cache, + dead_end_path_cache, distances.clone(), &mut new_file, ); @@ -547,11 +547,11 @@ shape: class" let Edge { source_node, dest_node, condition: details, cost } = edge; match &details { - EdgeCondition::Unconditional => { + None => { writeln!(file, "{source_node} -> {dest_node} : \"always cost {cost}\"",) .unwrap(); } - EdgeCondition::Conditional(condition) => { + Some(condition) => { // let condition = graph.conditions_interner.get(*condition); writeln!( file, diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index e6c3931ed..696f65443 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -66,7 +66,7 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec>], - empty_paths_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, @@ -77,7 +77,7 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec>], - empty_paths_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, @@ -137,7 +137,7 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths_map: &[Vec>], - _empty_paths_cache: &DeadEndPathCache, + _dead_end_path_cache: &DeadEndPathCache, _universe: &RoaringBitmap, _distances: &MappedInterner)>, QueryNode>, _cost: u16, @@ -148,7 +148,7 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths: &[Vec>], - _empty_paths_cache: &DeadEndPathCache, + _dead_end_path_cache: &DeadEndPathCache, _universe: &RoaringBitmap, _distances: &MappedInterner)>, QueryNode>, _cost: u16, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 92e00df11..8ff832de4 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -36,6 +36,7 @@ use crate::search::new::query_term::located_query_terms_from_string; use crate::search::new::words::Words; use crate::{Filter, Index, Result, TermsMatchingStrategy}; +/// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { pub index: &'ctx Index, pub txn: &'ctx RoTxn<'ctx>, @@ -59,6 +60,7 @@ impl<'ctx> SearchContext<'ctx> { } } +/// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it. #[allow(clippy::too_many_arguments)] fn resolve_maximally_reduced_query_graph<'ctx>( ctx: &mut SearchContext<'ctx>, @@ -100,6 +102,8 @@ fn resolve_maximally_reduced_query_graph<'ctx>( Ok(docids) } + +/// Return the list of initialised ranking rules to be used for a placeholder search. fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, ) -> Result>>> { @@ -123,6 +127,8 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( } Ok(ranking_rules) } + +/// Return the list of initialised ranking rules to be used for a query graph search. fn get_ranking_rules_for_query_graph_search<'ctx>( ctx: &SearchContext<'ctx>, terms_matching_strategy: TermsMatchingStrategy, diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 7ab08aceb..cd622a7ba 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,7 +1,7 @@ use std::collections::HashSet; use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::interner::{DedupInterner, Interner}; +use crate::search::new::interner::DedupInterner; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, SearchContext}; use crate::Result; @@ -19,7 +19,7 @@ impl RankingRuleGraph { let mut conditions_interner = DedupInterner::default(); - let mut edges_store = Interner::default(); + let mut edges_store = DedupInterner::default(); let mut edges_of_node = query_graph.nodes.map(|_| HashSet::new()); for (source_id, source_node) in graph_nodes.iter() { @@ -33,7 +33,7 @@ impl RankingRuleGraph { } for (cost, condition) in edges { - let new_edge_id = edges_store.push(Some(Edge { + let new_edge_id = edges_store.insert(Some(Edge { source_node: source_id, dest_node: dest_idx, cost, diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 52f199ca9..a86f1fcb0 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -4,8 +4,8 @@ use std::collections::btree_map::Entry; use std::collections::{BTreeMap, VecDeque}; use std::ops::ControlFlow; -use super::empty_paths_cache::DeadEndPathCache; -use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; +use super::dead_end_path_cache::DeadEndPathCache; +use super::{RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNode; use crate::search::new::small_bitmap::SmallBitmap; @@ -23,7 +23,7 @@ impl RankingRuleGraph { from: Interned, cost: u16, all_distances: &MappedInterner)>, QueryNode>, - empty_paths_cache: &mut DeadEndPathCache, + dead_end_path_cache: &mut DeadEndPathCache, mut visit: impl FnMut( &[Interned], &mut Self, @@ -34,11 +34,11 @@ impl RankingRuleGraph { from, cost, all_distances, - empty_paths_cache, + dead_end_path_cache, &mut visit, &mut vec![], &mut SmallBitmap::for_interned_values_in(&self.conditions_interner), - &mut empty_paths_cache.conditions.clone(), + &mut dead_end_path_cache.conditions.clone(), )?; Ok(()) } @@ -47,7 +47,7 @@ impl RankingRuleGraph { from: Interned, cost: u16, all_distances: &MappedInterner)>, QueryNode>, - empty_paths_cache: &mut DeadEndPathCache, + dead_end_path_cache: &mut DeadEndPathCache, visit: &mut impl FnMut( &[Interned], &mut Self, @@ -66,10 +66,10 @@ impl RankingRuleGraph { continue; } let next_any_valid = match edge.condition { - EdgeCondition::Unconditional => { + None => { if edge.dest_node == self.query_graph.end_node { any_valid = true; - let control_flow = visit(prev_conditions, self, empty_paths_cache)?; + let control_flow = visit(prev_conditions, self, dead_end_path_cache)?; match control_flow { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(true), @@ -80,7 +80,7 @@ impl RankingRuleGraph { edge.dest_node, cost - edge.cost as u16, all_distances, - empty_paths_cache, + dead_end_path_cache, visit, prev_conditions, cur_path, @@ -88,7 +88,7 @@ impl RankingRuleGraph { )? } } - EdgeCondition::Conditional(condition) => { + Some(condition) => { if forbidden_conditions.contains(condition) || !all_distances.get(edge.dest_node).iter().any( |(next_cost, necessary_conditions)| { @@ -104,8 +104,8 @@ impl RankingRuleGraph { let mut new_forbidden_conditions = forbidden_conditions.clone(); new_forbidden_conditions - .union(empty_paths_cache.condition_couples.get(condition)); - empty_paths_cache.prefixes.final_edges_after_prefix( + .union(dead_end_path_cache.condition_couples.get(condition)); + dead_end_path_cache.prefixes.final_edges_after_prefix( prev_conditions, &mut |x| { new_forbidden_conditions.insert(x); @@ -113,7 +113,7 @@ impl RankingRuleGraph { ); let next_any_valid = if edge.dest_node == self.query_graph.end_node { any_valid = true; - let control_flow = visit(prev_conditions, self, empty_paths_cache)?; + let control_flow = visit(prev_conditions, self, dead_end_path_cache)?; match control_flow { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(true), @@ -124,7 +124,7 @@ impl RankingRuleGraph { edge.dest_node, cost - edge.cost as u16, all_distances, - empty_paths_cache, + dead_end_path_cache, visit, prev_conditions, cur_path, @@ -139,15 +139,15 @@ impl RankingRuleGraph { any_valid |= next_any_valid; if next_any_valid { - if empty_paths_cache.path_is_dead_end(prev_conditions, cur_path) { + if dead_end_path_cache.path_is_dead_end(prev_conditions, cur_path) { return Ok(any_valid); } - forbidden_conditions.union(&empty_paths_cache.conditions); + forbidden_conditions.union(&dead_end_path_cache.conditions); for prev_condition in prev_conditions.iter() { forbidden_conditions - .union(empty_paths_cache.condition_couples.get(*prev_condition)); + .union(dead_end_path_cache.condition_couples.get(*prev_condition)); } - empty_paths_cache.prefixes.final_edges_after_prefix(prev_conditions, &mut |x| { + dead_end_path_cache.prefixes.final_edges_after_prefix(prev_conditions, &mut |x| { forbidden_conditions.insert(x); }); } @@ -178,16 +178,14 @@ impl RankingRuleGraph { let cur_node_edges = &self.edges_of_node.get(cur_node); for edge_idx in cur_node_edges.iter() { let edge = self.edges_store.get(edge_idx).as_ref().unwrap(); - let condition = match edge.condition { - EdgeCondition::Unconditional => None, - EdgeCondition::Conditional(condition) => Some(condition), - }; let succ_node = edge.dest_node; let succ_distances = distances_to_end.get(succ_node); for (succ_distance, succ_necessary_conditions) in succ_distances { let mut potential_necessary_edges = SmallBitmap::for_interned_values_in(&self.conditions_interner); - for condition in condition.into_iter().chain(succ_necessary_conditions.iter()) { + for condition in + edge.condition.into_iter().chain(succ_necessary_conditions.iter()) + { potential_necessary_edges.insert(condition); } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs similarity index 60% rename from milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs rename to milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs index b3426619b..9766cfaa3 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs @@ -9,44 +9,43 @@ use crate::search::new::SearchContext; use crate::Result; /// A cache storing the document ids associated with each ranking rule edge -pub struct EdgeConditionDocIdsCache { +pub struct ConditionDocIdsCache { // TODO: should be FxHashMap, RoaringBitmap> - pub cache: FxHashMap, RoaringBitmap>, + pub cache: FxHashMap, RoaringBitmap>, _phantom: PhantomData, } -impl Default for EdgeConditionDocIdsCache { +impl Default for ConditionDocIdsCache { fn default() -> Self { Self { cache: Default::default(), _phantom: Default::default() } } } -impl EdgeConditionDocIdsCache { +impl ConditionDocIdsCache { /// Retrieve the document ids for the given edge condition. /// /// If the cache does not yet contain these docids, they are computed /// and inserted in the cache. - pub fn get_edge_docids<'s, 'ctx>( + pub fn get_condition_docids<'s, 'ctx>( &'s mut self, ctx: &mut SearchContext<'ctx>, - // TODO: should be Interned - interned_edge_condition: Interned, + interned_condition: Interned, graph: &RankingRuleGraph, // TODO: maybe universe doesn't belong here universe: &RoaringBitmap, ) -> Result<&'s RoaringBitmap> { - if self.cache.contains_key(&interned_edge_condition) { + if self.cache.contains_key(&interned_condition) { // TODO: should we update the bitmap in the cache if the new universe // reduces it? // TODO: maybe have a generation: u32 to track every time the universe was // reduced. Then only attempt to recompute the intersection when there is a chance - // that edge_docids & universe changed - return Ok(&self.cache[&interned_edge_condition]); + // that condition_docids & universe changed + return Ok(&self.cache[&interned_condition]); } // TODO: maybe universe doesn't belong here - let edge_condition = graph.conditions_interner.get(interned_edge_condition); + let condition = graph.conditions_interner.get(interned_condition); // TODO: faster way to do this? - let docids = universe & G::resolve_edge_condition(ctx, edge_condition, universe)?; - let _ = self.cache.insert(interned_edge_condition, docids); - let docids = &self.cache[&interned_edge_condition]; + let docids = universe & G::resolve_condition(ctx, condition, universe)?; + let _ = self.cache.insert(interned_condition, docids); + let docids = &self.cache[&interned_condition]; Ok(docids) } } diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs similarity index 78% rename from milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs rename to milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs index 15346e929..b4af625d6 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs @@ -9,11 +9,11 @@ use crate::search::new::{ /// universe. pub struct DeadEndPathCache { /// The set of edge conditions that resolve to no documents. - pub conditions: SmallBitmap, + pub conditions: SmallBitmap, /// A set of path prefixes that resolve to no documents. - pub prefixes: PathSet, + pub prefixes: PathSet, /// A set of empty couples of edge conditions that resolve to no documents. - pub condition_couples: MappedInterner, G::EdgeCondition>, + pub condition_couples: MappedInterner, G::Condition>, } impl Clone for DeadEndPathCache { fn clone(&self) -> Self { @@ -27,17 +27,17 @@ impl Clone for DeadEndPathCache { impl DeadEndPathCache { /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. - pub fn new(all_edge_conditions: &FixedSizeInterner) -> Self { + pub fn new(all_conditions: &FixedSizeInterner) -> Self { Self { - conditions: SmallBitmap::for_interned_values_in(all_edge_conditions), + conditions: SmallBitmap::for_interned_values_in(all_conditions), prefixes: PathSet::default(), - condition_couples: all_edge_conditions - .map(|_| SmallBitmap::for_interned_values_in(all_edge_conditions)), + condition_couples: all_conditions + .map(|_| SmallBitmap::for_interned_values_in(all_conditions)), } } /// Store in the cache that every path containing the given edge resolves to no documents. - pub fn add_condition(&mut self, condition: Interned) { + pub fn add_condition(&mut self, condition: Interned) { self.conditions.insert(condition); self.condition_couples.get_mut(condition).clear(); self.prefixes.remove_edge(condition); @@ -46,7 +46,7 @@ impl DeadEndPathCache { } } /// Store in the cache that every path containing the given prefix resolves to no documents. - pub fn add_prefix(&mut self, prefix: &[Interned]) { + pub fn add_prefix(&mut self, prefix: &[Interned]) { // TODO: typed PathSet self.prefixes.insert(prefix.iter().copied()); } @@ -54,8 +54,8 @@ impl DeadEndPathCache { /// Store in the cache that every path containing the two given edges resolves to no documents. pub fn add_condition_couple( &mut self, - edge1: Interned, - edge2: Interned, + edge1: Interned, + edge2: Interned, ) { self.condition_couples.get_mut(edge1).insert(edge2); } @@ -63,8 +63,8 @@ impl DeadEndPathCache { /// Returns true if the cache can determine that the given path resolves to no documents. pub fn path_is_dead_end( &self, - path: &[Interned], - path_bitmap: &SmallBitmap, + path: &[Interned], + path_bitmap: &SmallBitmap, ) -> bool { if path_bitmap.intersects(&self.conditions) { return true; diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 03e7c14bc..fcb9ade3f 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -7,8 +7,8 @@ the same but the edges are replaced. mod build; mod cheapest_paths; -mod edge_docids_cache; -mod empty_paths_cache; +mod condition_docids_cache; +mod dead_end_path_cache; mod path_set; /// Implementation of the `proximity` ranking rule @@ -19,8 +19,8 @@ mod typo; use std::collections::HashSet; use std::hash::Hash; -pub use edge_docids_cache::EdgeConditionDocIdsCache; -pub use empty_paths_cache::DeadEndPathCache; +pub use condition_docids_cache::EdgeConditionDocIdsCache; +pub use dead_end_path_cache::DeadEndPathCache; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoEdge, TypoGraph}; @@ -32,31 +32,6 @@ use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; use crate::Result; -/// The condition that is associated with an edge in the ranking rule graph. -/// -/// Some edges are unconditional, which means that traversing them does not reduce -/// the set of candidates. -/// -/// Most edges, however, have a condition attached to them. For example, for the -/// proximity ranking rule, the condition could be that a word is N-close to another one. -/// When the edge is traversed, some database operations are executed to retrieve the set -/// of documents that satisfy the condition, which reduces the list of candidate document ids. -pub enum EdgeCondition { - Unconditional, - Conditional(Interned), -} - -impl Copy for EdgeCondition {} - -impl Clone for EdgeCondition { - fn clone(&self) -> Self { - match self { - Self::Unconditional => Self::Unconditional, - Self::Conditional(arg0) => Self::Conditional(*arg0), - } - } -} - /// An edge in the ranking rule graph. /// /// It contains: @@ -68,7 +43,27 @@ pub struct Edge { pub source_node: Interned, pub dest_node: Interned, pub cost: u8, - pub condition: EdgeCondition, + pub condition: Option>, +} + +impl Hash for Edge { + fn hash(&self, state: &mut H) { + self.source_node.hash(state); + self.dest_node.hash(state); + self.cost.hash(state); + self.condition.hash(state); + } +} + +impl Eq for Edge {} + +impl PartialEq for Edge { + fn eq(&self, other: &Self) -> bool { + self.source_node == other.source_node + && self.dest_node == other.dest_node + && self.cost == other.cost + && self.condition == other.condition + } } /// A trait to be implemented by a marker type to build a graph-based ranking rule. @@ -113,12 +108,12 @@ pub trait RankingRuleGraphTrait: Sized { conditions_interner: &mut DedupInterner, source_node: &QueryNode, dest_node: &QueryNode, - ) -> Result)>>; + ) -> Result>)>>; fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - empty_paths_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, @@ -151,15 +146,12 @@ impl RankingRuleGraph { pub fn remove_edges_with_condition(&mut self, condition_to_remove: Interned) { for (edge_id, edge_opt) in self.edges_store.iter_mut() { let Some(edge) = edge_opt.as_mut() else { continue }; - match edge.condition { - EdgeCondition::Unconditional => continue, - EdgeCondition::Conditional(condition) => { - if condition == condition_to_remove { - let (source_node, _dest_node) = (edge.source_node, edge.dest_node); - *edge_opt = None; - self.edges_of_node.get_mut(source_node).remove(edge_id); - } - } + let Some(condition) = edge.condition else { continue }; + + if condition == condition_to_remove { + let (source_node, _dest_node) = (edge.source_node, edge.dest_node); + *edge_opt = None; + self.edges_of_node.get_mut(source_node).remove(edge_id); } } } diff --git a/milli/src/search/new/ranking_rule_graph/path_set.rs b/milli/src/search/new/ranking_rule_graph/path_set.rs index 04e6c9116..19c7d012e 100644 --- a/milli/src/search/new/ranking_rule_graph/path_set.rs +++ b/milli/src/search/new/ranking_rule_graph/path_set.rs @@ -4,7 +4,7 @@ use crate::search::new::interner::Interned; -/// A set of [`Path`] +/// A set of `Vec>`. pub struct PathSet { nodes: Vec<(Interned, Self)>, is_end: bool, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 8ae634fbf..5e38a1879 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -7,7 +7,6 @@ use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; use crate::search::new::ranking_rule_graph::proximity::WordPair; -use crate::search::new::ranking_rule_graph::EdgeCondition; use crate::search::new::{QueryNode, SearchContext}; use crate::Result; use heed::RoTxn; @@ -40,7 +39,7 @@ pub fn build_edges<'ctx>( conditions_interner: &mut DedupInterner, from_node: &QueryNode, to_node: &QueryNode, -) -> Result)>> { +) -> Result>)>> { let SearchContext { index, txn, @@ -52,7 +51,7 @@ pub fn build_edges<'ctx>( } = ctx; let right_term = match &to_node.data { - QueryNodeData::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), + QueryNodeData::End => return Ok(vec![(0, None)]), QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]), QueryNodeData::Term(term) => term, }; @@ -70,7 +69,7 @@ pub fn build_edges<'ctx>( QueryNodeData::Start => { return Ok(vec![( (right_ngram_length - 1) as u8, - EdgeCondition::Conditional( + Some( conditions_interner .insert(ProximityCondition::Term { term: *right_term_interned }), ), @@ -88,7 +87,7 @@ pub fn build_edges<'ctx>( // but `sun` and `are` have no proximity condition between them return Ok(vec![( (right_ngram_length - 1) as u8, - EdgeCondition::Conditional( + Some( conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }), ), )]); @@ -140,7 +139,7 @@ pub fn build_edges<'ctx>( .map(|(cost, word_pairs)| { ( cost, - EdgeCondition::Conditional( + Some( conditions_interner .insert(ProximityCondition::Pairs { pairs: word_pairs.into_boxed_slice() }), ), @@ -149,9 +148,7 @@ pub fn build_edges<'ctx>( .collect::>(); new_edges.push(( 8 + (right_ngram_length - 1) as u8, - EdgeCondition::Conditional( - conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }), - ), + Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })), )); Ok(new_edges) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 5a5c5cff8..d5655e949 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -6,8 +6,8 @@ use std::iter::FromIterator; use roaring::RoaringBitmap; -use super::empty_paths_cache::DeadEndPathCache; -use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; +use super::dead_end_path_cache::DeadEndPathCache; +use super::{RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::{Phrase, QueryTerm}; @@ -60,20 +60,20 @@ impl RankingRuleGraphTrait for ProximityGraph { conditions_interner: &mut DedupInterner, source_node: &QueryNode, dest_node: &QueryNode, - ) -> Result)>> { + ) -> Result>)>> { build::build_edges(ctx, conditions_interner, source_node, dest_node) } fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - empty_paths_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ) { - logger.log_proximity_state(graph, paths, empty_paths_cache, universe, distances, cost); + logger.log_proximity_state(graph, paths, dead_end_path_cache, universe, distances, cost); } fn label_for_edge_condition<'ctx>( diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index d0b18ddc8..b41787ac2 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,7 +1,7 @@ use roaring::RoaringBitmap; -use super::empty_paths_cache::DeadEndPathCache; -use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; +use super::dead_end_path_cache::DeadEndPathCache; +use super::{RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_graph::QueryNodeData; @@ -58,7 +58,7 @@ impl RankingRuleGraphTrait for TypoGraph { conditions_interner: &mut DedupInterner, _from_node: &QueryNode, to_node: &QueryNode, - ) -> Result)>> { + ) -> Result>)>> { let SearchContext { term_interner, .. } = ctx; match &to_node.data { QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { @@ -121,7 +121,7 @@ impl RankingRuleGraphTrait for TypoGraph { if !new_term.is_empty() { edges.push(( nbr_typos as u8 + base_cost, - EdgeCondition::Conditional(conditions_interner.insert(TypoEdge { + Some(conditions_interner.insert(TypoEdge { term: term_interner.insert(new_term), nbr_typos: nbr_typos as u8, })), @@ -130,7 +130,7 @@ impl RankingRuleGraphTrait for TypoGraph { } Ok(edges) } - QueryNodeData::End => Ok(vec![(0, EdgeCondition::Unconditional)]), + QueryNodeData::End => Ok(vec![(0, None)]), QueryNodeData::Deleted | QueryNodeData::Start => panic!(), } } @@ -138,13 +138,13 @@ impl RankingRuleGraphTrait for TypoGraph { fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - empty_paths_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndPathCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ) { - logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances, cost); + logger.log_typo_state(graph, paths, dead_end_path_cache, universe, distances, cost); } fn label_for_edge_condition<'ctx>( diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index cb1c64ec2..975c5343f 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -2,6 +2,7 @@ use std::marker::PhantomData; use super::interner::{FixedSizeInterner, Interned}; +/// A compact set of [`Interned`] pub struct SmallBitmap { internal: SmallBitmapInternal, _phantom: PhantomData, From 2853009987fad3f20fde077fa4ae0abc971ed0ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Mar 2023 11:49:23 +0100 Subject: [PATCH 062/234] Renaming Edge -> Condition --- .../search/new/graph_based_ranking_rule.rs | 20 ++++----- milli/src/search/new/logger/detailed.rs | 22 ++++----- milli/src/search/new/logger/mod.rs | 11 ++--- .../new/ranking_rule_graph/cheapest_paths.rs | 18 ++++---- .../src/search/new/ranking_rule_graph/mod.rs | 44 +++++++++--------- .../search/new/ranking_rule_graph/path_set.rs | 2 +- .../proximity/compute_docids.rs | 4 +- .../new/ranking_rule_graph/proximity/mod.rs | 28 ++++++------ .../search/new/ranking_rule_graph/typo/mod.rs | 45 +++++++++---------- 9 files changed, 97 insertions(+), 97 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 72c839f2a..bbe4099bc 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -45,7 +45,7 @@ use super::interner::MappedInterner; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - DeadEndPathCache, EdgeConditionDocIdsCache, ProximityGraph, RankingRuleGraph, + ConditionDocIdsCache, DeadEndPathCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; @@ -85,12 +85,12 @@ pub struct GraphBasedRankingRuleState { /// The current graph graph: RankingRuleGraph, /// Cache to retrieve the docids associated with each edge - edge_conditions_cache: EdgeConditionDocIdsCache, + conditions_cache: ConditionDocIdsCache, /// Cache used to optimistically discard paths that resolve to no documents. dead_end_path_cache: DeadEndPathCache, /// A structure giving the list of possible costs from each node to the end node, /// along with a set of unavoidable edges that must be traversed to achieve that distance. - all_distances: MappedInterner)>, QueryNode>, + all_distances: MappedInterner)>, QueryNode>, /// An index in the first element of `all_distances`, giving the cost of the next bucket cur_distance_idx: usize, } @@ -101,7 +101,7 @@ pub struct GraphBasedRankingRuleState { fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( ctx: &mut SearchContext<'ctx>, graph: &mut RankingRuleGraph, - condition_docids_cache: &mut EdgeConditionDocIdsCache, + condition_docids_cache: &mut ConditionDocIdsCache, universe: &RoaringBitmap, dead_end_path_cache: &mut DeadEndPathCache, ) -> Result<()> { @@ -135,7 +135,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase query_graph: &QueryGraph, ) -> Result<()> { let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; - let mut condition_docids_cache = EdgeConditionDocIdsCache::default(); + let mut condition_docids_cache = ConditionDocIdsCache::default(); let mut dead_end_path_cache = DeadEndPathCache::new(&graph.conditions_interner); // First simplify the graph as much as possible, by computing the docids of all the conditions @@ -153,7 +153,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let state = GraphBasedRankingRuleState { graph, - edge_conditions_cache: condition_docids_cache, + conditions_cache: condition_docids_cache, dead_end_path_cache, all_distances, cur_distance_idx: 0, @@ -181,7 +181,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase remove_empty_edges( ctx, &mut state.graph, - &mut state.edge_conditions_cache, + &mut state.conditions_cache, universe, &mut state.dead_end_path_cache, )?; @@ -204,7 +204,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let GraphBasedRankingRuleState { graph, - edge_conditions_cache: condition_docids_cache, + conditions_cache: condition_docids_cache, dead_end_path_cache, all_distances, cur_distance_idx: _, @@ -326,8 +326,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let mut used_phrases = HashSet::new(); for condition in used_conditions.iter() { let condition = graph.conditions_interner.get(condition); - used_words.extend(G::words_used_by_edge_condition(ctx, condition)?); - used_phrases.extend(G::phrases_used_by_edge_condition(ctx, condition)?); + used_words.extend(G::words_used_by_condition(ctx, condition)?); + used_phrases.extend(G::phrases_used_by_condition(ctx, condition)?); } // 2. Remove the unused words and phrases from all the nodes in the graph let mut nodes_to_remove = vec![]; diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 96f4092b1..1ed992c56 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -11,7 +11,7 @@ use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; use crate::search::new::ranking_rule_graph::{ DeadEndPathCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, - RankingRuleGraphTrait, TypoEdge, TypoGraph, + RankingRuleGraphTrait, TypoCondition, TypoGraph, }; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; @@ -51,10 +51,10 @@ pub enum SearchEvents { }, TypoState { graph: RankingRuleGraph, - paths: Vec>>, + paths: Vec>>, dead_end_path_cache: DeadEndPathCache, universe: RoaringBitmap, - distances: MappedInterner)>, QueryNode>, + distances: MappedInterner)>, QueryNode>, cost: u16, }, RankingRuleSkipBucket { @@ -188,10 +188,10 @@ impl SearchLogger for DetailedSearchLogger { fn log_typo_state( &mut self, query_graph: &RankingRuleGraph, - paths_map: &[Vec>], + paths_map: &[Vec>], dead_end_path_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ) { self.events.push(SearchEvents::TypoState { @@ -430,7 +430,7 @@ results.{random} {{ ctx: &mut SearchContext, node_idx: Interned, node: &QueryNode, - distances: &[(u16, SmallBitmap)], + distances: &[(u16, SmallBitmap)], file: &mut File, ) { match &node.data { @@ -527,9 +527,9 @@ shape: class" fn ranking_rule_graph_d2_description( ctx: &mut SearchContext, graph: &RankingRuleGraph, - paths: &[Vec>], + paths: &[Vec>], dead_end_paths_cache: &DeadEndPathCache, - distances: MappedInterner)>, QueryNode>, + distances: MappedInterner)>, QueryNode>, file: &mut File, ) { writeln!(file, "direction: right").unwrap(); @@ -596,7 +596,7 @@ shape: class" fn condition_d2_description( ctx: &mut SearchContext, graph: &RankingRuleGraph, - condition_id: Interned, + condition_id: Interned, file: &mut File, ) { let condition = graph.conditions_interner.get(condition_id); @@ -606,14 +606,14 @@ shape: class" shape: class {} }}", - R::label_for_edge_condition(ctx, condition).unwrap() + R::label_for_condition(ctx, condition).unwrap() ) .unwrap(); } fn paths_d2_description( ctx: &mut SearchContext, graph: &RankingRuleGraph, - paths: &[Vec>], + paths: &[Vec>], file: &mut File, ) { for (path_idx, condition_indexes) in paths.iter().enumerate() { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 696f65443..ba58d049f 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -6,7 +6,8 @@ use roaring::RoaringBitmap; use super::interner::{Interned, MappedInterner}; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - DeadEndPathCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoEdge, TypoGraph, + DeadEndPathCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoCondition, + TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{RankingRule, RankingRuleQueryTrait}; @@ -76,10 +77,10 @@ pub trait SearchLogger { fn log_typo_state( &mut self, query_graph: &RankingRuleGraph, - paths: &[Vec>], + paths: &[Vec>], dead_end_path_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ); } @@ -147,10 +148,10 @@ impl SearchLogger for DefaultSearchLogger { fn log_typo_state( &mut self, _query_graph: &RankingRuleGraph, - _paths: &[Vec>], + _paths: &[Vec>], _dead_end_path_cache: &DeadEndPathCache, _universe: &RoaringBitmap, - _distances: &MappedInterner)>, QueryNode>, + _distances: &MappedInterner)>, QueryNode>, _cost: u16, ) { } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index a86f1fcb0..bc0e88326 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -22,10 +22,10 @@ impl RankingRuleGraph { &mut self, from: Interned, cost: u16, - all_distances: &MappedInterner)>, QueryNode>, + all_distances: &MappedInterner)>, QueryNode>, dead_end_path_cache: &mut DeadEndPathCache, mut visit: impl FnMut( - &[Interned], + &[Interned], &mut Self, &mut DeadEndPathCache, ) -> Result>, @@ -46,16 +46,16 @@ impl RankingRuleGraph { &mut self, from: Interned, cost: u16, - all_distances: &MappedInterner)>, QueryNode>, + all_distances: &MappedInterner)>, QueryNode>, dead_end_path_cache: &mut DeadEndPathCache, visit: &mut impl FnMut( - &[Interned], + &[Interned], &mut Self, &mut DeadEndPathCache, ) -> Result>, - prev_conditions: &mut Vec>, - cur_path: &mut SmallBitmap, - forbidden_conditions: &mut SmallBitmap, + prev_conditions: &mut Vec>, + cur_path: &mut SmallBitmap, + forbidden_conditions: &mut SmallBitmap, ) -> Result { let mut any_valid = false; @@ -158,7 +158,7 @@ impl RankingRuleGraph { pub fn initialize_distances_with_necessary_edges( &self, - ) -> MappedInterner)>, QueryNode> { + ) -> MappedInterner)>, QueryNode> { let mut distances_to_end = self.query_graph.nodes.map(|_| vec![]); let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); @@ -173,7 +173,7 @@ impl RankingRuleGraph { } while let Some(cur_node) = node_stack.pop_front() { - let mut self_distances = BTreeMap::>::new(); + let mut self_distances = BTreeMap::>::new(); let cur_node_edges = &self.edges_of_node.get(cur_node); for edge_idx in cur_node_edges.iter() { diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index fcb9ade3f..02a68b811 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -19,11 +19,11 @@ mod typo; use std::collections::HashSet; use std::hash::Hash; -pub use condition_docids_cache::EdgeConditionDocIdsCache; +pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_end_path_cache::DeadEndPathCache; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; -pub use typo::{TypoEdge, TypoGraph}; +pub use typo::{TypoCondition, TypoGraph}; use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner}; use super::logger::SearchLogger; @@ -74,48 +74,48 @@ impl PartialEq for Edge { pub trait RankingRuleGraphTrait: Sized { /// The condition of an edge connecting two query nodes. The condition /// should be sufficient to compute the edge's cost and associated document ids - /// in [`resolve_edge_condition`](RankingRuleGraphTrait::resolve_edge_condition). - type EdgeCondition: Sized + Clone + PartialEq + Eq + Hash; + /// in [`resolve_condition`](RankingRuleGraphTrait::resolve_condition). + type Condition: Sized + Clone + PartialEq + Eq + Hash; /// Return the label of the given edge condition, to be used when visualising /// the ranking rule graph. - fn label_for_edge_condition<'ctx>( + fn label_for_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &Self::EdgeCondition, + condition: &Self::Condition, ) -> Result; - fn words_used_by_edge_condition<'ctx>( + fn words_used_by_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &Self::EdgeCondition, + condition: &Self::Condition, ) -> Result>>; - fn phrases_used_by_edge_condition<'ctx>( + + fn phrases_used_by_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &Self::EdgeCondition, + condition: &Self::Condition, ) -> Result>>; /// Compute the document ids associated with the given edge condition, /// restricted to the given universe. - fn resolve_edge_condition<'ctx>( + fn resolve_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - edge_condition: &Self::EdgeCondition, + condition: &Self::Condition, universe: &RoaringBitmap, ) -> Result; - /// Return the cost and condition of the edges going from the previously visited node - /// (with [`build_step_visit_source_node`](RankingRuleGraphTrait::build_step_visit_source_node)) to `dest_node`. + /// Return the costs and conditions of the edges going from the source node to the destination node fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut DedupInterner, + conditions_interner: &mut DedupInterner, source_node: &QueryNode, dest_node: &QueryNode, - ) -> Result>)>>; + ) -> Result>)>>; fn log_state( graph: &RankingRuleGraph, - paths: &[Vec>], + paths: &[Vec>], dead_end_path_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner)>, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ); @@ -127,9 +127,9 @@ pub trait RankingRuleGraphTrait: Sized { /// but replacing the edges. pub struct RankingRuleGraph { pub query_graph: QueryGraph, - pub edges_store: FixedSizeInterner>>, - pub edges_of_node: MappedInterner>>, QueryNode>, - pub conditions_interner: FixedSizeInterner, + pub edges_store: FixedSizeInterner>>, + pub edges_of_node: MappedInterner>>, QueryNode>, + pub conditions_interner: FixedSizeInterner, } impl Clone for RankingRuleGraph { fn clone(&self) -> Self { @@ -143,7 +143,7 @@ impl Clone for RankingRuleGraph { } impl RankingRuleGraph { /// Remove all edges with the given condition - pub fn remove_edges_with_condition(&mut self, condition_to_remove: Interned) { + pub fn remove_edges_with_condition(&mut self, condition_to_remove: Interned) { for (edge_id, edge_opt) in self.edges_store.iter_mut() { let Some(edge) = edge_opt.as_mut() else { continue }; let Some(condition) = edge.condition else { continue }; diff --git a/milli/src/search/new/ranking_rule_graph/path_set.rs b/milli/src/search/new/ranking_rule_graph/path_set.rs index 19c7d012e..1a87c9604 100644 --- a/milli/src/search/new/ranking_rule_graph/path_set.rs +++ b/milli/src/search/new/ranking_rule_graph/path_set.rs @@ -4,7 +4,7 @@ use crate::search::new::interner::Interned; -/// A set of `Vec>`. +/// A set of `Vec>` implemented as a prefix tree. pub struct PathSet { nodes: Vec<(Interned, Self)>, is_end: bool, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index cdf167cb0..bf5278f8d 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -6,7 +6,7 @@ use crate::{CboRoaringBitmapCodec, Result}; pub fn compute_docids<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &ProximityCondition, + condition: &ProximityCondition, universe: &RoaringBitmap, ) -> Result { let SearchContext { @@ -18,7 +18,7 @@ pub fn compute_docids<'ctx>( phrase_interner, term_interner, } = ctx; - let pairs = match edge { + let pairs = match condition { ProximityCondition::Term { term } => { return term_docids .get_query_term_docids( diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index d5655e949..9a6080301 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -45,11 +45,11 @@ pub enum ProximityCondition { pub enum ProximityGraph {} impl RankingRuleGraphTrait for ProximityGraph { - type EdgeCondition = ProximityCondition; + type Condition = ProximityCondition; - fn resolve_edge_condition<'ctx>( + fn resolve_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - condition: &Self::EdgeCondition, + condition: &Self::Condition, universe: &RoaringBitmap, ) -> Result { compute_docids::compute_docids(ctx, condition, universe) @@ -57,10 +57,10 @@ impl RankingRuleGraphTrait for ProximityGraph { fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut DedupInterner, + conditions_interner: &mut DedupInterner, source_node: &QueryNode, dest_node: &QueryNode, - ) -> Result>)>> { + ) -> Result>)>> { build::build_edges(ctx, conditions_interner, source_node, dest_node) } @@ -76,11 +76,11 @@ impl RankingRuleGraphTrait for ProximityGraph { logger.log_proximity_state(graph, paths, dead_end_path_cache, universe, distances, cost); } - fn label_for_edge_condition<'ctx>( + fn label_for_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &Self::EdgeCondition, + condition: &Self::Condition, ) -> Result { - match edge { + match condition { ProximityCondition::Term { term } => { let term = ctx.term_interner.get(*term); Ok(format!("{} : exists", ctx.word_interner.get(term.original))) @@ -117,11 +117,11 @@ impl RankingRuleGraphTrait for ProximityGraph { } } - fn words_used_by_edge_condition<'ctx>( + fn words_used_by_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &Self::EdgeCondition, + condition: &Self::Condition, ) -> Result>> { - match edge { + match condition { ProximityCondition::Term { term } => { let term = ctx.term_interner.get(*term); Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) @@ -153,11 +153,11 @@ impl RankingRuleGraphTrait for ProximityGraph { } } - fn phrases_used_by_edge_condition<'ctx>( + fn phrases_used_by_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &Self::EdgeCondition, + condition: &Self::Condition, ) -> Result>> { - match edge { + match condition { ProximityCondition::Term { term } => { let term = ctx.term_interner.get(*term); Ok(HashSet::from_iter(term.all_phrases())) diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index b41787ac2..0b81ec0ec 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -14,19 +14,18 @@ use std::fmt::Write; use std::iter::FromIterator; #[derive(Clone, PartialEq, Eq, Hash)] -pub struct TypoEdge { +pub struct TypoCondition { term: Interned, - nbr_typos: u8, } pub enum TypoGraph {} impl RankingRuleGraphTrait for TypoGraph { - type EdgeCondition = TypoEdge; + type Condition = TypoCondition; - fn resolve_edge_condition<'db_cache, 'ctx>( + fn resolve_condition<'db_cache, 'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &Self::EdgeCondition, + condition: &Self::Condition, universe: &RoaringBitmap, ) -> Result { let SearchContext { @@ -47,7 +46,7 @@ impl RankingRuleGraphTrait for TypoGraph { word_interner, term_interner, phrase_interner, - edge.term, + condition.term, )?; Ok(docids) @@ -55,10 +54,10 @@ impl RankingRuleGraphTrait for TypoGraph { fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut DedupInterner, + conditions_interner: &mut DedupInterner, _from_node: &QueryNode, to_node: &QueryNode, - ) -> Result>)>> { + ) -> Result>)>> { let SearchContext { term_interner, .. } = ctx; match &to_node.data { QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { @@ -121,10 +120,10 @@ impl RankingRuleGraphTrait for TypoGraph { if !new_term.is_empty() { edges.push(( nbr_typos as u8 + base_cost, - Some(conditions_interner.insert(TypoEdge { - term: term_interner.insert(new_term), - nbr_typos: nbr_typos as u8, - })), + Some( + conditions_interner + .insert(TypoCondition { term: term_interner.insert(new_term) }), + ), )) } } @@ -137,21 +136,21 @@ impl RankingRuleGraphTrait for TypoGraph { fn log_state( graph: &RankingRuleGraph, - paths: &[Vec>], + paths: &[Vec>], dead_end_path_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner)>, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ) { logger.log_typo_state(graph, paths, dead_end_path_cache, universe, distances, cost); } - fn label_for_edge_condition<'ctx>( + fn label_for_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &Self::EdgeCondition, + condition: &Self::Condition, ) -> Result { - let TypoEdge { term, nbr_typos: _ } = edge; + let TypoCondition { term } = condition; let term = ctx.term_interner.get(*term); let QueryTerm { original: _, @@ -203,20 +202,20 @@ impl RankingRuleGraphTrait for TypoGraph { Ok(s) } - fn words_used_by_edge_condition<'ctx>( + fn words_used_by_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &Self::EdgeCondition, + condition: &Self::Condition, ) -> Result>> { - let TypoEdge { term, .. } = edge; + let TypoCondition { term, .. } = condition; let term = ctx.term_interner.get(*term); Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) } - fn phrases_used_by_edge_condition<'ctx>( + fn phrases_used_by_condition<'ctx>( ctx: &mut SearchContext<'ctx>, - edge: &Self::EdgeCondition, + condition: &Self::Condition, ) -> Result>> { - let TypoEdge { term, .. } = edge; + let TypoCondition { term, .. } = condition; let term = ctx.term_interner.get(*term); Ok(HashSet::from_iter(term.all_phrases())) } From 8b4e07e1a3c7379bfa4356d3432b5f39f699ca14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sat, 18 Mar 2023 15:04:34 +0100 Subject: [PATCH 063/234] WIP --- milli/src/lib.rs | 4 +- .../search/new/graph_based_ranking_rule.rs | 83 ++++--- milli/src/search/new/interner.rs | 2 +- milli/src/search/new/logger/detailed.rs | 29 ++- milli/src/search/new/logger/mod.rs | 13 +- milli/src/search/new/mod.rs | 51 ++-- .../new/ranking_rule_graph/cheapest_paths.rs | 54 ++--- .../condition_docids_cache.rs | 14 +- .../ranking_rule_graph/dead_end_path_cache.rs | 157 ++++++------ .../src/search/new/ranking_rule_graph/mod.rs | 5 +- .../search/new/ranking_rule_graph/path_set.rs | 223 +++++++++++------- .../new/ranking_rule_graph/proximity/mod.rs | 5 +- .../search/new/ranking_rule_graph/typo/mod.rs | 5 +- milli/src/search/new/small_bitmap.rs | 6 + 14 files changed, 367 insertions(+), 284 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ade6ee8bd..199221c7c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -70,7 +70,9 @@ pub mod update; #[macro_use] pub mod snapshot_tests; -pub use search::new::{execute_search, SearchContext}; +pub use search::new::DetailedSearchLogger; + +pub use search::new::{execute_search, DefaultSearchLogger, SearchContext}; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index bbe4099bc..edac0b15c 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -45,8 +45,8 @@ use super::interner::MappedInterner; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - ConditionDocIdsCache, DeadEndPathCache, ProximityGraph, RankingRuleGraph, - RankingRuleGraphTrait, TypoGraph, + ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, + TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; @@ -87,7 +87,7 @@ pub struct GraphBasedRankingRuleState { /// Cache to retrieve the docids associated with each edge conditions_cache: ConditionDocIdsCache, /// Cache used to optimistically discard paths that resolve to no documents. - dead_end_path_cache: DeadEndPathCache, + dead_end_path_cache: DeadEndsCache, /// A structure giving the list of possible costs from each node to the end node, /// along with a set of unavoidable edges that must be traversed to achieve that distance. all_distances: MappedInterner)>, QueryNode>, @@ -103,7 +103,7 @@ fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( graph: &mut RankingRuleGraph, condition_docids_cache: &mut ConditionDocIdsCache, universe: &RoaringBitmap, - dead_end_path_cache: &mut DeadEndPathCache, + dead_end_path_cache: &mut DeadEndsCache, ) -> Result<()> { for edge_id in graph.edges_store.indexes() { let Some(edge) = graph.edges_store.get(edge_id).as_ref() else { @@ -113,9 +113,9 @@ fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( let docids = condition_docids_cache.get_condition_docids(ctx, condition, graph, universe)?; - if docids.is_disjoint(universe) { + if docids.is_empty() { graph.remove_edges_with_condition(condition); - dead_end_path_cache.add_condition(condition); + dead_end_path_cache.forbid_condition(condition); // add_condition(condition); condition_docids_cache.cache.remove(&condition); continue; } @@ -135,8 +135,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase query_graph: &QueryGraph, ) -> Result<()> { let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; - let mut condition_docids_cache = ConditionDocIdsCache::default(); - let mut dead_end_path_cache = DeadEndPathCache::new(&graph.conditions_interner); + let mut condition_docids_cache = ConditionDocIdsCache::new(universe); + let mut dead_end_path_cache = DeadEndsCache::new(&graph.conditions_interner); // First simplify the graph as much as possible, by computing the docids of all the conditions // within the rule's universe and removing the edges that have no associated docids. @@ -230,62 +230,79 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase graph.query_graph.root_node, cost, all_distances, + dead_end_path_cache.forbidden.clone(), + |condition, forbidden_conditions| {}, dead_end_path_cache, |path, graph, dead_end_path_cache| { // Accumulate the path for logging purposes only paths.push(path.to_vec()); + let mut path_docids = universe.clone(); // We store the edges and their docids in vectors in case the path turns out to be // empty and we need to figure out why it was empty. let mut visited_conditions = vec![]; let mut cached_condition_docids = vec![]; - // graph.conditions_interner.map(|_| RoaringBitmap::new()); - for &condition in path { - visited_conditions.push(condition); + for &latest_condition in path { + visited_conditions.push(latest_condition); - let condition_docids = condition_docids_cache - .get_condition_docids(ctx, condition, graph, &universe)?; + let condition_docids = condition_docids_cache.get_condition_docids( + ctx, + latest_condition, + graph, + &universe, + )?; - cached_condition_docids.push((condition, condition_docids.clone())); // .get_mut(condition) = condition_docids.clone(); + cached_condition_docids.push((latest_condition, condition_docids.clone())); // If the edge is empty, then the path will be empty as well, we update the graph // and caches accordingly and skip to the next candidate path. if condition_docids.is_disjoint(&universe) { // 1. Store in the cache that this edge is empty for this universe - dead_end_path_cache.add_condition(condition); - // 2. remove this edge from the ranking rule graph - // ouch, no! :( need to link a condition to one or more ranking rule edges - graph.remove_edges_with_condition(condition); + dead_end_path_cache.forbid_condition(latest_condition); + // 2. remove all the edges with this condition from the ranking rule graph + graph.remove_edges_with_condition(latest_condition); // 3. Also remove the entry from the condition_docids_cache, since we don't need it anymore - condition_docids_cache.cache.remove(&condition); + condition_docids_cache.cache.remove(&latest_condition); return Ok(ControlFlow::Continue(())); } - path_docids &= condition_docids; - // If the (sub)path is empty, we try to figure out why and update the caches accordingly. - if path_docids.is_disjoint(&universe) { + if path_docids.is_disjoint(condition_docids) { // First, we know that this path is empty, and thus any path // that is a superset of it will also be empty. - dead_end_path_cache.add_prefix(&visited_conditions); + dead_end_path_cache.forbid_condition_after_prefix( + visited_conditions[..visited_conditions.len() - 1].iter().copied(), + latest_condition, + ); + + let mut dead_end_cache_cursor = dead_end_path_cache; + // Second, if the intersection between this edge and any - // previous one is disjoint with the universe, - // then we also know that any path containing the same couple of - // edges will also be empty. - for (past_condition, condition_docids2) in cached_condition_docids.iter() { - if *past_condition == condition { + // previous prefix is disjoint with the universe, then... TODO + for (past_condition, past_condition_docids) in + cached_condition_docids.iter() + { + // TODO: should ensure that it is simply not possible to have twice + // the same condition in the cached_condition_docids. Maybe it is + // already the case? + dead_end_cache_cursor = + dead_end_cache_cursor.advance(*past_condition).unwrap(); + // TODO: check how that interacts with the dead end cache? + if *past_condition == latest_condition { + // TODO: should we break instead? + // Is it even possible? continue; }; - let intersection = condition_docids & condition_docids2; - if intersection.is_disjoint(&universe) { - dead_end_path_cache - .add_condition_couple(*past_condition, condition); + if condition_docids.is_disjoint(past_condition_docids) { + dead_end_cache_cursor.forbid_condition(latest_condition); } } // We should maybe instead try to compute: // 0th & nth & 1st & n-1th & 2nd & etc... return Ok(ControlFlow::Continue(())); + } else { + path_docids &= condition_docids; } } assert!(!path_docids.is_empty()); @@ -303,7 +320,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase } }, )?; - + // println!(" {} paths of cost {} in {}", paths.len(), cost, self.id); G::log_state( &original_graph, &paths, diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index 3b5544b68..ea8b987fd 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -152,7 +152,7 @@ impl Hash for Interned { } } -impl Ord for Interned { +impl Ord for Interned { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.idx.cmp(&other.idx) } diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 1ed992c56..2b5d31781 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -3,7 +3,7 @@ use std::io::Write; use std::path::PathBuf; use std::time::Instant; -use rand::random; +// use rand::random; use roaring::RoaringBitmap; use crate::search::new::interner::{Interned, MappedInterner}; @@ -323,12 +323,11 @@ impl DetailedSearchLogger { let cur_activated_id = activated_id(×tamp); let docids = new.iter().collect::>(); let len = new.len(); - let random = random::(); writeln!( &mut file, - "{cur_ranking_rule}.{cur_activated_id} -> results.{random} : \"add {len}\" -results.{random} {{ + "{cur_ranking_rule}.{cur_activated_id} -> results.{cur_ranking_rule}{cur_activated_id} : \"add {len}\" +results.{cur_ranking_rule}{cur_activated_id} {{ tooltip: \"{docids:?}\" style {{ fill: \"#B6E2D3\" @@ -572,17 +571,17 @@ shape: class" Self::paths_d2_description(ctx, graph, paths, file); writeln!(file, "}}").unwrap(); - writeln!(file, "Dead-end couples of conditions {{").unwrap(); - for (i, (e1, e2)) in dead_end_paths_cache.condition_couples.iter().enumerate() { - writeln!(file, "{i} : \"\" {{").unwrap(); - Self::condition_d2_description(ctx, graph, e1, file); - for e2 in e2.iter() { - Self::condition_d2_description(ctx, graph, e2, file); - writeln!(file, "{e1} -- {e2}").unwrap(); - } - writeln!(file, "}}").unwrap(); - } - writeln!(file, "}}").unwrap(); + // writeln!(file, "Dead-end couples of conditions {{").unwrap(); + // for (i, (e1, e2)) in dead_end_paths_cache.condition_couples.iter().enumerate() { + // writeln!(file, "{i} : \"\" {{").unwrap(); + // Self::condition_d2_description(ctx, graph, e1, file); + // for e2 in e2.iter() { + // Self::condition_d2_description(ctx, graph, e2, file); + // writeln!(file, "{e1} -- {e2}").unwrap(); + // } + // writeln!(file, "}}").unwrap(); + // } + // writeln!(file, "}}").unwrap(); writeln!(file, "Dead-end edges {{").unwrap(); for condition in dead_end_paths_cache.conditions.iter() { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index ba58d049f..f8ab89cbf 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -1,4 +1,4 @@ -#[cfg(test)] +// #[cfg(test)] pub mod detailed; use roaring::RoaringBitmap; @@ -6,8 +6,7 @@ use roaring::RoaringBitmap; use super::interner::{Interned, MappedInterner}; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - DeadEndPathCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoCondition, - TypoGraph, + DeadEndsCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoCondition, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{RankingRule, RankingRuleQueryTrait}; @@ -67,7 +66,7 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, @@ -78,7 +77,7 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, @@ -138,7 +137,7 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths_map: &[Vec>], - _dead_end_path_cache: &DeadEndPathCache, + _dead_end_path_cache: &DeadEndsCache, _universe: &RoaringBitmap, _distances: &MappedInterner)>, QueryNode>, _cost: u16, @@ -149,7 +148,7 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths: &[Vec>], - _dead_end_path_cache: &DeadEndPathCache, + _dead_end_path_cache: &DeadEndsCache, _universe: &RoaringBitmap, _distances: &MappedInterner)>, QueryNode>, _cost: u16, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 8ff832de4..29a6020aa 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -15,26 +15,26 @@ mod sort; // TODO: documentation + comments mod words; +// #[cfg(test)] +pub use logger::detailed::DetailedSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use std::collections::{BTreeSet, HashSet}; +use crate::{Filter, Index, MatchingWords, Result, Search, SearchResult, TermsMatchingStrategy}; use charabia::Tokenize; use db_cache::DatabaseCache; +use graph_based_ranking_rule::{Proximity, Typo}; use heed::RoTxn; -use query_graph::{QueryGraph, QueryNode}; -pub use ranking_rules::{bucket_sort, RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; +use interner::DedupInterner; +use query_graph::{QueryGraph, QueryNode, QueryNodeData}; +use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; +use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; +use resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; use roaring::RoaringBitmap; +use words::Words; -use self::interner::DedupInterner; -use self::query_graph::QueryNodeData; -use self::query_term::{Phrase, QueryTerm}; -use self::ranking_rules::PlaceholderQuery; -use self::resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; -use crate::search::new::graph_based_ranking_rule::{Proximity, Typo}; -use crate::search::new::query_term::located_query_terms_from_string; -use crate::search::new::words::Words; -use crate::{Filter, Index, Result, TermsMatchingStrategy}; +use self::ranking_rules::RankingRule; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -231,12 +231,12 @@ pub fn execute_search<'ctx>( length: usize, placeholder_search_logger: &mut dyn SearchLogger, query_graph_logger: &mut dyn SearchLogger, -) -> Result> { +) -> Result { assert!(!query.is_empty()); let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None)?; let graph = QueryGraph::from_query(ctx, query_terms)?; - let universe = if let Some(filters) = filters { + let mut universe = if let Some(filters) = filters { filters.evaluate(ctx.txn, ctx.index)? } else { ctx.index.documents_ids(ctx.txn)? @@ -249,8 +249,8 @@ pub fn execute_search<'ctx>( // But in that case, we should return no results. // // The search is a placeholder search only if there are no tokens? - if graph.nodes.len() > 2 { - let universe = resolve_maximally_reduced_query_graph( + let documents_ids = if graph.nodes.len() > 2 { + universe = resolve_maximally_reduced_query_graph( ctx, &universe, &graph, @@ -259,7 +259,7 @@ pub fn execute_search<'ctx>( )?; let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, terms_matching_strategy)?; - bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger) + bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { let ranking_rules = get_ranking_rules_for_placeholder_search(ctx)?; bucket_sort( @@ -270,7 +270,22 @@ pub fn execute_search<'ctx>( from, length, placeholder_search_logger, - ) + )? + }; + + Ok(SearchResult { + // TODO: correct matching words + matching_words: MatchingWords::default(), + // TODO: candidates with distinct + candidates: universe, + documents_ids, + }) +} + +impl<'a> Search<'a> { + // TODO + pub fn execute_new(&self) -> Result { + todo!() } } @@ -329,7 +344,7 @@ mod tests { println!("{}us", elapsed.as_micros()); let _documents = index - .documents(&txn, results.iter().copied()) + .documents(&txn, results.documents_ids.iter().copied()) .unwrap() .into_iter() .map(|(id, obkv)| { diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index bc0e88326..a36c6943f 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -4,8 +4,7 @@ use std::collections::btree_map::Entry; use std::collections::{BTreeMap, VecDeque}; use std::ops::ControlFlow; -use super::dead_end_path_cache::DeadEndPathCache; -use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNode; use crate::search::new::small_bitmap::SmallBitmap; @@ -23,11 +22,11 @@ impl RankingRuleGraph { from: Interned, cost: u16, all_distances: &MappedInterner)>, QueryNode>, - dead_end_path_cache: &mut DeadEndPathCache, + dead_end_path_cache: &mut DeadEndsCache, mut visit: impl FnMut( &[Interned], &mut Self, - &mut DeadEndPathCache, + &mut DeadEndsCache, ) -> Result>, ) -> Result<()> { let _ = self.visit_paths_of_cost_rec( @@ -38,7 +37,7 @@ impl RankingRuleGraph { &mut visit, &mut vec![], &mut SmallBitmap::for_interned_values_in(&self.conditions_interner), - &mut dead_end_path_cache.conditions.clone(), + &mut dead_end_path_cache.forbidden.clone(), )?; Ok(()) } @@ -47,11 +46,11 @@ impl RankingRuleGraph { from: Interned, cost: u16, all_distances: &MappedInterner)>, QueryNode>, - dead_end_path_cache: &mut DeadEndPathCache, + dead_end_path_cache: &mut DeadEndsCache, visit: &mut impl FnMut( &[Interned], &mut Self, - &mut DeadEndPathCache, + &mut DeadEndsCache, ) -> Result>, prev_conditions: &mut Vec>, cur_path: &mut SmallBitmap, @@ -74,7 +73,6 @@ impl RankingRuleGraph { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(true), } - true } else { self.visit_paths_of_cost_rec( edge.dest_node, @@ -85,7 +83,7 @@ impl RankingRuleGraph { prev_conditions, cur_path, forbidden_conditions, - )? + )?; } } Some(condition) => { @@ -101,24 +99,20 @@ impl RankingRuleGraph { } cur_path.insert(condition); prev_conditions.push(condition); - let mut new_forbidden_conditions = forbidden_conditions.clone(); - new_forbidden_conditions - .union(dead_end_path_cache.condition_couples.get(condition)); - dead_end_path_cache.prefixes.final_edges_after_prefix( - prev_conditions, - &mut |x| { - new_forbidden_conditions.insert(x); - }, - ); - let next_any_valid = if edge.dest_node == self.query_graph.end_node { + if let Some(next_forbidden) = + dead_end_path_cache.forbidden_conditions_after_prefix(&prev_conditions) + { + new_forbidden_conditions.union(&next_forbidden); + } + + if edge.dest_node == self.query_graph.end_node { any_valid = true; let control_flow = visit(prev_conditions, self, dead_end_path_cache)?; match control_flow { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(true), } - true } else { self.visit_paths_of_cost_rec( edge.dest_node, @@ -129,28 +123,12 @@ impl RankingRuleGraph { prev_conditions, cur_path, &mut new_forbidden_conditions, - )? - }; + )?; + } cur_path.remove(condition); prev_conditions.pop(); - next_any_valid } }; - any_valid |= next_any_valid; - - if next_any_valid { - if dead_end_path_cache.path_is_dead_end(prev_conditions, cur_path) { - return Ok(any_valid); - } - forbidden_conditions.union(&dead_end_path_cache.conditions); - for prev_condition in prev_conditions.iter() { - forbidden_conditions - .union(dead_end_path_cache.condition_couples.get(*prev_condition)); - } - dead_end_path_cache.prefixes.final_edges_after_prefix(prev_conditions, &mut |x| { - forbidden_conditions.insert(x); - }); - } } Ok(any_valid) diff --git a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs index 9766cfaa3..367f36e6a 100644 --- a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs @@ -12,11 +12,16 @@ use crate::Result; pub struct ConditionDocIdsCache { // TODO: should be FxHashMap, RoaringBitmap> pub cache: FxHashMap, RoaringBitmap>, + pub universe_length: u64, _phantom: PhantomData, } -impl Default for ConditionDocIdsCache { - fn default() -> Self { - Self { cache: Default::default(), _phantom: Default::default() } +impl ConditionDocIdsCache { + pub fn new(universe: &RoaringBitmap) -> Self { + Self { + cache: Default::default(), + _phantom: Default::default(), + universe_length: universe.len(), + } } } impl ConditionDocIdsCache { @@ -33,6 +38,9 @@ impl ConditionDocIdsCache { universe: &RoaringBitmap, ) -> Result<&'s RoaringBitmap> { if self.cache.contains_key(&interned_condition) { + // TODO compare length of universe compared to the one in self + // if it is smaller, then update the value + // TODO: should we update the bitmap in the cache if the new universe // reduces it? // TODO: maybe have a generation: u32 to track every time the universe was diff --git a/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs b/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs index b4af625d6..701421ea7 100644 --- a/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs @@ -1,84 +1,83 @@ -use super::{path_set::PathSet, RankingRuleGraphTrait}; -use crate::search::new::{ - interner::{FixedSizeInterner, Interned, MappedInterner}, - small_bitmap::SmallBitmap, -}; +// use super::{path_set::PathSet, RankingRuleGraphTrait}; +// use crate::search::new::{ +// interner::{FixedSizeInterner, Interned, MappedInterner}, +// small_bitmap::SmallBitmap, +// }; -/// A cache which stores sufficient conditions for a path -/// to resolve to an empty set of candidates within the current -/// universe. -pub struct DeadEndPathCache { - /// The set of edge conditions that resolve to no documents. - pub conditions: SmallBitmap, - /// A set of path prefixes that resolve to no documents. - pub prefixes: PathSet, - /// A set of empty couples of edge conditions that resolve to no documents. - pub condition_couples: MappedInterner, G::Condition>, -} -impl Clone for DeadEndPathCache { - fn clone(&self) -> Self { - Self { - conditions: self.conditions.clone(), - prefixes: self.prefixes.clone(), - condition_couples: self.condition_couples.clone(), - } - } -} +// /// A cache which stores sufficient conditions for a path +// /// to resolve to an empty set of candidates within the current +// /// universe. +// pub struct DeadEndPathCache { +// /// The set of edge conditions that resolve to no documents. +// pub conditions: SmallBitmap, +// /// A set of path prefixes that resolve to no documents. +// pub prefixes: PathSet, +// /// A set of empty couples of edge conditions that resolve to no documents. +// pub condition_couples: MappedInterner, G::Condition>, +// } +// impl Clone for DeadEndPathCache { +// fn clone(&self) -> Self { +// Self { +// conditions: self.conditions.clone(), +// prefixes: self.prefixes.clone(), +// condition_couples: self.condition_couples.clone(), +// } +// } +// } -impl DeadEndPathCache { - /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. - pub fn new(all_conditions: &FixedSizeInterner) -> Self { - Self { - conditions: SmallBitmap::for_interned_values_in(all_conditions), - prefixes: PathSet::default(), - condition_couples: all_conditions - .map(|_| SmallBitmap::for_interned_values_in(all_conditions)), - } - } +// impl DeadEndPathCache { +// /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. +// pub fn new(all_conditions: &FixedSizeInterner) -> Self { +// Self { +// conditions: SmallBitmap::for_interned_values_in(all_conditions), +// prefixes: PathSet::default(), +// condition_couples: all_conditions +// .map(|_| SmallBitmap::for_interned_values_in(all_conditions)), +// } +// } - /// Store in the cache that every path containing the given edge resolves to no documents. - pub fn add_condition(&mut self, condition: Interned) { - self.conditions.insert(condition); - self.condition_couples.get_mut(condition).clear(); - self.prefixes.remove_edge(condition); - for (_, edges2) in self.condition_couples.iter_mut() { - edges2.remove(condition); - } - } - /// Store in the cache that every path containing the given prefix resolves to no documents. - pub fn add_prefix(&mut self, prefix: &[Interned]) { - // TODO: typed PathSet - self.prefixes.insert(prefix.iter().copied()); - } +// /// Store in the cache that every path containing the given edge resolves to no documents. +// pub fn add_condition(&mut self, condition: Interned) { +// self.conditions.insert(condition); +// self.condition_couples.get_mut(condition).clear(); +// self.prefixes.remove_edge(condition); +// for (_, edges2) in self.condition_couples.iter_mut() { +// edges2.remove(condition); +// } +// } +// /// Store in the cache that every path containing the given prefix resolves to no documents. +// pub fn add_prefix(&mut self, prefix: &[Interned]) { +// // TODO: typed PathSet +// self.prefixes.insert(prefix.iter().copied()); +// } - /// Store in the cache that every path containing the two given edges resolves to no documents. - pub fn add_condition_couple( - &mut self, - edge1: Interned, - edge2: Interned, - ) { - self.condition_couples.get_mut(edge1).insert(edge2); - } +// /// Store in the cache that every path containing the two given edges resolves to no documents. +// pub fn add_condition_couple( +// &mut self, +// edge1: Interned, +// edge2: Interned, +// ) { +// self.condition_couples.get_mut(edge1).insert(edge2); +// } - /// Returns true if the cache can determine that the given path resolves to no documents. - pub fn path_is_dead_end( - &self, - path: &[Interned], - path_bitmap: &SmallBitmap, - ) -> bool { - if path_bitmap.intersects(&self.conditions) { - return true; - } - for condition in path.iter() { - // TODO: typed path - let forbidden_other_edges = self.condition_couples.get(*condition); - if path_bitmap.intersects(forbidden_other_edges) { - return true; - } - } - if self.prefixes.contains_prefix_of_path(path) { - return true; - } - false - } -} +// /// Returns true if the cache can determine that the given path resolves to no documents. +// pub fn path_is_dead_end( +// &self, +// path: &[Interned], +// path_bitmap: &SmallBitmap, +// ) -> bool { +// if path_bitmap.intersects(&self.conditions) { +// return true; +// } +// for condition in path.iter() { +// let forbidden_other_edges = self.condition_couples.get(*condition); +// if path_bitmap.intersects(forbidden_other_edges) { +// return true; +// } +// } +// if self.prefixes.contains_prefix_of_path(path) { +// return true; +// } +// false +// } +// } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 02a68b811..977d6c96b 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -20,7 +20,8 @@ use std::collections::HashSet; use std::hash::Hash; pub use condition_docids_cache::ConditionDocIdsCache; -pub use dead_end_path_cache::DeadEndPathCache; +// pub use dead_end_path_cache::DeadEndPathCache; +pub use path_set::DeadEndsCache; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; @@ -113,7 +114,7 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, diff --git a/milli/src/search/new/ranking_rule_graph/path_set.rs b/milli/src/search/new/ranking_rule_graph/path_set.rs index 1a87c9604..0aa4472dc 100644 --- a/milli/src/search/new/ranking_rule_graph/path_set.rs +++ b/milli/src/search/new/ranking_rule_graph/path_set.rs @@ -2,104 +2,165 @@ // For the empty_prefixes field in the EmptyPathsCache only :/ // but it could be used for more, like efficient computing of a set of paths -use crate::search::new::interner::Interned; +use crate::search::new::{ + interner::{FixedSizeInterner, Interned}, + small_bitmap::SmallBitmap, +}; -/// A set of `Vec>` implemented as a prefix tree. -pub struct PathSet { +pub struct DeadEndsCache { nodes: Vec<(Interned, Self)>, - is_end: bool, + pub forbidden: SmallBitmap, } - -impl Clone for PathSet { - fn clone(&self) -> Self { - Self { nodes: self.nodes.clone(), is_end: self.is_end } +impl DeadEndsCache { + pub fn new(for_interner: &FixedSizeInterner) -> Self { + Self { nodes: vec![], forbidden: SmallBitmap::for_interned_values_in(for_interner) } } -} - -impl std::fmt::Debug for PathSet { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("PathSet").field("nodes", &self.nodes).field("is_end", &self.is_end).finish() + pub fn forbid_condition(&mut self, condition: Interned) { + self.forbidden.insert(condition); } -} - -impl Default for PathSet { - fn default() -> Self { - Self { nodes: Default::default(), is_end: Default::default() } - } -} - -impl PathSet { - pub fn insert(&mut self, mut edges: impl Iterator>) { - match edges.next() { - None => { - self.is_end = true; - } - Some(first_edge) => { - for (edge, next_node) in &mut self.nodes { - if edge == &first_edge { - return next_node.insert(edges); - } - } - let mut rest = PathSet::default(); - rest.insert(edges); - self.nodes.push((first_edge, rest)); + fn advance(&mut self, condition: Interned) -> Option<&mut Self> { + for (e, next_node) in &mut self.nodes { + if condition == *e { + return Some(next_node); } } + None } - - pub fn remove_edge(&mut self, forbidden_edge: Interned) { - let mut i = 0; - while i < self.nodes.len() { - let should_remove = if self.nodes[i].0 == forbidden_edge { - true - } else if !self.nodes[i].1.nodes.is_empty() { - self.nodes[i].1.remove_edge(forbidden_edge); - self.nodes[i].1.nodes.is_empty() + pub fn forbidden_conditions_after_prefix( + &mut self, + mut prefix: &[Interned], + ) -> Option> { + let mut cursor = self; + for c in prefix.iter() { + if let Some(next) = cursor.advance(*c) { + cursor = next; } else { - false - }; - if should_remove { - self.nodes.remove(i); - } else { - i += 1; + return None; } } + Some(cursor.forbidden.clone()) } - - pub fn final_edges_after_prefix( - &self, - prefix: &[Interned], - visit: &mut impl FnMut(Interned), + pub fn forbid_condition_after_prefix( + &mut self, + mut prefix: impl Iterator>, + forbidden: Interned, ) { - let [first_edge, remaining_prefix @ ..] = prefix else { - for node in self.nodes.iter() { - if node.1.is_end { - visit(node.0) - } + match prefix.next() { + None => { + self.forbidden.insert(forbidden); } - return - }; - for (edge, rest) in self.nodes.iter() { - if edge == first_edge { - return rest.final_edges_after_prefix(remaining_prefix, visit); - } - } - } - - pub fn contains_prefix_of_path(&self, path: &[Interned]) -> bool { - if self.is_end { - return true; - } - match path { - [] => false, - [first_edge, remaining_path @ ..] => { - for (edge, rest) in self.nodes.iter() { - if edge == first_edge { - return rest.contains_prefix_of_path(remaining_path); + Some(first_condition) => { + for (condition, next_node) in &mut self.nodes { + if condition == &first_condition { + return next_node.forbid_condition_after_prefix(prefix, forbidden); } } - false + let mut rest = DeadEndsCache { + nodes: vec![], + forbidden: SmallBitmap::new(self.forbidden.universe_length()), + }; + rest.forbid_condition_after_prefix(prefix, forbidden); + self.nodes.push((first_condition, rest)); } } } } +// /// A set of `Vec>` implemented as a prefix tree. +// pub struct PathSet { +// nodes: Vec<(Interned, Self)>, +// is_end: bool, +// } + +// impl Clone for PathSet { +// fn clone(&self) -> Self { +// Self { nodes: self.nodes.clone(), is_end: self.is_end } +// } +// } + +// impl std::fmt::Debug for PathSet { +// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +// f.debug_struct("PathSet").field("nodes", &self.nodes).field("is_end", &self.is_end).finish() +// } +// } + +// impl Default for PathSet { +// fn default() -> Self { +// Self { nodes: Default::default(), is_end: Default::default() } +// } +// } + +// impl PathSet { +// pub fn insert(&mut self, mut conditions: impl Iterator>) { +// match conditions.next() { +// None => { +// self.is_end = true; +// } +// Some(first_condition) => { +// for (condition, next_node) in &mut self.nodes { +// if condition == &first_condition { +// return next_node.insert(conditions); +// } +// } +// let mut rest = PathSet::default(); +// rest.insert(conditions); +// self.nodes.push((first_condition, rest)); +// } +// } +// } + +// pub fn remove_condition(&mut self, forbidden_condition: Interned) { +// let mut i = 0; +// while i < self.nodes.len() { +// let should_remove = if self.nodes[i].0 == forbidden_condition { +// true +// } else if !self.nodes[i].1.nodes.is_empty() { +// self.nodes[i].1.remove_condition(forbidden_condition); +// self.nodes[i].1.nodes.is_empty() +// } else { +// false +// }; +// if should_remove { +// self.nodes.remove(i); +// } else { +// i += 1; +// } +// } +// } + +// pub fn final_conditions_after_prefix( +// &self, +// prefix: &[Interned], +// visit: &mut impl FnMut(Interned), +// ) { +// let [first_condition, remaining_prefix @ ..] = prefix else { +// for node in self.nodes.iter() { +// if node.1.is_end { +// visit(node.0) +// } +// } +// return +// }; +// for (condition, rest) in self.nodes.iter() { +// if condition == first_condition { +// return rest.final_conditions_after_prefix(remaining_prefix, visit); +// } +// } +// } + +// pub fn contains_prefix_of_path(&self, path: &[Interned]) -> bool { +// if self.is_end { +// return true; +// } +// match path { +// [] => false, +// [first_condition, remaining_path @ ..] => { +// for (condition, rest) in self.nodes.iter() { +// if condition == first_condition { +// return rest.contains_prefix_of_path(remaining_path); +// } +// } +// false +// } +// } +// } +// } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 9a6080301..690200773 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -6,8 +6,7 @@ use std::iter::FromIterator; use roaring::RoaringBitmap; -use super::dead_end_path_cache::DeadEndPathCache; -use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::{RankingRuleGraph, RankingRuleGraphTrait, DeadEndsCache}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::{Phrase, QueryTerm}; @@ -67,7 +66,7 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 0b81ec0ec..d6553a49d 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,7 +1,6 @@ use roaring::RoaringBitmap; -use super::dead_end_path_cache::DeadEndPathCache; -use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::{RankingRuleGraph, RankingRuleGraphTrait, DeadEndsCache}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_graph::QueryNodeData; @@ -137,7 +136,7 @@ impl RankingRuleGraphTrait for TypoGraph { fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index 975c5343f..faf9c077c 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -28,6 +28,12 @@ impl SmallBitmap { } } } + pub fn universe_length(&self) -> u16 { + match &self.internal { + SmallBitmapInternal::Tiny(_) => 64, + SmallBitmapInternal::Small(xs) => 64 * xs.len() as u16, + } + } pub fn from_iter( xs: impl Iterator>, for_interner: &FixedSizeInterner, From 1e6e62407882b26c6158e465bcf9b3d542d5e3c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sun, 19 Mar 2023 14:27:58 +0100 Subject: [PATCH 064/234] Fix bug in SmallBitmap --- milli/src/search/new/small_bitmap.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index faf9c077c..1fdd31346 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -22,7 +22,7 @@ impl SmallBitmap { } else { Self { internal: SmallBitmapInternal::Small( - vec![0; 1 + universe_length as usize / 64].into_boxed_slice(), + vec![0; 1 + (universe_length - 1) as usize / 64].into_boxed_slice(), ), _phantom: PhantomData, } From 49240c367af8750151dc73d1675ab7e99bedac3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sun, 19 Mar 2023 14:28:15 +0100 Subject: [PATCH 065/234] Fix bug in cost of typo conditions --- milli/src/search/new/ranking_rule_graph/typo/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index d6553a49d..e1e01d6b1 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,6 +1,6 @@ use roaring::RoaringBitmap; -use super::{RankingRuleGraph, RankingRuleGraphTrait, DeadEndsCache}; +use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_graph::QueryNodeData; @@ -64,7 +64,7 @@ impl RankingRuleGraphTrait for TypoGraph { // Ngrams have a base typo cost // 2-gram -> equivalent to 1 typo // 3-gram -> equivalent to 2 typos - let base_cost = positions.len().max(2) as u8; + let base_cost = positions.len().min(2) as u8; for nbr_typos in 0..=2 { let term = term_interner.get(*value).clone(); From c6ff97a2202c7a9f6a83577ecbc8c055c9adb090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sun, 19 Mar 2023 14:30:19 +0100 Subject: [PATCH 066/234] Rewrite the dead-ends cache to detect more dead-ends --- .../search/new/graph_based_ranking_rule.rs | 102 +++++++---- milli/src/search/new/logger/detailed.rs | 22 +-- .../new/ranking_rule_graph/cheapest_paths.rs | 32 ++-- .../condition_docids_cache.rs | 38 ++-- .../ranking_rule_graph/dead_end_path_cache.rs | 83 --------- .../src/search/new/ranking_rule_graph/mod.rs | 6 +- .../search/new/ranking_rule_graph/path_set.rs | 166 ------------------ 7 files changed, 116 insertions(+), 333 deletions(-) delete mode 100644 milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs delete mode 100644 milli/src/search/new/ranking_rule_graph/path_set.rs diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index edac0b15c..31d6e504c 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -135,7 +135,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase query_graph: &QueryGraph, ) -> Result<()> { let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; - let mut condition_docids_cache = ConditionDocIdsCache::new(universe); + let mut condition_docids_cache = ConditionDocIdsCache::default(); let mut dead_end_path_cache = DeadEndsCache::new(&graph.conditions_interner); // First simplify the graph as much as possible, by computing the docids of all the conditions @@ -215,36 +215,36 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let original_graph = graph.clone(); let mut used_conditions = SmallBitmap::for_interned_values_in(&graph.conditions_interner); - let mut paths = vec![]; + let mut considered_paths = vec![]; + let mut good_paths = vec![]; // For each path of the given cost, we will compute its associated // document ids. // In case the path does not resolve to any document id, we try to figure out why // and update the `dead_end_path_cache` accordingly. - // For example, it may be that the path is empty because one of its edges is disjoint - // with the universe, or because a prefix of the path is disjoint with the universe, or because - // the path contains two edges that are disjoint from each other within the universe. // Updating the dead_end_path_cache helps speed up the execution of `visit_paths_of_cost` and reduces // the number of future candidate paths given by that same function. graph.visit_paths_of_cost( graph.query_graph.root_node, cost, all_distances, - dead_end_path_cache.forbidden.clone(), - |condition, forbidden_conditions| {}, dead_end_path_cache, |path, graph, dead_end_path_cache| { + if universe.is_empty() { + return Ok(ControlFlow::Break(())); + } // Accumulate the path for logging purposes only - paths.push(path.to_vec()); + considered_paths.push(path.to_vec()); let mut path_docids = universe.clone(); // We store the edges and their docids in vectors in case the path turns out to be // empty and we need to figure out why it was empty. let mut visited_conditions = vec![]; - let mut cached_condition_docids = vec![]; + // let mut cached_condition_docids = vec![]; + let mut subpath_docids = vec![]; - for &latest_condition in path { + for (latest_condition_path_idx, &latest_condition) in path.iter().enumerate() { visited_conditions.push(latest_condition); let condition_docids = condition_docids_cache.get_condition_docids( @@ -254,11 +254,9 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase &universe, )?; - cached_condition_docids.push((latest_condition, condition_docids.clone())); - // If the edge is empty, then the path will be empty as well, we update the graph // and caches accordingly and skip to the next candidate path. - if condition_docids.is_disjoint(&universe) { + if condition_docids.is_empty() { // 1. Store in the cache that this edge is empty for this universe dead_end_path_cache.forbid_condition(latest_condition); // 2. remove all the edges with this condition from the ranking rule graph @@ -267,45 +265,71 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase condition_docids_cache.cache.remove(&latest_condition); return Ok(ControlFlow::Continue(())); } + path_docids &= condition_docids; + subpath_docids.push(path_docids.clone()); + // If the (sub)path is empty, we try to figure out why and update the caches accordingly. - if path_docids.is_disjoint(condition_docids) { + if path_docids.is_empty() { + let len_prefix = subpath_docids.len() - 1; // First, we know that this path is empty, and thus any path // that is a superset of it will also be empty. dead_end_path_cache.forbid_condition_after_prefix( - visited_conditions[..visited_conditions.len() - 1].iter().copied(), + visited_conditions[..len_prefix].iter().copied(), latest_condition, ); - let mut dead_end_cache_cursor = dead_end_path_cache; + if visited_conditions.len() > 1 { + let mut subprefix = vec![]; + // Deadend if the intersection between this edge and any + // previous prefix is disjoint with the universe + for (past_condition, subpath_docids) in visited_conditions[..len_prefix] + .iter() + .zip(subpath_docids[..len_prefix].iter()) + { + if *past_condition == latest_condition { + todo!(); + }; + subprefix.push(*past_condition); + if condition_docids.is_disjoint(subpath_docids) { + dead_end_path_cache.forbid_condition_after_prefix( + subprefix.iter().copied(), + latest_condition, + ); + } + } - // Second, if the intersection between this edge and any - // previous prefix is disjoint with the universe, then... TODO - for (past_condition, past_condition_docids) in - cached_condition_docids.iter() - { - // TODO: should ensure that it is simply not possible to have twice - // the same condition in the cached_condition_docids. Maybe it is - // already the case? - dead_end_cache_cursor = - dead_end_cache_cursor.advance(*past_condition).unwrap(); - // TODO: check how that interacts with the dead end cache? - if *past_condition == latest_condition { - // TODO: should we break instead? - // Is it even possible? - continue; - }; - if condition_docids.is_disjoint(past_condition_docids) { - dead_end_cache_cursor.forbid_condition(latest_condition); + // keep the same prefix and check the intersection with + // all the remaining conditions + let mut forbidden = dead_end_path_cache.forbidden.clone(); + let mut cursor = dead_end_path_cache; + for &c in visited_conditions[..len_prefix].iter() { + cursor = cursor.advance(c).unwrap(); + forbidden.union(&cursor.forbidden); + } + + let past_path_docids = &subpath_docids[subpath_docids.len() - 2]; + + let remaining_conditions = + path[latest_condition_path_idx..].iter().skip(1); + for next_condition in remaining_conditions { + if forbidden.contains(*next_condition) { + continue; + } + let next_condition_docids = condition_docids_cache + .get_condition_docids(ctx, *next_condition, graph, &universe)?; + + if past_path_docids.is_disjoint(next_condition_docids) { + cursor.forbid_condition(*next_condition); + } } } - // We should maybe instead try to compute: - // 0th & nth & 1st & n-1th & 2nd & etc... + return Ok(ControlFlow::Continue(())); - } else { - path_docids &= condition_docids; } } assert!(!path_docids.is_empty()); + // Accumulate the path for logging purposes only + good_paths.push(path.to_vec()); for condition in path { used_conditions.insert(*condition); } @@ -323,7 +347,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // println!(" {} paths of cost {} in {}", paths.len(), cost, self.id); G::log_state( &original_graph, - &paths, + &good_paths, dead_end_path_cache, original_universe, all_distances, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 2b5d31781..a46e63005 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -10,7 +10,7 @@ use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; use crate::search::new::ranking_rule_graph::{ - DeadEndPathCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, + DeadEndsCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, }; use crate::search::new::small_bitmap::SmallBitmap; @@ -44,7 +44,7 @@ pub enum SearchEvents { ProximityState { graph: RankingRuleGraph, paths: Vec>>, - dead_end_path_cache: DeadEndPathCache, + dead_end_path_cache: DeadEndsCache, universe: RoaringBitmap, distances: MappedInterner)>, QueryNode>, cost: u16, @@ -52,7 +52,7 @@ pub enum SearchEvents { TypoState { graph: RankingRuleGraph, paths: Vec>>, - dead_end_path_cache: DeadEndPathCache, + dead_end_path_cache: DeadEndsCache, universe: RoaringBitmap, distances: MappedInterner)>, QueryNode>, cost: u16, @@ -170,7 +170,7 @@ impl SearchLogger for DetailedSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, @@ -189,7 +189,7 @@ impl SearchLogger for DetailedSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, @@ -527,7 +527,7 @@ shape: class" ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_paths_cache: &DeadEndPathCache, + dead_end_paths_cache: &DeadEndsCache, distances: MappedInterner)>, QueryNode>, file: &mut File, ) { @@ -583,11 +583,11 @@ shape: class" // } // writeln!(file, "}}").unwrap(); - writeln!(file, "Dead-end edges {{").unwrap(); - for condition in dead_end_paths_cache.conditions.iter() { - writeln!(file, "{condition}").unwrap(); - } - writeln!(file, "}}").unwrap(); + // writeln!(file, "Dead-end edges {{").unwrap(); + // for condition in dead_end_paths_cache.conditions.iter() { + // writeln!(file, "{condition}").unwrap(); + // } + // writeln!(file, "}}").unwrap(); // writeln!(file, "Dead-end prefixes {{").unwrap(); // writeln!(file, "}}").unwrap(); diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index a36c6943f..4fb4d0844 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -37,7 +37,7 @@ impl RankingRuleGraph { &mut visit, &mut vec![], &mut SmallBitmap::for_interned_values_in(&self.conditions_interner), - &mut dead_end_path_cache.forbidden.clone(), + dead_end_path_cache.forbidden.clone(), )?; Ok(()) } @@ -54,12 +54,12 @@ impl RankingRuleGraph { ) -> Result>, prev_conditions: &mut Vec>, cur_path: &mut SmallBitmap, - forbidden_conditions: &mut SmallBitmap, + mut forbidden_conditions: SmallBitmap, ) -> Result { let mut any_valid = false; let edges = self.edges_of_node.get(from).clone(); - for edge_idx in edges.iter() { + 'edges_loop: for edge_idx in edges.iter() { let Some(edge) = self.edges_store.get(edge_idx).as_ref() else { continue }; if cost < edge.cost as u16 { continue; @@ -73,6 +73,7 @@ impl RankingRuleGraph { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(true), } + true } else { self.visit_paths_of_cost_rec( edge.dest_node, @@ -82,8 +83,8 @@ impl RankingRuleGraph { visit, prev_conditions, cur_path, - forbidden_conditions, - )?; + forbidden_conditions.clone(), + )? } } Some(condition) => { @@ -101,18 +102,19 @@ impl RankingRuleGraph { prev_conditions.push(condition); let mut new_forbidden_conditions = forbidden_conditions.clone(); if let Some(next_forbidden) = - dead_end_path_cache.forbidden_conditions_after_prefix(&prev_conditions) + dead_end_path_cache.forbidden_conditions_after_prefix(prev_conditions) { new_forbidden_conditions.union(&next_forbidden); } - if edge.dest_node == self.query_graph.end_node { + let next_any_valid = if edge.dest_node == self.query_graph.end_node { any_valid = true; let control_flow = visit(prev_conditions, self, dead_end_path_cache)?; match control_flow { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(true), } + true } else { self.visit_paths_of_cost_rec( edge.dest_node, @@ -122,13 +124,23 @@ impl RankingRuleGraph { visit, prev_conditions, cur_path, - &mut new_forbidden_conditions, - )?; - } + new_forbidden_conditions, + )? + }; cur_path.remove(condition); prev_conditions.pop(); + next_any_valid } }; + any_valid |= next_any_valid; + + if next_any_valid { + forbidden_conditions = dead_end_path_cache + .forbidden_conditions_for_all_prefixes_up_to(prev_conditions); + if cur_path.intersects(&forbidden_conditions) { + break 'edges_loop; + } + } } Ok(any_valid) diff --git a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs index 367f36e6a..a3d2ae419 100644 --- a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs @@ -8,20 +8,17 @@ use crate::search::new::interner::Interned; use crate::search::new::SearchContext; use crate::Result; +// TODO: give a generation to each universe, then be able to get the exact +// delta of docids between two universes of different generations! + /// A cache storing the document ids associated with each ranking rule edge pub struct ConditionDocIdsCache { - // TODO: should be FxHashMap, RoaringBitmap> - pub cache: FxHashMap, RoaringBitmap>, - pub universe_length: u64, + pub cache: FxHashMap, (u64, RoaringBitmap)>, _phantom: PhantomData, } -impl ConditionDocIdsCache { - pub fn new(universe: &RoaringBitmap) -> Self { - Self { - cache: Default::default(), - _phantom: Default::default(), - universe_length: universe.len(), - } +impl Default for ConditionDocIdsCache { + fn default() -> Self { + Self { cache: Default::default(), _phantom: Default::default() } } } impl ConditionDocIdsCache { @@ -40,20 +37,21 @@ impl ConditionDocIdsCache { if self.cache.contains_key(&interned_condition) { // TODO compare length of universe compared to the one in self // if it is smaller, then update the value - - // TODO: should we update the bitmap in the cache if the new universe - // reduces it? - // TODO: maybe have a generation: u32 to track every time the universe was - // reduced. Then only attempt to recompute the intersection when there is a chance - // that condition_docids & universe changed - return Ok(&self.cache[&interned_condition]); + let (universe_len, docids) = self.cache.entry(interned_condition).or_default(); + if *universe_len == universe.len() { + return Ok(docids); + } else { + *docids &= universe; + *universe_len = universe.len(); + return Ok(docids); + } } // TODO: maybe universe doesn't belong here let condition = graph.conditions_interner.get(interned_condition); // TODO: faster way to do this? - let docids = universe & G::resolve_condition(ctx, condition, universe)?; - let _ = self.cache.insert(interned_condition, docids); - let docids = &self.cache[&interned_condition]; + let docids = G::resolve_condition(ctx, condition, universe)?; + let _ = self.cache.insert(interned_condition, (universe.len(), docids)); + let (_, docids) = &self.cache[&interned_condition]; Ok(docids) } } diff --git a/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs b/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs deleted file mode 100644 index 701421ea7..000000000 --- a/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs +++ /dev/null @@ -1,83 +0,0 @@ -// use super::{path_set::PathSet, RankingRuleGraphTrait}; -// use crate::search::new::{ -// interner::{FixedSizeInterner, Interned, MappedInterner}, -// small_bitmap::SmallBitmap, -// }; - -// /// A cache which stores sufficient conditions for a path -// /// to resolve to an empty set of candidates within the current -// /// universe. -// pub struct DeadEndPathCache { -// /// The set of edge conditions that resolve to no documents. -// pub conditions: SmallBitmap, -// /// A set of path prefixes that resolve to no documents. -// pub prefixes: PathSet, -// /// A set of empty couples of edge conditions that resolve to no documents. -// pub condition_couples: MappedInterner, G::Condition>, -// } -// impl Clone for DeadEndPathCache { -// fn clone(&self) -> Self { -// Self { -// conditions: self.conditions.clone(), -// prefixes: self.prefixes.clone(), -// condition_couples: self.condition_couples.clone(), -// } -// } -// } - -// impl DeadEndPathCache { -// /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. -// pub fn new(all_conditions: &FixedSizeInterner) -> Self { -// Self { -// conditions: SmallBitmap::for_interned_values_in(all_conditions), -// prefixes: PathSet::default(), -// condition_couples: all_conditions -// .map(|_| SmallBitmap::for_interned_values_in(all_conditions)), -// } -// } - -// /// Store in the cache that every path containing the given edge resolves to no documents. -// pub fn add_condition(&mut self, condition: Interned) { -// self.conditions.insert(condition); -// self.condition_couples.get_mut(condition).clear(); -// self.prefixes.remove_edge(condition); -// for (_, edges2) in self.condition_couples.iter_mut() { -// edges2.remove(condition); -// } -// } -// /// Store in the cache that every path containing the given prefix resolves to no documents. -// pub fn add_prefix(&mut self, prefix: &[Interned]) { -// // TODO: typed PathSet -// self.prefixes.insert(prefix.iter().copied()); -// } - -// /// Store in the cache that every path containing the two given edges resolves to no documents. -// pub fn add_condition_couple( -// &mut self, -// edge1: Interned, -// edge2: Interned, -// ) { -// self.condition_couples.get_mut(edge1).insert(edge2); -// } - -// /// Returns true if the cache can determine that the given path resolves to no documents. -// pub fn path_is_dead_end( -// &self, -// path: &[Interned], -// path_bitmap: &SmallBitmap, -// ) -> bool { -// if path_bitmap.intersects(&self.conditions) { -// return true; -// } -// for condition in path.iter() { -// let forbidden_other_edges = self.condition_couples.get(*condition); -// if path_bitmap.intersects(forbidden_other_edges) { -// return true; -// } -// } -// if self.prefixes.contains_prefix_of_path(path) { -// return true; -// } -// false -// } -// } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 977d6c96b..b01c82969 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -8,8 +8,7 @@ the same but the edges are replaced. mod build; mod cheapest_paths; mod condition_docids_cache; -mod dead_end_path_cache; -mod path_set; +mod dead_ends_cache; /// Implementation of the `proximity` ranking rule mod proximity; @@ -20,8 +19,7 @@ use std::collections::HashSet; use std::hash::Hash; pub use condition_docids_cache::ConditionDocIdsCache; -// pub use dead_end_path_cache::DeadEndPathCache; -pub use path_set::DeadEndsCache; +pub use dead_ends_cache::DeadEndsCache; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; diff --git a/milli/src/search/new/ranking_rule_graph/path_set.rs b/milli/src/search/new/ranking_rule_graph/path_set.rs deleted file mode 100644 index 0aa4472dc..000000000 --- a/milli/src/search/new/ranking_rule_graph/path_set.rs +++ /dev/null @@ -1,166 +0,0 @@ -// What is PathSet used for? -// For the empty_prefixes field in the EmptyPathsCache only :/ -// but it could be used for more, like efficient computing of a set of paths - -use crate::search::new::{ - interner::{FixedSizeInterner, Interned}, - small_bitmap::SmallBitmap, -}; - -pub struct DeadEndsCache { - nodes: Vec<(Interned, Self)>, - pub forbidden: SmallBitmap, -} -impl DeadEndsCache { - pub fn new(for_interner: &FixedSizeInterner) -> Self { - Self { nodes: vec![], forbidden: SmallBitmap::for_interned_values_in(for_interner) } - } - pub fn forbid_condition(&mut self, condition: Interned) { - self.forbidden.insert(condition); - } - fn advance(&mut self, condition: Interned) -> Option<&mut Self> { - for (e, next_node) in &mut self.nodes { - if condition == *e { - return Some(next_node); - } - } - None - } - pub fn forbidden_conditions_after_prefix( - &mut self, - mut prefix: &[Interned], - ) -> Option> { - let mut cursor = self; - for c in prefix.iter() { - if let Some(next) = cursor.advance(*c) { - cursor = next; - } else { - return None; - } - } - Some(cursor.forbidden.clone()) - } - pub fn forbid_condition_after_prefix( - &mut self, - mut prefix: impl Iterator>, - forbidden: Interned, - ) { - match prefix.next() { - None => { - self.forbidden.insert(forbidden); - } - Some(first_condition) => { - for (condition, next_node) in &mut self.nodes { - if condition == &first_condition { - return next_node.forbid_condition_after_prefix(prefix, forbidden); - } - } - let mut rest = DeadEndsCache { - nodes: vec![], - forbidden: SmallBitmap::new(self.forbidden.universe_length()), - }; - rest.forbid_condition_after_prefix(prefix, forbidden); - self.nodes.push((first_condition, rest)); - } - } - } -} -// /// A set of `Vec>` implemented as a prefix tree. -// pub struct PathSet { -// nodes: Vec<(Interned, Self)>, -// is_end: bool, -// } - -// impl Clone for PathSet { -// fn clone(&self) -> Self { -// Self { nodes: self.nodes.clone(), is_end: self.is_end } -// } -// } - -// impl std::fmt::Debug for PathSet { -// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { -// f.debug_struct("PathSet").field("nodes", &self.nodes).field("is_end", &self.is_end).finish() -// } -// } - -// impl Default for PathSet { -// fn default() -> Self { -// Self { nodes: Default::default(), is_end: Default::default() } -// } -// } - -// impl PathSet { -// pub fn insert(&mut self, mut conditions: impl Iterator>) { -// match conditions.next() { -// None => { -// self.is_end = true; -// } -// Some(first_condition) => { -// for (condition, next_node) in &mut self.nodes { -// if condition == &first_condition { -// return next_node.insert(conditions); -// } -// } -// let mut rest = PathSet::default(); -// rest.insert(conditions); -// self.nodes.push((first_condition, rest)); -// } -// } -// } - -// pub fn remove_condition(&mut self, forbidden_condition: Interned) { -// let mut i = 0; -// while i < self.nodes.len() { -// let should_remove = if self.nodes[i].0 == forbidden_condition { -// true -// } else if !self.nodes[i].1.nodes.is_empty() { -// self.nodes[i].1.remove_condition(forbidden_condition); -// self.nodes[i].1.nodes.is_empty() -// } else { -// false -// }; -// if should_remove { -// self.nodes.remove(i); -// } else { -// i += 1; -// } -// } -// } - -// pub fn final_conditions_after_prefix( -// &self, -// prefix: &[Interned], -// visit: &mut impl FnMut(Interned), -// ) { -// let [first_condition, remaining_prefix @ ..] = prefix else { -// for node in self.nodes.iter() { -// if node.1.is_end { -// visit(node.0) -// } -// } -// return -// }; -// for (condition, rest) in self.nodes.iter() { -// if condition == first_condition { -// return rest.final_conditions_after_prefix(remaining_prefix, visit); -// } -// } -// } - -// pub fn contains_prefix_of_path(&self, path: &[Interned]) -> bool { -// if self.is_end { -// return true; -// } -// match path { -// [] => false, -// [first_condition, remaining_path @ ..] => { -// for (condition, rest) in self.nodes.iter() { -// if condition == first_condition { -// return rest.contains_prefix_of_path(remaining_path); -// } -// } -// false -// } -// } -// } -// } From dd491320e53380f6a922a5a2cf0ca8eb83cd62a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sun, 19 Mar 2023 14:43:14 +0100 Subject: [PATCH 067/234] Simplify graph-based ranking rule impl --- .../search/new/graph_based_ranking_rule.rs | 61 +++---------------- .../new/ranking_rule_graph/cheapest_paths.rs | 6 -- 2 files changed, 7 insertions(+), 60 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 31d6e504c..211fce736 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -87,7 +87,7 @@ pub struct GraphBasedRankingRuleState { /// Cache to retrieve the docids associated with each edge conditions_cache: ConditionDocIdsCache, /// Cache used to optimistically discard paths that resolve to no documents. - dead_end_path_cache: DeadEndsCache, + dead_ends_cache: DeadEndsCache, /// A structure giving the list of possible costs from each node to the end node, /// along with a set of unavoidable edges that must be traversed to achieve that distance. all_distances: MappedInterner)>, QueryNode>, @@ -95,34 +95,6 @@ pub struct GraphBasedRankingRuleState { cur_distance_idx: usize, } -/// Traverse each edge of the graph, computes its associated document ids, -/// and remove this edge from the graph if its docids are disjoint with the -/// given universe. -fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( - ctx: &mut SearchContext<'ctx>, - graph: &mut RankingRuleGraph, - condition_docids_cache: &mut ConditionDocIdsCache, - universe: &RoaringBitmap, - dead_end_path_cache: &mut DeadEndsCache, -) -> Result<()> { - for edge_id in graph.edges_store.indexes() { - let Some(edge) = graph.edges_store.get(edge_id).as_ref() else { - continue; - }; - let Some(condition) = edge.condition else { continue }; - - let docids = - condition_docids_cache.get_condition_docids(ctx, condition, graph, universe)?; - if docids.is_empty() { - graph.remove_edges_with_condition(condition); - dead_end_path_cache.forbid_condition(condition); // add_condition(condition); - condition_docids_cache.cache.remove(&condition); - continue; - } - } - Ok(()) -} - impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBasedRankingRule { fn id(&self) -> String { self.id.clone() @@ -131,22 +103,12 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase &mut self, ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, - universe: &RoaringBitmap, + _universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { - let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; - let mut condition_docids_cache = ConditionDocIdsCache::default(); - let mut dead_end_path_cache = DeadEndsCache::new(&graph.conditions_interner); - - // First simplify the graph as much as possible, by computing the docids of all the conditions - // within the rule's universe and removing the edges that have no associated docids. - remove_empty_edges( - ctx, - &mut graph, - &mut condition_docids_cache, - universe, - &mut dead_end_path_cache, - )?; + let graph = RankingRuleGraph::build(ctx, query_graph.clone())?; + let condition_docids_cache = ConditionDocIdsCache::default(); + let dead_end_path_cache = DeadEndsCache::new(&graph.conditions_interner); // Then pre-compute the cost of all paths from each node to the end node let all_distances = graph.initialize_distances_with_necessary_edges(); @@ -154,7 +116,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let state = GraphBasedRankingRuleState { graph, conditions_cache: condition_docids_cache, - dead_end_path_cache, + dead_ends_cache: dead_end_path_cache, all_distances, cur_distance_idx: 0, }; @@ -177,15 +139,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // should never happen let mut state = self.state.take().unwrap(); - // TODO: does this have a real positive performance impact? - remove_empty_edges( - ctx, - &mut state.graph, - &mut state.conditions_cache, - universe, - &mut state.dead_end_path_cache, - )?; - // If the cur_distance_idx does not point to a valid cost in the `all_distances` // structure, then we have computed all the buckets and can return. if state.cur_distance_idx @@ -205,7 +158,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let GraphBasedRankingRuleState { graph, conditions_cache: condition_docids_cache, - dead_end_path_cache, + dead_ends_cache: dead_end_path_cache, all_distances, cur_distance_idx: _, } = &mut state; diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 4fb4d0844..017663443 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -10,12 +10,6 @@ use crate::search::new::query_graph::QueryNode; use crate::search::new::small_bitmap::SmallBitmap; use crate::Result; -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct Path { - pub edges: Vec, - pub cost: u64, -} - impl RankingRuleGraph { pub fn visit_paths_of_cost( &mut self, From 825f74200041ebf1a81c53dd850a504a3cc9b9e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sun, 19 Mar 2023 15:03:57 +0100 Subject: [PATCH 068/234] Simplify graph-based ranking rule impl --- .../search/new/graph_based_ranking_rule.rs | 33 +++++---- milli/src/search/new/logger/detailed.rs | 39 +++++----- milli/src/search/new/logger/mod.rs | 17 +++-- .../new/ranking_rule_graph/cheapest_paths.rs | 71 +++++++------------ .../src/search/new/ranking_rule_graph/mod.rs | 4 +- .../new/ranking_rule_graph/proximity/mod.rs | 9 ++- .../search/new/ranking_rule_graph/typo/mod.rs | 7 +- 7 files changed, 77 insertions(+), 103 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 211fce736..4a96855ce 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -88,9 +88,8 @@ pub struct GraphBasedRankingRuleState { conditions_cache: ConditionDocIdsCache, /// Cache used to optimistically discard paths that resolve to no documents. dead_ends_cache: DeadEndsCache, - /// A structure giving the list of possible costs from each node to the end node, - /// along with a set of unavoidable edges that must be traversed to achieve that distance. - all_distances: MappedInterner)>, QueryNode>, + /// A structure giving the list of possible costs from each node to the end node + all_distances: MappedInterner, QueryNode>, /// An index in the first element of `all_distances`, giving the cost of the next bucket cur_distance_idx: usize, } @@ -108,7 +107,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase ) -> Result<()> { let graph = RankingRuleGraph::build(ctx, query_graph.clone())?; let condition_docids_cache = ConditionDocIdsCache::default(); - let dead_end_path_cache = DeadEndsCache::new(&graph.conditions_interner); + let dead_ends_cache = DeadEndsCache::new(&graph.conditions_interner); // Then pre-compute the cost of all paths from each node to the end node let all_distances = graph.initialize_distances_with_necessary_edges(); @@ -116,7 +115,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let state = GraphBasedRankingRuleState { graph, conditions_cache: condition_docids_cache, - dead_ends_cache: dead_end_path_cache, + dead_ends_cache, all_distances, cur_distance_idx: 0, }; @@ -149,7 +148,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase } // Retrieve the cost of the paths to compute - let (cost, _) = + let cost = state.all_distances.get(state.graph.query_graph.root_node)[state.cur_distance_idx]; state.cur_distance_idx += 1; @@ -158,7 +157,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let GraphBasedRankingRuleState { graph, conditions_cache: condition_docids_cache, - dead_ends_cache: dead_end_path_cache, + dead_ends_cache, all_distances, cur_distance_idx: _, } = &mut state; @@ -174,15 +173,15 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // For each path of the given cost, we will compute its associated // document ids. // In case the path does not resolve to any document id, we try to figure out why - // and update the `dead_end_path_cache` accordingly. - // Updating the dead_end_path_cache helps speed up the execution of `visit_paths_of_cost` and reduces + // and update the `dead_ends_cache` accordingly. + // Updating the dead_ends_cache helps speed up the execution of `visit_paths_of_cost` and reduces // the number of future candidate paths given by that same function. graph.visit_paths_of_cost( graph.query_graph.root_node, cost, all_distances, - dead_end_path_cache, - |path, graph, dead_end_path_cache| { + dead_ends_cache, + |path, graph, dead_ends_cache| { if universe.is_empty() { return Ok(ControlFlow::Break(())); } @@ -211,7 +210,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // and caches accordingly and skip to the next candidate path. if condition_docids.is_empty() { // 1. Store in the cache that this edge is empty for this universe - dead_end_path_cache.forbid_condition(latest_condition); + dead_ends_cache.forbid_condition(latest_condition); // 2. remove all the edges with this condition from the ranking rule graph graph.remove_edges_with_condition(latest_condition); // 3. Also remove the entry from the condition_docids_cache, since we don't need it anymore @@ -226,7 +225,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let len_prefix = subpath_docids.len() - 1; // First, we know that this path is empty, and thus any path // that is a superset of it will also be empty. - dead_end_path_cache.forbid_condition_after_prefix( + dead_ends_cache.forbid_condition_after_prefix( visited_conditions[..len_prefix].iter().copied(), latest_condition, ); @@ -244,7 +243,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase }; subprefix.push(*past_condition); if condition_docids.is_disjoint(subpath_docids) { - dead_end_path_cache.forbid_condition_after_prefix( + dead_ends_cache.forbid_condition_after_prefix( subprefix.iter().copied(), latest_condition, ); @@ -253,8 +252,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // keep the same prefix and check the intersection with // all the remaining conditions - let mut forbidden = dead_end_path_cache.forbidden.clone(); - let mut cursor = dead_end_path_cache; + let mut forbidden = dead_ends_cache.forbidden.clone(); + let mut cursor = dead_ends_cache; for &c in visited_conditions[..len_prefix].iter() { cursor = cursor.advance(c).unwrap(); forbidden.union(&cursor.forbidden); @@ -301,7 +300,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase G::log_state( &original_graph, &good_paths, - dead_end_path_cache, + dead_ends_cache, original_universe, all_distances, cost, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index a46e63005..23134c113 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -44,17 +44,17 @@ pub enum SearchEvents { ProximityState { graph: RankingRuleGraph, paths: Vec>>, - dead_end_path_cache: DeadEndsCache, + dead_ends_cache: DeadEndsCache, universe: RoaringBitmap, - distances: MappedInterner)>, QueryNode>, + distances: MappedInterner, QueryNode>, cost: u16, }, TypoState { graph: RankingRuleGraph, paths: Vec>>, - dead_end_path_cache: DeadEndsCache, + dead_ends_cache: DeadEndsCache, universe: RoaringBitmap, - distances: MappedInterner)>, QueryNode>, + distances: MappedInterner, QueryNode>, cost: u16, }, RankingRuleSkipBucket { @@ -170,15 +170,15 @@ impl SearchLogger for DetailedSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec>], - dead_end_path_cache: &DeadEndsCache, + dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner, QueryNode>, cost: u16, ) { self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), - dead_end_path_cache: dead_end_path_cache.clone(), + dead_ends_cache: dead_ends_cache.clone(), universe: universe.clone(), distances: distances.clone(), cost, @@ -189,15 +189,15 @@ impl SearchLogger for DetailedSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec>], - dead_end_path_cache: &DeadEndsCache, + dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner, QueryNode>, cost: u16, ) { self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), - dead_end_path_cache: dead_end_path_cache.clone(), + dead_ends_cache: dead_ends_cache.clone(), universe: universe.clone(), distances: distances.clone(), cost, @@ -357,7 +357,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ SearchEvents::ProximityState { graph, paths, - dead_end_path_cache, + dead_ends_cache, universe, distances, cost, @@ -373,7 +373,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ ctx, graph, paths, - dead_end_path_cache, + dead_ends_cache, distances.clone(), &mut new_file, ); @@ -390,7 +390,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ SearchEvents::TypoState { graph, paths, - dead_end_path_cache, + dead_ends_cache, universe, distances, cost, @@ -406,7 +406,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ ctx, graph, paths, - dead_end_path_cache, + dead_ends_cache, distances.clone(), &mut new_file, ); @@ -429,7 +429,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ ctx: &mut SearchContext, node_idx: Interned, node: &QueryNode, - distances: &[(u16, SmallBitmap)], + distances: &[u16], file: &mut File, ) { match &node.data { @@ -490,9 +490,8 @@ shape: class" let p = ctx.word_interner.get(*use_prefix_db); writeln!(file, "use prefix DB : {p}").unwrap(); } - for (d, edges) in distances.iter() { - writeln!(file, "\"distance {d}\" : {:?}", edges.iter().collect::>()) - .unwrap(); + for d in distances.iter() { + writeln!(file, "\"d_{d}\" : distance").unwrap(); } writeln!(file, "}}").unwrap(); @@ -527,8 +526,8 @@ shape: class" ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_paths_cache: &DeadEndsCache, - distances: MappedInterner)>, QueryNode>, + _dead_ends_cache: &DeadEndsCache, + distances: MappedInterner, QueryNode>, file: &mut File, ) { writeln!(file, "direction: right").unwrap(); diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index f8ab89cbf..203ac7b56 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -8,7 +8,6 @@ use super::query_graph::QueryNode; use super::ranking_rule_graph::{ DeadEndsCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoCondition, TypoGraph, }; -use super::small_bitmap::SmallBitmap; use super::{RankingRule, RankingRuleQueryTrait}; /// Trait for structure logging the execution of a search query. @@ -66,9 +65,9 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndsCache, + dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner, QueryNode>, cost: u16, ); @@ -77,9 +76,9 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndsCache, + dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner, QueryNode>, cost: u16, ); } @@ -137,9 +136,9 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths_map: &[Vec>], - _dead_end_path_cache: &DeadEndsCache, + _dead_ends_cache: &DeadEndsCache, _universe: &RoaringBitmap, - _distances: &MappedInterner)>, QueryNode>, + _distances: &MappedInterner, QueryNode>, _cost: u16, ) { } @@ -148,9 +147,9 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths: &[Vec>], - _dead_end_path_cache: &DeadEndsCache, + _dead_ends_cache: &DeadEndsCache, _universe: &RoaringBitmap, - _distances: &MappedInterner)>, QueryNode>, + _distances: &MappedInterner, QueryNode>, _cost: u16, ) { } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 017663443..c340ef8c7 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -1,7 +1,6 @@ #![allow(clippy::too_many_arguments)] -use std::collections::btree_map::Entry; -use std::collections::{BTreeMap, VecDeque}; +use std::collections::{BTreeSet, VecDeque}; use std::ops::ControlFlow; use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; @@ -15,8 +14,8 @@ impl RankingRuleGraph { &mut self, from: Interned, cost: u16, - all_distances: &MappedInterner)>, QueryNode>, - dead_end_path_cache: &mut DeadEndsCache, + all_distances: &MappedInterner, QueryNode>, + dead_ends_cache: &mut DeadEndsCache, mut visit: impl FnMut( &[Interned], &mut Self, @@ -27,11 +26,11 @@ impl RankingRuleGraph { from, cost, all_distances, - dead_end_path_cache, + dead_ends_cache, &mut visit, &mut vec![], &mut SmallBitmap::for_interned_values_in(&self.conditions_interner), - dead_end_path_cache.forbidden.clone(), + dead_ends_cache.forbidden.clone(), )?; Ok(()) } @@ -39,8 +38,8 @@ impl RankingRuleGraph { &mut self, from: Interned, cost: u16, - all_distances: &MappedInterner)>, QueryNode>, - dead_end_path_cache: &mut DeadEndsCache, + all_distances: &MappedInterner, QueryNode>, + dead_ends_cache: &mut DeadEndsCache, visit: &mut impl FnMut( &[Interned], &mut Self, @@ -62,7 +61,7 @@ impl RankingRuleGraph { None => { if edge.dest_node == self.query_graph.end_node { any_valid = true; - let control_flow = visit(prev_conditions, self, dead_end_path_cache)?; + let control_flow = visit(prev_conditions, self, dead_ends_cache)?; match control_flow { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(true), @@ -73,7 +72,7 @@ impl RankingRuleGraph { edge.dest_node, cost - edge.cost as u16, all_distances, - dead_end_path_cache, + dead_ends_cache, visit, prev_conditions, cur_path, @@ -83,12 +82,10 @@ impl RankingRuleGraph { } Some(condition) => { if forbidden_conditions.contains(condition) - || !all_distances.get(edge.dest_node).iter().any( - |(next_cost, necessary_conditions)| { - (*next_cost == cost - edge.cost as u16) - && !forbidden_conditions.intersects(necessary_conditions) - }, - ) + || all_distances + .get(edge.dest_node) + .iter() + .all(|next_cost| *next_cost != cost - edge.cost as u16) { continue; } @@ -96,14 +93,14 @@ impl RankingRuleGraph { prev_conditions.push(condition); let mut new_forbidden_conditions = forbidden_conditions.clone(); if let Some(next_forbidden) = - dead_end_path_cache.forbidden_conditions_after_prefix(prev_conditions) + dead_ends_cache.forbidden_conditions_after_prefix(prev_conditions) { new_forbidden_conditions.union(&next_forbidden); } let next_any_valid = if edge.dest_node == self.query_graph.end_node { any_valid = true; - let control_flow = visit(prev_conditions, self, dead_end_path_cache)?; + let control_flow = visit(prev_conditions, self, dead_ends_cache)?; match control_flow { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(true), @@ -114,7 +111,7 @@ impl RankingRuleGraph { edge.dest_node, cost - edge.cost as u16, all_distances, - dead_end_path_cache, + dead_ends_cache, visit, prev_conditions, cur_path, @@ -129,8 +126,8 @@ impl RankingRuleGraph { any_valid |= next_any_valid; if next_any_valid { - forbidden_conditions = dead_end_path_cache - .forbidden_conditions_for_all_prefixes_up_to(prev_conditions); + forbidden_conditions = + dead_ends_cache.forbidden_conditions_for_all_prefixes_up_to(prev_conditions); if cur_path.intersects(&forbidden_conditions) { break 'edges_loop; } @@ -140,16 +137,13 @@ impl RankingRuleGraph { Ok(any_valid) } - pub fn initialize_distances_with_necessary_edges( - &self, - ) -> MappedInterner)>, QueryNode> { + pub fn initialize_distances_with_necessary_edges(&self) -> MappedInterner, QueryNode> { let mut distances_to_end = self.query_graph.nodes.map(|_| vec![]); let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); let mut node_stack = VecDeque::new(); - *distances_to_end.get_mut(self.query_graph.end_node) = - vec![(0, SmallBitmap::for_interned_values_in(&self.conditions_interner))]; + *distances_to_end.get_mut(self.query_graph.end_node) = vec![0]; for prev_node in self.query_graph.nodes.get(self.query_graph.end_node).predecessors.iter() { node_stack.push_back(prev_node); @@ -157,35 +151,20 @@ impl RankingRuleGraph { } while let Some(cur_node) = node_stack.pop_front() { - let mut self_distances = BTreeMap::>::new(); + let mut self_distances = BTreeSet::::new(); let cur_node_edges = &self.edges_of_node.get(cur_node); for edge_idx in cur_node_edges.iter() { let edge = self.edges_store.get(edge_idx).as_ref().unwrap(); let succ_node = edge.dest_node; let succ_distances = distances_to_end.get(succ_node); - for (succ_distance, succ_necessary_conditions) in succ_distances { - let mut potential_necessary_edges = - SmallBitmap::for_interned_values_in(&self.conditions_interner); - for condition in - edge.condition.into_iter().chain(succ_necessary_conditions.iter()) - { - potential_necessary_edges.insert(condition); - } - - match self_distances.entry(edge.cost as u16 + succ_distance) { - Entry::Occupied(mut prev_necessary_edges) => { - prev_necessary_edges.get_mut().intersection(&potential_necessary_edges); - } - Entry::Vacant(entry) => { - entry.insert(potential_necessary_edges); - } - } + for succ_distance in succ_distances { + self_distances.insert(edge.cost as u16 + succ_distance); } } let distances_to_end_cur_node = distances_to_end.get_mut(cur_node); - for (cost, necessary_edges) in self_distances.iter() { - distances_to_end_cur_node.push((*cost, necessary_edges.clone())); + for cost in self_distances.iter() { + distances_to_end_cur_node.push(*cost); } *distances_to_end.get_mut(cur_node) = self_distances.into_iter().collect(); for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index b01c82969..129590088 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -112,9 +112,9 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndsCache, + dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 690200773..4c058ac8e 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -6,11 +6,10 @@ use std::iter::FromIterator; use roaring::RoaringBitmap; -use super::{RankingRuleGraph, RankingRuleGraphTrait, DeadEndsCache}; +use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::{Phrase, QueryTerm}; -use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; @@ -66,13 +65,13 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndsCache, + dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ) { - logger.log_proximity_state(graph, paths, dead_end_path_cache, universe, distances, cost); + logger.log_proximity_state(graph, paths, dead_ends_cache, universe, distances, cost); } fn label_for_condition<'ctx>( diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index e1e01d6b1..854bd589b 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -5,7 +5,6 @@ use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; -use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; use std::collections::HashSet; @@ -136,13 +135,13 @@ impl RankingRuleGraphTrait for TypoGraph { fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndsCache, + dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner)>, QueryNode>, + distances: &MappedInterner, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ) { - logger.log_typo_state(graph, paths, dead_end_path_cache, universe, distances, cost); + logger.log_typo_state(graph, paths, dead_ends_cache, universe, distances, cost); } fn label_for_condition<'ctx>( From a59ca28e2cf92b1e4b300f3f5c58e58008f1a0b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sun, 19 Mar 2023 15:14:40 +0100 Subject: [PATCH 069/234] Add forgotten file --- .../new/ranking_rule_graph/dead_ends_cache.rs | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs diff --git a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs new file mode 100644 index 000000000..ecb6ae8da --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs @@ -0,0 +1,93 @@ +use crate::search::new::{ + interner::{FixedSizeInterner, Interned}, + small_bitmap::SmallBitmap, +}; + +pub struct DeadEndsCache { + conditions: Vec>, + next: Vec, + pub forbidden: SmallBitmap, +} +impl Clone for DeadEndsCache { + fn clone(&self) -> Self { + Self { + conditions: self.conditions.clone(), + next: self.next.clone(), + forbidden: self.forbidden.clone(), + } + } +} +impl DeadEndsCache { + pub fn new(for_interner: &FixedSizeInterner) -> Self { + Self { + conditions: vec![], + next: vec![], + forbidden: SmallBitmap::for_interned_values_in(for_interner), + } + } + pub fn forbid_condition(&mut self, condition: Interned) { + self.forbidden.insert(condition); + } + + pub fn advance(&mut self, condition: Interned) -> Option<&mut Self> { + if let Some(idx) = self.conditions.iter().position(|c| *c == condition) { + Some(&mut self.next[idx]) + } else { + None + } + } + pub fn forbidden_conditions_for_all_prefixes_up_to( + &mut self, + prefix: &[Interned], + ) -> SmallBitmap { + let mut forbidden = self.forbidden.clone(); + let mut cursor = self; + for c in prefix.iter() { + if let Some(next) = cursor.advance(*c) { + cursor = next; + forbidden.union(&cursor.forbidden); + } else { + break; + } + } + forbidden + } + pub fn forbidden_conditions_after_prefix( + &mut self, + prefix: &[Interned], + ) -> Option> { + let mut cursor = self; + for c in prefix.iter() { + if let Some(next) = cursor.advance(*c) { + cursor = next; + } else { + return None; + } + } + Some(cursor.forbidden.clone()) + } + pub fn forbid_condition_after_prefix( + &mut self, + mut prefix: impl Iterator>, + forbidden: Interned, + ) { + match prefix.next() { + None => { + self.forbidden.insert(forbidden); + } + Some(first_condition) => { + if let Some(idx) = self.conditions.iter().position(|c| *c == first_condition) { + return self.next[idx].forbid_condition_after_prefix(prefix, forbidden); + } + let mut rest = DeadEndsCache { + conditions: vec![], + next: vec![], + forbidden: SmallBitmap::new(self.forbidden.universe_length()), + }; + rest.forbid_condition_after_prefix(prefix, forbidden); + self.conditions.push(first_condition); + self.next.push(rest); + } + } + } +} From fbb1ba3de07b06bedf568e7d8d5a34d8fb9048ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sun, 19 Mar 2023 15:15:58 +0100 Subject: [PATCH 070/234] Cargo fmt --- milli/src/lib.rs | 5 +---- milli/src/search/new/distinct.rs | 16 ++++++---------- milli/src/search/new/logger/detailed.rs | 4 +--- milli/src/search/new/mod.rs | 7 +++---- .../new/ranking_rule_graph/dead_ends_cache.rs | 6 ++---- .../new/ranking_rule_graph/proximity/build.rs | 3 ++- .../search/new/ranking_rule_graph/typo/mod.rs | 7 ++++--- 7 files changed, 19 insertions(+), 29 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 199221c7c..b256192bd 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -70,10 +70,6 @@ pub mod update; #[macro_use] pub mod snapshot_tests; -pub use search::new::DetailedSearchLogger; - -pub use search::new::{execute_search, DefaultSearchLogger, SearchContext}; - use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; @@ -81,6 +77,7 @@ use std::hash::BuildHasherDefault; pub use filter_parser::{Condition, FilterCondition, Span, Token}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; +pub use search::new::{execute_search, DefaultSearchLogger, DetailedSearchLogger, SearchContext}; use serde_json::Value; pub use {charabia as tokenizer, heed}; diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs index 10657f210..f2e79603a 100644 --- a/milli/src/search/new/distinct.rs +++ b/milli/src/search/new/distinct.rs @@ -1,19 +1,15 @@ -use heed::{ - types::{ByteSlice, Str, Unit}, - Database, RoPrefix, RoTxn, -}; +use heed::types::{ByteSlice, Str, Unit}; +use heed::{Database, RoPrefix, RoTxn}; use roaring::RoaringBitmap; const FID_SIZE: usize = 2; const DOCID_SIZE: usize = 4; -use crate::{ - heed_codec::{ - facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec}, - ByteSliceRefCodec, - }, - Index, Result, SearchContext, +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec, }; +use crate::heed_codec::ByteSliceRefCodec; +use crate::{Index, Result, SearchContext}; pub struct DistinctOutput { pub remaining: RoaringBitmap, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 23134c113..b519ca659 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -13,9 +13,7 @@ use crate::search::new::ranking_rule_graph::{ DeadEndsCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, }; -use crate::search::new::small_bitmap::SmallBitmap; -use crate::search::new::{QueryGraph, QueryNode, SearchContext}; -use crate::search::new::{RankingRule, SearchLogger}; +use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger}; pub enum SearchEvents { RankingRuleStartIteration { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 29a6020aa..c4e494242 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -16,17 +16,15 @@ mod sort; mod words; // #[cfg(test)] -pub use logger::detailed::DetailedSearchLogger; -pub use logger::{DefaultSearchLogger, SearchLogger}; - use std::collections::{BTreeSet, HashSet}; -use crate::{Filter, Index, MatchingWords, Result, Search, SearchResult, TermsMatchingStrategy}; use charabia::Tokenize; use db_cache::DatabaseCache; use graph_based_ranking_rule::{Proximity, Typo}; use heed::RoTxn; use interner::DedupInterner; +pub use logger::detailed::DetailedSearchLogger; +pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode, QueryNodeData}; use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; @@ -35,6 +33,7 @@ use roaring::RoaringBitmap; use words::Words; use self::ranking_rules::RankingRule; +use crate::{Filter, Index, MatchingWords, Result, Search, SearchResult, TermsMatchingStrategy}; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { diff --git a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs index ecb6ae8da..d25c69c23 100644 --- a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs @@ -1,7 +1,5 @@ -use crate::search::new::{ - interner::{FixedSizeInterner, Interned}, - small_bitmap::SmallBitmap, -}; +use crate::search::new::interner::{FixedSizeInterner, Interned}; +use crate::search::new::small_bitmap::SmallBitmap; pub struct DeadEndsCache { conditions: Vec>, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 5e38a1879..d0977d732 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -1,6 +1,8 @@ #![allow(clippy::too_many_arguments)] use std::collections::BTreeMap; +use heed::RoTxn; + use super::ProximityCondition; use crate::search::new::db_cache::DatabaseCache; use crate::search::new::interner::{DedupInterner, Interned}; @@ -9,7 +11,6 @@ use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; use crate::search::new::ranking_rule_graph::proximity::WordPair; use crate::search::new::{QueryNode, SearchContext}; use crate::Result; -use heed::RoTxn; fn last_word_of_term_iter<'t>( t: &'t QueryTerm, diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 854bd589b..6b8fc1154 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,3 +1,7 @@ +use std::collections::HashSet; +use std::fmt::Write; +use std::iter::FromIterator; + use roaring::RoaringBitmap; use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; @@ -7,9 +11,6 @@ use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; -use std::collections::HashSet; -use std::fmt::Write; -use std::iter::FromIterator; #[derive(Clone, PartialEq, Eq, Hash)] pub struct TypoCondition { From 65474c8de50a03ec7d713cea3df7f8d98e6c73a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 20 Mar 2023 09:26:11 +0100 Subject: [PATCH 071/234] Update new sort ranking rule after rebasing --- milli/src/search/new/sort.rs | 62 ++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 70173889e..a1bced92c 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -68,30 +68,52 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, ) -> Result<()> { let iter: RankingRuleOutputIterWrapper = match self.field_id { Some(field_id) => { - let make_iter = - if self.is_ascending { ascending_facet_sort } else { descending_facet_sort }; + let number_db = ctx + .index + .facet_id_f64_docids + .remap_key_type::>(); + let string_db = ctx + .index + .facet_id_string_docids + .remap_key_type::>(); - let number_iter = make_iter( - ctx.txn, - ctx.index - .facet_id_f64_docids - .remap_key_type::>(), - field_id, - parent_candidates.clone(), - )?; + let (number_iter, string_iter) = if self.is_ascending { + let number_iter = ascending_facet_sort( + ctx.txn, + number_db, + field_id, + parent_candidates.clone(), + )?; + let string_iter = ascending_facet_sort( + ctx.txn, + string_db, + field_id, + parent_candidates.clone(), + )?; + + (itertools::Either::Left(number_iter), itertools::Either::Left(string_iter)) + } else { + let number_iter = descending_facet_sort( + ctx.txn, + number_db, + field_id, + parent_candidates.clone(), + )?; + let string_iter = descending_facet_sort( + ctx.txn, + string_db, + field_id, + parent_candidates.clone(), + )?; + + (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter)) + }; - let string_iter = make_iter( - ctx.txn, - ctx.index - .facet_id_string_docids - .remap_key_type::>(), - field_id, - parent_candidates.clone(), - )?; let query_graph = parent_query_graph.clone(); RankingRuleOutputIterWrapper::new(Box::new(number_iter.chain(string_iter).map( - move |docids| { - Ok(RankingRuleOutput { query: query_graph.clone(), candidates: docids? }) + move |r| { + let (docids, _) = r?; + Ok(RankingRuleOutput { query: query_graph.clone(), candidates: docids }) }, ))) } From 5b50e49522269356fc047c935e7eab635bdcc007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 20 Mar 2023 09:30:10 +0100 Subject: [PATCH 072/234] cargo fmt --- milli/src/search/new/sort.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index a1bced92c..6277149bd 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -2,6 +2,10 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait, SearchContext}; +use crate::heed_codec::facet::FacetGroupKeyCodec; +use crate::heed_codec::ByteSliceRefCodec; +use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; +use crate::{FieldId, Index, Result}; pub trait RankingRuleOutputIter<'ctx, Query> { fn next_bucket(&mut self) -> Result>>; @@ -24,15 +28,6 @@ impl<'ctx, Query> RankingRuleOutputIter<'ctx, Query> for RankingRuleOutputIterWr } } -use crate::{ - // facet::FacetType, - heed_codec::{facet::FacetGroupKeyCodec, ByteSliceRefCodec}, - search::facet::{ascending_facet_sort, descending_facet_sort}, - FieldId, - Index, - Result, -}; - pub struct Sort<'ctx, Query> { field_name: String, field_id: Option, @@ -64,7 +59,7 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, parent_candidates: &RoaringBitmap, - parent_query_graph: &Query, + parent_query: &Query, ) -> Result<()> { let iter: RankingRuleOutputIterWrapper = match self.field_id { Some(field_id) => { @@ -109,7 +104,7 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter)) }; - let query_graph = parent_query_graph.clone(); + let query_graph = parent_query.clone(); RankingRuleOutputIterWrapper::new(Box::new(number_iter.chain(string_iter).map( move |r| { let (docids, _) = r?; @@ -119,7 +114,7 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, } None => RankingRuleOutputIterWrapper::new(Box::new(std::iter::empty())), }; - self.original_query = Some(parent_query_graph.clone()); + self.original_query = Some(parent_query.clone()); self.iter = Some(iter); Ok(()) } From 9259cdb12ea95aa3a7aacda6699709d2bd3f40d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 20 Mar 2023 09:30:30 +0100 Subject: [PATCH 073/234] Update Cargo.lock (was mistakenly changed during rebase) --- Cargo.lock | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0bdad9131..ec2c7af1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -410,7 +410,7 @@ checksum = "b645a089122eccb6111b4f81cbc1a49f5900ac4666bb93ac027feaecf15607bf" [[package]] name = "benchmarks" -version = "1.1.0" +version = "1.0.0" dependencies = [ "anyhow", "bytes", @@ -1150,7 +1150,7 @@ dependencies = [ [[package]] name = "dump" -version = "1.1.0" +version = "1.0.0" dependencies = [ "anyhow", "big_s", @@ -1371,7 +1371,7 @@ dependencies = [ [[package]] name = "file-store" -version = "1.1.0" +version = "1.0.0" dependencies = [ "faux", "tempfile", @@ -1393,7 +1393,7 @@ dependencies = [ [[package]] name = "filter-parser" -version = "1.1.0" +version = "1.0.0" dependencies = [ "insta", "nom", @@ -1413,7 +1413,7 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "1.1.0" +version = "1.0.0" dependencies = [ "criterion", "serde_json", @@ -1890,7 +1890,7 @@ dependencies = [ [[package]] name = "index-scheduler" -version = "1.1.0" +version = "1.0.0" dependencies = [ "anyhow", "big_s", @@ -2049,7 +2049,7 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "1.1.0" +version = "1.0.0" dependencies = [ "criterion", "serde_json", @@ -2445,7 +2445,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "meili-snap" -version = "1.1.0" +version = "1.0.0" dependencies = [ "insta", "md5", @@ -2454,7 +2454,7 @@ dependencies = [ [[package]] name = "meilisearch" -version = "1.1.0" +version = "1.0.0" dependencies = [ "actix-cors", "actix-http", @@ -2542,7 +2542,7 @@ dependencies = [ [[package]] name = "meilisearch-auth" -version = "1.1.0" +version = "1.0.0" dependencies = [ "base64 0.13.1", "enum-iterator", @@ -2561,7 +2561,7 @@ dependencies = [ [[package]] name = "meilisearch-types" -version = "1.1.0" +version = "1.0.0" dependencies = [ "actix-web", "anyhow", @@ -2615,7 +2615,7 @@ dependencies = [ [[package]] name = "milli" -version = "1.1.0" +version = "1.0.0" dependencies = [ "big_s", "bimap", @@ -2970,7 +2970,7 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "permissive-json-pointer" -version = "1.1.0" +version = "1.0.0" dependencies = [ "big_s", "serde_json", From c63c7377e6ed793e68fc11ae81b9815d600ce543 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 20 Mar 2023 09:37:11 +0100 Subject: [PATCH 074/234] Switch order of MappedInterner generic params --- milli/src/search/new/graph_based_ranking_rule.rs | 2 +- milli/src/search/new/interner.rs | 8 ++++---- milli/src/search/new/logger/detailed.rs | 10 +++++----- milli/src/search/new/logger/mod.rs | 8 ++++---- .../search/new/ranking_rule_graph/cheapest_paths.rs | 6 +++--- milli/src/search/new/ranking_rule_graph/mod.rs | 4 ++-- .../src/search/new/ranking_rule_graph/proximity/mod.rs | 2 +- milli/src/search/new/ranking_rule_graph/typo/mod.rs | 2 +- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 4a96855ce..efc5a6dcc 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -89,7 +89,7 @@ pub struct GraphBasedRankingRuleState { /// Cache used to optimistically discard paths that resolve to no documents. dead_ends_cache: DeadEndsCache, /// A structure giving the list of possible costs from each node to the end node - all_distances: MappedInterner, QueryNode>, + all_distances: MappedInterner>, /// An index in the first element of `all_distances`, giving the cost of the next bucket cur_distance_idx: usize, } diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index ea8b987fd..c26e18524 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -96,7 +96,7 @@ impl FixedSizeInterner { pub fn len(&self) -> u16 { self.stable_store.len() as u16 } - pub fn map(&self, map_f: impl Fn(&T) -> U) -> MappedInterner { + pub fn map(&self, map_f: impl Fn(&T) -> U) -> MappedInterner { MappedInterner { stable_store: self.stable_store.iter().map(map_f).collect(), _phantom: PhantomData, @@ -119,19 +119,19 @@ impl FixedSizeInterner { /// /// Values in this interner are indexed with [`Interned`]. #[derive(Clone)] -pub struct MappedInterner { +pub struct MappedInterner { stable_store: Vec, _phantom: PhantomData, } -impl MappedInterner { +impl MappedInterner { pub fn get(&self, interned: Interned) -> &T { &self.stable_store[interned.idx as usize] } pub fn get_mut(&mut self, interned: Interned) -> &mut T { &mut self.stable_store[interned.idx as usize] } - pub fn map(&self, map_f: impl Fn(&T) -> U) -> MappedInterner { + pub fn map(&self, map_f: impl Fn(&T) -> U) -> MappedInterner { MappedInterner { stable_store: self.stable_store.iter().map(map_f).collect(), _phantom: PhantomData, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index b519ca659..45a74a585 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -44,7 +44,7 @@ pub enum SearchEvents { paths: Vec>>, dead_ends_cache: DeadEndsCache, universe: RoaringBitmap, - distances: MappedInterner, QueryNode>, + distances: MappedInterner>, cost: u16, }, TypoState { @@ -52,7 +52,7 @@ pub enum SearchEvents { paths: Vec>>, dead_ends_cache: DeadEndsCache, universe: RoaringBitmap, - distances: MappedInterner, QueryNode>, + distances: MappedInterner>, cost: u16, }, RankingRuleSkipBucket { @@ -170,7 +170,7 @@ impl SearchLogger for DetailedSearchLogger { paths_map: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner, QueryNode>, + distances: &MappedInterner>, cost: u16, ) { self.events.push(SearchEvents::ProximityState { @@ -189,7 +189,7 @@ impl SearchLogger for DetailedSearchLogger { paths_map: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner, QueryNode>, + distances: &MappedInterner>, cost: u16, ) { self.events.push(SearchEvents::TypoState { @@ -525,7 +525,7 @@ shape: class" graph: &RankingRuleGraph, paths: &[Vec>], _dead_ends_cache: &DeadEndsCache, - distances: MappedInterner, QueryNode>, + distances: MappedInterner>, file: &mut File, ) { writeln!(file, "direction: right").unwrap(); diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 203ac7b56..3b8642cab 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -67,7 +67,7 @@ pub trait SearchLogger { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner, QueryNode>, + distances: &MappedInterner>, cost: u16, ); @@ -78,7 +78,7 @@ pub trait SearchLogger { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner, QueryNode>, + distances: &MappedInterner>, cost: u16, ); } @@ -138,7 +138,7 @@ impl SearchLogger for DefaultSearchLogger { _paths_map: &[Vec>], _dead_ends_cache: &DeadEndsCache, _universe: &RoaringBitmap, - _distances: &MappedInterner, QueryNode>, + _distances: &MappedInterner>, _cost: u16, ) { } @@ -149,7 +149,7 @@ impl SearchLogger for DefaultSearchLogger { _paths: &[Vec>], _dead_ends_cache: &DeadEndsCache, _universe: &RoaringBitmap, - _distances: &MappedInterner, QueryNode>, + _distances: &MappedInterner>, _cost: u16, ) { } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index c340ef8c7..c09f6e5e0 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -14,7 +14,7 @@ impl RankingRuleGraph { &mut self, from: Interned, cost: u16, - all_distances: &MappedInterner, QueryNode>, + all_distances: &MappedInterner>, dead_ends_cache: &mut DeadEndsCache, mut visit: impl FnMut( &[Interned], @@ -38,7 +38,7 @@ impl RankingRuleGraph { &mut self, from: Interned, cost: u16, - all_distances: &MappedInterner, QueryNode>, + all_distances: &MappedInterner>, dead_ends_cache: &mut DeadEndsCache, visit: &mut impl FnMut( &[Interned], @@ -137,7 +137,7 @@ impl RankingRuleGraph { Ok(any_valid) } - pub fn initialize_distances_with_necessary_edges(&self) -> MappedInterner, QueryNode> { + pub fn initialize_distances_with_necessary_edges(&self) -> MappedInterner> { let mut distances_to_end = self.query_graph.nodes.map(|_| vec![]); let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 129590088..34363febe 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -114,7 +114,7 @@ pub trait RankingRuleGraphTrait: Sized { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner, QueryNode>, + distances: &MappedInterner>, cost: u16, logger: &mut dyn SearchLogger, ); @@ -127,7 +127,7 @@ pub trait RankingRuleGraphTrait: Sized { pub struct RankingRuleGraph { pub query_graph: QueryGraph, pub edges_store: FixedSizeInterner>>, - pub edges_of_node: MappedInterner>>, QueryNode>, + pub edges_of_node: MappedInterner>>>, pub conditions_interner: FixedSizeInterner, } impl Clone for RankingRuleGraph { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 4c058ac8e..568a2c2b0 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -67,7 +67,7 @@ impl RankingRuleGraphTrait for ProximityGraph { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner, QueryNode>, + distances: &MappedInterner>, cost: u16, logger: &mut dyn SearchLogger, ) { diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 6b8fc1154..a32274954 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -138,7 +138,7 @@ impl RankingRuleGraphTrait for TypoGraph { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner, QueryNode>, + distances: &MappedInterner>, cost: u16, logger: &mut dyn SearchLogger, ) { From 272cd7ebbdbf274b61eacb3b6a5d0d0d8b15f23a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 20 Mar 2023 13:39:19 +0100 Subject: [PATCH 075/234] Small cleanup --- milli/src/search/new/logger/detailed.rs | 6 +++--- .../src/search/new/ranking_rule_graph/mod.rs | 3 --- .../search/new/ranking_rule_graph/typo/mod.rs | 1 + milli/src/search/new/small_bitmap.rs | 21 ------------------- 4 files changed, 4 insertions(+), 27 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 45a74a585..19a2679a8 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -488,9 +488,9 @@ shape: class" let p = ctx.word_interner.get(*use_prefix_db); writeln!(file, "use prefix DB : {p}").unwrap(); } - for d in distances.iter() { - writeln!(file, "\"d_{d}\" : distance").unwrap(); - } + // for d in distances.iter() { + // writeln!(file, "\"d_{d}\" : distance").unwrap(); + // } writeln!(file, "}}").unwrap(); } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 34363febe..528ff3107 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -71,9 +71,6 @@ impl PartialEq for Edge { /// 1. Retrieve the set of edges (their cost and condition) between two nodes. /// 2. Compute the document ids satisfying a condition pub trait RankingRuleGraphTrait: Sized { - /// The condition of an edge connecting two query nodes. The condition - /// should be sufficient to compute the edge's cost and associated document ids - /// in [`resolve_condition`](RankingRuleGraphTrait::resolve_condition). type Condition: Sized + Clone + PartialEq + Eq + Hash; /// Return the label of the given edge condition, to be used when visualising diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index a32274954..32b905244 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -74,6 +74,7 @@ impl RankingRuleGraphTrait for TypoGraph { is_prefix: term.is_prefix, zero_typo: term.zero_typo, prefix_of: term.prefix_of, + // TOOD: debatable synonyms: term.synonyms, split_words: None, one_typo: Box::new([]), diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index 1fdd31346..503bd72f5 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -272,24 +272,3 @@ impl<'b> Iterator for SmallBitmapInternalIter<'b> { } } } - -// #[cfg(test)] -// mod tests { -// use super::SmallBitmap; - -// #[test] -// fn test_small_bitmap() { -// let mut bitmap1 = SmallBitmap::new(32); -// for x in 0..16 { -// bitmap1.insert(x * 2); -// } -// let mut bitmap2 = SmallBitmap::new(32); -// for x in 0..=10 { -// bitmap2.insert(x * 3); -// } -// bitmap1.intersection(&bitmap2); -// for v in bitmap1.iter() { -// println!("{v}"); -// } -// } -// } From 83e5b4ed0d667bd3dbf6c60a2892445e0e94f092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Mar 2023 10:44:40 +0100 Subject: [PATCH 076/234] Compute edges of proximity graph lazily --- milli/src/search/mod.rs | 1 + .../search/new/graph_based_ranking_rule.rs | 7 +- milli/src/search/new/interner.rs | 2 +- milli/src/search/new/mod.rs | 365 ------------------ .../condition_docids_cache.rs | 40 +- .../src/search/new/ranking_rule_graph/mod.rs | 14 +- .../new/ranking_rule_graph/proximity/build.rs | 235 ++--------- .../proximity/compute_docids.rs | 331 +++++++++++----- .../new/ranking_rule_graph/proximity/mod.rs | 140 +------ .../search/new/ranking_rule_graph/typo/mod.rs | 44 ++- milli/src/search/new/ranking_rules.rs | 3 +- milli/src/search/new/words.rs | 4 +- 12 files changed, 345 insertions(+), 841 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 5e741c7f3..dc236dd0d 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -367,6 +367,7 @@ pub fn word_derivations<'c>( match cache.entry((word.to_string(), is_prefix, max_typo)) { Entry::Occupied(entry) => Ok(entry.into_mut()), Entry::Vacant(entry) => { + // println!("word derivations {word} {is_prefix} {max_typo}"); let mut derived_words = Vec::new(); if max_typo == 0 { if is_prefix { diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index efc5a6dcc..f83f01074 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -318,9 +318,10 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let mut used_words = HashSet::new(); let mut used_phrases = HashSet::new(); for condition in used_conditions.iter() { - let condition = graph.conditions_interner.get(condition); - used_words.extend(G::words_used_by_condition(ctx, condition)?); - used_phrases.extend(G::phrases_used_by_condition(ctx, condition)?); + let (ws, ps) = + condition_docids_cache.get_condition_used_words_and_phrases(condition); + used_words.extend(ws); + used_phrases.extend(ps); } // 2. Remove the unused words and phrases from all the nodes in the graph let mut nodes_to_remove = vec![]; diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index c26e18524..b8f54d087 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -30,7 +30,7 @@ impl Interned { #[derive(Clone)] pub struct DedupInterner { stable_store: Vec, - lookup: FxHashMap>, + lookup: FxHashMap>, // TODO: Arc } impl Default for DedupInterner { fn default() -> Self { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index c4e494242..44e26a9ea 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -287,368 +287,3 @@ impl<'a> Search<'a> { todo!() } } - -#[cfg(test)] -mod tests { - // use crate::allocator::ALLOC; - use std::fs::File; - use std::io::{BufRead, BufReader, Cursor, Seek}; - use std::time::Instant; - - use big_s::S; - use heed::EnvOpenOptions; - use maplit::hashset; - - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - // use crate::search::new::logger::detailed::DetailedSearchLogger; - use crate::search::new::logger::DefaultSearchLogger; - use crate::search::new::{execute_search, SearchContext}; - use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; - use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; - - #[test] - fn search_wiki_new() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - let txn = index.read_txn().unwrap(); - - println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - - loop { - let start = Instant::now(); - - // let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "released from prison by the government", - // "which a the releases from poison by the government", - // "sun flower s are the best", - // "zero config", - TermsMatchingStrategy::Last, - None, - 0, - 20, - &mut DefaultSearchLogger, - &mut DefaultSearchLogger, - // &mut logger, - ) - .unwrap(); - - // logger.write_d2_description(&mut ctx); - - let elapsed = start.elapsed(); - println!("{}us", elapsed.as_micros()); - - let _documents = index - .documents(&txn, results.documents_ids.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), results); - } - // for (id, document) in documents { - // println!("{id}:"); - // // println!("{document}"); - // } - } - - #[test] - fn search_wiki_old() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - - let txn = index.read_txn().unwrap(); - - let rr = index.criteria(&txn).unwrap(); - println!("{rr:?}"); - - let start = Instant::now(); - - let mut s = Search::new(&txn, &index); - s.query( - // "which a the releases from poison by the government", - // "sun flower s are the best", - "zero config", - ); - s.terms_matching_strategy(TermsMatchingStrategy::Last); - // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlyIterative); - let docs = s.execute().unwrap(); - - let elapsed = start.elapsed(); - - let documents = index - .documents(&txn, docs.documents_ids.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - for (id, _document) in documents { - println!("{id}:"); - // println!("{document}"); - } - } - #[test] - fn search_movies_new() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let txn = index.read_txn().unwrap(); - - // let primary_key = index.primary_key(&txn).unwrap().unwrap(); - // let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - // loop { - let start = Instant::now(); - - let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "releases from poison by the government", - TermsMatchingStrategy::Last, - None, - 0, - 20, - &mut DefaultSearchLogger, - &mut logger, - ) - .unwrap(); - - logger.write_d2_description(&mut ctx); - - let elapsed = start.elapsed(); - - // let ids = index - // .documents(&txn, results.iter().copied()) - // .unwrap() - // .into_iter() - // .map(|x| { - // let obkv = &x.1; - // let id = obkv.get(primary_key).unwrap(); - // let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - // id.as_str().unwrap().to_owned() - // }) - // .collect::>(); - - println!("{}us: {results:?}", elapsed.as_micros()); - // println!("external ids: {ids:?}"); - // } - } - - #[test] - fn search_movies_old() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - - let txn = index.read_txn().unwrap(); - - let rr = index.criteria(&txn).unwrap(); - println!("{rr:?}"); - - let primary_key = index.primary_key(&txn).unwrap().unwrap(); - let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - - let start = Instant::now(); - - let mut s = Search::new(&txn, &index); - s.query("which a the releases from poison by the government"); - s.terms_matching_strategy(TermsMatchingStrategy::Last); - s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); - let docs = s.execute().unwrap(); - - let elapsed = start.elapsed(); - - let ids = index - .documents(&txn, docs.documents_ids.iter().copied()) - .unwrap() - .into_iter() - .map(|x| { - let obkv = &x.1; - let id = obkv.get(primary_key).unwrap(); - let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - id.as_str().unwrap().to_owned() - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - println!("external ids: {ids:?}"); - } - - #[test] - fn _settings_movies() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_min_word_len_one_typo(5); - builder.set_min_word_len_two_typos(100); - builder.set_sortable_fields(hashset! { S("release_date") }); - builder.set_criteria(vec![ - Criterion::Words, - Criterion::Typo, - Criterion::Proximity, - Criterion::Asc("release_date".to_owned()), - ]); - - builder.execute(|_| (), || false).unwrap(); - wtxn.commit().unwrap(); - } - - #[test] - fn _index_movies() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - let primary_key = "id"; - let searchable_fields = vec!["title", "overview"]; - let filterable_fields = vec!["release_date", "genres"]; - - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(primary_key.to_owned()); - let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_filterable_fields(filterable_fields); - - builder.set_min_word_len_one_typo(5); - builder.set_min_word_len_two_typos(100); - builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); - builder.execute(|_| (), || false).unwrap(); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) - .unwrap(); - - let documents = documents_from( - "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json", - "json", - ); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); - } - #[test] - fn _index_wiki() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - // let primary_key = "id"; - let searchable_fields = vec!["body", "title", "url"]; - // let filterable_fields = vec![]; - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - // builder.set_primary_key(primary_key.to_owned()); - let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_filterable_fields(filterable_fields); - - // builder.set_min_word_len_one_typo(5); - // builder.set_min_word_len_two_typos(100); - builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); - builder.execute(|_| (), || false).unwrap(); - - let config = IndexerConfig::default(); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) - .unwrap(); - - let documents = documents_from( - "/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv", - "csv", - ); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); - } - - fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { - let reader = File::open(filename) - .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename)); - let reader = BufReader::new(reader); - let documents = match filetype { - "csv" => documents_from_csv(reader).unwrap(), - "json" => documents_from_json(reader).unwrap(), - "jsonl" => documents_from_jsonl(reader).unwrap(), - otherwise => panic!("invalid update format {:?}", otherwise), - }; - DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() - } - - fn documents_from_jsonl(reader: impl BufRead) -> crate::Result> { - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - - for result in serde_json::Deserializer::from_reader(reader).into_iter::() { - let object = result.unwrap(); - documents.append_json_object(&object)?; - } - - documents.into_inner().map_err(Into::into) - } - - fn documents_from_json(reader: impl BufRead) -> crate::Result> { - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - - documents.append_json_array(reader)?; - - documents.into_inner().map_err(Into::into) - } - - fn documents_from_csv(reader: impl BufRead) -> crate::Result> { - let csv = csv::Reader::from_reader(reader); - - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - documents.append_csv(csv)?; - - documents.into_inner().map_err(Into::into) - } -} diff --git a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs index a3d2ae419..15d82a2be 100644 --- a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs @@ -1,19 +1,28 @@ use std::marker::PhantomData; -use fxhash::FxHashMap; +use fxhash::{FxHashMap, FxHashSet}; use roaring::RoaringBitmap; use super::{RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::Interned; +use crate::search::new::query_term::Phrase; use crate::search::new::SearchContext; use crate::Result; // TODO: give a generation to each universe, then be able to get the exact // delta of docids between two universes of different generations! +#[derive(Default)] +pub struct ComputedCondition { + docids: RoaringBitmap, + universe_len: u64, + used_words: FxHashSet>, + used_phrases: FxHashSet>, +} + /// A cache storing the document ids associated with each ranking rule edge pub struct ConditionDocIdsCache { - pub cache: FxHashMap, (u64, RoaringBitmap)>, + pub cache: FxHashMap, ComputedCondition>, _phantom: PhantomData, } impl Default for ConditionDocIdsCache { @@ -22,6 +31,14 @@ impl Default for ConditionDocIdsCache { } } impl ConditionDocIdsCache { + pub fn get_condition_used_words_and_phrases( + &mut self, + interned_condition: Interned, + ) -> (&FxHashSet>, &FxHashSet>) { + let ComputedCondition { used_words, used_phrases, .. } = &self.cache[&interned_condition]; + (used_words, used_phrases) + } + /// Retrieve the document ids for the given edge condition. /// /// If the cache does not yet contain these docids, they are computed @@ -30,14 +47,14 @@ impl ConditionDocIdsCache { &'s mut self, ctx: &mut SearchContext<'ctx>, interned_condition: Interned, - graph: &RankingRuleGraph, - // TODO: maybe universe doesn't belong here + graph: &mut RankingRuleGraph, universe: &RoaringBitmap, ) -> Result<&'s RoaringBitmap> { if self.cache.contains_key(&interned_condition) { // TODO compare length of universe compared to the one in self // if it is smaller, then update the value - let (universe_len, docids) = self.cache.entry(interned_condition).or_default(); + let ComputedCondition { docids, universe_len, .. } = + self.cache.entry(interned_condition).or_default(); if *universe_len == universe.len() { return Ok(docids); } else { @@ -46,12 +63,13 @@ impl ConditionDocIdsCache { return Ok(docids); } } - // TODO: maybe universe doesn't belong here - let condition = graph.conditions_interner.get(interned_condition); - // TODO: faster way to do this? - let docids = G::resolve_condition(ctx, condition, universe)?; - let _ = self.cache.insert(interned_condition, (universe.len(), docids)); - let (_, docids) = &self.cache[&interned_condition]; + let condition = graph.conditions_interner.get_mut(interned_condition); + let (docids, used_words, used_phrases) = G::resolve_condition(ctx, condition, universe)?; + let _ = self.cache.insert( + interned_condition, + ComputedCondition { docids, universe_len: universe.len(), used_words, used_phrases }, + ); + let ComputedCondition { docids, .. } = &self.cache[&interned_condition]; Ok(docids) } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 528ff3107..baeb8bb71 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -15,11 +15,11 @@ mod proximity; /// Implementation of the `typo` ranking rule mod typo; -use std::collections::HashSet; use std::hash::Hash; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; +use fxhash::FxHashSet; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; @@ -80,23 +80,13 @@ pub trait RankingRuleGraphTrait: Sized { condition: &Self::Condition, ) -> Result; - fn words_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>>; - - fn phrases_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>>; - /// Compute the document ids associated with the given edge condition, /// restricted to the given universe. fn resolve_condition<'ctx>( ctx: &mut SearchContext<'ctx>, condition: &Self::Condition, universe: &RoaringBitmap, - ) -> Result; + ) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)>; /// Return the costs and conditions of the edges going from the source node to the destination node fn build_edges<'ctx>( diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index d0977d732..097120b49 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -1,56 +1,18 @@ #![allow(clippy::too_many_arguments)] -use std::collections::BTreeMap; - -use heed::RoTxn; use super::ProximityCondition; -use crate::search::new::db_cache::DatabaseCache; use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; -use crate::search::new::ranking_rule_graph::proximity::WordPair; +use crate::search::new::query_term::LocatedQueryTerm; use crate::search::new::{QueryNode, SearchContext}; use crate::Result; -fn last_word_of_term_iter<'t>( - t: &'t QueryTerm, - phrase_interner: &'t DedupInterner, -) -> impl Iterator>, Interned)> + 't { - t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map( - move |p| { - let phrase = phrase_interner.get(p); - phrase.words.last().unwrap().map(|last| (Some(p), last)) - }, - )) -} -fn first_word_of_term_iter<'t>( - t: &'t QueryTerm, - phrase_interner: &'t DedupInterner, -) -> impl Iterator, Option>)> + 't { - t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map( - move |p| { - let phrase = phrase_interner.get(p); - phrase.words.first().unwrap().map(|first| (first, Some(p))) - }, - )) -} - pub fn build_edges<'ctx>( - ctx: &mut SearchContext<'ctx>, + _ctx: &mut SearchContext<'ctx>, conditions_interner: &mut DedupInterner, from_node: &QueryNode, to_node: &QueryNode, ) -> Result>)>> { - let SearchContext { - index, - txn, - db_cache, - word_interner, - phrase_interner, - term_interner, - term_docids: _, - } = ctx; - let right_term = match &to_node.data { QueryNodeData::End => return Ok(vec![(0, None)]), QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]), @@ -59,13 +21,11 @@ pub fn build_edges<'ctx>( let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term; - let (right_term, right_start_position, right_ngram_length) = - (term_interner.get(*right_term_interned), *right_positions.start(), right_positions.len()); + let (right_start_position, right_ngram_length) = + (*right_positions.start(), right_positions.len()); - let (left_term, left_end_position) = match &from_node.data { - QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { - (term_interner.get(*value), *positions.end()) - } + let (left_term_interned, left_end_position) = match &from_node.data { + QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()), QueryNodeData::Deleted => return Ok(vec![]), QueryNodeData::Start => { return Ok(vec![( @@ -94,175 +54,24 @@ pub fn build_edges<'ctx>( )]); } - let mut cost_word_pairs = BTreeMap::>::new(); - - if let Some(right_prefix) = right_term.use_prefix_db { - for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { - add_prefix_edges( - index, - txn, - db_cache, - word_interner, - right_ngram_length, - left_word, - right_prefix, - &mut cost_word_pairs, - left_phrase, - )?; - } - } - - // TODO: add safeguard in case the cartesian product is too large! - // even if we restrict the word derivations to a maximum of 100, the size of the - // caterisan product could reach a maximum of 10_000 derivations, which is way too much. - // Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo - // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been - // reached - - for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { - for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) { - add_non_prefix_edges( - index, - txn, - db_cache, - word_interner, - right_ngram_length, - left_word, - right_word, - &mut cost_word_pairs, - &[left_phrase, right_phrase].iter().copied().flatten().collect::>(), - )?; - } - } - - let mut new_edges = cost_word_pairs - .into_iter() - .map(|(cost, word_pairs)| { - ( + let mut conditions = vec![]; + for cost in right_ngram_length..(7 + right_ngram_length) { + let cost = cost as u8; + conditions.push(( + cost, + Some(conditions_interner.insert(ProximityCondition::Uninit { + left_term: left_term_interned, + right_term: *right_term_interned, + right_term_ngram_len: right_ngram_length as u8, cost, - Some( - conditions_interner - .insert(ProximityCondition::Pairs { pairs: word_pairs.into_boxed_slice() }), - ), - ) - }) - .collect::>(); - new_edges.push(( - 8 + (right_ngram_length - 1) as u8, + })), + )) + } + + conditions.push(( + (7 + right_ngram_length) as u8, Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })), )); - Ok(new_edges) -} -fn add_prefix_edges<'ctx>( - index: &mut &crate::Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - word_interner: &mut DedupInterner, - right_ngram_length: usize, - left_word: Interned, - right_prefix: Interned, - cost_proximity_word_pairs: &mut BTreeMap>, - left_phrase: Option>, -) -> Result<()> { - for proximity in 1..=(8 - right_ngram_length) { - let cost = (proximity + right_ngram_length - 1) as u8; - // TODO: if we had access to the universe here, we could already check whether - // the bitmap corresponding to this word pair is disjoint with the universe or not - if db_cache - .get_word_prefix_pair_proximity_docids( - index, - txn, - word_interner, - left_word, - right_prefix, - proximity as u8, - )? - .is_some() - { - cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefix { - phrases: left_phrase.into_iter().collect(), - left: left_word, - right_prefix, - proximity: proximity as u8, - }); - } - - // No swapping when computing the proximity between a phrase and a word - if left_phrase.is_none() - && db_cache - .get_prefix_word_pair_proximity_docids( - index, - txn, - word_interner, - right_prefix, - left_word, - proximity as u8 - 1, - )? - .is_some() - { - cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefixSwapped { - left_prefix: right_prefix, - right: left_word, - proximity: proximity as u8 - 1, - }); - } - } - Ok(()) -} - -fn add_non_prefix_edges<'ctx>( - index: &mut &crate::Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - word_interner: &mut DedupInterner, - right_ngram_length: usize, - word1: Interned, - word2: Interned, - cost_proximity_word_pairs: &mut BTreeMap>, - phrases: &[Interned], -) -> Result<()> { - for proximity in 1..=(8 - right_ngram_length) { - let cost = (proximity + right_ngram_length - 1) as u8; - if db_cache - .get_word_pair_proximity_docids( - index, - txn, - word_interner, - word1, - word2, - proximity as u8, - )? - .is_some() - { - cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words { - phrases: phrases.to_vec(), - left: word1, - right: word2, - proximity: proximity as u8, - }); - } - if proximity > 1 - // no swapping when either term is a phrase - && phrases.is_empty() - && db_cache - .get_word_pair_proximity_docids( - index, - txn, - word_interner, - word2, - word1, - proximity as u8 - 1, - )? - .is_some() - { - cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words { - phrases: vec![], - left: word2, - right: word1, - proximity: proximity as u8 - 1, - }); - } - } - Ok(()) + Ok(conditions) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index bf5278f8d..eabdb2cb1 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,6 +1,15 @@ +#![allow(clippy::too_many_arguments)] + +use std::iter::FromIterator; + +use fxhash::FxHashSet; +use heed::RoTxn; use roaring::RoaringBitmap; -use super::{ProximityCondition, WordPair}; +use super::ProximityCondition; +use crate::search::new::db_cache::DatabaseCache; +use crate::search::new::interner::{DedupInterner, Interned}; +use crate::search::new::query_term::{Phrase, QueryTerm}; use crate::search::new::SearchContext; use crate::{CboRoaringBitmapCodec, Result}; @@ -8,7 +17,7 @@ pub fn compute_docids<'ctx>( ctx: &mut SearchContext<'ctx>, condition: &ProximityCondition, universe: &RoaringBitmap, -) -> Result { +) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)> { let SearchContext { index, txn, @@ -18,96 +27,238 @@ pub fn compute_docids<'ctx>( phrase_interner, term_interner, } = ctx; - let pairs = match condition { - ProximityCondition::Term { term } => { - return term_docids - .get_query_term_docids( - index, - txn, - db_cache, - word_interner, - term_interner, - phrase_interner, - *term, - ) - .cloned() + + let (left_term, right_term, right_term_ngram_len, cost) = match condition { + ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => { + (*left_term, *right_term, *right_term_ngram_len, *cost) + } + ProximityCondition::Term { term } => { + let term_v = term_interner.get(*term); + return Ok(( + term_docids + .get_query_term_docids( + index, + txn, + db_cache, + word_interner, + term_interner, + phrase_interner, + *term, + )? + .clone(), + FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()), + FxHashSet::from_iter(term_v.all_phrases()), + )); } - ProximityCondition::Pairs { pairs } => pairs, }; - let mut pair_docids = RoaringBitmap::new(); - for pair in pairs.iter() { - let pair = match pair { - WordPair::Words { phrases, left, right, proximity } => { - let mut docids = db_cache - .get_word_pair_proximity_docids( - index, - txn, - word_interner, - *left, - *right, - *proximity, - )? - .map(CboRoaringBitmapCodec::deserialize_from) - .transpose()? - .unwrap_or_default(); - if !docids.is_empty() { - for phrase in phrases { - docids &= ctx.term_docids.get_phrase_docids( - index, - txn, - db_cache, - word_interner, - &ctx.phrase_interner, - *phrase, - )?; - } - } - docids - } - WordPair::WordPrefix { phrases, left, right_prefix, proximity } => { - let mut docids = db_cache - .get_word_prefix_pair_proximity_docids( - index, - txn, - word_interner, - *left, - *right_prefix, - *proximity, - )? - .map(CboRoaringBitmapCodec::deserialize_from) - .transpose()? - .unwrap_or_default(); - if !docids.is_empty() { - for phrase in phrases { - docids &= ctx.term_docids.get_phrase_docids( - index, - txn, - db_cache, - word_interner, - &ctx.phrase_interner, - *phrase, - )?; - } - } - docids - } - WordPair::WordPrefixSwapped { left_prefix, right, proximity } => db_cache - .get_prefix_word_pair_proximity_docids( - index, - txn, - word_interner, - *left_prefix, - *right, - *proximity, - )? - .map(CboRoaringBitmapCodec::deserialize_from) - .transpose()? - .unwrap_or_default(), - }; - // TODO: deserialize bitmap within a universe - let bitmap = universe & pair; - pair_docids |= bitmap; + + let left_term = term_interner.get(left_term); + let right_term = term_interner.get(right_term); + + // e.g. for the simple words `sun .. flower` + // the cost is 5 + // the forward proximity is 5 + // the backward proximity is 4 + // + // for the 2gram `the sunflower` + // the cost is 5 + // the forward proximity is 4 + // the backward proximity is 3 + let forward_proximity = 1 + cost - right_term_ngram_len; + let backward_proximity = cost - right_term_ngram_len; + + let mut used_words = FxHashSet::default(); + let mut used_phrases = FxHashSet::default(); + + let mut docids = RoaringBitmap::new(); + + if let Some(right_prefix) = right_term.use_prefix_db { + for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { + compute_prefix_edges( + index, + txn, + db_cache, + word_interner, + left_word, + right_prefix, + left_phrase, + forward_proximity, + backward_proximity, + &mut docids, + universe, + &mut used_words, + &mut used_phrases, + )?; + } } - Ok(pair_docids) + // TODO: add safeguard in case the cartesian product is too large! + // even if we restrict the word derivations to a maximum of 100, the size of the + // caterisan product could reach a maximum of 10_000 derivations, which is way too much. + // Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo + // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been + // reached + + for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { + for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) { + compute_non_prefix_edges( + index, + txn, + db_cache, + word_interner, + left_word, + right_word, + &[left_phrase, right_phrase].iter().copied().flatten().collect::>(), + forward_proximity, + backward_proximity, + &mut docids, + universe, + &mut used_words, + &mut used_phrases, + )?; + } + } + + Ok((docids, used_words, used_phrases)) +} + +fn compute_prefix_edges<'ctx>( + index: &mut &crate::Index, + txn: &'ctx RoTxn, + db_cache: &mut DatabaseCache<'ctx>, + word_interner: &mut DedupInterner, + left_word: Interned, + right_prefix: Interned, + left_phrase: Option>, + forward_proximity: u8, + backward_proximity: u8, + docids: &mut RoaringBitmap, + universe: &RoaringBitmap, + used_words: &mut FxHashSet>, + used_phrases: &mut FxHashSet>, +) -> Result<()> { + if let Some(phrase) = left_phrase { + // TODO: compute the phrase, take the intersection between + // the phrase and the docids + used_phrases.insert(phrase); // This is not fully correct + } + + if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids( + index, + txn, + word_interner, + left_word, + right_prefix, + forward_proximity, + )? { + let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + if !new_docids.is_empty() { + used_words.insert(left_word); + used_words.insert(right_prefix); + *docids |= new_docids; + } + } + + // No swapping when computing the proximity between a phrase and a word + if left_phrase.is_none() { + if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids( + index, + txn, + word_interner, + right_prefix, + left_word, + backward_proximity, + )? { + let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + if !new_docids.is_empty() { + used_words.insert(left_word); + used_words.insert(right_prefix); + *docids |= new_docids; + } + } + } + + Ok(()) +} + +fn compute_non_prefix_edges<'ctx>( + index: &mut &crate::Index, + txn: &'ctx RoTxn, + db_cache: &mut DatabaseCache<'ctx>, + word_interner: &mut DedupInterner, + word1: Interned, + word2: Interned, + phrases: &[Interned], + forward_proximity: u8, + backward_proximity: u8, + docids: &mut RoaringBitmap, + universe: &RoaringBitmap, + used_words: &mut FxHashSet>, + used_phrases: &mut FxHashSet>, +) -> Result<()> { + if !phrases.is_empty() { + // TODO: compute the docids associated with these phrases + // take their intersection with the new docids + used_phrases.extend(phrases); // This is not fully correct + } + if let Some(new_docids) = db_cache.get_word_pair_proximity_docids( + index, + txn, + word_interner, + word1, + word2, + forward_proximity, + )? { + let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + if !new_docids.is_empty() { + used_words.insert(word1); + used_words.insert(word2); + *docids |= new_docids; + } + } + if backward_proximity >= 1 + // no swapping when either term is a phrase + && phrases.is_empty() + { + if let Some(new_docids) = db_cache.get_word_pair_proximity_docids( + index, + txn, + word_interner, + word2, + word1, + backward_proximity, + )? { + let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + if !new_docids.is_empty() { + used_words.insert(word1); + used_words.insert(word2); + *docids |= new_docids; + } + } + } + + Ok(()) +} + +fn last_word_of_term_iter<'t>( + t: &'t QueryTerm, + phrase_interner: &'t DedupInterner, +) -> impl Iterator>, Interned)> + 't { + t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map( + move |p| { + let phrase = phrase_interner.get(p); + phrase.words.last().unwrap().map(|last| (Some(p), last)) + }, + )) +} +fn first_word_of_term_iter<'t>( + t: &'t QueryTerm, + phrase_interner: &'t DedupInterner, +) -> impl Iterator, Option>)> + 't { + t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map( + move |p| { + let phrase = phrase_interner.get(p); + phrase.words.first().unwrap().map(|first| (first, Some(p))) + }, + )) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 568a2c2b0..7b8a066ab 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -1,9 +1,7 @@ pub mod build; pub mod compute_docids; -use std::collections::HashSet; -use std::iter::FromIterator; - +use fxhash::FxHashSet; use roaring::RoaringBitmap; use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; @@ -13,31 +11,17 @@ use crate::search::new::query_term::{Phrase, QueryTerm}; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum WordPair { - Words { - phrases: Vec>, - left: Interned, - right: Interned, - proximity: u8, - }, - WordPrefix { - phrases: Vec>, - left: Interned, - right_prefix: Interned, - proximity: u8, - }, - WordPrefixSwapped { - left_prefix: Interned, - right: Interned, - proximity: u8, - }, -} - #[derive(Clone, PartialEq, Eq, Hash)] pub enum ProximityCondition { - Term { term: Interned }, - Pairs { pairs: Box<[WordPair]> }, + Uninit { + left_term: Interned, + right_term: Interned, + right_term_ngram_len: u8, + cost: u8, + }, + Term { + term: Interned, + }, } pub enum ProximityGraph {} @@ -49,7 +33,8 @@ impl RankingRuleGraphTrait for ProximityGraph { ctx: &mut SearchContext<'ctx>, condition: &Self::Condition, universe: &RoaringBitmap, - ) -> Result { + ) -> Result<(roaring::RoaringBitmap, FxHashSet>, FxHashSet>)> + { compute_docids::compute_docids(ctx, condition, universe) } @@ -79,107 +64,14 @@ impl RankingRuleGraphTrait for ProximityGraph { condition: &Self::Condition, ) -> Result { match condition { + ProximityCondition::Uninit { cost, .. } => { + // TODO + Ok(format!("{cost}: cost")) + } ProximityCondition::Term { term } => { let term = ctx.term_interner.get(*term); Ok(format!("{} : exists", ctx.word_interner.get(term.original))) } - ProximityCondition::Pairs { pairs } => { - let mut s = String::new(); - for pair in pairs.iter() { - match pair { - WordPair::Words { phrases, left, right, proximity } => { - let left = ctx.word_interner.get(*left); - let right = ctx.word_interner.get(*right); - if !phrases.is_empty() { - s.push_str(&format!("{} phrases + ", phrases.len())); - } - s.push_str(&format!("\"{left} {right}\": {proximity}\n")); - } - WordPair::WordPrefix { phrases, left, right_prefix, proximity } => { - let left = ctx.word_interner.get(*left); - let right = ctx.word_interner.get(*right_prefix); - if !phrases.is_empty() { - s.push_str(&format!("{} phrases + ", phrases.len())); - } - s.push_str(&format!("\"{left} {right}...\" : {proximity}\n")); - } - WordPair::WordPrefixSwapped { left_prefix, right, proximity } => { - let left = ctx.word_interner.get(*left_prefix); - let right = ctx.word_interner.get(*right); - s.push_str(&format!("\"{left}... {right}\" : {proximity}\n")); - } - } - } - Ok(s) - } - } - } - - fn words_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>> { - match condition { - ProximityCondition::Term { term } => { - let term = ctx.term_interner.get(*term); - Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) - } - ProximityCondition::Pairs { pairs } => { - let mut set = HashSet::new(); - for pair in pairs.iter() { - match pair { - WordPair::Words { phrases: _, left, right, proximity: _ } => { - set.insert(*left); - set.insert(*right); - } - WordPair::WordPrefix { phrases: _, left, right_prefix, proximity: _ } => { - set.insert(*left); - // TODO: this is not correct, there should be another trait method for collecting the prefixes - // to be used with the prefix DBs - set.insert(*right_prefix); - } - WordPair::WordPrefixSwapped { left_prefix, right, proximity: _ } => { - // TODO: this is not correct, there should be another trait method for collecting the prefixes - // to be used with the prefix DBs - set.insert(*left_prefix); - set.insert(*right); - } - } - } - Ok(set) - } - } - } - - fn phrases_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>> { - match condition { - ProximityCondition::Term { term } => { - let term = ctx.term_interner.get(*term); - Ok(HashSet::from_iter(term.all_phrases())) - } - ProximityCondition::Pairs { pairs } => { - let mut set = HashSet::new(); - for pair in pairs.iter() { - match pair { - WordPair::Words { phrases, left: _, right: _, proximity: _ } => { - set.extend(phrases.iter().copied()); - } - WordPair::WordPrefix { - phrases, - left: _, - right_prefix: _, - proximity: _, - } => { - set.extend(phrases.iter().copied()); - } - WordPair::WordPrefixSwapped { left_prefix: _, right: _, proximity: _ } => {} - } - } - Ok(set) - } } } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 32b905244..4ef0d15d1 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,7 +1,8 @@ -use std::collections::HashSet; +// use std::collections::HashSet; use std::fmt::Write; use std::iter::FromIterator; +use fxhash::FxHashSet; use roaring::RoaringBitmap; use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; @@ -26,7 +27,7 @@ impl RankingRuleGraphTrait for TypoGraph { ctx: &mut SearchContext<'ctx>, condition: &Self::Condition, universe: &RoaringBitmap, - ) -> Result { + ) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)> { let SearchContext { index, txn, @@ -48,7 +49,12 @@ impl RankingRuleGraphTrait for TypoGraph { condition.term, )?; - Ok(docids) + let term = term_interner.get(condition.term); + Ok(( + docids, + FxHashSet::from_iter(term.all_single_words_except_prefix_db()), + FxHashSet::from_iter(term.all_phrases()), + )) } fn build_edges<'ctx>( @@ -202,21 +208,21 @@ impl RankingRuleGraphTrait for TypoGraph { Ok(s) } - fn words_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>> { - let TypoCondition { term, .. } = condition; - let term = ctx.term_interner.get(*term); - Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) - } + // fn words_used_by_condition<'ctx>( + // ctx: &mut SearchContext<'ctx>, + // condition: &Self::Condition, + // ) -> Result>> { + // let TypoCondition { term, .. } = condition; + // let term = ctx.term_interner.get(*term); + // Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) + // } - fn phrases_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>> { - let TypoCondition { term, .. } = condition; - let term = ctx.term_interner.get(*term); - Ok(HashSet::from_iter(term.all_phrases())) - } + // fn phrases_used_by_condition<'ctx>( + // ctx: &mut SearchContext<'ctx>, + // condition: &Self::Condition, + // ) -> Result>> { + // let TypoCondition { term, .. } = condition; + // let term = ctx.term_interner.get(*term); + // Ok(HashSet::from_iter(term.all_phrases())) + // } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 7549cfff7..32434248c 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -125,7 +125,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( let mut results = vec![]; let mut cur_offset = 0usize; - /// Add the candidates to the results. Take `distinct`, `from`, `limit`, and `cur_offset` + /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` /// into account and inform the logger. macro_rules! maybe_add_to_results { ($candidates:expr) => { @@ -181,6 +181,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( cur_offset += len as usize; }; } + while results.len() < length { // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index f5f8c0895..ff8a9bf2f 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -9,9 +9,9 @@ use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::{Result, TermsMatchingStrategy}; pub struct Words { - exhausted: bool, + exhausted: bool, // TODO: remove query_graph: Option, - iterating: bool, + iterating: bool, // TODO: remove positions_to_remove: Vec, terms_matching_strategy: TermsMatchingStrategy, } From 384fdc2df44e795f5d202a669571115dfc035501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Mar 2023 11:43:25 +0100 Subject: [PATCH 077/234] Fix two bugs in proximity ranking rule --- .../search/new/graph_based_ranking_rule.rs | 2 - .../proximity/compute_docids.rs | 127 +++++++++++------- 2 files changed, 76 insertions(+), 53 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index f83f01074..5127082f7 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -213,8 +213,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase dead_ends_cache.forbid_condition(latest_condition); // 2. remove all the edges with this condition from the ranking rule graph graph.remove_edges_with_condition(latest_condition); - // 3. Also remove the entry from the condition_docids_cache, since we don't need it anymore - condition_docids_cache.cache.remove(&latest_condition); return Ok(ControlFlow::Continue(())); } path_docids &= condition_docids; diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index eabdb2cb1..12b5654c4 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -2,47 +2,37 @@ use std::iter::FromIterator; -use fxhash::FxHashSet; -use heed::RoTxn; -use roaring::RoaringBitmap; - use super::ProximityCondition; use crate::search::new::db_cache::DatabaseCache; use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::{Phrase, QueryTerm}; +use crate::search::new::resolve_query_graph::QueryTermDocIdsCache; use crate::search::new::SearchContext; -use crate::{CboRoaringBitmapCodec, Result}; +use crate::{CboRoaringBitmapCodec, Index, Result}; +use fxhash::FxHashSet; +use heed::RoTxn; +use roaring::RoaringBitmap; pub fn compute_docids<'ctx>( ctx: &mut SearchContext<'ctx>, condition: &ProximityCondition, universe: &RoaringBitmap, ) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)> { - let SearchContext { - index, - txn, - db_cache, - word_interner, - term_docids, - phrase_interner, - term_interner, - } = ctx; - let (left_term, right_term, right_term_ngram_len, cost) = match condition { ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => { (*left_term, *right_term, *right_term_ngram_len, *cost) } ProximityCondition::Term { term } => { - let term_v = term_interner.get(*term); + let term_v = ctx.term_interner.get(*term); return Ok(( - term_docids + ctx.term_docids .get_query_term_docids( - index, - txn, - db_cache, - word_interner, - term_interner, - phrase_interner, + ctx.index, + ctx.txn, + &mut ctx.db_cache, + &ctx.word_interner, + &ctx.term_interner, + &ctx.phrase_interner, *term, )? .clone(), @@ -52,8 +42,8 @@ pub fn compute_docids<'ctx>( } }; - let left_term = term_interner.get(left_term); - let right_term = term_interner.get(right_term); + let left_term = ctx.term_interner.get(left_term); + let right_term = ctx.term_interner.get(right_term); // e.g. for the simple words `sun .. flower` // the cost is 5 @@ -73,12 +63,14 @@ pub fn compute_docids<'ctx>( let mut docids = RoaringBitmap::new(); if let Some(right_prefix) = right_term.use_prefix_db { - for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { + for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) { compute_prefix_edges( - index, - txn, - db_cache, - word_interner, + ctx.index, + ctx.txn, + &mut ctx.db_cache, + &mut ctx.term_docids, + &ctx.word_interner, + &ctx.phrase_interner, left_word, right_prefix, left_phrase, @@ -99,13 +91,16 @@ pub fn compute_docids<'ctx>( // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been // reached - for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { - for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) { + for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) { + for (right_word, right_phrase) in first_word_of_term_iter(right_term, &ctx.phrase_interner) + { compute_non_prefix_edges( - index, - txn, - db_cache, - word_interner, + ctx.index, + ctx.txn, + &mut ctx.db_cache, + &mut ctx.term_docids, + &ctx.word_interner, + &ctx.phrase_interner, left_word, right_word, &[left_phrase, right_phrase].iter().copied().flatten().collect::>(), @@ -123,10 +118,12 @@ pub fn compute_docids<'ctx>( } fn compute_prefix_edges<'ctx>( - index: &mut &crate::Index, + index: &Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &mut DedupInterner, + term_docids: &mut QueryTermDocIdsCache, + word_interner: &DedupInterner, + phrase_interner: &DedupInterner, left_word: Interned, right_prefix: Interned, left_phrase: Option>, @@ -137,10 +134,23 @@ fn compute_prefix_edges<'ctx>( used_words: &mut FxHashSet>, used_phrases: &mut FxHashSet>, ) -> Result<()> { + let mut universe = universe.clone(); if let Some(phrase) = left_phrase { - // TODO: compute the phrase, take the intersection between - // the phrase and the docids - used_phrases.insert(phrase); // This is not fully correct + let phrase_docids = term_docids.get_phrase_docids( + index, + txn, + db_cache, + word_interner, + phrase_interner, + phrase, + )?; + if !phrase_docids.is_empty() { + used_phrases.insert(phrase); + } + universe &= phrase_docids; + if universe.is_empty() { + return Ok(()); + } } if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids( @@ -151,7 +161,7 @@ fn compute_prefix_edges<'ctx>( right_prefix, forward_proximity, )? { - let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { used_words.insert(left_word); used_words.insert(right_prefix); @@ -169,7 +179,7 @@ fn compute_prefix_edges<'ctx>( left_word, backward_proximity, )? { - let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { used_words.insert(left_word); used_words.insert(right_prefix); @@ -182,10 +192,12 @@ fn compute_prefix_edges<'ctx>( } fn compute_non_prefix_edges<'ctx>( - index: &mut &crate::Index, + index: &Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &mut DedupInterner, + term_docids: &mut QueryTermDocIdsCache, + word_interner: &DedupInterner, + phrase_interner: &DedupInterner, word1: Interned, word2: Interned, phrases: &[Interned], @@ -196,10 +208,23 @@ fn compute_non_prefix_edges<'ctx>( used_words: &mut FxHashSet>, used_phrases: &mut FxHashSet>, ) -> Result<()> { - if !phrases.is_empty() { - // TODO: compute the docids associated with these phrases - // take their intersection with the new docids - used_phrases.extend(phrases); // This is not fully correct + let mut universe = universe.clone(); + for phrase in phrases { + let phrase_docids = term_docids.get_phrase_docids( + index, + txn, + db_cache, + word_interner, + phrase_interner, + *phrase, + )?; + if !phrase_docids.is_empty() { + used_phrases.insert(*phrase); + } + universe &= phrase_docids; + if universe.is_empty() { + return Ok(()); + } } if let Some(new_docids) = db_cache.get_word_pair_proximity_docids( index, @@ -209,7 +234,7 @@ fn compute_non_prefix_edges<'ctx>( word2, forward_proximity, )? { - let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { used_words.insert(word1); used_words.insert(word2); @@ -228,7 +253,7 @@ fn compute_non_prefix_edges<'ctx>( word1, backward_proximity, )? { - let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { used_words.insert(word1); used_words.insert(word2); From a86aeba41109cfa9aa42fc53258407db95855b4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 22 Mar 2023 14:43:08 +0100 Subject: [PATCH 078/234] WIP --- milli/src/search/new/mod.rs | 1 + milli/src/search/new/query_term.rs | 1 + milli/src/search/new/ranking_rule_graph/mod.rs | 2 ++ 3 files changed, 4 insertions(+) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 44e26a9ea..e699d408f 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -150,6 +150,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( crate::Criterion::Typo | crate::Criterion::Attribute | crate::Criterion::Proximity + // TODO: no exactness | crate::Criterion::Exactness => { if !words { ranking_rules.push(Box::new(Words::new(terms_matching_strategy))); diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 0ce000537..2b32fcd84 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -216,6 +216,7 @@ impl QueryTerm { /// /// This excludes synonyms, split words, and words stored in the prefix databases. pub fn all_phrases(&'_ self) -> impl Iterator> + Clone + '_ { + todo!("self.phrase"); self.split_words.iter().chain(self.synonyms.iter()).copied() } pub fn is_empty(&self) -> bool { diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index baeb8bb71..9f825ee3d 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -15,6 +15,8 @@ mod proximity; /// Implementation of the `typo` ranking rule mod typo; +mod attribute_rank; + use std::hash::Hash; pub use condition_docids_cache::ConditionDocIdsCache; From 01c7d2de8fbc9e26a25713b841311942fcc3177f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 22 Mar 2023 14:50:41 +0100 Subject: [PATCH 079/234] Add example targets to the milli crate --- milli/examples/index.rs | 119 +++++++++++++++++++++++++++++++++++ milli/examples/search.rs | 124 +++++++++++++++++++++++++++++++++++++ milli/examples/settings.rs | 31 ++++++++++ 3 files changed, 274 insertions(+) create mode 100644 milli/examples/index.rs create mode 100644 milli/examples/search.rs create mode 100644 milli/examples/settings.rs diff --git a/milli/examples/index.rs b/milli/examples/index.rs new file mode 100644 index 000000000..17a62b31f --- /dev/null +++ b/milli/examples/index.rs @@ -0,0 +1,119 @@ +use std::{ + error::Error, + fs::File, + io::{BufRead, BufReader, Cursor, Seek}, + time::Duration, +}; + +use heed::EnvOpenOptions; +use milli::{ + documents::{DocumentsBatchBuilder, DocumentsBatchReader}, + update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}, + Criterion, Index, Object, +}; + +fn main() -> Result<(), Box> { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_organizations").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + let primary_key = "uuid"; + // let searchable_fields = vec!["body", "title", "url"]; + // let searchable_fields = vec!["title", "overview"]; + let searchable_fields = + vec!["name", "primary_role", "city", "region", "country_code", "short_description"]; + // let filterable_fields = vec!["release_date", "genres"]; + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key(primary_key.to_owned()); + let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_filterable_fields(filterable_fields); + + // builder.set_min_word_len_one_typo(5); + // builder.set_min_word_len_two_typos(100); + builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); + builder.execute(|_| (), || false).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); + + let documents = documents_from( + // "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json", + "/Users/meilisearch/Documents/datasets/organizations.csv", + // "json" + "csv", + ); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + // let rtxn = index.read_txn().unwrap(); + + // let mut wtxn = index.write_txn().unwrap(); + // let config = IndexerConfig::default(); + // let indexing_config = IndexDocumentsConfig::default(); + // let builder = + // IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); + + // let documents = documents_from("test_doc.json", "json"); + // let (builder, user_error) = builder.add_documents(documents).unwrap(); + // user_error.unwrap(); + // builder.execute().unwrap(); + // wtxn.commit().unwrap(); + + // let _ = index.all_documents(&rtxn)?; + + // println!("done!"); + // std::thread::sleep(Duration::from_secs(100)); + + index.prepare_for_closing().wait(); + Ok(()) +} +fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { + let reader = File::open(filename) + .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename)); + let reader = BufReader::new(reader); + let documents = match filetype { + "csv" => documents_from_csv(reader).unwrap(), + "json" => documents_from_json(reader).unwrap(), + "jsonl" => documents_from_jsonl(reader).unwrap(), + otherwise => panic!("invalid update format {:?}", otherwise), + }; + DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() +} + +fn documents_from_jsonl(reader: impl BufRead) -> milli::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { + let object = result.unwrap(); + documents.append_json_object(&object)?; + } + + documents.into_inner().map_err(Into::into) +} + +fn documents_from_json(reader: impl BufRead) -> milli::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + documents.append_json_array(reader)?; + + documents.into_inner().map_err(Into::into) +} + +fn documents_from_csv(reader: impl BufRead) -> milli::Result> { + let csv = csv::Reader::from_reader(reader); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) +} diff --git a/milli/examples/search.rs b/milli/examples/search.rs new file mode 100644 index 000000000..558f92bac --- /dev/null +++ b/milli/examples/search.rs @@ -0,0 +1,124 @@ +// use crate::allocator::ALLOC; +use std::error::Error; +use std::io::stdin; +use std::time::Instant; + +use heed::EnvOpenOptions; +use milli::{ + execute_search, DefaultSearchLogger, Index, Search, SearchContext, TermsMatchingStrategy, +}; + +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +fn main() -> Result<(), Box> { + // TODO: command line + let mut args = std::env::args(); + let _ = args.next().unwrap(); + let dataset = args.next().unwrap(); + + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + // Query: + // disp: 20 + // + // dasp: 70 words + // dosp: 80 + // dasc: 80 + // + // + // daspouyerf + // daspojewkfb + + let index = Index::new(options, dataset)?; + let txn = index.read_txn()?; + let mut query = String::new(); + while stdin().read_line(&mut query)? > 0 { + for _ in 0..10 { + let start = Instant::now(); + // let mut logger = milli::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let docs = execute_search( + &mut ctx, + query.trim(), + // what a the from which when there is + TermsMatchingStrategy::Last, + None, + 0, + 20, + &mut DefaultSearchLogger, + &mut DefaultSearchLogger, + // &mut logger, + )?; + // logger.write_d2_description(&mut ctx); + let elapsed = start.elapsed(); + println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids); + + // let documents = index + // .documents(&txn, docs.documents_ids.iter().copied()) + // .unwrap() + // .into_iter() + // .map(|(id, obkv)| { + // let mut object = serde_json::Map::default(); + // for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + // let value = obkv.get(fid).unwrap(); + // let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + // object.insert(fid_name.to_owned(), value); + // } + // (id, serde_json::to_string_pretty(&object).unwrap()) + // }) + // .collect::>(); + + // println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + // for (id, document) in documents { + // println!("{id}:"); + // println!("{document}"); + // } + + let start = Instant::now(); + let mut s = Search::new(&txn, &index); + s.query( + // "which a the releases from poison by the government", + // "sun flower s are the best", + query.trim(), + ); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + // s.limit(1); + // s.criterion_implementation_strategy( + // milli::CriterionImplementationStrategy::OnlySetBased, + // ); + + let docs = s.execute().unwrap(); + let elapsed = start.elapsed(); + println!("old: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids); + + // let documents = index + // .documents(&txn, docs.documents_ids.iter().copied()) + // .unwrap() + // .into_iter() + // .map(|(id, obkv)| { + // let mut object = serde_json::Map::default(); + // for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + // let value = obkv.get(fid).unwrap(); + // let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + // object.insert(fid_name.to_owned(), value); + // } + // (id, serde_json::to_string_pretty(&object).unwrap()) + // }) + // .collect::>(); + // println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + // for (id, document) in documents { + // println!("{id}:"); + // println!("{document}"); + // } + } + query.clear(); + } + // for (id, document) in documents { + // println!("{id}:"); + // // println!("{document}"); + // } + + Ok(()) +} diff --git a/milli/examples/settings.rs b/milli/examples/settings.rs new file mode 100644 index 000000000..fb9cf2789 --- /dev/null +++ b/milli/examples/settings.rs @@ -0,0 +1,31 @@ +// use big_s::S; +use heed::EnvOpenOptions; +// use maplit::hashset; +use milli::{ + update::{IndexerConfig, Settings}, + Criterion, Index, +}; + +fn main() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_wiki").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + // builder.set_min_word_len_one_typo(5); + // builder.set_min_word_len_two_typos(7); + // builder.set_sortable_fields(hashset! { S("release_date") }); + builder.set_criteria(vec![ + Criterion::Words, + Criterion::Typo, + Criterion::Proximity, + // Criterion::Asc("release_date".to_owned()), + ]); + + builder.execute(|_| (), || false).unwrap(); + wtxn.commit().unwrap(); +} From 9b1f439a917da71d5790baf77a390a210bea668d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 23 Mar 2023 09:12:35 +0100 Subject: [PATCH 080/234] WIP --- milli/examples/search.rs | 37 +++++++++++++++--------------- milli/examples/settings.rs | 4 +++- milli/src/search/new/mod.rs | 8 +++---- milli/src/search/new/query_term.rs | 1 - 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 558f92bac..57aac5a02 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -35,7 +35,25 @@ fn main() -> Result<(), Box> { let txn = index.read_txn()?; let mut query = String::new(); while stdin().read_line(&mut query)? > 0 { - for _ in 0..10 { + for _ in 0..2 { + let start = Instant::now(); + let mut s = Search::new(&txn, &index); + s.query( + // "which a the releases from poison by the government", + // "sun flower s are the best", + query.trim(), + ); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.offset(0); + // s.limit(1); + // s.criterion_implementation_strategy( + // milli::CriterionImplementationStrategy::OnlySetBased, + // ); + + let docs = s.execute().unwrap(); + let elapsed = start.elapsed(); + println!("old: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids); + let start = Instant::now(); // let mut logger = milli::DetailedSearchLogger::new("log"); let mut ctx = SearchContext::new(&index, &txn); @@ -76,23 +94,6 @@ fn main() -> Result<(), Box> { // println!("{document}"); // } - let start = Instant::now(); - let mut s = Search::new(&txn, &index); - s.query( - // "which a the releases from poison by the government", - // "sun flower s are the best", - query.trim(), - ); - s.terms_matching_strategy(TermsMatchingStrategy::Last); - // s.limit(1); - // s.criterion_implementation_strategy( - // milli::CriterionImplementationStrategy::OnlySetBased, - // ); - - let docs = s.execute().unwrap(); - let elapsed = start.elapsed(); - println!("old: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids); - // let documents = index // .documents(&txn, docs.documents_ids.iter().copied()) // .unwrap() diff --git a/milli/examples/settings.rs b/milli/examples/settings.rs index fb9cf2789..a4ac3879f 100644 --- a/milli/examples/settings.rs +++ b/milli/examples/settings.rs @@ -10,7 +10,7 @@ fn main() { let mut options = EnvOpenOptions::new(); options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - let index = Index::new(options, "data_wiki").unwrap(); + let index = Index::new(options, "data_movies").unwrap(); let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); @@ -23,6 +23,8 @@ fn main() { Criterion::Words, Criterion::Typo, Criterion::Proximity, + Criterion::Attribute, + Criterion::Exactness, // Criterion::Asc("release_date".to_owned()), ]); diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index e699d408f..9627a662e 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -185,21 +185,21 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( if attribute { continue; } - todo!(); + // todo!(); // attribute = false; } crate::Criterion::Sort => { if sort { continue; } - todo!(); + // todo!(); // sort = false; } crate::Criterion::Exactness => { if exactness { continue; } - todo!(); + // todo!(); // exactness = false; } crate::Criterion::Asc(field) => { @@ -214,7 +214,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( continue; } desc.insert(field); - todo!(); + // todo!(); } } } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 2b32fcd84..0ce000537 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -216,7 +216,6 @@ impl QueryTerm { /// /// This excludes synonyms, split words, and words stored in the prefix databases. pub fn all_phrases(&'_ self) -> impl Iterator> + Clone + '_ { - todo!("self.phrase"); self.split_words.iter().chain(self.synonyms.iter()).copied() } pub fn is_empty(&self) -> bool { From 56b7209f26f74f5094fb34b7e314c4c0a9bd6c54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 23 Mar 2023 09:15:57 +0100 Subject: [PATCH 081/234] Make clippy happy --- milli/src/search/new/distinct.rs | 4 +-- milli/src/search/new/logger/detailed.rs | 18 +++++------ milli/src/search/new/logger/mod.rs | 32 +++++++++---------- milli/src/search/new/mod.rs | 10 +++--- milli/src/search/new/query_term.rs | 4 +-- .../condition_docids_cache.rs | 4 +-- .../src/search/new/ranking_rule_graph/mod.rs | 13 +++----- .../new/ranking_rule_graph/proximity/build.rs | 4 +-- .../proximity/compute_docids.rs | 4 +-- .../new/ranking_rule_graph/proximity/mod.rs | 13 +++----- .../search/new/ranking_rule_graph/typo/mod.rs | 13 +++----- milli/src/search/new/resolve_query_graph.rs | 4 +-- milli/src/search/new/words.rs | 2 +- milli/src/search/query_tree.rs | 9 ++---- 14 files changed, 61 insertions(+), 73 deletions(-) diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs index f2e79603a..ad4b46659 100644 --- a/milli/src/search/new/distinct.rs +++ b/milli/src/search/new/distinct.rs @@ -22,8 +22,8 @@ pub struct DistinctOutput { /// is considered unique. /// - `excluded`: the set of document ids that contain a value for the given field that occurs /// in the given candidates. -pub fn apply_distinct_rule<'ctx>( - ctx: &mut SearchContext<'ctx>, +pub fn apply_distinct_rule( + ctx: &mut SearchContext, field_id: u16, candidates: &RoaringBitmap, // TODO: add a universe here, such that the `excluded` are a subset of the universe? diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 19a2679a8..d6037aab2 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -102,10 +102,10 @@ impl SearchLogger for DetailedSearchLogger { self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect()); } - fn start_iteration_ranking_rule<'transaction>( + fn start_iteration_ranking_rule( &mut self, ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + _ranking_rule: &dyn RankingRule, query: &QueryGraph, universe: &RoaringBitmap, ) { @@ -117,10 +117,10 @@ impl SearchLogger for DetailedSearchLogger { }) } - fn next_bucket_ranking_rule<'transaction>( + fn next_bucket_ranking_rule( &mut self, ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + _ranking_rule: &dyn RankingRule, universe: &RoaringBitmap, candidates: &RoaringBitmap, ) { @@ -131,10 +131,10 @@ impl SearchLogger for DetailedSearchLogger { time: Instant::now(), }) } - fn skip_bucket_ranking_rule<'transaction>( + fn skip_bucket_ranking_rule( &mut self, ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + _ranking_rule: &dyn RankingRule, candidates: &RoaringBitmap, ) { self.events.push(SearchEvents::RankingRuleSkipBucket { @@ -144,10 +144,10 @@ impl SearchLogger for DetailedSearchLogger { }) } - fn end_iteration_ranking_rule<'transaction>( + fn end_iteration_ranking_rule( &mut self, ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, + _ranking_rule: &dyn RankingRule, universe: &RoaringBitmap, ) { self.events.push(SearchEvents::RankingRuleEndIteration { @@ -427,7 +427,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ ctx: &mut SearchContext, node_idx: Interned, node: &QueryNode, - distances: &[u16], + _distances: &[u16], file: &mut File, ) { match &node.data { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 3b8642cab..9ebb4344a 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -25,33 +25,33 @@ pub trait SearchLogger { fn ranking_rules(&mut self, rr: &[Box>]); /// Logs the start of a ranking rule's iteration. - fn start_iteration_ranking_rule<'transaction>( + fn start_iteration_ranking_rule( &mut self, ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, + ranking_rule: &dyn RankingRule, query: &Q, universe: &RoaringBitmap, ); /// Logs the end of the computation of a ranking rule bucket - fn next_bucket_ranking_rule<'transaction>( + fn next_bucket_ranking_rule( &mut self, ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, + ranking_rule: &dyn RankingRule, universe: &RoaringBitmap, candidates: &RoaringBitmap, ); /// Logs the skipping of a ranking rule bucket - fn skip_bucket_ranking_rule<'transaction>( + fn skip_bucket_ranking_rule( &mut self, ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, + ranking_rule: &dyn RankingRule, candidates: &RoaringBitmap, ); /// Logs the end of a ranking rule's iteration. - fn end_iteration_ranking_rule<'transaction>( + fn end_iteration_ranking_rule( &mut self, ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, + ranking_rule: &dyn RankingRule, universe: &RoaringBitmap, ); /// Logs the addition of document ids to the final results @@ -95,35 +95,35 @@ impl SearchLogger for DefaultSearchLogger { fn ranking_rules(&mut self, _rr: &[Box>]) {} - fn start_iteration_ranking_rule<'transaction>( + fn start_iteration_ranking_rule( &mut self, _ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule<'transaction, Q>, + _ranking_rule: &dyn RankingRule, _query: &Q, _universe: &RoaringBitmap, ) { } - fn next_bucket_ranking_rule<'transaction>( + fn next_bucket_ranking_rule( &mut self, _ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule<'transaction, Q>, + _ranking_rule: &dyn RankingRule, _universe: &RoaringBitmap, _candidates: &RoaringBitmap, ) { } - fn skip_bucket_ranking_rule<'transaction>( + fn skip_bucket_ranking_rule( &mut self, _ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule<'transaction, Q>, + _ranking_rule: &dyn RankingRule, _candidates: &RoaringBitmap, ) { } - fn end_iteration_ranking_rule<'transaction>( + fn end_iteration_ranking_rule( &mut self, _ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule<'transaction, Q>, + _ranking_rule: &dyn RankingRule, _universe: &RoaringBitmap, ) { } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 9627a662e..fff180879 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -61,8 +61,8 @@ impl<'ctx> SearchContext<'ctx> { /// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it. #[allow(clippy::too_many_arguments)] -fn resolve_maximally_reduced_query_graph<'ctx>( - ctx: &mut SearchContext<'ctx>, +fn resolve_maximally_reduced_query_graph( + ctx: &mut SearchContext, universe: &RoaringBitmap, query_graph: &QueryGraph, matching_strategy: TermsMatchingStrategy, @@ -75,7 +75,7 @@ fn resolve_maximally_reduced_query_graph<'ctx>( for (_, n) in query_graph.nodes.iter() { match &n.data { QueryNodeData::Term(term) => { - all_positions.extend(term.positions.clone().into_iter()); + all_positions.extend(term.positions.clone()); } QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {} } @@ -222,8 +222,8 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( } #[allow(clippy::too_many_arguments)] -pub fn execute_search<'ctx>( - ctx: &mut SearchContext<'ctx>, +pub fn execute_search( + ctx: &mut SearchContext, query: &str, terms_matching_strategy: TermsMatchingStrategy, filters: Option, diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 0ce000537..8591670b8 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -425,8 +425,8 @@ impl LocatedQueryTerm { } /// Convert the tokenised search query into a list of located query terms. -pub fn located_query_terms_from_string<'ctx>( - ctx: &mut SearchContext<'ctx>, +pub fn located_query_terms_from_string( + ctx: &mut SearchContext, query: NormalizedTokenIter>, words_limit: Option, ) -> Result> { diff --git a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs index 15d82a2be..67e9be6a4 100644 --- a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs @@ -43,9 +43,9 @@ impl ConditionDocIdsCache { /// /// If the cache does not yet contain these docids, they are computed /// and inserted in the cache. - pub fn get_condition_docids<'s, 'ctx>( + pub fn get_condition_docids<'s>( &'s mut self, - ctx: &mut SearchContext<'ctx>, + ctx: &mut SearchContext, interned_condition: Interned, graph: &mut RankingRuleGraph, universe: &RoaringBitmap, diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 9f825ee3d..5ceee3f4e 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -77,22 +77,19 @@ pub trait RankingRuleGraphTrait: Sized { /// Return the label of the given edge condition, to be used when visualising /// the ranking rule graph. - fn label_for_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result; + fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result; /// Compute the document ids associated with the given edge condition, /// restricted to the given universe. - fn resolve_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, + fn resolve_condition( + ctx: &mut SearchContext, condition: &Self::Condition, universe: &RoaringBitmap, ) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)>; /// Return the costs and conditions of the edges going from the source node to the destination node - fn build_edges<'ctx>( - ctx: &mut SearchContext<'ctx>, + fn build_edges( + ctx: &mut SearchContext, conditions_interner: &mut DedupInterner, source_node: &QueryNode, dest_node: &QueryNode, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 097120b49..4d42463e8 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -7,8 +7,8 @@ use crate::search::new::query_term::LocatedQueryTerm; use crate::search::new::{QueryNode, SearchContext}; use crate::Result; -pub fn build_edges<'ctx>( - _ctx: &mut SearchContext<'ctx>, +pub fn build_edges( + _ctx: &mut SearchContext, conditions_interner: &mut DedupInterner, from_node: &QueryNode, to_node: &QueryNode, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 12b5654c4..6f56e6221 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -13,8 +13,8 @@ use fxhash::FxHashSet; use heed::RoTxn; use roaring::RoaringBitmap; -pub fn compute_docids<'ctx>( - ctx: &mut SearchContext<'ctx>, +pub fn compute_docids( + ctx: &mut SearchContext, condition: &ProximityCondition, universe: &RoaringBitmap, ) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)> { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 7b8a066ab..3b98ed5b5 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -29,8 +29,8 @@ pub enum ProximityGraph {} impl RankingRuleGraphTrait for ProximityGraph { type Condition = ProximityCondition; - fn resolve_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, + fn resolve_condition( + ctx: &mut SearchContext, condition: &Self::Condition, universe: &RoaringBitmap, ) -> Result<(roaring::RoaringBitmap, FxHashSet>, FxHashSet>)> @@ -38,8 +38,8 @@ impl RankingRuleGraphTrait for ProximityGraph { compute_docids::compute_docids(ctx, condition, universe) } - fn build_edges<'ctx>( - ctx: &mut SearchContext<'ctx>, + fn build_edges( + ctx: &mut SearchContext, conditions_interner: &mut DedupInterner, source_node: &QueryNode, dest_node: &QueryNode, @@ -59,10 +59,7 @@ impl RankingRuleGraphTrait for ProximityGraph { logger.log_proximity_state(graph, paths, dead_ends_cache, universe, distances, cost); } - fn label_for_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result { + fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { match condition { ProximityCondition::Uninit { cost, .. } => { // TODO diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 4ef0d15d1..d20523cc9 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -23,8 +23,8 @@ pub enum TypoGraph {} impl RankingRuleGraphTrait for TypoGraph { type Condition = TypoCondition; - fn resolve_condition<'db_cache, 'ctx>( - ctx: &mut SearchContext<'ctx>, + fn resolve_condition<'db_cache>( + ctx: &mut SearchContext, condition: &Self::Condition, universe: &RoaringBitmap, ) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)> { @@ -57,8 +57,8 @@ impl RankingRuleGraphTrait for TypoGraph { )) } - fn build_edges<'ctx>( - ctx: &mut SearchContext<'ctx>, + fn build_edges( + ctx: &mut SearchContext, conditions_interner: &mut DedupInterner, _from_node: &QueryNode, to_node: &QueryNode, @@ -152,10 +152,7 @@ impl RankingRuleGraphTrait for TypoGraph { logger.log_typo_state(graph, paths, dead_ends_cache, universe, distances, cost); } - fn label_for_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result { + fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { let TypoCondition { term } = condition; let term = ctx.term_interner.get(*term); let QueryTerm { diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 2f941098d..f4db260ed 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -87,8 +87,8 @@ impl QueryTermDocIdsCache { } } -pub fn resolve_query_graph<'ctx>( - ctx: &mut SearchContext<'ctx>, +pub fn resolve_query_graph( + ctx: &mut SearchContext, q: &QueryGraph, universe: &RoaringBitmap, ) -> Result { diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index ff8a9bf2f..fb2c62f11 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -47,7 +47,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { for (_, n) in parent_query_graph.nodes.iter() { match &n.data { QueryNodeData::Term(term) => { - all_positions.extend(term.positions.clone().into_iter()); + all_positions.extend(term.positions.clone()); } QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {} } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 24e33bdd8..1b1a42c1c 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -591,8 +591,7 @@ fn create_matching_words( } // create a CONSECUTIVE matchings words wrapping all word in the phrase PrimitiveQueryPart::Phrase(words) => { - let ids: Vec<_> = - (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); + let ids: Vec<_> = (0..words.len()).map(|i| id + i as PrimitiveWordId).collect(); // Require that all words of the phrase have a corresponding MatchingWord // before adding any of them to the matching_words result if let Some(phrase_matching_words) = words @@ -649,10 +648,8 @@ fn create_matching_words( } }) .collect(); - let ids: Vec<_> = (0..words.len()) - .into_iter() - .map(|i| id + i as PrimitiveWordId) - .collect(); + let ids: Vec<_> = + (0..words.len()).map(|i| id + i as PrimitiveWordId).collect(); if let Some(synonyms) = ctx.synonyms(&words)? { for synonym in synonyms { From 9b2653427ded198a8d744e112dba68a93470dd51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 23 Mar 2023 09:22:01 +0100 Subject: [PATCH 082/234] Split position DB into fid and relative position DB --- milli/src/heed_codec/mod.rs | 2 +- milli/src/heed_codec/str_beu32_codec.rs | 34 ++++ milli/src/index.rs | 21 ++- milli/src/lib.rs | 17 ++ milli/src/search/criteria/attribute.rs | 17 +- milli/src/search/criteria/mod.rs | 12 +- milli/src/update/clear_documents.rs | 4 + milli/src/update/delete_documents.rs | 162 ++++++------------ .../extract/extract_word_position_docids.rs | 15 +- .../src/update/index_documents/extract/mod.rs | 4 +- .../update/words_prefix_position_docids.rs | 9 +- 11 files changed, 162 insertions(+), 135 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index a4df63e22..b7a8c3c88 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -21,5 +21,5 @@ pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; pub use self::script_language_codec::ScriptLanguageCodec; -pub use self::str_beu32_codec::StrBEU32Codec; +pub use self::str_beu32_codec::{StrBEU32Codec, StrBEU16Codec}; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; diff --git a/milli/src/heed_codec/str_beu32_codec.rs b/milli/src/heed_codec/str_beu32_codec.rs index d1f379bdc..17f3c996f 100644 --- a/milli/src/heed_codec/str_beu32_codec.rs +++ b/milli/src/heed_codec/str_beu32_codec.rs @@ -36,3 +36,37 @@ impl<'a> heed::BytesEncode<'a> for StrBEU32Codec { Some(Cow::Owned(bytes)) } } + +pub struct StrBEU16Codec; + +impl<'a> heed::BytesDecode<'a> for StrBEU16Codec { + type DItem = (&'a str, u16); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let footer_len = size_of::(); + + if bytes.len() < footer_len { + return None; + } + + let (word, bytes) = bytes.split_at(bytes.len() - footer_len); + let word = str::from_utf8(word).ok()?; + let pos = bytes.try_into().map(u16::from_be_bytes).ok()?; + + Some((word, pos)) + } +} + +impl<'a> heed::BytesEncode<'a> for StrBEU16Codec { + type EItem = (&'a str, u16); + + fn bytes_encode((word, pos): &Self::EItem) -> Option> { + let pos = pos.to_be_bytes(); + + let mut bytes = Vec::with_capacity(word.len() + pos.len()); + bytes.extend_from_slice(word.as_bytes()); + bytes.extend_from_slice(&pos[..]); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index a4048dfb0..7848ddf5a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -19,12 +19,12 @@ use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, }; -use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec}; +use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec}; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32, + Search, U8StrStrCodec, BEU16, BEU32, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -76,7 +76,9 @@ pub mod db_name { pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; + pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids"; pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; + pub const WORD_PREFIX_FIELD_ID_DOCIDS: &str = "word-prefix-field-id-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; @@ -118,11 +120,16 @@ pub struct Index { pub prefix_word_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. - pub word_position_docids: Database, + pub word_position_docids: Database, + /// Maps the word and the field id with the docids that corresponds to it. + pub word_fid_docids: Database, + /// Maps the field id and the word count with the docids that corresponds to it. pub field_id_word_count_docids: Database, /// Maps the position of a word prefix with all the docids where this prefix appears. - pub word_prefix_position_docids: Database, + pub word_prefix_position_docids: Database, + /// Maps the word and the field id with the docids that corresponds to it. + pub word_prefix_fid_docids: Database, /// Maps the script and language with all the docids that corresponds to it. pub script_language_docids: Database, @@ -153,7 +160,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(19); + options.max_dbs(21); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -170,8 +177,10 @@ impl Index { let prefix_word_pair_proximity_docids = env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; + let word_fid_docids = env.create_database(Some(WORD_FIELD_ID_DOCIDS))?; let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; + let word_prefix_fid_docids = env.create_database(Some(WORD_PREFIX_FIELD_ID_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; @@ -196,7 +205,9 @@ impl Index { word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids, word_position_docids, + word_fid_docids, word_prefix_position_docids, + word_prefix_fid_docids, field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index b256192bd..a62c344f9 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -152,6 +152,23 @@ pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, Relative pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position { (field_id as u32) << 16 | (relative as u32) } +// TODO: this is wrong, but will do for now +/// Compute the "bucketed" absolute position from the field id and relative position in the field. +/// +/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger. +pub fn bucketed_position(relative: u16) -> u16 { + // The first few relative positions are kept intact. + if relative < 16 { + relative + } else if relative < 24 { + // Relative positions between 16 and 24 all become equal to 24 + 24 + } else { + // Then, groups of positions that have the same base-2 logarithm are reduced to + // the same relative position: the smallest power of 2 that is greater than them + (relative as f64).log2().ceil().exp2() as u16 + } +} /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 5b33fdf54..322f6e051 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -199,7 +199,7 @@ impl<'t> Criterion for Attribute<'t> { struct QueryPositionIterator<'t> { #[allow(clippy::type_complexity)] inner: - Vec> + 't>>>, + Vec> + 't>>>, } impl<'t> QueryPositionIterator<'t> { @@ -241,7 +241,7 @@ impl<'t> QueryPositionIterator<'t> { } impl<'t> Iterator for QueryPositionIterator<'t> { - type Item = heed::Result<(u32, RoaringBitmap)>; + type Item = heed::Result<(u16, RoaringBitmap)>; fn next(&mut self) -> Option { // sort inner words from the closest next position to the farthest next position. @@ -281,9 +281,9 @@ impl<'t> Iterator for QueryPositionIterator<'t> { /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, /// This branch allows us to iterate over meta-interval of positions. struct Branch<'t> { - query_level_iterator: Vec<(u32, RoaringBitmap, Peekable>)>, - last_result: (u32, RoaringBitmap), - branch_size: u32, + query_level_iterator: Vec<(u16, RoaringBitmap, Peekable>)>, + last_result: (u16, RoaringBitmap), + branch_size: u16, } impl<'t> Branch<'t> { @@ -303,7 +303,7 @@ impl<'t> Branch<'t> { let mut branch = Self { query_level_iterator, last_result: (0, RoaringBitmap::new()), - branch_size: flatten_branch.len() as u32, + branch_size: flatten_branch.len() as u16, }; branch.update_last_result(); @@ -342,7 +342,7 @@ impl<'t> Branch<'t> { Some(result) => { result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0) } - None => u32::MAX, + None => u16::MAX, } } }) @@ -378,7 +378,8 @@ impl<'t> Branch<'t> { fn compute_rank(&self) -> u32 { // we compute a rank from the position. let (pos, _) = self.last_result; - pos.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size + pos.saturating_sub((0..self.branch_size).sum()) as u32 * LCM_10_FIRST_NUMBERS + / self.branch_size as u32 } fn cmp(&self, other: &Self) -> Ordering { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 0c1c8add1..5e491672f 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -171,7 +171,7 @@ pub trait Context<'c> { &self, word: &str, in_prefix_cache: bool, - ) -> heed::Result> + 'c>>; + ) -> heed::Result> + 'c>>; fn synonyms(&self, word: &str) -> heed::Result>>>; fn searchable_fields_ids(&self) -> Result>; fn field_id_word_count_docids( @@ -322,11 +322,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { &self, word: &str, in_prefix_cache: bool, - ) -> heed::Result> + 'c>> + ) -> heed::Result> + 'c>> { let range = { - let left = u32::min_value(); - let right = u32::max_value(); + let left = u16::min_value(); // TODO: this is wrong + let right = u16::max_value(); // TODO: this is wrong let left = (word, left); let right = (word, right); left..=right @@ -360,7 +360,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { } fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result> { - let key = (word, pos); + let key = (word, pos as u16); // TODO: this is wrong self.index.word_position_docids.get(self.rtxn, &key) } } @@ -899,7 +899,7 @@ pub mod test { _word: &str, _in_prefix_cache: bool, ) -> heed::Result< - Box> + 'c>, + Box> + 'c>, > { todo!() } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 0296bc192..c9de4d9ab 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -28,8 +28,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids, word_position_docids, + word_fid_docids, field_id_word_count_docids, word_prefix_position_docids, + word_prefix_fid_docids, script_language_docids, facet_id_f64_docids, facet_id_string_docids, @@ -81,8 +83,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_pair_proximity_docids.clear(self.wtxn)?; prefix_word_pair_proximity_docids.clear(self.wtxn)?; word_position_docids.clear(self.wtxn)?; + word_fid_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?; + word_prefix_fid_docids.clear(self.wtxn)?; script_language_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; facet_id_exists_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index eeb67b829..47a7bde4c 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,8 +2,8 @@ use std::collections::btree_map::Entry; use std::collections::{HashMap, HashSet}; use fst::IntoStreamer; -use heed::types::{ByteSlice, DecodeIgnore, Str}; -use heed::Database; +use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice}; +use heed::{BytesDecode, BytesEncode, Database, RwIter}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; @@ -239,6 +239,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { prefix_word_pair_proximity_docids, word_position_docids, word_prefix_position_docids, + word_fid_docids, + word_prefix_fid_docids, facet_id_f64_docids: _, facet_id_string_docids: _, field_id_docid_facet_f64s: _, @@ -361,97 +363,34 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { // We delete the documents ids from the word prefix pair proximity database docids // and remove the empty pairs too. - let db = db.remap_key_type::(); - let mut iter = db.iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (key, mut docids) = result?; - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } + Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?; } - - // We delete the documents ids that are under the pairs of words, - // it is faster and use no memory to iterate over all the words pairs than - // to compute the cartesian product of every words of the deleted documents. - let mut iter = - word_pair_proximity_docids.remap_key_type::().iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - - drop(iter); - - // We delete the documents ids that are under the word level position docids. - let mut iter = word_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - - drop(iter); - - // We delete the documents ids that are under the word prefix level position docids. - let mut iter = - word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - - drop(iter); + Self::delete_from_db( + word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; + Self::delete_from_db( + word_position_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; + Self::delete_from_db( + word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; + Self::delete_from_db( + word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; + Self::delete_from_db( + word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; // Remove the documents ids from the field id word count database. - let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; - while let Some((key, mut docids)) = iter.next().transpose()? { - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } - - drop(iter); + Self::delete_from_db( + field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; @@ -501,21 +440,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // Remove the documents ids from the script language database. - let mut iter = script_language_docids.iter_mut(self.wtxn)?; - while let Some((key, mut docids)) = iter.next().transpose()? { - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } - - drop(iter); + Self::delete_from_db( + script_language_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_id_exists_docids( self.wtxn, @@ -531,6 +459,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { soft_deletion_used: false, }) } + + fn delete_from_db( + mut iter: RwIter, C>, + to_delete_docids: &RoaringBitmap, + ) -> Result<()> + where + C: for<'a> BytesDecode<'a, DItem = RoaringBitmap> + + for<'a> BytesEncode<'a, EItem = RoaringBitmap>, + { + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids -= to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &docids)? }; + } + } + Ok(()) + } } fn remove_from_word_prefix_docids( diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index d95db4157..cd3ec691b 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -7,14 +7,17 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{DocumentId, Result}; +use crate::{ + absolute_from_relative_position, bucketed_position, relative_from_absolute_position, + DocumentId, Result, +}; /// Extracts the word positions and the documents ids where this word appear. /// /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_position_docids( +pub fn extract_word_fid_and_position_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { @@ -39,11 +42,15 @@ pub fn extract_word_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); + let (fid, position) = relative_from_absolute_position(position); + let position = bucketed_position(position); + let position = absolute_from_relative_position(fid, position); key_buffer.extend_from_slice(&position.to_be_bytes()); - word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } } - sorter_into_reader(word_position_docids_sorter, indexer) + let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?; + + Ok(word_position_docids_reader) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index c0f07cf79..844efed36 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -23,7 +23,7 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; -use self::extract_word_position_docids::extract_word_position_docids; +use self::extract_word_position_docids::extract_word_fid_and_position_docids; use super::helpers::{ as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, @@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents( docid_word_positions_chunks, indexer, lmdb_writer_sx.clone(), - extract_word_position_docids, + extract_word_fid_and_position_docids, merge_cbo_roaring_bitmaps, TypedChunk::WordPositionDocids, "word-position-docids", diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index 6f12dde38..0822d0d26 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -8,13 +8,13 @@ use heed::{BytesDecode, BytesEncode}; use log::debug; use crate::error::SerializationError; -use crate::heed_codec::StrBEU32Codec; +use crate::heed_codec::{StrBEU16Codec, StrBEU32Codec}; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::index_documents::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, CursorClonableMmap, MergeFn, }; -use crate::{Index, Result}; +use crate::{bucketed_position, relative_from_absolute_position, Index, Result}; pub struct WordPrefixPositionDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -82,6 +82,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { let mut prefixes_cache = HashMap::new(); while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + let (_fid, pos) = relative_from_absolute_position(pos); current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), @@ -127,12 +128,12 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { let iter = db .remap_key_type::() .prefix_iter(self.wtxn, prefix_bytes.as_bytes())? - .remap_key_type::(); + .remap_key_type::(); for result in iter { let ((word, pos), data) = result?; if word.starts_with(prefix) { let key = (prefix, pos); - let bytes = StrBEU32Codec::bytes_encode(&key).unwrap(); + let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); prefix_position_docids_sorter.insert(bytes, data)?; } } From f5f5f03ec0fd67973e310c89d2ea369632cec148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 23 Mar 2023 09:35:53 +0100 Subject: [PATCH 083/234] Remove old criteria code --- milli/src/search/criteria/asc_desc.rs | 569 ---------- milli/src/search/criteria/attribute.rs | 710 ------------ milli/src/search/criteria/exactness.rs | 766 ------------- milli/src/search/criteria/final.rs | 77 -- milli/src/search/criteria/geo.rs | 154 --- milli/src/search/criteria/initial.rs | 82 -- milli/src/search/criteria/mod.rs | 1049 ------------------ milli/src/search/criteria/proximity.rs | 712 ------------ milli/src/search/criteria/typo.rs | 493 -------- milli/src/search/criteria/words.rs | 106 -- milli/src/search/distinct/facet_distinct.rs | 218 ---- milli/src/search/distinct/mod.rs | 155 --- milli/src/search/distinct/noop_distinct.rs | 55 - milli/src/search/facet/facet_distribution.rs | 4 +- milli/src/search/facet/mod.rs | 36 +- milli/src/search/mod.rs | 205 +--- milli/src/search/new/mod.rs | 40 +- milli/src/search/new/query_term.rs | 2 +- 18 files changed, 88 insertions(+), 5345 deletions(-) delete mode 100644 milli/src/search/criteria/asc_desc.rs delete mode 100644 milli/src/search/criteria/attribute.rs delete mode 100644 milli/src/search/criteria/exactness.rs delete mode 100644 milli/src/search/criteria/final.rs delete mode 100644 milli/src/search/criteria/geo.rs delete mode 100644 milli/src/search/criteria/initial.rs delete mode 100644 milli/src/search/criteria/mod.rs delete mode 100644 milli/src/search/criteria/proximity.rs delete mode 100644 milli/src/search/criteria/typo.rs delete mode 100644 milli/src/search/criteria/words.rs delete mode 100644 milli/src/search/distinct/facet_distinct.rs delete mode 100644 milli/src/search/distinct/mod.rs delete mode 100644 milli/src/search/distinct/noop_distinct.rs diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs deleted file mode 100644 index 378e1c8da..000000000 --- a/milli/src/search/criteria/asc_desc.rs +++ /dev/null @@ -1,569 +0,0 @@ -use std::mem::take; - -use heed::BytesDecode; -use itertools::Itertools; -use log::debug; -use ordered_float::OrderedFloat; -use roaring::RoaringBitmap; - -use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; -use crate::heed_codec::ByteSliceRefCodec; -use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; -use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; -use crate::search::query_tree::Operation; -use crate::search::CriterionImplementationStrategy; -use crate::{FieldId, Index, Result}; - -/// Threshold on the number of candidates that will make -/// the system to choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 1000; - -pub struct AscDesc<'t> { - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - field_name: String, - field_id: Option, - is_ascending: bool, - query_tree: Option, - candidates: Box> + 't>, - allowed_candidates: RoaringBitmap, - initial_candidates: InitialCandidates, - faceted_candidates: RoaringBitmap, - implementation_strategy: CriterionImplementationStrategy, - parent: Box, -} - -impl<'t> AscDesc<'t> { - pub fn asc( - index: &'t Index, - rtxn: &'t heed::RoTxn, - parent: Box, - field_name: String, - implementation_strategy: CriterionImplementationStrategy, - ) -> Result { - Self::new(index, rtxn, parent, field_name, true, implementation_strategy) - } - - pub fn desc( - index: &'t Index, - rtxn: &'t heed::RoTxn, - parent: Box, - field_name: String, - implementation_strategy: CriterionImplementationStrategy, - ) -> Result { - Self::new(index, rtxn, parent, field_name, false, implementation_strategy) - } - - fn new( - index: &'t Index, - rtxn: &'t heed::RoTxn, - parent: Box, - field_name: String, - is_ascending: bool, - implementation_strategy: CriterionImplementationStrategy, - ) -> Result { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let field_id = fields_ids_map.id(&field_name); - let faceted_candidates = match field_id { - Some(field_id) => { - let number_faceted = - index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?; - let string_faceted = - index.faceted_documents_ids(rtxn, field_id, FacetType::String)?; - number_faceted | string_faceted - } - None => RoaringBitmap::default(), - }; - - Ok(AscDesc { - index, - rtxn, - field_name, - field_id, - is_ascending, - query_tree: None, - candidates: Box::new(std::iter::empty()), - allowed_candidates: RoaringBitmap::new(), - faceted_candidates, - initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), - implementation_strategy, - parent, - }) - } -} - -impl<'t> Criterion for AscDesc<'t> { - #[logging_timer::time("AscDesc::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // remove excluded candidates when next is called, instead of doing it in the loop. - self.allowed_candidates -= params.excluded_candidates; - - loop { - debug!( - "Facet {}({}) iteration", - if self.is_ascending { "Asc" } else { "Desc" }, - self.field_name - ); - - match self.candidates.next().transpose()? { - None if !self.allowed_candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.clone(), - candidates: Some(take(&mut self.allowed_candidates)), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree, - candidates, - filtered_candidates, - initial_candidates, - }) => { - self.query_tree = query_tree; - let mut candidates = match (&self.query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => { - let context = CriteriaBuilder::new(self.rtxn, self.index)?; - resolve_query_tree(&context, qt, params.wdcache)? - } - (None, None) => self.index.documents_ids(self.rtxn)?, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - match initial_candidates { - Some(initial_candidates) => { - self.initial_candidates |= initial_candidates - } - None => self.initial_candidates.map_inplace(|c| c | &candidates), - } - - if candidates.is_empty() { - continue; - } - - self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = match self.field_id { - Some(field_id) => facet_ordered( - self.index, - self.rtxn, - field_id, - self.is_ascending, - candidates & &self.faceted_candidates, - self.implementation_strategy, - )?, - None => Box::new(std::iter::empty()), - }; - } - None => return Ok(None), - }, - Some(mut candidates) => { - candidates -= params.excluded_candidates; - self.allowed_candidates -= &candidates; - return Ok(Some(CriterionResult { - query_tree: self.query_tree.clone(), - candidates: Some(candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - } - } - } -} - -fn facet_ordered_iterative<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - is_ascending: bool, - candidates: RoaringBitmap, -) -> Result> + 't>> { - let number_iter = iterative_facet_number_ordered_iter( - index, - rtxn, - field_id, - is_ascending, - candidates.clone(), - )?; - let string_iter = - iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; - Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) -} - -fn facet_extreme_value<'t>( - mut extreme_it: impl Iterator> + 't, -) -> Result> { - let extreme_value = - if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) }; - let (_, extreme_value) = extreme_value?; - - Ok(OrderedF64Codec::bytes_decode(extreme_value)) -} - -pub fn facet_min_value<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - candidates: RoaringBitmap, -) -> Result> { - let db = index.facet_id_f64_docids.remap_key_type::>(); - let it = ascending_facet_sort(rtxn, db, field_id, candidates)?; - facet_extreme_value(it) -} - -pub fn facet_max_value<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - candidates: RoaringBitmap, -) -> Result> { - let db = index.facet_id_f64_docids.remap_key_type::>(); - let it = descending_facet_sort(rtxn, db, field_id, candidates)?; - facet_extreme_value(it) -} - -fn facet_ordered_set_based<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - is_ascending: bool, - candidates: RoaringBitmap, -) -> Result> + 't>> { - let number_db = - index.facet_id_f64_docids.remap_key_type::>(); - let string_db = - index.facet_id_string_docids.remap_key_type::>(); - - let (number_iter, string_iter) = if is_ascending { - let number_iter = ascending_facet_sort(rtxn, number_db, field_id, candidates.clone())?; - let string_iter = ascending_facet_sort(rtxn, string_db, field_id, candidates)?; - - (itertools::Either::Left(number_iter), itertools::Either::Left(string_iter)) - } else { - let number_iter = descending_facet_sort(rtxn, number_db, field_id, candidates.clone())?; - let string_iter = descending_facet_sort(rtxn, string_db, field_id, candidates)?; - - (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter)) - }; - - Ok(Box::new(number_iter.chain(string_iter).map(|res| res.map(|(doc_ids, _)| doc_ids)))) -} - -/// Returns an iterator over groups of the given candidates in ascending or descending order. -/// -/// It will either use an iterative or a recursive method on the whole facet database depending -/// on the number of candidates to rank. -fn facet_ordered<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - is_ascending: bool, - candidates: RoaringBitmap, - implementation_strategy: CriterionImplementationStrategy, -) -> Result> + 't>> { - match implementation_strategy { - CriterionImplementationStrategy::OnlyIterative => { - facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates) - } - CriterionImplementationStrategy::OnlySetBased => { - facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates) - } - CriterionImplementationStrategy::Dynamic => { - if candidates.len() <= CANDIDATES_THRESHOLD { - facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates) - } else { - facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates) - } - } - } -} - -/// Fetch the whole list of candidates facet number values one by one and order them by it. -/// -/// This function is fast when the amount of candidates to rank is small. -fn iterative_facet_number_ordered_iter<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - is_ascending: bool, - candidates: RoaringBitmap, -) -> Result + 't> { - let mut docids_values = Vec::with_capacity(candidates.len() as usize); - for docid in candidates.iter() { - let left = (field_id, docid, f64::MIN); - let right = (field_id, docid, f64::MAX); - let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?; - let entry = if is_ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, OrderedFloat(value))); - } - } - docids_values.sort_unstable_by_key(|(_, v)| *v); - let iter = docids_values.into_iter(); - let iter = if is_ascending { - Box::new(iter) as Box> - } else { - Box::new(iter.rev()) - }; - - // The itertools GroupBy iterator doesn't provide an owned version, we are therefore - // required to collect the result into an owned collection (a Vec). - // https://github.com/rust-itertools/itertools/issues/499 - #[allow(clippy::needless_collect)] - let vec: Vec<_> = iter - .group_by(|(_, v)| *v) - .into_iter() - .map(|(_, ids)| ids.map(|(id, _)| id).collect()) - .collect(); - - Ok(vec.into_iter()) -} - -/// Fetch the whole list of candidates facet string values one by one and order them by it. -/// -/// This function is fast when the amount of candidates to rank is small. -fn iterative_facet_string_ordered_iter<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - is_ascending: bool, - candidates: RoaringBitmap, -) -> Result + 't> { - let mut docids_values = Vec::with_capacity(candidates.len() as usize); - for docid in candidates.iter() { - let left = (field_id, docid, ""); - let right = (field_id, docid.saturating_add(1), ""); - // FIXME Doing this means that it will never be possible to retrieve - // the document with id 2^32, not sure this is a real problem. - let mut iter = index.field_id_docid_facet_strings.range(rtxn, &(left..right))?; - let entry = if is_ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), _)) = entry.transpose()? { - docids_values.push((docid, value)); - } - } - docids_values.sort_unstable_by_key(|(_, v)| *v); - let iter = docids_values.into_iter(); - let iter = if is_ascending { - Box::new(iter) as Box> - } else { - Box::new(iter.rev()) - }; - - // The itertools GroupBy iterator doesn't provide an owned version, we are therefore - // required to collect the result into an owned collection (a Vec). - // https://github.com/rust-itertools/itertools/issues/499 - #[allow(clippy::needless_collect)] - let vec: Vec<_> = iter - .group_by(|(_, v)| *v) - .into_iter() - .map(|(_, ids)| ids.map(|(id, _)| id).collect()) - .collect(); - - Ok(vec.into_iter()) -} - -#[cfg(test)] -mod tests { - use std::str::FromStr; - - use big_s::S; - use maplit::hashset; - - use crate::index::tests::TempIndex; - use crate::{AscDesc, Criterion, Filter, Search, SearchResult}; - - // Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD - // constant to 0 to ensure that the other sort algorithms are also correct. - #[test] - fn sort_criterion_placeholder() { - let index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings - .set_sortable_fields(maplit::hashset! { S("id"), S("mod_10"), S("mod_20") }); - settings.set_criteria(vec![Criterion::Sort]); - }) - .unwrap(); - - let mut docs = vec![]; - for i in 0..100 { - docs.push( - serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }), - ); - } - - index.add_documents(documents!(docs)).unwrap(); - - let all_ids = (0..100).collect::>(); - - let rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&rtxn, &index); - search.sort_criteria(vec![AscDesc::from_str("mod_10:desc").unwrap()]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 19, 29, 39, 49, 59, 69, 79, 89, 99, 8, 18, 28, 38, 48, 58, 68, 78, 88, 98, 7, 17, 27, 37, 47, 57, 67, 77, 87, 97, 6, 16, 26, 36, 46, 56, 66, 76, 86, 96, 5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 4, 14, 24, 34, 44, 54, 64, 74, 84, 94, 3, 13, 23, 33, 43, 53, 63, 73, 83, 93, 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90]"); - documents_ids.sort(); - assert_eq!(all_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:desc").unwrap(), - AscDesc::from_str("id:desc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 89, 79, 69, 59, 49, 39, 29, 19, 9, 98, 88, 78, 68, 58, 48, 38, 28, 18, 8, 97, 87, 77, 67, 57, 47, 37, 27, 17, 7, 96, 86, 76, 66, 56, 46, 36, 26, 16, 6, 95, 85, 75, 65, 55, 45, 35, 25, 15, 5, 94, 84, 74, 64, 54, 44, 34, 24, 14, 4, 93, 83, 73, 63, 53, 43, 33, 23, 13, 3, 92, 82, 72, 62, 52, 42, 32, 22, 12, 2, 91, 81, 71, 61, 51, 41, 31, 21, 11, 1, 90, 80, 70, 60, 50, 40, 30, 20, 10, 0]"); - documents_ids.sort(); - assert_eq!(all_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:desc").unwrap(), - AscDesc::from_str("mod_20:asc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 29, 49, 69, 89, 19, 39, 59, 79, 99, 8, 28, 48, 68, 88, 18, 38, 58, 78, 98, 7, 27, 47, 67, 87, 17, 37, 57, 77, 97, 6, 26, 46, 66, 86, 16, 36, 56, 76, 96, 5, 25, 45, 65, 85, 15, 35, 55, 75, 95, 4, 24, 44, 64, 84, 14, 34, 54, 74, 94, 3, 23, 43, 63, 83, 13, 33, 53, 73, 93, 2, 22, 42, 62, 82, 12, 32, 52, 72, 92, 1, 21, 41, 61, 81, 11, 31, 51, 71, 91, 0, 20, 40, 60, 80, 10, 30, 50, 70, 90]"); - documents_ids.sort(); - assert_eq!(all_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:desc").unwrap(), - AscDesc::from_str("mod_20:desc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 39, 59, 79, 99, 9, 29, 49, 69, 89, 18, 38, 58, 78, 98, 8, 28, 48, 68, 88, 17, 37, 57, 77, 97, 7, 27, 47, 67, 87, 16, 36, 56, 76, 96, 6, 26, 46, 66, 86, 15, 35, 55, 75, 95, 5, 25, 45, 65, 85, 14, 34, 54, 74, 94, 4, 24, 44, 64, 84, 13, 33, 53, 73, 93, 3, 23, 43, 63, 83, 12, 32, 52, 72, 92, 2, 22, 42, 62, 82, 11, 31, 51, 71, 91, 1, 21, 41, 61, 81, 10, 30, 50, 70, 90, 0, 20, 40, 60, 80]"); - documents_ids.sort(); - assert_eq!(all_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:desc").unwrap(), - AscDesc::from_str("mod_20:desc").unwrap(), - AscDesc::from_str("id:desc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 79, 59, 39, 19, 89, 69, 49, 29, 9, 98, 78, 58, 38, 18, 88, 68, 48, 28, 8, 97, 77, 57, 37, 17, 87, 67, 47, 27, 7, 96, 76, 56, 36, 16, 86, 66, 46, 26, 6, 95, 75, 55, 35, 15, 85, 65, 45, 25, 5, 94, 74, 54, 34, 14, 84, 64, 44, 24, 4, 93, 73, 53, 33, 13, 83, 63, 43, 23, 3, 92, 72, 52, 32, 12, 82, 62, 42, 22, 2, 91, 71, 51, 31, 11, 81, 61, 41, 21, 1, 90, 70, 50, 30, 10, 80, 60, 40, 20, 0]"); - documents_ids.sort(); - assert_eq!(all_ids, documents_ids); - } - - // Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD - // constant to 0 to ensure that the other sort algorithms are also correct. - #[test] - fn sort_criterion_non_placeholder() { - let index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") }); - settings.set_sortable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") }); - settings.set_criteria(vec![Criterion::Sort]); - }) - .unwrap(); - - let mut docs = vec![]; - for i in 0..100 { - docs.push( - serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }), - ); - } - - index.add_documents(documents!(docs)).unwrap(); - - let rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&rtxn, &index); - search.filter( - Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]") - .unwrap() - .unwrap(), - ); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:desc").unwrap(), - AscDesc::from_str("mod_20:asc").unwrap(), - AscDesc::from_str("id:desc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - // The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 93, 73, 53, 33, 13, 82, 62, 42, 22, 2, 92, 72, 52, 32, 12, 81, 61, 41, 21, 1, 91, 71, 51, 31, 11, 80, 60, 40, 20, 0, 90, 70, 50, 30, 10]"); - let expected_ids = (0..100) - .filter(|id| { - [1, 0, 2].contains(&(id % 10)) - || [10, 13].contains(&(id % 20)) - || [5, 6].contains(id) - }) - .collect::>(); - documents_ids.sort(); - assert_eq!(expected_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.filter( - Filter::from_str("mod_10 IN [7, 8, 0] OR mod_20 IN [1, 15, 16] OR id IN [0, 4]") - .unwrap() - .unwrap(), - ); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:asc").unwrap(), - AscDesc::from_str("mod_20:asc").unwrap(), - AscDesc::from_str("id:desc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - // The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[80, 60, 40, 20, 0, 90, 70, 50, 30, 10, 81, 61, 41, 21, 1, 4, 95, 75, 55, 35, 15, 96, 76, 56, 36, 16, 87, 67, 47, 27, 7, 97, 77, 57, 37, 17, 88, 68, 48, 28, 8, 98, 78, 58, 38, 18]"); - let expected_ids = (0..100) - .filter(|id| { - [7, 8, 0].contains(&(id % 10)) - || [1, 15, 16].contains(&(id % 20)) - || [0, 4].contains(id) - }) - .collect::>(); - documents_ids.sort(); - assert_eq!(expected_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.filter( - Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]") - .unwrap() - .unwrap(), - ); - search.sort_criteria(vec![AscDesc::from_str("id:desc").unwrap()]); - search.limit(100); - - let SearchResult { documents_ids, .. } = search.execute().unwrap(); - // The order should be in decreasing value of the id - let mut expected_ids = (0..100) - .filter(|id| { - [1, 0, 2].contains(&(id % 10)) - || [10, 13].contains(&(id % 20)) - || [5, 6].contains(id) - }) - .collect::>(); - expected_ids.sort(); - expected_ids.reverse(); - assert_eq!(expected_ids, documents_ids); - } -} diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs deleted file mode 100644 index 322f6e051..000000000 --- a/milli/src/search/criteria/attribute.rs +++ /dev/null @@ -1,710 +0,0 @@ -use std::cmp::{self, Ordering}; -use std::collections::binary_heap::PeekMut; -use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap}; -use std::iter::Peekable; -use std::mem::take; - -use roaring::RoaringBitmap; - -use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::{InitialCandidates, Query}; -use crate::search::query_tree::{Operation, QueryKind}; -use crate::search::{ - build_dfa, word_derivations, CriterionImplementationStrategy, WordDerivationsCache, -}; -use crate::Result; - -/// To be able to divide integers by the number of words in the query -/// we want to find a multiplier that allow us to divide by any number between 1 and 10. -/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). -const LCM_10_FIRST_NUMBERS: u32 = 2520; - -/// Threshold on the number of candidates that will make -/// the system to choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 500; - -type FlattenedQueryTree = Vec>>; - -pub struct Attribute<'t> { - ctx: &'t dyn Context<'t>, - state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>, - initial_candidates: InitialCandidates, - parent: Box, - linear_buckets: Option>, - set_buckets: Option>>, - implementation_strategy: CriterionImplementationStrategy, -} - -impl<'t> Attribute<'t> { - pub fn new( - ctx: &'t dyn Context<'t>, - parent: Box, - implementation_strategy: CriterionImplementationStrategy, - ) -> Self { - Attribute { - ctx, - state: None, - initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), - parent, - linear_buckets: None, - set_buckets: None, - implementation_strategy, - } - } -} - -impl<'t> Criterion for Attribute<'t> { - #[logging_timer::time("Attribute::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some((_, _, allowed_candidates)) = self.state.as_mut() { - *allowed_candidates -= params.excluded_candidates; - } - - loop { - match self.state.take() { - Some((query_tree, _, allowed_candidates)) if allowed_candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: Some(query_tree), - candidates: Some(RoaringBitmap::new()), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { - let found_candidates = if matches!( - self.implementation_strategy, - CriterionImplementationStrategy::OnlyIterative - ) || (matches!( - self.implementation_strategy, - CriterionImplementationStrategy::Dynamic - ) && allowed_candidates.len() - < CANDIDATES_THRESHOLD) - { - let linear_buckets = match self.linear_buckets.as_mut() { - Some(linear_buckets) => linear_buckets, - None => { - let new_buckets = initialize_linear_buckets( - self.ctx, - &flattened_query_tree, - &allowed_candidates, - )?; - self.linear_buckets.get_or_insert(new_buckets.into_iter()) - } - }; - - match linear_buckets.next() { - Some((_score, candidates)) => candidates, - None => { - return Ok(Some(CriterionResult { - query_tree: Some(query_tree), - candidates: Some(RoaringBitmap::new()), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - } - } else { - let set_buckets = match self.set_buckets.as_mut() { - Some(set_buckets) => set_buckets, - None => { - let new_buckets = initialize_set_buckets( - self.ctx, - &flattened_query_tree, - &allowed_candidates, - params.wdcache, - )?; - self.set_buckets.get_or_insert(new_buckets) - } - }; - - match set_compute_candidates(set_buckets, &allowed_candidates)? { - Some((_score, candidates)) => candidates, - None => { - return Ok(Some(CriterionResult { - query_tree: Some(query_tree), - candidates: Some(allowed_candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - } - }; - - allowed_candidates -= &found_candidates; - - self.state = - Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); - - return Ok(Some(CriterionResult { - query_tree: Some(query_tree), - candidates: Some(found_candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates, - initial_candidates, - }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => { - resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - - params.excluded_candidates - } - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - let flattened_query_tree = flatten_query_tree(&query_tree); - - match initial_candidates { - Some(initial_candidates) => { - self.initial_candidates |= initial_candidates - } - None => self.initial_candidates.map_inplace(|c| c | &candidates), - } - - self.state = Some((query_tree, flattened_query_tree, candidates)); - self.linear_buckets = None; - } - Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - })); - } - None => return Ok(None), - }, - } - } - } -} - -/// QueryPositionIterator is an Iterator over positions of a Query, -/// It contains iterators over words positions. -struct QueryPositionIterator<'t> { - #[allow(clippy::type_complexity)] - inner: - Vec> + 't>>>, -} - -impl<'t> QueryPositionIterator<'t> { - fn new( - ctx: &'t dyn Context<'t>, - queries: &[Query], - wdcache: &mut WordDerivationsCache, - ) -> Result { - let mut inner = Vec::with_capacity(queries.len()); - for query in queries { - let in_prefix_cache = query.prefix && ctx.in_prefix_cache(query.kind.word()); - match &query.kind { - QueryKind::Exact { word, .. } => { - if !query.prefix || in_prefix_cache { - let word = query.kind.word(); - let iter = ctx.word_position_iterator(word, in_prefix_cache)?; - inner.push(iter.peekable()); - } else { - for (word, _) in word_derivations(word, true, 0, ctx.words_fst(), wdcache)? - { - let iter = ctx.word_position_iterator(word, in_prefix_cache)?; - inner.push(iter.peekable()); - } - } - } - QueryKind::Tolerant { typo, word } => { - for (word, _) in - word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)? - { - let iter = ctx.word_position_iterator(word, in_prefix_cache)?; - inner.push(iter.peekable()); - } - } - }; - } - - Ok(Self { inner }) - } -} - -impl<'t> Iterator for QueryPositionIterator<'t> { - type Item = heed::Result<(u16, RoaringBitmap)>; - - fn next(&mut self) -> Option { - // sort inner words from the closest next position to the farthest next position. - let expected_pos = self - .inner - .iter_mut() - .filter_map(|wli| match wli.peek() { - Some(Ok(((_, pos), _))) => Some(*pos), - _ => None, - }) - .min()?; - - let mut candidates = None; - for wli in self.inner.iter_mut() { - if let Some(Ok(((_, pos), _))) = wli.peek() { - if *pos > expected_pos { - continue; - } - } - - match wli.next() { - Some(Ok((_, docids))) => { - candidates = match candidates.take() { - Some(candidates) => Some(candidates | docids), - None => Some(docids), - } - } - Some(Err(e)) => return Some(Err(e)), - None => continue, - } - } - - candidates.map(|candidates| Ok((expected_pos, candidates))) - } -} - -/// A Branch is represent a possible alternative of the original query and is build with the Query Tree, -/// This branch allows us to iterate over meta-interval of positions. -struct Branch<'t> { - query_level_iterator: Vec<(u16, RoaringBitmap, Peekable>)>, - last_result: (u16, RoaringBitmap), - branch_size: u16, -} - -impl<'t> Branch<'t> { - fn new( - ctx: &'t dyn Context<'t>, - flatten_branch: &[Vec], - wdcache: &mut WordDerivationsCache, - allowed_candidates: &RoaringBitmap, - ) -> Result { - let mut query_level_iterator = Vec::new(); - for queries in flatten_branch { - let mut qli = QueryPositionIterator::new(ctx, queries, wdcache)?.peekable(); - let (pos, docids) = qli.next().transpose()?.unwrap_or((0, RoaringBitmap::new())); - query_level_iterator.push((pos, docids & allowed_candidates, qli)); - } - - let mut branch = Self { - query_level_iterator, - last_result: (0, RoaringBitmap::new()), - branch_size: flatten_branch.len() as u16, - }; - - branch.update_last_result(); - - Ok(branch) - } - - /// return the next meta-interval of the branch, - /// and update inner interval in order to be ranked by the BinaryHeap. - fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result { - // update the first query. - let index = self.lowest_iterator_index(); - match self.query_level_iterator.get_mut(index) { - Some((cur_pos, cur_docids, qli)) => match qli.next().transpose()? { - Some((next_pos, next_docids)) => { - *cur_pos = next_pos; - *cur_docids |= next_docids & allowed_candidates; - self.update_last_result(); - Ok(true) - } - None => Ok(false), - }, - None => Ok(false), - } - } - - fn lowest_iterator_index(&mut self) -> usize { - let (index, _) = self - .query_level_iterator - .iter_mut() - .map(|(pos, docids, qli)| { - if docids.is_empty() { - 0 - } else { - match qli.peek() { - Some(result) => { - result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0) - } - None => u16::MAX, - } - } - }) - .enumerate() - .min_by_key(|(_, diff)| *diff) - .unwrap_or((0, 0)); - - index - } - - fn update_last_result(&mut self) { - let mut result_pos = 0; - let mut result_docids = None; - - for (pos, docids, _qli) in self.query_level_iterator.iter() { - result_pos += pos; - result_docids = result_docids - .take() - .map_or_else(|| Some(docids.clone()), |candidates| Some(candidates & docids)); - } - - // remove last result docids from inner iterators - if let Some(docids) = result_docids.as_ref() { - for (_, query_docids, _) in self.query_level_iterator.iter_mut() { - *query_docids -= docids; - } - } - - self.last_result = (result_pos, result_docids.unwrap_or_default()); - } - - /// return the score of the current inner interval. - fn compute_rank(&self) -> u32 { - // we compute a rank from the position. - let (pos, _) = self.last_result; - pos.saturating_sub((0..self.branch_size).sum()) as u32 * LCM_10_FIRST_NUMBERS - / self.branch_size as u32 - } - - fn cmp(&self, other: &Self) -> Ordering { - let self_rank = self.compute_rank(); - let other_rank = other.compute_rank(); - - // lower rank is better, and because BinaryHeap give the higher ranked branch, we reverse it. - self_rank.cmp(&other_rank).reverse() - } -} - -impl<'t> Ord for Branch<'t> { - fn cmp(&self, other: &Self) -> Ordering { - self.cmp(other) - } -} - -impl<'t> PartialOrd for Branch<'t> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl<'t> PartialEq for Branch<'t> { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == Ordering::Equal - } -} - -impl<'t> Eq for Branch<'t> {} - -fn initialize_set_buckets<'t>( - ctx: &'t dyn Context<'t>, - branches: &FlattenedQueryTree, - allowed_candidates: &RoaringBitmap, - wdcache: &mut WordDerivationsCache, -) -> Result>> { - let mut heap = BinaryHeap::new(); - for flatten_branch in branches { - let branch = Branch::new(ctx, flatten_branch, wdcache, allowed_candidates)?; - heap.push(branch); - } - - Ok(heap) -} - -fn set_compute_candidates( - branches_heap: &mut BinaryHeap, - allowed_candidates: &RoaringBitmap, -) -> Result> { - let mut final_candidates: Option<(u32, RoaringBitmap)> = None; - let mut allowed_candidates = allowed_candidates.clone(); - - while let Some(mut branch) = branches_heap.peek_mut() { - // if current is worst than best we break to return - // candidates that correspond to the best rank - let branch_rank = branch.compute_rank(); - if let Some((best_rank, _)) = final_candidates { - if branch_rank > best_rank { - break; - } - } - - let candidates = take(&mut branch.last_result.1); - if candidates.is_empty() { - // we don't have candidates, get next interval. - if !branch.next(&allowed_candidates)? { - PeekMut::pop(branch); - } - } else { - allowed_candidates -= &candidates; - final_candidates = match final_candidates.take() { - // we add current candidates to best candidates - Some((best_rank, mut best_candidates)) => { - best_candidates |= candidates; - branch.next(&allowed_candidates)?; - Some((best_rank, best_candidates)) - } - // we take current candidates as best candidates - None => { - branch.next(&allowed_candidates)?; - Some((branch_rank, candidates)) - } - }; - } - } - - Ok(final_candidates) -} - -fn initialize_linear_buckets( - ctx: &dyn Context, - branches: &FlattenedQueryTree, - allowed_candidates: &RoaringBitmap, -) -> Result> { - fn compute_candidate_rank( - branches: &FlattenedQueryTree, - words_positions: HashMap, - ) -> u64 { - let mut min_rank = u64::max_value(); - for branch in branches { - let branch_len = branch.len(); - let mut branch_rank = Vec::with_capacity(branch_len); - for derivates in branch { - let mut position = None; - for Query { prefix, kind } in derivates { - // find the best position of the current word in the document. - let current_position = match kind { - QueryKind::Exact { word, .. } => { - if *prefix { - word_derivations(word, true, 0, &words_positions) - .flat_map(|positions| positions.iter().next()) - .min() - } else { - words_positions - .get(word) - .and_then(|positions| positions.iter().next()) - } - } - QueryKind::Tolerant { typo, word } => { - word_derivations(word, *prefix, *typo, &words_positions) - .flat_map(|positions| positions.iter().next()) - .min() - } - }; - - match (position, current_position) { - (Some(p), Some(cp)) => position = Some(cmp::min(p, cp)), - (None, Some(cp)) => position = Some(cp), - _ => (), - } - } - - // if a position is found, we add it to the branch score, - // otherwise the branch is considered as unfindable in this document and we break. - if let Some(position) = position { - branch_rank.push(position as u64); - } else { - branch_rank.clear(); - break; - } - } - - if !branch_rank.is_empty() { - branch_rank.sort_unstable(); - // because several words in same query can't match all a the position 0, - // we substract the word index to the position. - let branch_rank: u64 = - branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); - // here we do the means of the words of the branch - min_rank = - min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); - } - } - - min_rank - } - - fn word_derivations<'a>( - word: &str, - is_prefix: bool, - max_typo: u8, - words_positions: &'a HashMap, - ) -> impl Iterator { - let dfa = build_dfa(word, max_typo, is_prefix); - words_positions.iter().filter_map(move |(document_word, positions)| { - use levenshtein_automata::Distance; - match dfa.eval(document_word) { - Distance::Exact(_) => Some(positions), - Distance::AtLeast(_) => None, - } - }) - } - - let mut candidates = BTreeMap::new(); - for docid in allowed_candidates { - let words_positions = ctx.docid_words_positions(docid)?; - let rank = compute_candidate_rank(branches, words_positions); - candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid); - } - - Ok(candidates) -} - -// TODO can we keep refs of Query -fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { - use crate::search::criteria::Operation::{And, Or, Phrase}; - - fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree { - match tail.split_first() { - Some((thead, tail)) => { - let tail = and_recurse(thead, tail); - let mut out = Vec::new(); - for array in recurse(head) { - for tail_array in &tail { - let mut array = array.clone(); - array.extend(tail_array.iter().cloned()); - out.push(array); - } - } - out - } - None => recurse(head), - } - } - - fn recurse(op: &Operation) -> FlattenedQueryTree { - match op { - And(ops) => ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)), - Or(_, ops) => { - if ops.iter().all(|op| op.query().is_some()) { - vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] - } else { - ops.iter().flat_map(recurse).collect() - } - } - Phrase(words) => { - let queries = words - .iter() - .filter_map(|w| w.as_ref()) - .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }]) - .collect(); - vec![queries] - } - Operation::Query(query) => vec![vec![vec![query.clone()]]], - } - } - - recurse(query_tree) -} - -#[cfg(test)] -mod tests { - use big_s::S; - - use super::*; - use crate::search::criteria::QueryKind; - - #[test] - fn simple_flatten_query_tree() { - let query_tree = Operation::Or( - false, - vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), - ]), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact(S("thefish")), - }), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact(S("the")), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact(S("fish")), - }), - ]), - ], - ), - ]), - ], - ); - let result = flatten_query_tree(&query_tree); - - insta::assert_debug_snapshot!(result, @r###" - [ - [ - [ - Exact { - word: "manythefish", - }, - ], - ], - [ - [ - Exact { - word: "manythe", - }, - ], - [ - Exact { - word: "fish", - }, - ], - ], - [ - [ - Exact { - word: "many", - }, - ], - [ - Exact { - word: "thefish", - }, - ], - ], - [ - [ - Exact { - word: "many", - }, - ], - [ - Exact { - word: "the", - }, - ], - [ - Exact { - word: "fish", - }, - ], - ], - ] - "###); - } -} diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs deleted file mode 100644 index 078a9cd6c..000000000 --- a/milli/src/search/criteria/exactness.rs +++ /dev/null @@ -1,766 +0,0 @@ -use std::collections::btree_map::Entry; -use std::collections::BTreeMap; -use std::convert::TryFrom; -use std::mem::take; - -use log::debug; -use roaring::{MultiOps, RoaringBitmap}; - -use crate::search::criteria::{ - resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, - InitialCandidates, -}; -use crate::search::query_tree::{Operation, PrimitiveQueryPart}; -use crate::{absolute_from_relative_position, FieldId, Result}; - -pub struct Exactness<'t> { - ctx: &'t dyn Context<'t>, - query_tree: Option, - state: Option, - initial_candidates: InitialCandidates, - parent: Box, - query: Vec, - cache: Option, -} - -impl<'t> Exactness<'t> { - pub fn new( - ctx: &'t dyn Context<'t>, - parent: Box, - primitive_query: &[PrimitiveQueryPart], - ) -> heed::Result { - let mut query: Vec<_> = Vec::with_capacity(primitive_query.len()); - for part in primitive_query { - query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?); - } - - Ok(Exactness { - ctx, - query_tree: None, - state: None, - initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), - parent, - query, - cache: None, - }) - } -} - -impl<'t> Criterion for Exactness<'t> { - #[logging_timer::time("Exactness::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some(state) = self.state.as_mut() { - state.difference_with(params.excluded_candidates); - } - loop { - debug!("Exactness at state {:?}", self.state); - - match self.state.as_mut() { - Some(state) if state.is_empty() => { - // reset state - self.state = None; - self.query_tree = None; - // we don't need to reset the combinations cache since it only depends on - // the primitive query, which does not change - } - Some(state) => { - let (candidates, state) = - resolve_state(self.ctx, take(state), &self.query, &mut self.cache)?; - self.state = state; - - return Ok(Some(CriterionResult { - query_tree: self.query_tree.clone(), - candidates: Some(candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates, - initial_candidates, - }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => { - resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - - params.excluded_candidates - } - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - match initial_candidates { - Some(initial_candidates) => { - self.initial_candidates |= initial_candidates - } - None => self.initial_candidates.map_inplace(|c| c | &candidates), - } - - self.state = Some(State::new(candidates)); - self.query_tree = Some(query_tree); - } - Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - })); - } - None => return Ok(None), - }, - } - } - } -} - -#[derive(Debug)] -enum State { - /// Extract the documents that have an attribute that contains exactly the query. - ExactAttribute(RoaringBitmap), - /// Extract the documents that have an attribute that starts with exactly the query. - AttributeStartsWith(RoaringBitmap), - /// Rank the remaining documents by the number of exact words contained. - ExactWords(RoaringBitmap), - Remainings(Vec), -} - -impl State { - fn new(candidates: RoaringBitmap) -> Self { - Self::ExactAttribute(candidates) - } - - fn difference_with(&mut self, lhs: &RoaringBitmap) { - match self { - Self::ExactAttribute(candidates) - | Self::AttributeStartsWith(candidates) - | Self::ExactWords(candidates) => *candidates -= lhs, - Self::Remainings(candidates_array) => { - candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs); - candidates_array.retain(|candidates| !candidates.is_empty()); - } - } - } - - fn is_empty(&self) -> bool { - match self { - Self::ExactAttribute(candidates) - | Self::AttributeStartsWith(candidates) - | Self::ExactWords(candidates) => candidates.is_empty(), - Self::Remainings(candidates_array) => { - candidates_array.iter().all(RoaringBitmap::is_empty) - } - } - } -} - -impl Default for State { - fn default() -> Self { - Self::Remainings(vec![]) - } -} -#[logging_timer::time("Exactness::{}")] -fn resolve_state( - ctx: &dyn Context, - state: State, - query: &[ExactQueryPart], - cache: &mut Option, -) -> Result<(RoaringBitmap, Option)> { - use State::*; - match state { - ExactAttribute(mut allowed_candidates) => { - let mut candidates = RoaringBitmap::new(); - if let Ok(query_len) = u8::try_from(query.len()) { - let attributes_ids = ctx.searchable_fields_ids()?; - for id in attributes_ids { - if let Some(attribute_allowed_docids) = - ctx.field_id_word_count_docids(id, query_len)? - { - let mut attribute_candidates_array = - attribute_start_with_docids(ctx, id, query)?; - attribute_candidates_array.push(attribute_allowed_docids); - - candidates |= MultiOps::intersection(attribute_candidates_array); - } - } - - // only keep allowed candidates - candidates &= &allowed_candidates; - // remove current candidates from allowed candidates - allowed_candidates -= &candidates; - } - - Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) - } - AttributeStartsWith(mut allowed_candidates) => { - let mut candidates = RoaringBitmap::new(); - let attributes_ids = ctx.searchable_fields_ids()?; - for id in attributes_ids { - let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; - candidates |= MultiOps::intersection(attribute_candidates_array); - } - - // only keep allowed candidates - candidates &= &allowed_candidates; - // remove current candidates from allowed candidates - allowed_candidates -= &candidates; - Ok((candidates, Some(ExactWords(allowed_candidates)))) - } - ExactWords(allowed_candidates) => { - // Retrieve the cache if it already exist, otherwise create it. - let owned_cache = if let Some(cache) = cache.take() { - cache - } else { - compute_combinations(ctx, query)? - }; - // The cache contains the sets of documents which contain exactly 1,2,3,.. exact words - // from the query. It cannot be empty. All the candidates in it are disjoint. - - let mut candidates_array = owned_cache.combinations.clone(); - for candidates in candidates_array.iter_mut() { - *candidates &= &allowed_candidates; - } - *cache = Some(owned_cache); - - let best_candidates = candidates_array.pop().unwrap(); - - candidates_array.insert(0, allowed_candidates); - Ok((best_candidates, Some(Remainings(candidates_array)))) - } - // pop remainings candidates until the emptiness - Remainings(mut candidates_array) => { - let candidates = candidates_array.pop().unwrap_or_default(); - if !candidates_array.is_empty() { - Ok((candidates, Some(Remainings(candidates_array)))) - } else { - Ok((candidates, None)) - } - } - } -} - -fn attribute_start_with_docids( - ctx: &dyn Context, - attribute_id: FieldId, - query: &[ExactQueryPart], -) -> heed::Result> { - let mut attribute_candidates_array = Vec::new(); - // start from attribute first position - let mut pos = absolute_from_relative_position(attribute_id, 0); - for part in query { - use ExactQueryPart::*; - match part { - Synonyms(synonyms) => { - let mut synonyms_candidates = RoaringBitmap::new(); - for word in synonyms { - let wc = ctx.word_position_docids(word, pos)?; - if let Some(word_candidates) = wc { - synonyms_candidates |= word_candidates; - } - } - attribute_candidates_array.push(synonyms_candidates); - pos += 1; - } - Phrase(phrase) => { - for word in phrase { - if let Some(word) = word { - let wc = ctx.word_position_docids(word, pos)?; - if let Some(word_candidates) = wc { - attribute_candidates_array.push(word_candidates); - } - } - pos += 1; - } - } - } - } - - Ok(attribute_candidates_array) -} - -#[derive(Debug, Clone)] -pub enum ExactQueryPart { - Phrase(Vec>), - Synonyms(Vec), -} - -impl ExactQueryPart { - fn from_primitive_query_part( - ctx: &dyn Context, - part: &PrimitiveQueryPart, - ) -> heed::Result { - let part = match part { - PrimitiveQueryPart::Word(word, _) => { - match ctx.synonyms(word)? { - Some(synonyms) => { - let mut synonyms: Vec<_> = synonyms - .into_iter() - .filter_map(|mut array| { - // keep 1 word synonyms only. - match array.pop() { - Some(word) if array.is_empty() => Some(word), - _ => None, - } - }) - .collect(); - synonyms.push(word.clone()); - ExactQueryPart::Synonyms(synonyms) - } - None => ExactQueryPart::Synonyms(vec![word.clone()]), - } - } - PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()), - }; - - Ok(part) - } -} - -struct ExactWordsCombinationCache { - // index 0 is only 1 word - combinations: Vec, -} - -fn compute_combinations( - ctx: &dyn Context, - query: &[ExactQueryPart], -) -> Result { - let number_of_part = query.len(); - let mut parts_candidates_array = Vec::with_capacity(number_of_part); - for part in query { - let mut candidates = RoaringBitmap::new(); - use ExactQueryPart::*; - match part { - Synonyms(synonyms) => { - for synonym in synonyms { - if let Some(synonym_candidates) = ctx.word_docids(synonym)? { - candidates |= synonym_candidates; - } - } - } - // compute intersection on pair of words with a proximity of 0. - Phrase(phrase) => { - candidates |= resolve_phrase(ctx, phrase)?; - } - } - parts_candidates_array.push(candidates); - } - let combinations = create_disjoint_combinations(parts_candidates_array); - - Ok(ExactWordsCombinationCache { combinations }) -} - -/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn` -/// such that `Xi` contains all the elements that are contained in **at least** `i+1` bitmaps among `b0,b1,...,bn`. -/// -/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`. -/// -/// ## Implementation -/// -/// We do so by iteratively building a map containing the union of all the different ways to intersect `J` bitmaps among `b0,b1,...,bn`. -/// - The key of the map is the index `i` of the last bitmap in the intersections -/// - The value is the union of all the possible intersections of J bitmaps such that the last bitmap in the intersection is `bi` -/// -/// For example, with the bitmaps `b0,b1,b2,b3`, this map should look like this -/// ```text -/// Map 0: (first iteration, contains all the combinations of 1 bitmap) -/// // What follows are unions of intersection of bitmaps asscociated with the index of their last component -/// 0: [b0] -/// 1: [b1] -/// 2: [b2] -/// 3: [b3] -/// Map 1: (second iteration, combinations of 2 bitmaps) -/// 1: [b0&b1] -/// 2: [b0&b2 | b1&b2] -/// 3: [b0&b3 | b1&b3 | b2&b3] -/// Map 2: (third iteration, combinations of 3 bitmaps) -/// 2: [b0&b1&b2] -/// 3: [b0&b2&b3 | b1&b2&b3] -/// Map 3: (fourth iteration, combinations of 4 bitmaps) -/// 3: [b0&b1&b2&b3] -/// ``` -/// -/// These maps are built one by one from the content of the preceding map. -/// For example, to create Map 2, we look at each line of Map 1, for example: -/// ```text -/// 2: [b0&b2 | b1&b2] -/// ``` -/// And then for each i > 2, we compute `(b0&b2 | b1&b2) & bi = b0&b2&bi | b1&b2&bi` -/// and then add it the new map (Map 3) under the key `i` (if it is not empty): -/// ```text -/// 3: [b0&b2&b3 | b1&b2&b3] -/// 4: [b0&b2&b4 | b1&b2&b4] -/// 5: [b0&b2&b5 | b1&b2&b5] -/// etc. -/// ``` -/// We only keep two maps in memory at any one point. As soon as Map J is built, we flatten Map J-1 into -/// a single bitmap by taking the union of all of its values. This union gives us Xj-1. -/// -/// ## Memory Usage -/// This function is expected to be called on a maximum of 10 bitmaps. The worst case thus happens when -/// 10 identical large bitmaps are given. -/// -/// In the context of Meilisearch, let's imagine that we are given 10 bitmaps containing all -/// the document ids. If the dataset contains 16 million documents, then each bitmap will take -/// around 2MB of memory. -/// -/// When creating Map 3, we will have, in memory: -/// 1. The 10 original bitmaps (20MB) -/// 2. X0 : 2MB -/// 3. Map 1, containing 9 bitmaps: 18MB -/// 4. Map 2, containing 8 bitmaps: 16MB -/// 5. X1: 2MB -/// for a total of around 60MB of memory. This roughly represents the maximum memory usage of this function. -/// -/// ## Time complexity -/// Let N be the size of the given list of bitmaps and M the length of each individual bitmap. -/// -/// We need to create N new bitmaps. The most expensive one to create is the second one, where we need to -/// iterate over the N keys of Map 1, and for each of those keys `k_i`, we perform `N-k_i` bitmap unions. -/// Unioning two bitmaps is O(M), and we need to do it O(N^2) times. -/// -/// Therefore the time complexity is O(N^3 * M). -fn create_non_disjoint_combinations(bitmaps: Vec) -> Vec { - let nbr_parts = bitmaps.len(); - if nbr_parts == 1 { - return bitmaps; - } - let mut flattened_levels = vec![]; - let mut last_level: BTreeMap = - bitmaps.clone().into_iter().enumerate().collect(); - - for _ in 2..=nbr_parts { - let mut new_level = BTreeMap::new(); - for (last_part_index, base_combination) in last_level.iter() { - #[allow(clippy::needless_range_loop)] - for new_last_part_index in last_part_index + 1..nbr_parts { - let new_combination = base_combination & &bitmaps[new_last_part_index]; - if !new_combination.is_empty() { - match new_level.entry(new_last_part_index) { - Entry::Occupied(mut b) => { - *b.get_mut() |= new_combination; - } - Entry::Vacant(entry) => { - entry.insert(new_combination); - } - } - } - } - } - // Now flatten the last level to save memory - let flattened_last_level = MultiOps::union(last_level.into_values()); - flattened_levels.push(flattened_last_level); - last_level = new_level; - } - // Flatten the last level - let flattened_last_level = MultiOps::union(last_level.into_values()); - flattened_levels.push(flattened_last_level); - flattened_levels -} - -/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn` -/// such that `Xi` contains all the elements that are contained in **exactly** `i+1` bitmaps among `b0,b1,...,bn`. -/// -/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`. -fn create_disjoint_combinations(parts_candidates_array: Vec) -> Vec { - let non_disjoint_combinations = create_non_disjoint_combinations(parts_candidates_array); - let mut disjoint_combinations = vec![]; - let mut combinations = non_disjoint_combinations.into_iter().peekable(); - while let Some(mut combination) = combinations.next() { - if let Some(forbidden) = combinations.peek() { - combination -= forbidden; - } - disjoint_combinations.push(combination) - } - - disjoint_combinations -} - -#[cfg(test)] -mod tests { - use big_s::S; - use roaring::RoaringBitmap; - - use crate::index::tests::TempIndex; - use crate::search::criteria::exactness::{ - create_disjoint_combinations, create_non_disjoint_combinations, - }; - use crate::snapshot_tests::display_bitmap; - use crate::{Criterion, SearchResult}; - - #[test] - fn test_exact_words_subcriterion() { - let index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key(S("id")); - settings.set_criteria(vec![Criterion::Exactness]); - }) - .unwrap(); - - index - .add_documents(documents!([ - // not relevant - { "id": "0", "text": "cat good dog bad" }, - // 1 exact word - { "id": "1", "text": "they said: cats arebetter thandogs" }, - // 3 exact words - { "id": "2", "text": "they said: cats arebetter than dogs" }, - // 5 exact words - { "id": "3", "text": "they said: cats are better than dogs" }, - // attribute starts with the exact words - { "id": "4", "text": "cats are better than dogs except on Saturday" }, - // attribute equal to the exact words - { "id": "5", "text": "cats are better than dogs" }, - ])) - .unwrap(); - - let rtxn = index.read_txn().unwrap(); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("cats are better than dogs").execute().unwrap(); - - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4, 3, 2, 1]"); - } - - fn print_combinations(rbs: &[RoaringBitmap]) -> String { - let mut s = String::new(); - for rb in rbs { - s.push_str(&format!("{}\n", &display_bitmap(rb))); - } - s - } - - // In these unit tests, the test bitmaps always contain all the multiple of a certain number. - // This makes it easy to check the validity of the results of `create_disjoint_combinations` by - // counting the number of dividers of elements in the returned bitmaps. - fn assert_correct_combinations(combinations: &[RoaringBitmap], dividers: &[u32]) { - for (i, set) in combinations.iter().enumerate() { - let expected_nbr_dividers = i + 1; - for el in set { - let nbr_dividers = dividers.iter().map(|d| usize::from(el % d == 0)).sum::(); - assert_eq!( - nbr_dividers, expected_nbr_dividers, - "{el} is divisible by {nbr_dividers} elements, not {expected_nbr_dividers}." - ); - } - } - } - - #[test] - fn compute_combinations_1() { - let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); - - let parts_candidates = vec![b0]; - - let combinations = create_disjoint_combinations(parts_candidates); - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, ] - "###); - - assert_correct_combinations(&combinations, &[2]); - } - - #[test] - fn compute_combinations_2() { - let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); - let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); - - let parts_candidates = vec![b0, b1]; - - let combinations = create_disjoint_combinations(parts_candidates); - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [2, 3, 4, 8, 9, 10, 14, 15, 16, 20, 21, 22, 26, 27, 28, 32, 33, 34, 38, 39, 40, 44, 45, 46, 50, 51, 52, 56, 57, 58, 62, 63, 64, 68, 69, 70, 74, 75, 76, 80, 81, 82, 86, 87, 88, 92, 93, 94, 98, 99, 100, 104, 105, 106, 110, 111, 112, 116, 117, 118, 122, 123, 124, 128, 129, 130, 134, 135, 136, 140, 141, 142, 146, 147, 148, ] - [0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108, 114, 120, 126, 132, 138, 144, ] - "###); - } - - #[test] - fn compute_combinations_4() { - let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); - let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); - let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); - let b3: RoaringBitmap = (0..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect(); - - let parts_candidates = vec![b0, b1, b2, b3]; - - let combinations = create_disjoint_combinations(parts_candidates); - - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ] - [6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ] - [30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ] - [0, ] - "###); - - // But we also check it programmatically - assert_correct_combinations(&combinations, &[2, 3, 5, 7]); - } - #[test] - fn compute_combinations_4_with_empty_results_at_end() { - let b0: RoaringBitmap = (1..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); - let b1: RoaringBitmap = (1..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); - let b2: RoaringBitmap = (1..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); - let b3: RoaringBitmap = (1..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect(); - - let parts_candidates = vec![b0, b1, b2, b3]; - - let combinations = create_disjoint_combinations(parts_candidates); - - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ] - [6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ] - [30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ] - [] - "###); - - // But we also check it programmatically - assert_correct_combinations(&combinations, &[2, 3, 5, 7]); - } - - #[test] - fn compute_combinations_4_with_some_equal_bitmaps() { - let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); - let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); - let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); - // b3 == b1 - let b3: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); - - let parts_candidates = vec![b0, b1, b2, b3]; - - let combinations = create_disjoint_combinations(parts_candidates); - - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [2, 4, 5, 8, 14, 16, 22, 25, 26, 28, 32, 34, 35, 38, 44, 46, 52, 55, 56, 58, 62, 64, 65, 68, 74, 76, 82, 85, 86, 88, 92, 94, 95, 98, 104, 106, 112, 115, 116, 118, 122, 124, 125, 128, 134, 136, 142, 145, 146, 148, ] - [3, 9, 10, 20, 21, 27, 33, 39, 40, 50, 51, 57, 63, 69, 70, 80, 81, 87, 93, 99, 100, 110, 111, 117, 123, 129, 130, 140, 141, 147, ] - [6, 12, 15, 18, 24, 36, 42, 45, 48, 54, 66, 72, 75, 78, 84, 96, 102, 105, 108, 114, 126, 132, 135, 138, 144, ] - [0, 30, 60, 90, 120, ] - "###); - - // But we also check it programmatically - assert_correct_combinations(&combinations, &[2, 3, 5, 3]); - } - - #[test] - fn compute_combinations_10() { - let dividers = [2, 3, 5, 7, 11, 6, 15, 35, 18, 14]; - let parts_candidates: Vec = dividers - .iter() - .map(|÷r| { - (0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 210).collect() - }) - .collect(); - - let combinations = create_disjoint_combinations(parts_candidates); - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [2, 3, 4, 5, 7, 8, 9, 11, 16, 25, 26, 27, 32, 34, 38, 39, 46, 49, 51, 52, 57, 58, 62, 64, 65, 68, 69, 74, 76, 81, 82, 85, 86, 87, 91, 92, 93, 94, 95, 104, 106, 111, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 143, 145, 146, 148, 152, 153, 155, 158, 159, 161, 164, 166, 171, 172, 177, 178, 183, 184, 185, 187, 188, 194, 201, 202, 203, 205, 206, 207, 208, 209, ] - [10, 20, 21, 22, 33, 40, 44, 50, 55, 63, 77, 80, 88, 99, 100, 130, 147, 160, 170, 176, 189, 190, 200, ] - [6, 12, 14, 15, 24, 28, 35, 45, 48, 56, 75, 78, 96, 98, 102, 110, 112, 114, 135, 138, 156, 174, 175, 182, 186, 192, 195, 196, 204, ] - [18, 36, 54, 66, 72, 108, 132, 144, 154, 162, 165, ] - [30, 42, 60, 70, 84, 105, 120, 140, 150, 168, 198, ] - [90, 126, 180, ] - [] - [210, ] - [] - [0, ] - "###); - - assert_correct_combinations(&combinations, ÷rs); - } - - #[test] - fn compute_combinations_30() { - let dividers: [u32; 30] = [ - 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, - 5, - ]; - let parts_candidates: Vec = dividers - .iter() - .map(|divider| { - (0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 100).collect() - }) - .collect(); - - let combinations = create_non_disjoint_combinations(parts_candidates.clone()); - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 60, ] - [0, 60, ] - [0, 60, ] - [0, 60, ] - [0, 60, ] - [0, 60, ] - "###); - - let combinations = create_disjoint_combinations(parts_candidates); - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [] - [] - [] - [] - [] - [1, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 49, 53, 59, 61, 67, 71, 73, 77, 79, 83, 89, 91, 97, ] - [] - [] - [] - [] - [] - [2, 3, 5, 9, 14, 21, 22, 25, 26, 27, 33, 34, 35, 38, 39, 46, 51, 55, 57, 58, 62, 63, 65, 69, 74, 81, 82, 85, 86, 87, 93, 94, 95, 98, 99, ] - [] - [] - [] - [] - [] - [4, 6, 8, 10, 15, 16, 18, 28, 32, 42, 44, 45, 50, 52, 54, 56, 64, 66, 68, 70, 75, 76, 78, 88, 92, ] - [] - [] - [] - [] - [] - [12, 20, 24, 30, 36, 40, 48, 72, 80, 84, 90, 96, 100, ] - [] - [] - [] - [] - [] - [0, 60, ] - "###); - - assert_correct_combinations(&combinations, ÷rs); - } -} diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs deleted file mode 100644 index 9f7a147b8..000000000 --- a/milli/src/search/criteria/final.rs +++ /dev/null @@ -1,77 +0,0 @@ -use log::debug; -use roaring::RoaringBitmap; - -use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::InitialCandidates; -use crate::search::query_tree::Operation; -use crate::search::WordDerivationsCache; -use crate::Result; - -/// The result of a call to the fetcher. -#[derive(Debug, Clone, PartialEq)] -pub struct FinalResult { - /// The query tree corresponding to the current bucket of the last criterion. - pub query_tree: Option, - /// The candidates of the current bucket of the last criterion. - pub candidates: RoaringBitmap, - /// Candidates that comes from the current bucket of the initial criterion. - pub initial_candidates: InitialCandidates, -} - -pub struct Final<'t> { - ctx: &'t dyn Context<'t>, - parent: Box, - wdcache: WordDerivationsCache, - returned_candidates: RoaringBitmap, -} - -impl<'t> Final<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Final<'t> { - Final { - ctx, - parent, - wdcache: WordDerivationsCache::new(), - returned_candidates: RoaringBitmap::new(), - } - } - - #[logging_timer::time("Final::{}")] - pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> Result> { - debug!("Final iteration"); - let excluded_candidates = &self.returned_candidates | excluded_candidates; - let mut criterion_parameters = CriterionParameters { - wdcache: &mut self.wdcache, - // returned_candidates is merged with excluded_candidates to avoid duplicas - excluded_candidates: &excluded_candidates, - }; - - match self.parent.next(&mut criterion_parameters)? { - Some(CriterionResult { - query_tree, - candidates, - filtered_candidates, - initial_candidates, - }) => { - let mut candidates = match (candidates, query_tree.as_ref()) { - (Some(candidates), _) => candidates, - (None, Some(qt)) => { - resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates - } - (None, None) => self.ctx.documents_ids()? - excluded_candidates, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - let initial_candidates = initial_candidates - .unwrap_or_else(|| InitialCandidates::Estimated(candidates.clone())); - - self.returned_candidates |= &candidates; - - Ok(Some(FinalResult { query_tree, candidates, initial_candidates })) - } - None => Ok(None), - } - } -} diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs deleted file mode 100644 index 0b33e6b2f..000000000 --- a/milli/src/search/criteria/geo.rs +++ /dev/null @@ -1,154 +0,0 @@ -use std::iter; - -use roaring::RoaringBitmap; -use rstar::RTree; - -use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; -use crate::{lat_lng_to_xyz, GeoPoint, Index, Result}; - -pub struct Geo<'t> { - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - ascending: bool, - parent: Box, - candidates: Box>, - allowed_candidates: RoaringBitmap, - initial_candidates: InitialCandidates, - rtree: Option>, - point: [f64; 2], -} - -impl<'t> Geo<'t> { - pub fn asc( - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - parent: Box, - point: [f64; 2], - ) -> Result { - Self::new(index, rtxn, parent, point, true) - } - - pub fn desc( - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - parent: Box, - point: [f64; 2], - ) -> Result { - Self::new(index, rtxn, parent, point, false) - } - - fn new( - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - parent: Box, - point: [f64; 2], - ascending: bool, - ) -> Result { - let candidates = Box::new(iter::empty()); - let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?; - let initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new()); - let rtree = index.geo_rtree(rtxn)?; - - Ok(Self { - index, - rtxn, - ascending, - parent, - candidates, - allowed_candidates, - initial_candidates, - rtree, - point, - }) - } -} - -impl Criterion for Geo<'_> { - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - let rtree = self.rtree.as_ref(); - - loop { - match self.candidates.next() { - Some(mut candidates) => { - candidates -= params.excluded_candidates; - self.allowed_candidates -= &candidates; - return Ok(Some(CriterionResult { - query_tree: None, - candidates: Some(candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.clone()), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree, - candidates, - filtered_candidates, - initial_candidates, - }) => { - let mut candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => { - let context = CriteriaBuilder::new(self.rtxn, self.index)?; - resolve_query_tree(&context, qt, params.wdcache)? - } - (None, None) => self.index.documents_ids(self.rtxn)?, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - match initial_candidates { - Some(initial_candidates) => { - self.initial_candidates |= initial_candidates - } - None => self.initial_candidates.map_inplace(|c| c | &candidates), - } - - if candidates.is_empty() { - continue; - } - self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = match rtree { - Some(rtree) => geo_point( - rtree, - self.allowed_candidates.clone(), - self.point, - self.ascending, - ), - None => Box::new(std::iter::empty()), - }; - } - None => return Ok(None), - }, - } - } - } -} - -fn geo_point( - rtree: &RTree, - mut candidates: RoaringBitmap, - point: [f64; 2], - ascending: bool, -) -> Box> { - let point = lat_lng_to_xyz(&point); - - let mut results = Vec::new(); - for point in rtree.nearest_neighbor_iter(&point) { - if candidates.remove(point.data.0) { - results.push(std::iter::once(point.data.0).collect()); - if candidates.is_empty() { - break; - } - } - } - - if ascending { - Box::new(results.into_iter()) - } else { - Box::new(results.into_iter().rev()) - } -} diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs deleted file mode 100644 index 0826a9f68..000000000 --- a/milli/src/search/criteria/initial.rs +++ /dev/null @@ -1,82 +0,0 @@ -use roaring::RoaringBitmap; - -use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::{resolve_query_tree, Context, InitialCandidates}; -use crate::search::query_tree::Operation; -use crate::search::Distinct; -use crate::Result; -/// Initial is a mandatory criterion, it is always the first -/// and is meant to initalize the CriterionResult used by the other criteria. -/// It behave like an [Once Iterator](https://doc.rust-lang.org/std/iter/struct.Once.html) and will return Some(CriterionResult) only one time. -pub struct Initial<'t, D> { - ctx: &'t dyn Context<'t>, - answer: Option, - exhaustive_number_hits: bool, - distinct: Option, -} - -impl<'t, D> Initial<'t, D> { - pub fn new( - ctx: &'t dyn Context<'t>, - query_tree: Option, - filtered_candidates: Option, - exhaustive_number_hits: bool, - distinct: Option, - ) -> Initial { - let answer = CriterionResult { - query_tree, - candidates: None, - filtered_candidates, - initial_candidates: None, - }; - Initial { ctx, answer: Some(answer), exhaustive_number_hits, distinct } - } -} - -impl Criterion for Initial<'_, D> { - #[logging_timer::time("Initial::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - self.answer - .take() - .map(|mut answer| { - if self.exhaustive_number_hits { - // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. - let candidates = answer - .query_tree - .as_ref() - .map(|query_tree| resolve_query_tree(self.ctx, query_tree, params.wdcache)) - .transpose()?; - - // then intersect the candidates with the potential filtered candidates. - let mut candidates = match (candidates, answer.filtered_candidates.take()) { - (Some(candidates), Some(filtered)) => candidates & filtered, - (Some(candidates), None) => candidates, - (None, Some(filtered)) => filtered, - (None, None) => self.ctx.documents_ids()?, - }; - - // then remove the potential soft deleted documents. - candidates -= params.excluded_candidates; - - // because the initial_candidates should be an exhaustive count of the matching documents, - // we precompute the distinct attributes. - let initial_candidates = match &mut self.distinct { - Some(distinct) => { - let mut initial_candidates = RoaringBitmap::new(); - for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { - initial_candidates.insert(c?); - } - initial_candidates - } - None => candidates.clone(), - }; - - answer.candidates = Some(candidates); - answer.initial_candidates = - Some(InitialCandidates::Exhaustive(initial_candidates)); - } - Ok(answer) - }) - .transpose() - } -} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs deleted file mode 100644 index 5e491672f..000000000 --- a/milli/src/search/criteria/mod.rs +++ /dev/null @@ -1,1049 +0,0 @@ -use std::borrow::Cow; -use std::collections::HashMap; -use std::mem::take; -use std::ops::{BitOr, BitOrAssign}; - -use roaring::RoaringBitmap; - -use self::asc_desc::AscDesc; -use self::attribute::Attribute; -use self::exactness::Exactness; -use self::initial::Initial; -use self::proximity::Proximity; -use self::r#final::Final; -use self::typo::Typo; -use self::words::Words; -use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; -use super::CriterionImplementationStrategy; -use crate::search::criteria::geo::Geo; -use crate::search::{word_derivations, Distinct, WordDerivationsCache}; -use crate::update::{MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB}; -use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; - -mod asc_desc; -pub use asc_desc::{facet_max_value, facet_min_value}; -mod attribute; -mod exactness; -pub mod r#final; -mod geo; -mod initial; -mod proximity; -mod typo; -mod words; - -pub trait Criterion { - fn next(&mut self, params: &mut CriterionParameters) -> Result>; -} - -/// The result of a call to the parent criterion. -#[derive(Debug, Clone, PartialEq)] -pub struct CriterionResult { - /// The query tree that must be used by the children criterion to fetch candidates. - query_tree: Option, - /// The candidates that this criterion is allowed to return subsets of, - /// if None, it is up to the child to compute the candidates itself. - candidates: Option, - /// The candidates, coming from facet filters, that this criterion is allowed to return subsets of. - filtered_candidates: Option, - /// Candidates that comes from the current bucket of the initial criterion. - initial_candidates: Option, -} - -#[derive(Debug, PartialEq)] -pub struct CriterionParameters<'a> { - wdcache: &'a mut WordDerivationsCache, - excluded_candidates: &'a RoaringBitmap, -} - -/// Either a set of candidates that defines the candidates -/// that are allowed to be returned, -/// or the candidates that must never be returned. -#[derive(Debug)] -enum Candidates { - Allowed(RoaringBitmap), - Forbidden(RoaringBitmap), -} - -impl Default for Candidates { - fn default() -> Self { - Self::Forbidden(RoaringBitmap::new()) - } -} - -/// Either a set of candidates that defines the estimated set of candidates -/// that could be returned, -/// or the Exhaustive set of candidates that will be returned if all possible results are fetched. -#[derive(Debug, Clone, PartialEq)] -pub enum InitialCandidates { - Estimated(RoaringBitmap), - Exhaustive(RoaringBitmap), -} - -impl InitialCandidates { - fn take(&mut self) -> Self { - match self { - Self::Estimated(c) => Self::Estimated(take(c)), - Self::Exhaustive(c) => Self::Exhaustive(take(c)), - } - } - - /// modify the containing roaring bitmap inplace if the set isn't already Exhaustive. - pub fn map_inplace(&mut self, f: F) - where - F: FnOnce(RoaringBitmap) -> RoaringBitmap, - { - if let Self::Estimated(c) = self { - *c = f(take(c)) - } - } - - pub fn into_inner(self) -> RoaringBitmap { - match self { - Self::Estimated(c) => c, - Self::Exhaustive(c) => c, - } - } -} - -impl BitOrAssign for InitialCandidates { - /// Make an union between the containing roaring bitmaps if the set isn't already Exhaustive. - /// In the case of rhs is Exhaustive and not self, then rhs replaces self. - fn bitor_assign(&mut self, rhs: Self) { - if let Self::Estimated(c) = self { - *self = match rhs { - Self::Estimated(rhs) => Self::Estimated(rhs | &*c), - Self::Exhaustive(rhs) => Self::Exhaustive(rhs), - } - } - } -} - -impl BitOr for InitialCandidates { - type Output = Self; - - /// Make an union between the containing roaring bitmaps if the set isn't already Exhaustive. - /// In the case of rhs is Exhaustive and not self, then rhs replaces self. - fn bitor(self, rhs: Self) -> Self::Output { - if let Self::Estimated(c) = self { - match rhs { - Self::Estimated(rhs) => Self::Estimated(rhs | c), - Self::Exhaustive(rhs) => Self::Exhaustive(rhs), - } - } else { - self.clone() - } - } -} - -pub trait Context<'c> { - fn documents_ids(&self) -> heed::Result; - fn word_docids(&self, word: &str) -> heed::Result>; - fn exact_word_docids(&self, word: &str) -> heed::Result>; - fn word_prefix_docids(&self, word: &str) -> heed::Result>; - fn exact_word_prefix_docids(&self, word: &str) -> heed::Result>; - - fn word_pair_proximity_docids( - &self, - left: &str, - right: &str, - proximity: u8, - ) -> heed::Result>; - fn word_prefix_pair_proximity_docids( - &self, - left: &str, - right: &str, - proximity: u8, - ) -> heed::Result>; - fn prefix_word_pair_proximity_docids( - &self, - prefix: &str, - right: &str, - proximity: u8, - ) -> heed::Result>; - fn words_fst<'t>(&self) -> &'t fst::Set>; - fn in_prefix_cache(&self, word: &str) -> bool; - fn docid_words_positions( - &self, - docid: DocumentId, - ) -> heed::Result>; - #[allow(clippy::type_complexity)] - fn word_position_iterator( - &self, - word: &str, - in_prefix_cache: bool, - ) -> heed::Result> + 'c>>; - fn synonyms(&self, word: &str) -> heed::Result>>>; - fn searchable_fields_ids(&self) -> Result>; - fn field_id_word_count_docids( - &self, - field_id: FieldId, - word_count: u8, - ) -> heed::Result>; - fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result>; -} - -pub struct CriteriaBuilder<'t> { - rtxn: &'t heed::RoTxn<'t>, - index: &'t Index, - words_fst: fst::Set>, - words_prefixes_fst: fst::Set>, -} - -/// Return the docids for the following word pairs and proximities using [`Context::word_pair_proximity_docids`]. -/// * `left, right, prox` (leftward proximity) -/// * `right, left, prox-1` (rightward proximity) -/// -/// ## Example -/// For a document with the text `the good fox eats the apple`, we have: -/// * `rightward_proximity(the, eats) = 3` -/// * `leftward_proximity(eats, the) = 1` -/// -/// So both the expressions `word_pair_overall_proximity_docids(ctx, the, eats, 3)` -/// and `word_pair_overall_proximity_docids(ctx, the, eats, 2)` would return a bitmap containing -/// the id of this document. -fn word_pair_overall_proximity_docids( - ctx: &dyn Context, - left: &str, - right: &str, - prox: u8, -) -> heed::Result> { - let rightward = ctx.word_pair_proximity_docids(left, right, prox)?; - let leftward = - if prox > 1 { ctx.word_pair_proximity_docids(right, left, prox - 1)? } else { None }; - if let Some(mut all) = rightward { - if let Some(leftward) = leftward { - all |= leftward; - } - Ok(Some(all)) - } else { - Ok(leftward) - } -} - -/// This function works identically to [`word_pair_overall_proximity_docids`] except that the -/// right word is replaced by a prefix string. -/// -/// It will return None if no documents were found or if the prefix does not exist in the -/// `word_prefix_pair_proximity_docids` database. -fn word_prefix_pair_overall_proximity_docids( - ctx: &dyn Context, - left: &str, - prefix: &str, - proximity: u8, -) -> heed::Result> { - // We retrieve the docids for the original and swapped word pairs: - // A: word1 prefix2 proximity - // B: prefix2 word1 proximity-1 - let rightward = ctx.word_prefix_pair_proximity_docids(left, prefix, proximity)?; - - let leftward = if proximity > 1 { - ctx.prefix_word_pair_proximity_docids(prefix, left, proximity - 1)? - } else { - None - }; - if let Some(mut all) = rightward { - if let Some(leftward) = leftward { - all |= leftward; - } - Ok(Some(all)) - } else { - Ok(leftward) - } -} - -impl<'c> Context<'c> for CriteriaBuilder<'c> { - fn documents_ids(&self) -> heed::Result { - self.index.documents_ids(self.rtxn) - } - - fn word_docids(&self, word: &str) -> heed::Result> { - self.index.word_docids.get(self.rtxn, word) - } - - fn exact_word_docids(&self, word: &str) -> heed::Result> { - self.index.exact_word_docids.get(self.rtxn, word) - } - - fn word_prefix_docids(&self, word: &str) -> heed::Result> { - self.index.word_prefix_docids.get(self.rtxn, word) - } - - fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { - self.index.exact_word_prefix_docids.get(self.rtxn, word) - } - - fn word_pair_proximity_docids( - &self, - left: &str, - right: &str, - proximity: u8, - ) -> heed::Result> { - self.index.word_pair_proximity_docids.get(self.rtxn, &(proximity, left, right)) - } - - fn word_prefix_pair_proximity_docids( - &self, - left: &str, - prefix: &str, - proximity: u8, - ) -> heed::Result> { - self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &(proximity, left, prefix)) - } - fn prefix_word_pair_proximity_docids( - &self, - prefix: &str, - right: &str, - proximity: u8, - ) -> heed::Result> { - self.index.prefix_word_pair_proximity_docids.get(self.rtxn, &(proximity, prefix, right)) - } - - fn words_fst<'t>(&self) -> &'t fst::Set> { - &self.words_fst - } - - fn in_prefix_cache(&self, word: &str) -> bool { - self.words_prefixes_fst.contains(word) - } - - fn docid_words_positions( - &self, - docid: DocumentId, - ) -> heed::Result> { - let mut words_positions = HashMap::new(); - for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { - let ((_, word), positions) = result?; - words_positions.insert(word.to_string(), positions); - } - Ok(words_positions) - } - - fn word_position_iterator( - &self, - word: &str, - in_prefix_cache: bool, - ) -> heed::Result> + 'c>> - { - let range = { - let left = u16::min_value(); // TODO: this is wrong - let right = u16::max_value(); // TODO: this is wrong - let left = (word, left); - let right = (word, right); - left..=right - }; - let db = match in_prefix_cache { - true => self.index.word_prefix_position_docids, - false => self.index.word_position_docids, - }; - - Ok(Box::new(db.range(self.rtxn, &range)?)) - } - - fn synonyms(&self, word: &str) -> heed::Result>>> { - self.index.words_synonyms(self.rtxn, &[word]) - } - - fn searchable_fields_ids(&self) -> Result> { - match self.index.searchable_fields_ids(self.rtxn)? { - Some(searchable_fields_ids) => Ok(searchable_fields_ids), - None => Ok(self.index.fields_ids_map(self.rtxn)?.ids().collect()), - } - } - - fn field_id_word_count_docids( - &self, - field_id: FieldId, - word_count: u8, - ) -> heed::Result> { - let key = (field_id, word_count); - self.index.field_id_word_count_docids.get(self.rtxn, &key) - } - - fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result> { - let key = (word, pos as u16); // TODO: this is wrong - self.index.word_position_docids.get(self.rtxn, &key) - } -} - -impl<'t> CriteriaBuilder<'t> { - pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> Result { - let words_fst = index.words_fst(rtxn)?; - let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; - Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) - } - - #[allow(clippy::too_many_arguments)] - pub fn build( - &'t self, - query_tree: Option, - primitive_query: Option>, - filtered_candidates: Option, - sort_criteria: Option>, - exhaustive_number_hits: bool, - distinct: Option, - implementation_strategy: CriterionImplementationStrategy, - ) -> Result> { - use crate::criterion::Criterion as Name; - - let primitive_query = primitive_query.unwrap_or_default(); - - let mut criterion = Box::new(Initial::new( - self, - query_tree, - filtered_candidates, - exhaustive_number_hits, - distinct, - )) as Box; - for name in self.index.criteria(self.rtxn)? { - criterion = match name { - Name::Words => Box::new(Words::new(self, criterion)), - Name::Typo => Box::new(Typo::new(self, criterion)), - Name::Sort => match sort_criteria { - Some(ref sort_criteria) => { - for asc_desc in sort_criteria { - criterion = match asc_desc { - AscDescName::Asc(Member::Field(field)) => Box::new(AscDesc::asc( - self.index, - self.rtxn, - criterion, - field.to_string(), - implementation_strategy, - )?), - AscDescName::Desc(Member::Field(field)) => Box::new(AscDesc::desc( - self.index, - self.rtxn, - criterion, - field.to_string(), - implementation_strategy, - )?), - AscDescName::Asc(Member::Geo(point)) => { - Box::new(Geo::asc(self.index, self.rtxn, criterion, *point)?) - } - AscDescName::Desc(Member::Geo(point)) => { - Box::new(Geo::desc(self.index, self.rtxn, criterion, *point)?) - } - }; - } - criterion - } - None => criterion, - }, - Name::Proximity => { - Box::new(Proximity::new(self, criterion, implementation_strategy)) - } - Name::Attribute => { - Box::new(Attribute::new(self, criterion, implementation_strategy)) - } - Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), - Name::Asc(field) => Box::new(AscDesc::asc( - self.index, - self.rtxn, - criterion, - field, - implementation_strategy, - )?), - Name::Desc(field) => Box::new(AscDesc::desc( - self.index, - self.rtxn, - criterion, - field, - implementation_strategy, - )?), - }; - } - - Ok(Final::new(self, criterion)) - } -} - -pub fn resolve_query_tree( - ctx: &dyn Context, - query_tree: &Operation, - wdcache: &mut WordDerivationsCache, -) -> Result { - fn resolve_operation( - ctx: &dyn Context, - query_tree: &Operation, - wdcache: &mut WordDerivationsCache, - ) -> Result { - use Operation::{And, Or, Phrase, Query}; - - match query_tree { - And(ops) => { - let mut ops = ops - .iter() - .map(|op| resolve_operation(ctx, op, wdcache)) - .collect::>>()?; - - ops.sort_unstable_by_key(|cds| cds.len()); - - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for docids in ops { - if first_loop { - candidates = docids; - first_loop = false; - } else { - candidates &= &docids; - } - } - Ok(candidates) - } - Phrase(words) => resolve_phrase(ctx, words), - Or(_, ops) => { - let mut candidates = RoaringBitmap::new(); - for op in ops { - let docids = resolve_operation(ctx, op, wdcache)?; - candidates |= docids; - } - Ok(candidates) - } - Query(q) => Ok(query_docids(ctx, q, wdcache)?), - } - } - - resolve_operation(ctx, query_tree, wdcache) -} - -pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result { - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; - let winsize = phrase.len().min(3); - - if phrase.is_empty() { - return Ok(candidates); - } - - for win in phrase.windows(winsize) { - // Get all the documents with the matching distance for each word pairs. - let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win - .iter() - .enumerate() - .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) - { - for (dist, s2) in win - .iter() - .skip(offset + 1) - .enumerate() - .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) - { - if dist == 0 { - match ctx.word_pair_proximity_docids(s1, s2, 1)? { - Some(m) => bitmaps.push(m), - // If there are no document for this pair, there will be no - // results for the phrase query. - None => return Ok(RoaringBitmap::new()), - } - } else { - let mut bitmap = RoaringBitmap::new(); - for dist in 0..=dist { - if let Some(m) = ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - bitmap |= m - } - } - if bitmap.is_empty() { - return Ok(bitmap); - } else { - bitmaps.push(bitmap); - } - } - } - } - - // We sort the bitmaps so that we perform the small intersections first, which is faster. - bitmaps.sort_unstable_by_key(|a| a.len()); - - for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } - // There will be no match, return early - if candidates.is_empty() { - break; - } - } - } - Ok(candidates) -} - -fn all_word_pair_overall_proximity_docids, U: AsRef>( - ctx: &dyn Context, - left_words: &[(T, u8)], - right_words: &[(U, u8)], - proximity: u8, -) -> Result { - let mut docids = RoaringBitmap::new(); - for (left, _l_typo) in left_words { - for (right, _r_typo) in right_words { - let current_docids = - word_pair_overall_proximity_docids(ctx, left.as_ref(), right.as_ref(), proximity)? - .unwrap_or_default(); - docids |= current_docids; - } - } - Ok(docids) -} - -fn query_docids( - ctx: &dyn Context, - query: &Query, - wdcache: &mut WordDerivationsCache, -) -> Result { - match &query.kind { - QueryKind::Exact { word, original_typo } => { - if query.prefix && ctx.in_prefix_cache(word) { - let mut docids = ctx.word_prefix_docids(word)?.unwrap_or_default(); - // only add the exact docids if the word hasn't been derived - if *original_typo == 0 { - docids |= ctx.exact_word_prefix_docids(word)?.unwrap_or_default(); - } - Ok(docids) - } else if query.prefix { - let words = word_derivations(word, true, 0, ctx.words_fst(), wdcache)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - docids |= ctx.word_docids(word)?.unwrap_or_default(); - // only add the exact docids if the word hasn't been derived - if *original_typo == 0 { - docids |= ctx.exact_word_docids(word)?.unwrap_or_default(); - } - } - Ok(docids) - } else { - let mut docids = ctx.word_docids(word)?.unwrap_or_default(); - // only add the exact docids if the word hasn't been derived - if *original_typo == 0 { - docids |= ctx.exact_word_docids(word)?.unwrap_or_default(); - } - Ok(docids) - } - } - QueryKind::Tolerant { typo, word } => { - let words = word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)?; - let mut docids = RoaringBitmap::new(); - for (word, typo) in words { - let mut current_docids = ctx.word_docids(word)?.unwrap_or_default(); - if *typo == 0 { - current_docids |= ctx.exact_word_docids(word)?.unwrap_or_default() - } - docids |= current_docids; - } - Ok(docids) - } - } -} - -fn query_pair_proximity_docids( - ctx: &dyn Context, - left: &Query, - right: &Query, - proximity: u8, - wdcache: &mut WordDerivationsCache, -) -> Result { - if proximity >= 8 { - let mut candidates = query_docids(ctx, left, wdcache)?; - let right_candidates = query_docids(ctx, right, wdcache)?; - candidates &= right_candidates; - return Ok(candidates); - } - - let prefix = right.prefix; - match (&left.kind, &right.kind) { - (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { - if prefix { - // There are three distinct cases which we need to distinguish regarding the prefix `right`: - // - // 1. `right` is not in any prefix cache because it is not the prefix of many words - // (and thus, it doesn't have many word derivations) - // 2. `right` is in the prefix cache but cannot be found in the "word prefix pair proximity" databases either - // because it is too long or because the given proximity is too high. - // 3. `right` is in the prefix cache and can be found in the "word prefix pair proximity" databases - // - // The three cases are handled as follows: - // 1. We manually retrieve all the word derivations of `right` and check the `word_pair_proximity` - // database for each of them. - // 2. It would be too expensive to apply the same strategy as (1), therefore, we "disable" the - // proximity ranking rule for the prefixes of the right word. This is done as follows: - // 1. Only find the documents where left is in proximity to the exact (ie non-prefix) right word - // 2. Otherwise, assume that their proximity in all the documents in which they coexist is >= 8 - // - // 3. Query the prefix proximity databases. - match ( - ctx.in_prefix_cache(right), - right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB - && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, - ) { - // Case 1: not in prefix cache - (false, _) => { - let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_overall_proximity_docids( - ctx, - &[(left, 0)], - r_words, - proximity, - ) - } - // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to - // query the prefix proximity databases. - (true, false) => { - // To "save" the relevancy a little bit, we still find the documents where the - // exact (i.e. non-prefix) right word is in the given proximity to the left word. - Ok(word_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? - .unwrap_or_default()) - } - // Case 3: in prefix cache, short enough, and proximity is low enough - (true, true) => Ok(word_prefix_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? - .unwrap_or_default()), - } - } else { - Ok(word_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? - .unwrap_or_default()) - } - } - (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { - let l_words = - word_derivations(left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); - if prefix { - // The logic here is almost identical to the one in the previous match branch. - // The difference is that we fetch the docids for each derivation of the left word. - match ( - ctx.in_prefix_cache(right), - right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB - && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, - ) { - // Case 1: not in prefix cache - (false, _) => { - let mut docids = RoaringBitmap::new(); - let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; - for (left, _) in l_words { - docids |= all_word_pair_overall_proximity_docids( - ctx, - &[(left, 0)], - r_words, - proximity, - )?; - } - Ok(docids) - } - // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to - // query the prefix proximity databases. - (true, false) => { - // To "save" the relevancy a little bit, we still find the documents where the - // exact (i.e. non-prefix) right word is in proximity to any derivation of the left word. - let mut candidates = RoaringBitmap::new(); - for (left, _) in l_words { - candidates |= ctx - .word_pair_proximity_docids(&left, right, proximity)? - .unwrap_or_default(); - } - Ok(candidates) - } - // Case 3: in prefix cache, short enough, and proximity is low enough - (true, true) => { - let mut docids = RoaringBitmap::new(); - for (left, _) in l_words { - docids |= word_prefix_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? - .unwrap_or_default(); - } - Ok(docids) - } - } - } else { - all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) - } - } - (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { - let r_words = word_derivations(right, prefix, *typo, ctx.words_fst(), wdcache)?; - all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], r_words, proximity) - } - ( - QueryKind::Tolerant { typo: l_typo, word: left }, - QueryKind::Tolerant { typo: r_typo, word: right }, - ) => { - let l_words = - word_derivations(left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); - let r_words = word_derivations(right, prefix, *r_typo, ctx.words_fst(), wdcache)?; - all_word_pair_overall_proximity_docids(ctx, &l_words, r_words, proximity) - } - } -} - -#[cfg(test)] -pub mod test { - use std::collections::HashMap; - use std::iter; - - use maplit::hashmap; - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; - - use super::*; - - fn s(s: &str) -> String { - s.to_string() - } - pub struct TestContext<'t> { - words_fst: fst::Set>, - word_docids: HashMap, - exact_word_docids: HashMap, - word_prefix_docids: HashMap, - exact_word_prefix_docids: HashMap, - word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, - word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, - prefix_word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, - docid_words: HashMap>, - } - - impl<'c> Context<'c> for TestContext<'c> { - fn documents_ids(&self) -> heed::Result { - Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids)) - } - - fn word_docids(&self, word: &str) -> heed::Result> { - Ok(self.word_docids.get(&word.to_string()).cloned()) - } - - fn exact_word_docids(&self, word: &str) -> heed::Result> { - Ok(self.exact_word_docids.get(&word.to_string()).cloned()) - } - - fn word_prefix_docids(&self, word: &str) -> heed::Result> { - Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) - } - - fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { - Ok(self.exact_word_prefix_docids.get(&word.to_string()).cloned()) - } - - fn word_pair_proximity_docids( - &self, - left: &str, - right: &str, - proximity: u8, - ) -> heed::Result> { - let key = (left.to_string(), right.to_string(), proximity.into()); - Ok(self.word_pair_proximity_docids.get(&key).cloned()) - } - - fn word_prefix_pair_proximity_docids( - &self, - word: &str, - prefix: &str, - proximity: u8, - ) -> heed::Result> { - let key = (word.to_string(), prefix.to_string(), proximity.into()); - Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) - } - fn prefix_word_pair_proximity_docids( - &self, - prefix: &str, - word: &str, - proximity: u8, - ) -> heed::Result> { - let key = (prefix.to_string(), word.to_string(), proximity.into()); - Ok(self.prefix_word_pair_proximity_docids.get(&key).cloned()) - } - - fn words_fst<'t>(&self) -> &'t fst::Set> { - &self.words_fst - } - - fn in_prefix_cache(&self, word: &str) -> bool { - self.word_prefix_docids.contains_key(&word.to_string()) - } - - fn docid_words_positions( - &self, - docid: DocumentId, - ) -> heed::Result> { - if let Some(docid_words) = self.docid_words.get(&docid) { - Ok(docid_words - .iter() - .enumerate() - .map(|(i, w)| { - let bitmap = RoaringBitmap::from_sorted_iter(iter::once(i as u32)).unwrap(); - (w.clone(), bitmap) - }) - .collect()) - } else { - Ok(HashMap::new()) - } - } - - fn word_position_iterator( - &self, - _word: &str, - _in_prefix_cache: bool, - ) -> heed::Result< - Box> + 'c>, - > { - todo!() - } - - fn synonyms(&self, _word: &str) -> heed::Result>>> { - todo!() - } - - fn searchable_fields_ids(&self) -> Result> { - todo!() - } - - fn word_position_docids( - &self, - _word: &str, - _pos: u32, - ) -> heed::Result> { - todo!() - } - - fn field_id_word_count_docids( - &self, - _field_id: FieldId, - _word_count: u8, - ) -> heed::Result> { - todo!() - } - } - - impl<'a> Default for TestContext<'a> { - fn default() -> TestContext<'a> { - let mut rng = StdRng::seed_from_u64(102); - let rng = &mut rng; - - fn random_postings(rng: &mut R, len: usize) -> RoaringBitmap { - let mut values = Vec::::with_capacity(len); - while values.len() != len { - values.push(rng.gen()); - } - values.sort_unstable(); - - RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap() - } - - let word_docids = hashmap! { - s("hello") => random_postings(rng, 1500), - s("hi") => random_postings(rng, 4000), - s("word") => random_postings(rng, 2500), - s("split") => random_postings(rng, 400), - s("ngrams") => random_postings(rng, 1400), - s("world") => random_postings(rng, 15_000), - s("earth") => random_postings(rng, 8000), - s("2021") => random_postings(rng, 100), - s("2020") => random_postings(rng, 500), - s("is") => random_postings(rng, 50_000), - s("this") => random_postings(rng, 50_000), - s("good") => random_postings(rng, 1250), - s("morning") => random_postings(rng, 125), - }; - - let exact_word_docids = HashMap::new(); - - let mut docid_words = HashMap::new(); - for (word, docids) in word_docids.iter() { - for docid in docids { - let words: &mut Vec<_> = docid_words.entry(docid).or_default(); - words.push(word.clone()); - } - } - - let word_prefix_docids = hashmap! { - s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], - s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], - s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], - }; - - let exact_word_prefix_docids = HashMap::new(); - - let mut word_pair_proximity_docids = HashMap::new(); - let mut word_prefix_pair_proximity_docids = HashMap::new(); - let mut prefix_word_pair_proximity_docids = HashMap::new(); - - for (lword, lcandidates) in &word_docids { - for (rword, rcandidates) in &word_docids { - if lword == rword { - continue; - } - let candidates = lcandidates & rcandidates; - for candidate in candidates { - if let Some(docid_words) = docid_words.get(&candidate) { - let lposition = docid_words.iter().position(|w| w == lword).unwrap(); - let rposition = docid_words.iter().position(|w| w == rword).unwrap(); - let key = if lposition < rposition { - (s(lword), s(rword), (rposition - lposition) as i32) - } else { - (s(lword), s(rword), (lposition - rposition + 1) as i32) - }; - let docids: &mut RoaringBitmap = - word_pair_proximity_docids.entry(key).or_default(); - docids.push(candidate); - } - } - } - for (pword, pcandidates) in &word_prefix_docids { - if lword.starts_with(pword) { - continue; - } - let candidates = lcandidates & pcandidates; - for candidate in candidates { - if let Some(docid_words) = docid_words.get(&candidate) { - let lposition = docid_words.iter().position(|w| w == lword).unwrap(); - let rposition = - docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); - if lposition < rposition { - let key = (s(lword), s(pword), (rposition - lposition) as i32); - let docids: &mut RoaringBitmap = - word_prefix_pair_proximity_docids.entry(key).or_default(); - docids.push(candidate); - } else { - let key = (s(lword), s(pword), (lposition - rposition) as i32); - let docids: &mut RoaringBitmap = - prefix_word_pair_proximity_docids.entry(key).or_default(); - docids.push(candidate); - }; - } - } - } - } - - let mut keys = word_docids.keys().collect::>(); - keys.sort_unstable(); - let words_fst = fst::Set::from_iter(keys).unwrap().map_data(Cow::Owned).unwrap(); - - TestContext { - words_fst, - word_docids, - exact_word_docids, - word_prefix_docids, - exact_word_prefix_docids, - word_pair_proximity_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, - docid_words, - } - } - } -} diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs deleted file mode 100644 index 182f9fbea..000000000 --- a/milli/src/search/criteria/proximity.rs +++ /dev/null @@ -1,712 +0,0 @@ -use std::collections::btree_map::{self, BTreeMap}; -use std::collections::hash_map::HashMap; - -use log::debug; -use roaring::RoaringBitmap; -use slice_group_by::GroupBy; - -use super::{ - query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, - Criterion, CriterionParameters, CriterionResult, -}; -use crate::search::criteria::InitialCandidates; -use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; -use crate::search::{build_dfa, CriterionImplementationStrategy, WordDerivationsCache}; -use crate::{Position, Result}; - -type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; - -/// Threshold on the number of candidates that will make -/// the system choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 1000; - -/// Threshold on the number of proximity that will make -/// the system choose between one algorithm or another. -const PROXIMITY_THRESHOLD: u8 = 0; - -pub struct Proximity<'t> { - ctx: &'t dyn Context<'t>, - /// (max_proximity, query_tree, allowed_candidates) - state: Option<(u8, Operation, RoaringBitmap)>, - proximity: u8, - initial_candidates: InitialCandidates, - parent: Box, - candidates_cache: Cache, - plane_sweep_cache: Option>, - implementation_strategy: CriterionImplementationStrategy, -} - -impl<'t> Proximity<'t> { - pub fn new( - ctx: &'t dyn Context<'t>, - parent: Box, - implementation_strategy: CriterionImplementationStrategy, - ) -> Self { - Proximity { - ctx, - state: None, - proximity: 0, - initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), - parent, - candidates_cache: Cache::new(), - plane_sweep_cache: None, - implementation_strategy, - } - } -} - -impl<'t> Criterion for Proximity<'t> { - #[logging_timer::time("Proximity::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some((_, _, allowed_candidates)) = self.state.as_mut() { - *allowed_candidates -= params.excluded_candidates; - } - - loop { - debug!( - "Proximity at iteration {} (max prox {:?}) ({:?})", - self.proximity, - self.state.as_ref().map(|(mp, _, _)| mp), - self.state.as_ref().map(|(_, _, cd)| cd), - ); - - match &mut self.state { - Some((max_prox, _, allowed_candidates)) - if allowed_candidates.is_empty() || self.proximity > *max_prox => - { - self.state = None; // reset state - } - Some((_, query_tree, allowed_candidates)) => { - let mut new_candidates = if matches!( - self.implementation_strategy, - CriterionImplementationStrategy::OnlyIterative - ) || (matches!( - self.implementation_strategy, - CriterionImplementationStrategy::Dynamic - ) && allowed_candidates.len() - <= CANDIDATES_THRESHOLD - && self.proximity > PROXIMITY_THRESHOLD) - { - if let Some(cache) = self.plane_sweep_cache.as_mut() { - match cache.next() { - Some((p, candidates)) => { - self.proximity = p; - candidates - } - None => { - self.state = None; // reset state - continue; - } - } - } else { - let cache = resolve_plane_sweep_candidates( - self.ctx, - query_tree, - allowed_candidates, - )?; - self.plane_sweep_cache = Some(cache.into_iter()); - - continue; - } - } else { - // use set theory based algorithm - resolve_candidates( - self.ctx, - query_tree, - self.proximity, - &mut self.candidates_cache, - params.wdcache, - )? - }; - - new_candidates &= &*allowed_candidates; - *allowed_candidates -= &new_candidates; - self.proximity += 1; - - return Ok(Some(CriterionResult { - query_tree: Some(query_tree.clone()), - candidates: Some(new_candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates, - initial_candidates, - }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => { - resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - - params.excluded_candidates - } - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - match initial_candidates { - Some(initial_candidates) => { - self.initial_candidates |= initial_candidates - } - None => self.initial_candidates.map_inplace(|c| c | &candidates), - } - - let maximum_proximity = maximum_proximity(&query_tree); - self.state = Some((maximum_proximity as u8, query_tree, candidates)); - self.proximity = 0; - self.plane_sweep_cache = None; - } - Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - })); - } - None => return Ok(None), - }, - } - } - } -} - -fn resolve_candidates( - ctx: &dyn Context, - query_tree: &Operation, - proximity: u8, - cache: &mut Cache, - wdcache: &mut WordDerivationsCache, -) -> Result { - fn resolve_operation( - ctx: &dyn Context, - query_tree: &Operation, - proximity: u8, - cache: &mut Cache, - wdcache: &mut WordDerivationsCache, - ) -> Result> { - use Operation::{And, Or, Phrase}; - - let result = match query_tree { - And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, - Phrase(words) => { - if proximity == 0 { - let most_left = words - .iter() - .filter_map(|o| o.as_ref()) - .next() - .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - let most_right = words - .iter() - .rev() - .filter_map(|o| o.as_ref()) - .next() - .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - - match (most_left, most_right) { - (Some(l), Some(r)) => vec![(l, r, resolve_phrase(ctx, words)?)], - _otherwise => Default::default(), - } - } else { - Default::default() - } - } - Or(_, ops) => { - let mut output = Vec::new(); - for op in ops { - let result = resolve_operation(ctx, op, proximity, cache, wdcache)?; - output.extend(result); - } - output - } - Operation::Query(q) => { - if proximity == 0 { - let candidates = query_docids(ctx, q, wdcache)?; - vec![(q.clone(), q.clone(), candidates)] - } else { - Default::default() - } - } - }; - - Ok(result) - } - - fn mdfs_pair( - ctx: &dyn Context, - left: &Operation, - right: &Operation, - proximity: u8, - cache: &mut Cache, - wdcache: &mut WordDerivationsCache, - ) -> Result> { - fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator { - (0..=mana.min(left_max)).map(move |m| (m, mana - m)) - } - - let pair_max_proximity = 7; - - let mut output = Vec::new(); - - for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) { - for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) { - let left_key = (left.clone(), left_p); - if !cache.contains_key(&left_key) { - let candidates = resolve_operation(ctx, left, left_p, cache, wdcache)?; - cache.insert(left_key.clone(), candidates); - } - - let right_key = (right.clone(), right_p); - if !cache.contains_key(&right_key) { - let candidates = resolve_operation(ctx, right, right_p, cache, wdcache)?; - cache.insert(right_key.clone(), candidates); - } - - let lefts = cache.get(&left_key).unwrap(); - let rights = cache.get(&right_key).unwrap(); - - for (ll, lr, lcandidates) in lefts { - for (rl, rr, rcandidates) in rights { - let mut candidates = - query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; - if lcandidates.len() < rcandidates.len() { - candidates &= lcandidates; - candidates &= rcandidates; - } else { - candidates &= rcandidates; - candidates &= lcandidates; - } - if !candidates.is_empty() { - output.push((ll.clone(), rr.clone(), candidates)); - } - } - } - } - } - - Ok(output) - } - - fn mdfs( - ctx: &dyn Context, - branches: &[Operation], - proximity: u8, - cache: &mut Cache, - wdcache: &mut WordDerivationsCache, - ) -> Result> { - // Extract the first two elements but gives the tail - // that is just after the first element. - let next = - branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t)))); - - match next { - Some((head1, Some((head2, [_])))) => { - mdfs_pair(ctx, head1, head2, proximity, cache, wdcache) - } - Some((head1, Some((head2, tail)))) => { - let mut output = Vec::new(); - for p in 0..=proximity { - for (lhead, _, head_candidates) in - mdfs_pair(ctx, head1, head2, p, cache, wdcache)? - { - if !head_candidates.is_empty() { - for (_, rtail, mut candidates) in - mdfs(ctx, tail, proximity - p, cache, wdcache)? - { - candidates &= &head_candidates; - if !candidates.is_empty() { - output.push((lhead.clone(), rtail, candidates)); - } - } - } - } - } - Ok(output) - } - Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache), - None => Ok(Default::default()), - } - } - - let mut candidates = RoaringBitmap::new(); - for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? { - candidates |= cds; - } - Ok(candidates) -} - -fn resolve_plane_sweep_candidates( - ctx: &dyn Context, - query_tree: &Operation, - allowed_candidates: &RoaringBitmap, -) -> Result> { - /// FIXME may be buggy with query like "new new york" - fn plane_sweep( - groups_positions: Vec>, - consecutive: bool, - ) -> Result> { - fn compute_groups_proximity( - groups: &[(usize, (Position, u8, Position))], - consecutive: bool, - ) -> Option<(Position, u8, Position)> { - // take the inner proximity of the first group as initial - let (_, (_, mut proximity, _)) = groups.first()?; - let (_, (left_most_pos, _, _)) = groups.first()?; - let (_, (_, _, right_most_pos)) = - groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?; - - for pair in groups.windows(2) { - if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair { - // if two positions are equal, meaning that they share at least a word, we return None - if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 { - return None; - } - - let pair_proximity = { - // if intervals are disjoint [..].(..) - if lpos2 > rpos1 { - lpos2 - rpos1 - } - // if the second interval is a subset of the first [.(..).] - else if rpos2 < rpos1 { - (lpos2 - lpos1).min(rpos1 - rpos2) - } - // if intervals overlaps [.(..].) - else { - (lpos2 - lpos1).min(rpos2 - rpos1) - } - }; - - // if groups are in the good order (query order) we remove 1 to the proximity - // the proximity is clamped to 7 - let pair_proximity = - if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) }; - - proximity += pair_proximity as u8 + prox2; - } - } - - // if groups should be consecutives, we will only accept groups with a proximity of 0 - if !consecutive || proximity == 0 { - Some((*left_most_pos, proximity, *right_most_pos)) - } else { - None - } - } - - let groups_len = groups_positions.len(); - - let mut groups_positions: Vec<_> = - groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); - - // Pop top elements of each list. - let mut current = Vec::with_capacity(groups_len); - for (i, positions) in groups_positions.iter_mut().enumerate() { - match positions.next() { - Some(p) => current.push((i, p)), - // if a group return None, it means that the document does not contain all the words, - // we return an empty result. - None => return Ok(Vec::new()), - } - } - - // Sort k elements by their positions. - current.sort_unstable_by_key(|(_, p)| *p); - - // Find leftmost and rightmost group and their positions. - let mut leftmost = *current.first().unwrap(); - let mut rightmost = *current.last().unwrap(); - - let mut output = Vec::new(); - loop { - // Find the position p of the next elements of a list of the leftmost group. - // If the list is empty, break the loop. - let p = groups_positions[leftmost.0].next().map(|p| (leftmost.0, p)); - - // let q be the position q of second group of the interval. - let q = current[1]; - - // If p > r, then the interval [l, r] is minimal and - // we insert it into the heap according to its size. - if p.map_or(true, |p| p.1 > rightmost.1) { - if let Some(group) = compute_groups_proximity(¤t, consecutive) { - output.push(group); - } - } - - let p = match p { - Some(p) => p, - None => break, - }; - - // Replace the leftmost group P in the interval. - current[0] = p; - - if p.1 > rightmost.1 { - // if [l, r] is minimal, let r = p and l = q. - rightmost = p; - leftmost = q; - } else { - // Ohterwise, let l = min{p,q}. - leftmost = if p.1 < q.1 { p } else { q }; - } - - // Then update the interval and order of groups_positions in the interval. - current.sort_unstable_by_key(|(_, p)| *p); - } - - // Sort the list according to the size and the positions. - output.sort_unstable(); - - Ok(output) - } - - fn resolve_operation<'a>( - query_tree: &'a Operation, - rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, - words_positions: &HashMap, - ) -> Result> { - use Operation::{And, Or, Phrase}; - - if let Some(result) = rocache.get(query_tree) { - return Ok(result.clone()); - } - - let result = match query_tree { - And(ops) => { - let mut groups_positions = Vec::with_capacity(ops.len()); - for operation in ops { - let positions = resolve_operation(operation, rocache, words_positions)?; - groups_positions.push(positions); - } - plane_sweep(groups_positions, false)? - } - Phrase(words) => { - let mut groups_positions = Vec::with_capacity(words.len()); - - // group stop_words together. - for words in words.linear_group_by_key(Option::is_none) { - // skip if it's a group of stop words. - if matches!(words.first(), None | Some(None)) { - continue; - } - // make a consecutive plane-sweep on the subgroup of words. - let mut subgroup = Vec::with_capacity(words.len()); - for word in words.iter().map(|w| w.as_deref().unwrap()) { - match words_positions.get(word) { - Some(positions) => { - subgroup.push(positions.iter().map(|p| (p, 0, p)).collect()) - } - None => return Ok(vec![]), - } - } - match subgroup.len() { - 0 => {} - 1 => groups_positions.push(subgroup.pop().unwrap()), - _ => groups_positions.push(plane_sweep(subgroup, true)?), - } - } - match groups_positions.len() { - 0 => vec![], - 1 => groups_positions.pop().unwrap(), - _ => plane_sweep(groups_positions, false)?, - } - } - Or(_, ops) => { - let mut result = Vec::new(); - for op in ops { - result.extend(resolve_operation(op, rocache, words_positions)?) - } - - result.sort_unstable(); - result - } - Operation::Query(Query { prefix, kind }) => { - let mut result = Vec::new(); - match kind { - QueryKind::Exact { word, .. } => { - if *prefix { - let iter = word_derivations(word, true, 0, words_positions) - .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); - result.extend(iter); - } else if let Some(positions) = words_positions.get(word) { - result.extend(positions.iter().map(|p| (p, 0, p))); - } - } - QueryKind::Tolerant { typo, word } => { - let iter = word_derivations(word, *prefix, *typo, words_positions) - .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); - result.extend(iter); - } - } - - result.sort_unstable(); - result - } - }; - - rocache.insert(query_tree, result.clone()); - Ok(result) - } - - fn word_derivations<'a>( - word: &str, - is_prefix: bool, - max_typo: u8, - words_positions: &'a HashMap, - ) -> impl Iterator { - let dfa = build_dfa(word, max_typo, is_prefix); - words_positions.iter().filter_map(move |(document_word, positions)| { - use levenshtein_automata::Distance; - match dfa.eval(document_word) { - Distance::Exact(_) => Some(positions), - Distance::AtLeast(_) => None, - } - }) - } - - let mut resolve_operation_cache = HashMap::new(); - let mut candidates = BTreeMap::new(); - for docid in allowed_candidates { - let words_positions = ctx.docid_words_positions(docid)?; - resolve_operation_cache.clear(); - let positions = - resolve_operation(query_tree, &mut resolve_operation_cache, &words_positions)?; - let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); - let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7); - candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid); - } - - Ok(candidates) -} - -#[cfg(test)] -mod tests { - use std::io::Cursor; - - use big_s::S; - - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::{Criterion, CriterionImplementationStrategy, SearchResult}; - - fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { - let mut documents = Vec::new(); - for prefix in prefixes { - for i in 0..500 { - documents.push( - serde_json::json!({ - "text": format!("{prefix}{i:x}"), - }) - .as_object() - .unwrap() - .clone(), - ) - } - } - documents - } - - #[test] - fn test_proximity_criterion_prefix_handling() { - let mut index = TempIndex::new(); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_primary_key(S("id")); - settings.set_criteria(vec![ - Criterion::Words, - Criterion::Typo, - Criterion::Proximity, - ]); - }) - .unwrap(); - - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - - for doc in [ - // 0 - serde_json::json!({ "text": "zero is exactly the amount of configuration I want" }), - // 1 - serde_json::json!({ "text": "zero bad configuration" }), - // 2 - serde_json::json!({ "text": "zero configuration" }), - // 3 - serde_json::json!({ "text": "zero config" }), - // 4 - serde_json::json!({ "text": "zero conf" }), - // 5 - serde_json::json!({ "text": "zero bad conf" }), - ] { - documents.append_json_object(doc.as_object().unwrap()).unwrap(); - } - for doc in documents_with_enough_different_words_for_prefixes(&["conf"]) { - documents.append_json_object(&doc).unwrap(); - } - let documents = - DocumentsBatchReader::from_reader(Cursor::new(documents.into_inner().unwrap())) - .unwrap(); - - index.add_documents(documents).unwrap(); - - let rtxn = index.read_txn().unwrap(); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = index - .search(&rtxn) - .query("zero c") - .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) - .execute() - .unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = index - .search(&rtxn) - .query("zero co") - .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) - .execute() - .unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = index - .search(&rtxn) - .query("zero con") - .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) - .execute() - .unwrap(); - // Here searh results are degraded because `con` is in the prefix cache but it is too - // long to be stored in the prefix proximity databases, and we don't want to iterate over - // all of its word derivations - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]"); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = index - .search(&rtxn) - .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) - .query("zero conf") - .execute() - .unwrap(); - // Here search results are degraded as well, but we can still rank correctly documents - // that contain `conf` exactly, and not as a prefix. - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 2, 3]"); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = index - .search(&rtxn) - .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) - .query("zero config") - .execute() - .unwrap(); - // `config` is not a common prefix, so the normal methods are used - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 4, 5]"); - } -} diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs deleted file mode 100644 index 69a210e7b..000000000 --- a/milli/src/search/criteria/typo.rs +++ /dev/null @@ -1,493 +0,0 @@ -use std::borrow::Cow; -use std::collections::HashMap; -use std::mem::take; - -use log::debug; -use roaring::RoaringBitmap; - -use super::{ - query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters, - CriterionResult, -}; -use crate::search::criteria::{resolve_phrase, InitialCandidates}; -use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; -use crate::search::{word_derivations, WordDerivationsCache}; -use crate::Result; - -/// Maximum number of typo for a word of any length. -const MAX_TYPOS_PER_WORD: u8 = 2; - -pub struct Typo<'t> { - ctx: &'t dyn Context<'t>, - /// (max_typos, query_tree, candidates) - state: Option<(u8, Operation, Candidates)>, - typos: u8, - initial_candidates: Option, - parent: Box, - candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, -} - -impl<'t> Typo<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { - Typo { - ctx, - state: None, - typos: 0, - initial_candidates: None, - parent, - candidates_cache: HashMap::new(), - } - } -} - -impl<'t> Criterion for Typo<'t> { - #[logging_timer::time("Typo::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - use Candidates::{Allowed, Forbidden}; - // remove excluded candidates when next is called, instead of doing it in the loop. - match self.state.as_mut() { - Some((_, _, Allowed(candidates))) => *candidates -= params.excluded_candidates, - Some((_, _, Forbidden(candidates))) => *candidates |= params.excluded_candidates, - None => (), - } - - loop { - debug!( - "Typo at iteration {} (max typos {:?}) ({:?})", - self.typos, - self.state.as_ref().map(|(mt, _, _)| mt), - self.state.as_ref().map(|(_, _, cd)| cd), - ); - - match self.state.as_mut() { - Some((max_typos, _, _)) if self.typos > *max_typos => { - self.state = None; // reset state - } - Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => { - self.state = None; // reset state - } - Some((_, query_tree, candidates_authorization)) => { - let fst = self.ctx.words_fst(); - let new_query_tree = match self.typos { - typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree( - fst, - query_tree.clone(), - self.typos, - params.wdcache, - )?, - MAX_TYPOS_PER_WORD => { - // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, - // we keep the altered query tree - *query_tree = alterate_query_tree( - fst, - query_tree.clone(), - self.typos, - params.wdcache, - )?; - // we compute the allowed candidates - let query_tree_allowed_candidates = - resolve_query_tree(self.ctx, query_tree, params.wdcache)?; - // we assign the allowed candidates to the candidates authorization. - *candidates_authorization = match take(candidates_authorization) { - Allowed(allowed_candidates) => { - Allowed(query_tree_allowed_candidates & allowed_candidates) - } - Forbidden(forbidden_candidates) => { - Allowed(query_tree_allowed_candidates - forbidden_candidates) - } - }; - query_tree.clone() - } - _otherwise => query_tree.clone(), - }; - - let mut candidates = resolve_candidates( - self.ctx, - &new_query_tree, - self.typos, - &mut self.candidates_cache, - params.wdcache, - )?; - - match candidates_authorization { - Allowed(allowed_candidates) => { - candidates &= &*allowed_candidates; - *allowed_candidates -= &candidates; - } - Forbidden(forbidden_candidates) => { - candidates -= &*forbidden_candidates; - *forbidden_candidates |= &candidates; - } - } - - let initial_candidates = match self.initial_candidates.as_mut() { - Some(initial_candidates) => initial_candidates.take(), - None => InitialCandidates::Estimated(candidates.clone()), - }; - - self.typos += 1; - - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: Some(candidates), - filtered_candidates: None, - initial_candidates: Some(initial_candidates), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates, - initial_candidates, - }) => { - self.initial_candidates = - match (self.initial_candidates.take(), initial_candidates) { - (Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic), - (self_ic, parent_ic) => self_ic.or(parent_ic), - }; - - let candidates = match candidates.or(filtered_candidates) { - Some(candidates) => { - Candidates::Allowed(candidates - params.excluded_candidates) - } - None => Candidates::Forbidden(params.excluded_candidates.clone()), - }; - - let maximum_typos = maximum_typo(&query_tree) as u8; - self.state = Some((maximum_typos, query_tree, candidates)); - self.typos = 0; - } - Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - })); - } - None => return Ok(None), - }, - } - } - } -} - -/// Modify the query tree by replacing every tolerant query by an Or operation -/// containing all of the corresponding exact words in the words FST. Each tolerant -/// query will only be replaced by exact query with up to `number_typos` maximum typos. -fn alterate_query_tree( - words_fst: &fst::Set>, - mut query_tree: Operation, - number_typos: u8, - wdcache: &mut WordDerivationsCache, -) -> Result { - fn recurse( - words_fst: &fst::Set>, - operation: &mut Operation, - number_typos: u8, - wdcache: &mut WordDerivationsCache, - ) -> Result<()> { - use Operation::{And, Or, Phrase}; - - match operation { - And(ops) | Or(_, ops) => { - ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) - } - // Because Phrases don't allow typos, no alteration can be done. - Phrase(_words) => Ok(()), - Operation::Query(q) => { - if let QueryKind::Tolerant { typo, word } = &q.kind { - // if no typo is allowed we don't call word_derivations function, - // and directly create an Exact query - if number_typos == 0 { - *operation = Operation::Query(Query { - prefix: q.prefix, - kind: QueryKind::Exact { original_typo: 0, word: word.clone() }, - }); - } else { - let typo = *typo.min(&number_typos); - let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; - let queries = words - .iter() - .map(|(word, typo)| { - Operation::Query(Query { - prefix: false, - kind: QueryKind::Exact { - original_typo: *typo, - word: word.to_string(), - }, - }) - }) - .collect(); - - *operation = Operation::or(false, queries); - } - } - - Ok(()) - } - } - } - - recurse(words_fst, &mut query_tree, number_typos, wdcache)?; - Ok(query_tree) -} - -fn resolve_candidates( - ctx: &dyn Context, - query_tree: &Operation, - number_typos: u8, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, - wdcache: &mut WordDerivationsCache, -) -> Result { - fn resolve_operation( - ctx: &dyn Context, - query_tree: &Operation, - number_typos: u8, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, - wdcache: &mut WordDerivationsCache, - ) -> Result { - use Operation::{And, Or, Phrase, Query}; - - match query_tree { - And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache), - Phrase(words) => resolve_phrase(ctx, words), - Or(_, ops) => { - let mut candidates = RoaringBitmap::new(); - for op in ops { - let docids = resolve_operation(ctx, op, number_typos, cache, wdcache)?; - candidates |= docids; - } - Ok(candidates) - } - Query(q) => { - if q.kind.typo() == number_typos { - Ok(query_docids(ctx, q, wdcache)?) - } else { - Ok(RoaringBitmap::new()) - } - } - } - } - - fn mdfs( - ctx: &dyn Context, - branches: &[Operation], - mana: u8, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, - wdcache: &mut WordDerivationsCache, - ) -> Result { - match branches.split_first() { - Some((head, [])) => { - let cache_key = (head.clone(), mana); - if let Some(candidates) = cache.get(&cache_key) { - Ok(candidates.clone()) - } else { - let candidates = resolve_operation(ctx, head, mana, cache, wdcache)?; - cache.insert(cache_key, candidates.clone()); - Ok(candidates) - } - } - Some((head, tail)) => { - let mut candidates = RoaringBitmap::new(); - - for m in 0..=mana { - let mut head_candidates = { - let cache_key = (head.clone(), m); - if let Some(candidates) = cache.get(&cache_key) { - candidates.clone() - } else { - let candidates = resolve_operation(ctx, head, m, cache, wdcache)?; - cache.insert(cache_key, candidates.clone()); - candidates - } - }; - if !head_candidates.is_empty() { - let tail_candidates = mdfs(ctx, tail, mana - m, cache, wdcache)?; - head_candidates &= tail_candidates; - candidates |= head_candidates; - } - } - - Ok(candidates) - } - None => Ok(RoaringBitmap::new()), - } - } - - resolve_operation(ctx, query_tree, number_typos, cache, wdcache) -} - -#[cfg(test)] -mod test { - use super::super::initial::Initial; - use super::super::test::TestContext; - use super::*; - use crate::search::NoopDistinct; - - fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String { - let mut result = String::new(); - while let Some(criterion) = criteria.next(&mut parameters).unwrap() { - result.push_str(&format!("{criterion:?}\n\n")); - } - result - } - - #[test] - fn initial_placeholder_no_facets() { - let context = TestContext::default(); - let query_tree = None; - let facet_candidates = None; - - let criterion_parameters = CriterionParameters { - wdcache: &mut WordDerivationsCache::new(), - excluded_candidates: &RoaringBitmap::new(), - }; - - let parent = - Initial::::new(&context, query_tree, facet_candidates, false, None); - let criteria = Typo::new(&context, Box::new(parent)); - - let result = display_criteria(criteria, criterion_parameters); - insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, initial_candidates: None } - - "###); - } - - #[test] - fn initial_query_tree_no_facets() { - let context = TestContext::default(); - let query_tree = Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ); - - let facet_candidates = None; - - let criterion_parameters = CriterionParameters { - wdcache: &mut WordDerivationsCache::new(), - excluded_candidates: &RoaringBitmap::new(), - }; - let parent = - Initial::::new(&context, Some(query_tree), facet_candidates, false, None); - let criteria = Typo::new(&context, Box::new(parent)); - - let result = display_criteria(criteria, criterion_parameters); - insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: Some(OR - AND - Exact { word: "split" } - Exact { word: "this" } - Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } - - CriterionResult { query_tree: Some(OR - AND - Exact { word: "split" } - Exact { word: "this" } - OR - Exact { word: "word" } - Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } - - "###); - } - - #[test] - fn initial_placeholder_with_facets() { - let context = TestContext::default(); - let query_tree = None; - let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - - let criterion_parameters = CriterionParameters { - wdcache: &mut WordDerivationsCache::new(), - excluded_candidates: &RoaringBitmap::new(), - }; - let parent = - Initial::::new(&context, query_tree, Some(facet_candidates), false, None); - let criteria = Typo::new(&context, Box::new(parent)); - - let result = display_criteria(criteria, criterion_parameters); - insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), initial_candidates: None } - - "###); - } - - #[test] - fn initial_query_tree_with_facets() { - let context = TestContext::default(); - let query_tree = Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ); - - let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - - let criterion_parameters = CriterionParameters { - wdcache: &mut WordDerivationsCache::new(), - excluded_candidates: &RoaringBitmap::new(), - }; - let parent = Initial::::new( - &context, - Some(query_tree), - Some(facet_candidates), - false, - None, - ); - let criteria = Typo::new(&context, Box::new(parent)); - - let result = display_criteria(criteria, criterion_parameters); - insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: Some(OR - AND - Exact { word: "split" } - Exact { word: "this" } - Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } - - CriterionResult { query_tree: Some(OR - AND - Exact { word: "split" } - Exact { word: "this" } - OR - Exact { word: "word" } - Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } - - "###); - } -} diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs deleted file mode 100644 index 4c5f8b45b..000000000 --- a/milli/src/search/criteria/words.rs +++ /dev/null @@ -1,106 +0,0 @@ -use log::debug; -use roaring::RoaringBitmap; - -use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::InitialCandidates; -use crate::search::query_tree::Operation; -use crate::Result; - -pub struct Words<'t> { - ctx: &'t dyn Context<'t>, - query_trees: Vec, - candidates: Option, - initial_candidates: Option, - filtered_candidates: Option, - parent: Box, -} - -impl<'t> Words<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { - Words { - ctx, - query_trees: Vec::default(), - candidates: None, - initial_candidates: None, - parent, - filtered_candidates: None, - } - } -} - -impl<'t> Criterion for Words<'t> { - #[logging_timer::time("Words::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some(candidates) = self.candidates.as_mut() { - *candidates -= params.excluded_candidates; - } - - loop { - debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); - - match self.query_trees.pop() { - Some(query_tree) => { - let candidates = match self.candidates.as_mut() { - Some(allowed_candidates) => { - let mut candidates = - resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; - candidates &= &*allowed_candidates; - *allowed_candidates -= &candidates; - Some(candidates) - } - None => None, - }; - - let initial_candidates = self.initial_candidates.clone(); - - return Ok(Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates: self.filtered_candidates.clone(), - initial_candidates, - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates, - initial_candidates, - }) => { - self.query_trees = explode_query_tree(query_tree); - self.candidates = candidates; - self.filtered_candidates = filtered_candidates; - - self.initial_candidates = - match (self.initial_candidates.take(), initial_candidates) { - (Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic), - (self_ic, parent_ic) => self_ic.or(parent_ic), - }; - } - Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - })); - } - None => return Ok(None), - }, - } - } - } -} - -fn explode_query_tree(query_tree: Operation) -> Vec { - match query_tree { - Operation::Or(true, ops) => ops, - otherwise => vec![otherwise], - } -} diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs deleted file mode 100644 index 3ed683823..000000000 --- a/milli/src/search/distinct/facet_distinct.rs +++ /dev/null @@ -1,218 +0,0 @@ -use std::mem::size_of; - -use concat_arrays::concat_arrays; -use heed::types::{ByteSlice, Str, Unit}; -use roaring::RoaringBitmap; - -use super::{Distinct, DocIter}; -use crate::error::InternalError; -use crate::heed_codec::facet::{FacetGroupKey, *}; -use crate::index::db_name; -use crate::{DocumentId, FieldId, Index, Result}; - -const FID_SIZE: usize = size_of::(); -const DOCID_SIZE: usize = size_of::(); - -/// A distinct implementer that is backed by facets. -/// -/// On each iteration, the facet values for the -/// distinct attribute of the first document are retrieved. The document ids for these facet values -/// are then retrieved and taken out of the the candidate and added to the excluded set. We take -/// care to keep the document we are currently on, and remove it from the excluded list. The next -/// iterations will never contain any occurence of a document with the same distinct value as a -/// document from previous iterations. -#[derive(Clone)] -pub struct FacetDistinct<'a> { - distinct: FieldId, - index: &'a Index, - txn: &'a heed::RoTxn<'a>, -} - -impl<'a> FacetDistinct<'a> { - pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { - Self { distinct, index, txn } - } -} - -pub struct FacetDistinctIter<'a> { - candidates: RoaringBitmap, - distinct: FieldId, - excluded: RoaringBitmap, - index: &'a Index, - iter_offset: usize, - txn: &'a heed::RoTxn<'a>, -} - -impl<'a> FacetDistinctIter<'a> { - fn facet_string_docids(&self, key: &str) -> heed::Result> { - self.index - .facet_id_string_docids - .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) - .map(|opt| opt.map(|v| v.bitmap)) - } - - fn facet_number_docids(&self, key: f64) -> heed::Result> { - // get facet docids on level 0 - self.index - .facet_id_f64_docids - .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) - .map(|opt| opt.map(|v| v.bitmap)) - } - - fn distinct_string(&mut self, id: DocumentId) -> Result<()> { - let iter = facet_string_values(id, self.distinct, self.index, self.txn)?; - - for item in iter { - let ((_, _, value), _) = item?; - let facet_docids = - self.facet_string_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { - db_name: db_name::FACET_ID_STRING_DOCIDS, - key: None, - })?; - self.excluded |= facet_docids; - } - - self.excluded.remove(id); - - Ok(()) - } - - fn distinct_number(&mut self, id: DocumentId) -> Result<()> { - let iter = facet_number_values(id, self.distinct, self.index, self.txn)?; - - for item in iter { - let ((_, _, value), _) = item?; - let facet_docids = - self.facet_number_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { - db_name: db_name::FACET_ID_F64_DOCIDS, - key: None, - })?; - self.excluded |= facet_docids; - } - - self.excluded.remove(id); - - Ok(()) - } - - /// Performs the next iteration of the facet distinct. This is a convenience method that is - /// called by the Iterator::next implementation that transposes the result. It makes error - /// handling easier. - fn next_inner(&mut self) -> Result> { - // The first step is to remove all the excluded documents from our candidates - self.candidates -= &self.excluded; - - let mut candidates_iter = self.candidates.iter().skip(self.iter_offset); - match candidates_iter.next() { - Some(id) => { - // We distinct the document id on its facet strings and facet numbers. - self.distinct_string(id)?; - self.distinct_number(id)?; - - // The first document of each iteration is kept, since the next call to - // `difference_with` will filter out all the documents for that facet value. By - // increasing the offset we make sure to get the first valid value for the next - // distinct document to keep. - self.iter_offset += 1; - - Ok(Some(id)) - } - // no more candidate at this offset, return. - None => Ok(None), - } - } -} - -#[allow(clippy::drop_non_drop)] -fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] { - concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) -} - -fn facet_number_values<'a>( - id: DocumentId, - distinct: FieldId, - index: &Index, - txn: &'a heed::RoTxn, -) -> Result> { - let key = facet_values_prefix_key(distinct, id); - - let iter = index - .field_id_docid_facet_f64s - .remap_key_type::() - .prefix_iter(txn, &key)? - .remap_key_type::(); - - Ok(iter) -} - -fn facet_string_values<'a>( - id: DocumentId, - distinct: FieldId, - index: &Index, - txn: &'a heed::RoTxn, -) -> Result> { - let key = facet_values_prefix_key(distinct, id); - - let iter = index - .field_id_docid_facet_strings - .remap_key_type::() - .prefix_iter(txn, &key)? - .remap_types::(); - - Ok(iter) -} - -impl Iterator for FacetDistinctIter<'_> { - type Item = Result; - - fn next(&mut self) -> Option { - self.next_inner().transpose() - } -} - -impl DocIter for FacetDistinctIter<'_> { - fn into_excluded(self) -> RoaringBitmap { - self.excluded - } -} - -impl<'a> Distinct for FacetDistinct<'a> { - type Iter = FacetDistinctIter<'a>; - - fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { - FacetDistinctIter { - candidates, - distinct: self.distinct, - excluded, - index: self.index, - iter_offset: 0, - txn: self.txn, - } - } -} - -#[cfg(test)] -mod test { - use super::super::test::{generate_index, validate_distinct_candidates}; - use super::*; - - macro_rules! test_facet_distinct { - ($name:ident, $distinct:literal) => { - #[test] - fn $name() { - let (index, fid, candidates) = generate_index($distinct); - let txn = index.read_txn().unwrap(); - let mut map_distinct = FacetDistinct::new(fid, &index, &txn); - let excluded = RoaringBitmap::new(); - let mut iter = map_distinct.distinct(candidates.clone(), excluded); - let count = validate_distinct_candidates(iter.by_ref(), fid, &index); - let excluded = iter.into_excluded(); - assert_eq!(count as u64 + excluded.len(), candidates.len()); - } - }; - } - - test_facet_distinct!(test_string, "txt"); - test_facet_distinct!(test_strings, "txts"); - test_facet_distinct!(test_number, "cat-int"); -} diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs deleted file mode 100644 index 12374c1f5..000000000 --- a/milli/src/search/distinct/mod.rs +++ /dev/null @@ -1,155 +0,0 @@ -mod facet_distinct; -mod noop_distinct; - -pub use facet_distinct::FacetDistinct; -pub use noop_distinct::NoopDistinct; -use roaring::RoaringBitmap; - -use crate::{DocumentId, Result}; - -/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. -/// It provides a way to get back the ownership to the excluded set. -pub trait DocIter: Iterator> { - /// Returns ownership on the internal exluded set. - fn into_excluded(self) -> RoaringBitmap; -} - -/// A trait that is implemented by structs that perform a distinct on `candidates`. Calling distinct -/// must return an iterator containing only distinct documents, and add the discarded documents to -/// the excluded set. The excluded set can later be retrieved by calling `DocIter::excluded` on the -/// returned iterator. -pub trait Distinct { - type Iter: DocIter; - - fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter; -} - -#[cfg(test)] -mod test { - use std::collections::HashSet; - use std::io::Cursor; - - use once_cell::sync::Lazy; - use rand::seq::SliceRandom; - use rand::Rng; - use roaring::RoaringBitmap; - use serde_json::{json, Value}; - - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::index::Index; - use crate::update::{ - IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, - }; - use crate::{DocumentId, FieldId, BEU32}; - - static JSON: Lazy> = Lazy::new(|| { - let mut rng = rand::thread_rng(); - let num_docs = rng.gen_range(10..30); - - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - let txts = ["Toto", "Titi", "Tata"]; - let cats = (1..10).map(|i| i.to_string()).collect::>(); - let cat_ints = (1..10).collect::>(); - - for i in 0..num_docs { - let txt = txts.choose(&mut rng).unwrap(); - let mut sample_txts = cats.clone(); - sample_txts.shuffle(&mut rng); - - let mut sample_ints = cat_ints.clone(); - sample_ints.shuffle(&mut rng); - - let json = json!({ - "id": i, - "txt": txt, - "cat-int": rng.gen_range(0..3), - "txts": sample_txts[..(rng.gen_range(0..3))], - "cat-ints": sample_ints[..(rng.gen_range(0..3))], - }); - - let object = match json { - Value::Object(object) => object, - _ => panic!(), - }; - - builder.append_json_object(&object).unwrap(); - } - - builder.into_inner().unwrap() - }); - - /// Returns a temporary index populated with random test documents, the FieldId for the - /// distinct attribute, and the RoaringBitmap with the document ids. - pub(crate) fn generate_index(distinct: &str) -> (TempIndex, FieldId, RoaringBitmap) { - let index = TempIndex::new(); - let mut txn = index.write_txn().unwrap(); - - // set distinct and faceted attributes for the index. - let config = IndexerConfig::default(); - let mut update = Settings::new(&mut txn, &index, &config); - update.set_distinct_field(distinct.to_string()); - update.execute(|_| (), || false).unwrap(); - - // add documents to the index - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }; - let addition = - IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| (), || false) - .unwrap(); - - let reader = - crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) - .unwrap(); - - let (addition, user_error) = addition.add_documents(reader).unwrap(); - user_error.unwrap(); - addition.execute().unwrap(); - - let fields_map = index.fields_ids_map(&txn).unwrap(); - let fid = fields_map.id(distinct).unwrap(); - - let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap(); - let map = (0..documents.documents_count()).collect(); - - txn.commit().unwrap(); - - (index, fid, map) - } - - /// Checks that all the candidates are distinct, and returns the candidates number. - pub(crate) fn validate_distinct_candidates( - candidates: impl Iterator>, - distinct: FieldId, - index: &Index, - ) -> usize { - fn test(seen: &mut HashSet, value: &Value) { - match value { - Value::Null | Value::Object(_) | Value::Bool(_) => (), - Value::Number(_) | Value::String(_) => { - let s = value.to_string(); - assert!(seen.insert(s)); - } - Value::Array(values) => values.iter().for_each(|value| test(seen, value)), - } - } - - let mut seen = HashSet::::new(); - - let txn = index.read_txn().unwrap(); - let mut count = 0; - for candidate in candidates { - count += 1; - let candidate = candidate.unwrap(); - let id = BEU32::new(candidate); - let document = index.documents.get(&txn, &id).unwrap().unwrap(); - let value = document.get(distinct).unwrap(); - let value = serde_json::from_slice(value).unwrap(); - test(&mut seen, &value); - } - count - } -} diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs deleted file mode 100644 index 96a1f7d5d..000000000 --- a/milli/src/search/distinct/noop_distinct.rs +++ /dev/null @@ -1,55 +0,0 @@ -use roaring::bitmap::IntoIter; -use roaring::RoaringBitmap; - -use super::{Distinct, DocIter}; -use crate::{DocumentId, Result}; - -/// A distinct implementer that does not perform any distinct, -/// and simply returns an iterator to the candidates. -pub struct NoopDistinct; - -pub struct NoopDistinctIter { - candidates: IntoIter, - excluded: RoaringBitmap, -} - -impl Iterator for NoopDistinctIter { - type Item = Result; - - fn next(&mut self) -> Option { - self.candidates.next().map(Ok) - } -} - -impl DocIter for NoopDistinctIter { - fn into_excluded(self) -> RoaringBitmap { - self.excluded - } -} - -impl Distinct for NoopDistinct { - type Iter = NoopDistinctIter; - - fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { - NoopDistinctIter { candidates: candidates.into_iter(), excluded } - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_noop() { - let candidates = (1..10).collect(); - let excluded = RoaringBitmap::new(); - let mut iter = NoopDistinct.distinct(candidates, excluded); - assert_eq!( - iter.by_ref().map(Result::unwrap).collect::>(), - (1..10).collect::>() - ); - - let excluded = iter.into_excluded(); - assert!(excluded.is_empty()); - } -} diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 2aae78bb2..e9435f180 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -309,7 +309,7 @@ impl<'a> FacetDistribution<'a> { let mut distribution = BTreeMap::new(); for (fid, name) in fields_ids_map.iter() { if crate::is_faceted(name, &fields) { - let min_value = if let Some(min_value) = crate::search::criteria::facet_min_value( + let min_value = if let Some(min_value) = crate::search::facet::facet_min_value( self.index, self.rtxn, fid, @@ -319,7 +319,7 @@ impl<'a> FacetDistribution<'a> { } else { continue; }; - let max_value = if let Some(max_value) = crate::search::criteria::facet_max_value( + let max_value = if let Some(max_value) = crate::search::facet::facet_max_value( self.index, self.rtxn, fid, diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index c88d4e9e7..51f1bf005 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -2,11 +2,13 @@ pub use facet_sort_ascending::ascending_facet_sort; pub use facet_sort_descending::descending_facet_sort; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, RoTxn}; +use roaring::RoaringBitmap; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::{BadGeoError, Filter}; -use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; use crate::heed_codec::ByteSliceRefCodec; +use crate::{Index, Result}; mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; @@ -14,6 +16,38 @@ mod facet_sort_ascending; mod facet_sort_descending; mod filter; +fn facet_extreme_value<'t>( + mut extreme_it: impl Iterator> + 't, +) -> Result> { + let extreme_value = + if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) }; + let (_, extreme_value) = extreme_value?; + + Ok(OrderedF64Codec::bytes_decode(extreme_value)) +} + +pub fn facet_min_value<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: u16, + candidates: RoaringBitmap, +) -> Result> { + let db = index.facet_id_f64_docids.remap_key_type::>(); + let it = ascending_facet_sort(rtxn, db, field_id, candidates)?; + facet_extreme_value(it) +} + +pub fn facet_max_value<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: u16, + candidates: RoaringBitmap, +) -> Result> { + let db = index.facet_id_f64_docids.remap_key_type::>(); + let it = descending_facet_sort(rtxn, db, field_id, candidates)?; + facet_extreme_value(it) +} + /// Get the first facet value in the facet database pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index dc236dd0d..1015b01cb 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,38 +1,27 @@ -use std::borrow::Cow; -use std::collections::hash_map::{Entry, HashMap}; -use std::fmt; -use std::mem::take; -use std::result::Result as StdResult; -use std::str::Utf8Error; -use std::time::Instant; - -use charabia::TokenizerBuilder; -use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; -use fst::automaton::Str; -use fst::{Automaton, IntoStreamer, Streamer}; -use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; -use log::debug; -use once_cell::sync::Lazy; -use roaring::bitmap::RoaringBitmap; - pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, }; -use self::query_tree::QueryTreeBuilder; -use crate::error::UserError; -use crate::search::criteria::r#final::{Final, FinalResult}; -use crate::search::criteria::InitialCandidates; -use crate::{AscDesc, Criterion, DocumentId, Index, Member, Result}; +use crate::{ + execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, +}; +use fst::automaton::Str; +use fst::{Automaton, IntoStreamer, Streamer}; +use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; +use once_cell::sync::Lazy; +use roaring::bitmap::RoaringBitmap; +use std::borrow::Cow; +use std::collections::hash_map::{Entry, HashMap}; +use std::fmt; +use std::result::Result as StdResult; +use std::str::Utf8Error; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); -mod criteria; -mod distinct; pub mod facet; mod fst_utils; mod matches; @@ -135,162 +124,18 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - // We create the query tree by spliting the query into tokens. - let before = Instant::now(); - let (query_tree, primitive_query, matching_words) = match self.query.as_ref() { - Some(query) => { - let mut builder = QueryTreeBuilder::new(self.rtxn, self.index)?; - builder.terms_matching_strategy(self.terms_matching_strategy); - - builder.authorize_typos(self.is_typo_authorized()?); - - builder.words_limit(self.words_limit); - // We make sure that the analyzer is aware of the stop words - // this ensures that the query builder is able to properly remove them. - let mut tokbuilder = TokenizerBuilder::new(); - let stop_words = self.index.stop_words(self.rtxn)?; - if let Some(ref stop_words) = stop_words { - tokbuilder.stop_words(stop_words); - } - - let script_lang_map = self.index.script_language(self.rtxn)?; - if !script_lang_map.is_empty() { - tokbuilder.allow_list(&script_lang_map); - } - - let tokenizer = tokbuilder.build(); - let tokens = tokenizer.tokenize(query); - builder - .build(tokens)? - .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) - } - None => (None, None, None), - }; - - debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); - - // We create the original candidates with the facet conditions results. - let before = Instant::now(); - let filtered_candidates = match &self.filter { - Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?), - None => None, - }; - - debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed()); - - // We check that we are allowed to use the sort criteria, we check - // that they are declared in the sortable fields. - if let Some(sort_criteria) = &self.sort_criteria { - let sortable_fields = self.index.sortable_fields(self.rtxn)?; - for asc_desc in sort_criteria { - match asc_desc.member() { - Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => { - return Err(UserError::InvalidSortableAttribute { - field: field.to_string(), - valid_fields: sortable_fields.into_iter().collect(), - })? - } - Member::Geo(_) if !sortable_fields.contains("_geo") => { - return Err(UserError::InvalidSortableAttribute { - field: "_geo".to_string(), - valid_fields: sortable_fields.into_iter().collect(), - })? - } - _ => (), - } - } - } - - // We check that the sort ranking rule exists and throw an - // error if we try to use it and that it doesn't. - let sort_ranking_rule_missing = !self.index.criteria(self.rtxn)?.contains(&Criterion::Sort); - let empty_sort_criteria = self.sort_criteria.as_ref().map_or(true, |s| s.is_empty()); - if sort_ranking_rule_missing && !empty_sort_criteria { - return Err(UserError::SortRankingRuleMissing.into()); - } - - let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - - match self.index.distinct_field(self.rtxn)? { - None => { - let criteria = criteria_builder.build::( - query_tree, - primitive_query, - filtered_candidates, - self.sort_criteria.clone(), - self.exhaustive_number_hits, - None, - self.criterion_implementation_strategy, - )?; - self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria) - } - Some(name) => { - let field_ids_map = self.index.fields_ids_map(self.rtxn)?; - match field_ids_map.id(name) { - Some(fid) => { - let distinct = FacetDistinct::new(fid, self.index, self.rtxn); - - let criteria = criteria_builder.build( - query_tree, - primitive_query, - filtered_candidates, - self.sort_criteria.clone(), - self.exhaustive_number_hits, - Some(distinct.clone()), - self.criterion_implementation_strategy, - )?; - self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) - } - None => Ok(SearchResult::default()), - } - } - } - } - - fn perform_sort( - &self, - mut distinct: D, - matching_words: MatchingWords, - mut criteria: Final, - ) -> Result { - let mut offset = self.offset; - let mut initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new()); - let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?; - let mut documents_ids = Vec::new(); - - while let Some(FinalResult { candidates, initial_candidates: ic, .. }) = - criteria.next(&excluded_candidates)? - { - debug!("Number of candidates found {}", candidates.len()); - - let excluded = take(&mut excluded_candidates); - let mut candidates = distinct.distinct(candidates, excluded); - - initial_candidates |= ic; - - if offset != 0 { - let discarded = candidates.by_ref().take(offset).count(); - offset = offset.saturating_sub(discarded); - } - - for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { - documents_ids.push(candidate?); - } - - excluded_candidates |= candidates.into_excluded(); - - if documents_ids.len() == self.limit { - break; - } - } - - initial_candidates.map_inplace(|c| c - excluded_candidates); - - Ok(SearchResult { - matching_words, - candidates: initial_candidates.into_inner(), - documents_ids, - }) + let mut ctx = SearchContext::new(self.index, self.rtxn); + execute_search( + &mut ctx, + &self.query, + self.terms_matching_strategy, + &self.filter, + self.offset, + self.limit, + Some(self.words_limit), + &mut DefaultSearchLogger, + &mut DefaultSearchLogger, + ) } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index fff180879..15c895583 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -18,7 +18,7 @@ mod words; // #[cfg(test)] use std::collections::{BTreeSet, HashSet}; -use charabia::Tokenize; +use charabia::{Tokenize, TokenizerBuilder}; use db_cache::DatabaseCache; use graph_based_ranking_rule::{Proximity, Typo}; use heed::RoTxn; @@ -224,32 +224,41 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( #[allow(clippy::too_many_arguments)] pub fn execute_search( ctx: &mut SearchContext, - query: &str, + query: &Option, terms_matching_strategy: TermsMatchingStrategy, - filters: Option, + filters: &Option, from: usize, length: usize, + words_limit: Option, placeholder_search_logger: &mut dyn SearchLogger, query_graph_logger: &mut dyn SearchLogger, ) -> Result { - assert!(!query.is_empty()); - let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None)?; - let graph = QueryGraph::from_query(ctx, query_terms)?; - let mut universe = if let Some(filters) = filters { filters.evaluate(ctx.txn, ctx.index)? } else { ctx.index.documents_ids(ctx.txn)? }; - // TODO: other way to tell whether it is a placeholder search - // This way of doing things is not correct because if someone searches - // for a word that does not appear in any document, the word will be removed - // from the graph and thus its number of nodes will be == 2 - // But in that case, we should return no results. - // - // The search is a placeholder search only if there are no tokens? - let documents_ids = if graph.nodes.len() > 2 { + let documents_ids = if let Some(query) = query { + // We make sure that the analyzer is aware of the stop words + // this ensures that the query builder is able to properly remove them. + let mut tokbuilder = TokenizerBuilder::new(); + let stop_words = ctx.index.stop_words(ctx.txn)?; + if let Some(ref stop_words) = stop_words { + tokbuilder.stop_words(stop_words); + } + + let script_lang_map = ctx.index.script_language(ctx.txn)?; + if !script_lang_map.is_empty() { + tokbuilder.allow_list(&script_lang_map); + } + + let tokenizer = tokbuilder.build(); + let tokens = tokenizer.tokenize(&query); + + let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; + let graph = QueryGraph::from_query(ctx, query_terms)?; + universe = resolve_maximally_reduced_query_graph( ctx, &universe, @@ -259,6 +268,7 @@ pub fn execute_search( )?; let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, terms_matching_strategy)?; + bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { let ranking_rules = get_ranking_rules_for_placeholder_search(ctx)?; diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 8591670b8..e239d4669 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -427,7 +427,7 @@ impl LocatedQueryTerm { /// Convert the tokenised search query into a list of located query terms. pub fn located_query_terms_from_string( ctx: &mut SearchContext, - query: NormalizedTokenIter>, + query: NormalizedTokenIter<&[u8]>, words_limit: Option, ) -> Result> { let nbr_typos = number_of_typos_allowed(ctx)?; From 7169d85115050c58e2e7b373265a3b2f0f3f435e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 23 Mar 2023 09:39:16 +0100 Subject: [PATCH 084/234] Remove old query_tree code and make clippy happy --- milli/src/search/facet/facet_distribution.rs | 42 +- .../search/facet/facet_distribution_iter.rs | 4 +- milli/src/search/facet/facet_range_search.rs | 8 +- .../src/search/facet/facet_sort_ascending.rs | 4 +- .../src/search/facet/facet_sort_descending.rs | 4 +- milli/src/search/mod.rs | 170 +- milli/src/search/new/mod.rs | 5 +- milli/src/search/query_tree.rs | 1435 ----------------- 8 files changed, 82 insertions(+), 1590 deletions(-) delete mode 100755 milli/src/search/query_tree.rs diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index e9435f180..f5f32fecf 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -73,7 +73,7 @@ impl<'a> FacetDistribution<'a> { let distribution_prelength = distribution.len(); let db = self.index.field_id_docid_facet_f64s; - for docid in candidates.into_iter() { + for docid in candidates { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db @@ -97,7 +97,7 @@ impl<'a> FacetDistribution<'a> { let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); let db = self.index.field_id_docid_facet_strings; - 'outer: for docid in candidates.into_iter() { + 'outer: for docid in candidates { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db @@ -505,7 +505,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..10_000).into_iter().collect()) + .candidates((0..10_000).collect()) .execute() .unwrap(); @@ -513,7 +513,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..5_000).into_iter().collect()) + .candidates((0..5_000).collect()) .execute() .unwrap(); @@ -521,7 +521,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..5_000).into_iter().collect()) + .candidates((0..5_000).collect()) .execute() .unwrap(); @@ -529,7 +529,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..5_000).into_iter().collect()) + .candidates((0..5_000).collect()) .max_values_per_facet(1) .execute() .unwrap(); @@ -546,7 +546,7 @@ mod tests { .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) .unwrap(); - let facet_values = (0..1000).into_iter().map(|x| format!("{x:x}")).collect::>(); + let facet_values = (0..1000).map(|x| format!("{x:x}")).collect::>(); let mut documents = vec![]; for i in 0..10_000 { @@ -582,7 +582,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..10_000).into_iter().collect()) + .candidates((0..10_000).collect()) .execute() .unwrap(); @@ -590,7 +590,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..5_000).into_iter().collect()) + .candidates((0..5_000).collect()) .execute() .unwrap(); @@ -606,7 +606,7 @@ mod tests { .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) .unwrap(); - let facet_values = (0..1000).into_iter().collect::>(); + let facet_values = (0..1000).collect::>(); let mut documents = vec![]; for i in 0..1000 { @@ -634,7 +634,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..1000).into_iter().collect()) + .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -642,7 +642,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((217..777).into_iter().collect()) + .candidates((217..777).collect()) .compute_stats() .unwrap(); @@ -658,7 +658,7 @@ mod tests { .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) .unwrap(); - let facet_values = (0..1000).into_iter().collect::>(); + let facet_values = (0..1000).collect::>(); let mut documents = vec![]; for i in 0..1000 { @@ -686,7 +686,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..1000).into_iter().collect()) + .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -694,7 +694,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((217..777).into_iter().collect()) + .candidates((217..777).collect()) .compute_stats() .unwrap(); @@ -710,7 +710,7 @@ mod tests { .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) .unwrap(); - let facet_values = (0..1000).into_iter().collect::>(); + let facet_values = (0..1000).collect::>(); let mut documents = vec![]; for i in 0..1000 { @@ -738,7 +738,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..1000).into_iter().collect()) + .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -746,7 +746,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((217..777).into_iter().collect()) + .candidates((217..777).collect()) .compute_stats() .unwrap(); @@ -762,7 +762,7 @@ mod tests { .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) .unwrap(); - let facet_values = (0..1000).into_iter().collect::>(); + let facet_values = (0..1000).collect::>(); let mut documents = vec![]; for i in 0..1000 { @@ -794,7 +794,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..1000).into_iter().collect()) + .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -802,7 +802,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((217..777).into_iter().collect()) + .candidates((217..777).collect()) .compute_stats() .unwrap(); diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index bb3c75343..d355b981a 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -142,7 +142,7 @@ mod tests { let indexes = [get_simple_index(), get_random_looking_index()]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (0..=255).into_iter().collect::(); + let candidates = (0..=255).collect::(); let mut results = String::new(); iterate_over_facet_distribution( &txn, @@ -166,7 +166,7 @@ mod tests { let indexes = [get_simple_index(), get_random_looking_index()]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (0..=255).into_iter().collect::(); + let candidates = (0..=255).collect::(); let mut results = String::new(); let mut nbr_facets = 0; iterate_over_facet_distribution( diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index b1ab6f71f..26854bc1a 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -410,7 +410,7 @@ mod tests { let mut results = String::new(); - for i in (0..=255).into_iter().rev() { + for i in (0..=255).rev() { let i = i as f64; let start = Bound::Included(i); let end = Bound::Included(255.); @@ -431,7 +431,7 @@ mod tests { let mut results = String::new(); - for i in (0..=255).into_iter().rev() { + for i in (0..=255).rev() { let i = i as f64; let start = Bound::Excluded(i); let end = Bound::Excluded(255.); @@ -466,7 +466,7 @@ mod tests { let mut results = String::new(); - for i in (0..=128).into_iter().rev() { + for i in (0..=128).rev() { let i = i as f64; let start = Bound::Included(i); let end = Bound::Included(255. - i); @@ -491,7 +491,7 @@ mod tests { let mut results = String::new(); - for i in (0..=128).into_iter().rev() { + for i in (0..=128).rev() { let i = i as f64; let start = Bound::Excluded(i); let end = Bound::Excluded(255. - i); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index f59b884de..892401c08 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -132,7 +132,7 @@ mod tests { let indexes = [get_simple_index(), get_random_looking_index()]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (200..=300).into_iter().collect::(); + let candidates = (200..=300).collect::(); let mut results = String::new(); let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap(); for el in iter { @@ -154,7 +154,7 @@ mod tests { ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (200..=300).into_iter().collect::(); + let candidates = (200..=300).collect::(); let mut results = String::new(); let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap(); for el in iter { diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 454b12859..549f50f0a 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -142,7 +142,7 @@ mod tests { ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (200..=300).into_iter().collect::(); + let candidates = (200..=300).collect::(); let mut results = String::new(); let db = index.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); @@ -165,7 +165,7 @@ mod tests { ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (200..=300).into_iter().collect::(); + let candidates = (200..=300).collect::(); let mut results = String::new(); let db = index.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap(); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 1015b01cb..46829b986 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,21 +1,14 @@ pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; -use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, }; use crate::{ execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, }; -use fst::automaton::Str; -use fst::{Automaton, IntoStreamer, Streamer}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use std::borrow::Cow; -use std::collections::hash_map::{Entry, HashMap}; use std::fmt; -use std::result::Result as StdResult; -use std::str::Utf8Error; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -26,7 +19,6 @@ pub mod facet; mod fst_utils; mod matches; pub mod new; -mod query_tree; pub struct Search<'a> { query: Option, @@ -200,70 +192,6 @@ impl Default for TermsMatchingStrategy { } } -pub type WordDerivationsCache = HashMap<(String, bool, u8), Vec<(String, u8)>>; - -pub fn word_derivations<'c>( - word: &str, - is_prefix: bool, - max_typo: u8, - fst: &fst::Set>, - cache: &'c mut WordDerivationsCache, -) -> StdResult<&'c [(String, u8)], Utf8Error> { - match cache.entry((word.to_string(), is_prefix, max_typo)) { - Entry::Occupied(entry) => Ok(entry.into_mut()), - Entry::Vacant(entry) => { - // println!("word derivations {word} {is_prefix} {max_typo}"); - let mut derived_words = Vec::new(); - if max_typo == 0 { - if is_prefix { - let prefix = Str::new(word).starts_with(); - let mut stream = fst.search(prefix).into_stream(); - - while let Some(word) = stream.next() { - let word = std::str::from_utf8(word)?; - derived_words.push((word.to_string(), 0)); - } - } else if fst.contains(word) { - derived_words.push((word.to_string(), 0)); - } - } else if max_typo == 1 { - let dfa = build_dfa(word, 1, is_prefix); - let starts = StartsWith(Str::new(get_first(word))); - let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); - - while let Some((word, state)) = stream.next() { - let word = std::str::from_utf8(word)?; - let d = dfa.distance(state.1); - derived_words.push((word.to_string(), d.to_u8())); - } - } else { - let starts = StartsWith(Str::new(get_first(word))); - let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); - let second_dfa = build_dfa(word, 2, is_prefix); - let second = Intersection(&second_dfa, &starts); - let automaton = Union(first, &second); - - let mut stream = fst.search_with_state(automaton).into_stream(); - - while let Some((found_word, state)) = stream.next() { - let found_word = std::str::from_utf8(found_word)?; - // in the case the typo is on the first letter, we know the number of typo - // is two - if get_first(found_word) != get_first(word) { - derived_words.push((found_word.to_string(), 2)); - } else { - // Else, we know that it is the second dfa that matched and compute the - // correct distance - let d = second_dfa.distance((state.1).0); - derived_words.push((found_word.to_string(), d.to_u8())); - } - } - } - Ok(entry.insert(derived_words)) - } - } -} - fn get_first(s: &str) -> &str { match s.chars().next() { Some(c) => &s[..c.len_utf8()], @@ -337,66 +265,66 @@ mod test { assert!(!search.is_typo_authorized().unwrap()); } - #[test] - fn test_one_typos_tolerance() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap(); + // #[test] + // fn test_one_typos_tolerance() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap(); - assert_eq!(found, &[("zealand".to_string(), 1)]); - } + // assert_eq!(found, &[("zealand".to_string(), 1)]); + // } - #[test] - fn test_one_typos_first_letter() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap(); + // #[test] + // fn test_one_typos_first_letter() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap(); - assert_eq!(found, &[]); - } + // assert_eq!(found, &[]); + // } - #[test] - fn test_two_typos_tolerance() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap(); + // #[test] + // fn test_two_typos_tolerance() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap(); - assert_eq!(found, &[("zealand".to_string(), 2)]); - } + // assert_eq!(found, &[("zealand".to_string(), 2)]); + // } - #[test] - fn test_two_typos_first_letter() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap(); + // #[test] + // fn test_two_typos_first_letter() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap(); - assert_eq!(found, &[("zealand".to_string(), 2)]); - } + // assert_eq!(found, &[("zealand".to_string(), 2)]); + // } - #[test] - fn test_prefix() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap(); + // #[test] + // fn test_prefix() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap(); - assert_eq!(found, &[("zealand".to_string(), 0)]); - } + // assert_eq!(found, &[("zealand".to_string(), 0)]); + // } - #[test] - fn test_bad_prefix() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap(); + // #[test] + // fn test_bad_prefix() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap(); - assert_eq!(found, &[]); - } + // assert_eq!(found, &[]); + // } - #[test] - fn test_prefix_with_typo() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap(); + // #[test] + // fn test_prefix_with_typo() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap(); - assert_eq!(found, &[("zealand".to_string(), 1)]); - } + // assert_eq!(found, &[("zealand".to_string(), 1)]); + // } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 15c895583..45cad378a 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -15,10 +15,9 @@ mod sort; // TODO: documentation + comments mod words; -// #[cfg(test)] use std::collections::{BTreeSet, HashSet}; -use charabia::{Tokenize, TokenizerBuilder}; +use charabia::TokenizerBuilder; use db_cache::DatabaseCache; use graph_based_ranking_rule::{Proximity, Typo}; use heed::RoTxn; @@ -254,7 +253,7 @@ pub fn execute_search( } let tokenizer = tokbuilder.build(); - let tokens = tokenizer.tokenize(&query); + let tokens = tokenizer.tokenize(query); let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; let graph = QueryGraph::from_query(ctx, query_terms)?; diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs deleted file mode 100755 index 1b1a42c1c..000000000 --- a/milli/src/search/query_tree.rs +++ /dev/null @@ -1,1435 +0,0 @@ -use std::borrow::Cow; -use std::cmp::max; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::hash::Hash; -use std::rc::Rc; -use std::{fmt, mem}; - -use charabia::normalizer::NormalizedTokenIter; -use charabia::{SeparatorKind, TokenKind}; -use roaring::RoaringBitmap; -use slice_group_by::GroupBy; - -use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; -use crate::search::TermsMatchingStrategy; -use crate::{CboRoaringBitmapLenCodec, Index, MatchingWords, Result}; - -type IsOptionalWord = bool; -type IsPrefix = bool; - -#[derive(Clone, PartialEq, Eq, Hash)] -pub enum Operation { - And(Vec), - // series of consecutive non prefix and exact words - // `None` means a stop word. - Phrase(Vec>), - Or(IsOptionalWord, Vec), - Query(Query), -} - -impl fmt::Debug for Operation { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result { - match op { - Operation::And(children) => { - writeln!(f, "{:1$}AND", "", depth * 2)?; - children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - } - Operation::Phrase(children) => { - writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2) - } - Operation::Or(true, children) => { - writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?; - children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - } - Operation::Or(false, children) => { - writeln!(f, "{:1$}OR", "", depth * 2)?; - children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - } - Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2), - } - } - - pprint_tree(f, self, 0) - } -} - -impl Operation { - fn and(mut ops: Vec) -> Self { - if ops.len() == 1 { - ops.pop().unwrap() - } else { - Self::And(ops) - } - } - - pub fn or(word_branch: IsOptionalWord, mut ops: Vec) -> Self { - if ops.len() == 1 { - ops.pop().unwrap() - } else { - let ops = ops - .into_iter() - .flat_map(|o| match o { - Operation::Or(wb, children) if wb == word_branch => children, - op => vec![op], - }) - .collect(); - Self::Or(word_branch, ops) - } - } - - fn phrase(mut words: Vec>) -> Self { - if words.len() == 1 { - if let Some(word) = words.pop().unwrap() { - Self::Query(Query { prefix: false, kind: QueryKind::exact(word) }) - } else { - Self::Phrase(words) - } - } else { - Self::Phrase(words) - } - } - - pub fn query(&self) -> Option<&Query> { - match self { - Operation::Query(query) => Some(query), - _ => None, - } - } -} - -#[derive(Clone, Eq, PartialEq, Hash)] -pub struct Query { - pub prefix: IsPrefix, - pub kind: QueryKind, -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum QueryKind { - Tolerant { typo: u8, word: String }, - Exact { original_typo: u8, word: String }, -} - -impl QueryKind { - pub fn exact(word: String) -> Self { - QueryKind::Exact { original_typo: 0, word } - } - - pub fn tolerant(typo: u8, word: String) -> Self { - QueryKind::Tolerant { typo, word } - } - - pub fn typo(&self) -> u8 { - match self { - QueryKind::Tolerant { typo, .. } => *typo, - QueryKind::Exact { original_typo, .. } => *original_typo, - } - } - - pub fn word(&self) -> &str { - match self { - QueryKind::Tolerant { word, .. } => word, - QueryKind::Exact { word, .. } => word, - } - } -} - -impl fmt::Debug for Query { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let Query { prefix, kind } = self; - let prefix = if *prefix { String::from("Prefix") } else { String::default() }; - match kind { - QueryKind::Exact { word, .. } => { - f.debug_struct(&(prefix + "Exact")).field("word", &word).finish() - } - QueryKind::Tolerant { typo, word } => f - .debug_struct(&(prefix + "Tolerant")) - .field("word", &word) - .field("max typo", &typo) - .finish(), - } - } -} - -trait Context { - fn word_docids(&self, word: &str) -> heed::Result>; - fn synonyms>(&self, words: &[S]) -> heed::Result>>>; - fn word_documents_count(&self, word: &str) -> heed::Result> { - match self.word_docids(word)? { - Some(rb) => Ok(Some(rb.len())), - None => Ok(None), - } - } - /// Returns the minimum word len for 1 and 2 typos. - fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; - fn exact_words(&self) -> Option<&fst::Set>>; - fn word_pair_frequency( - &self, - left_word: &str, - right_word: &str, - proximity: u8, - ) -> heed::Result>; -} - -/// The query tree builder is the interface to build a query tree. -pub struct QueryTreeBuilder<'a> { - rtxn: &'a heed::RoTxn<'a>, - index: &'a Index, - terms_matching_strategy: TermsMatchingStrategy, - authorize_typos: bool, - words_limit: Option, - exact_words: Option>>, -} - -impl<'a> Context for QueryTreeBuilder<'a> { - fn word_docids(&self, word: &str) -> heed::Result> { - self.index.word_docids.get(self.rtxn, word) - } - - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { - self.index.words_synonyms(self.rtxn, words) - } - - fn word_documents_count(&self, word: &str) -> heed::Result> { - self.index.word_documents_count(self.rtxn, word) - } - - fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { - let one = self.index.min_word_len_one_typo(self.rtxn)?; - let two = self.index.min_word_len_two_typos(self.rtxn)?; - Ok((one, two)) - } - - fn exact_words(&self) -> Option<&fst::Set>> { - self.exact_words.as_ref() - } - - fn word_pair_frequency( - &self, - left_word: &str, - right_word: &str, - proximity: u8, - ) -> heed::Result> { - let key = (proximity, left_word, right_word); - self.index - .word_pair_proximity_docids - .remap_data_type::() - .get(self.rtxn, &key) - } -} - -impl<'a> QueryTreeBuilder<'a> { - /// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn` - /// and an Index `index`. - pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Result { - Ok(Self { - rtxn, - index, - terms_matching_strategy: TermsMatchingStrategy::default(), - authorize_typos: true, - words_limit: None, - exact_words: index.exact_words(rtxn)?, - }) - } - - /// if `terms_matching_strategy` is set to `All` the query tree will be - /// generated forcing all query words to be present in each matching documents - /// (the criterion `words` will be ignored). - /// default value if not called: `Last` - pub fn terms_matching_strategy( - &mut self, - terms_matching_strategy: TermsMatchingStrategy, - ) -> &mut Self { - self.terms_matching_strategy = terms_matching_strategy; - self - } - - /// if `authorize_typos` is set to `false` the query tree will be generated - /// forcing all query words to match documents without any typo - /// (the criterion `typo` will be ignored). - /// default value if not called: `true` - pub fn authorize_typos(&mut self, authorize_typos: bool) -> &mut Self { - self.authorize_typos = authorize_typos; - self - } - - /// Limit words and phrases that will be taken for query building. - /// Any beyond `words_limit` will be ignored. - pub fn words_limit(&mut self, words_limit: usize) -> &mut Self { - self.words_limit = Some(words_limit); - self - } - - /// Build the query tree: - /// - if `terms_matching_strategy` is set to `All` the query tree will be - /// generated forcing all query words to be present in each matching documents - /// (the criterion `words` will be ignored) - /// - if `authorize_typos` is set to `false` the query tree will be generated - /// forcing all query words to match documents without any typo - /// (the criterion `typo` will be ignored) - pub fn build>( - &self, - query: NormalizedTokenIter, - ) -> Result> { - let primitive_query = create_primitive_query(query, self.words_limit); - if !primitive_query.is_empty() { - let qt = create_query_tree( - self, - self.terms_matching_strategy, - self.authorize_typos, - &primitive_query, - )?; - let matching_words = - create_matching_words(self, self.authorize_typos, &primitive_query)?; - Ok(Some((qt, primitive_query, matching_words))) - } else { - Ok(None) - } - } -} - -/// Split the word depending on the frequency of pairs near together in the database documents. -fn split_best_frequency<'a>( - ctx: &impl Context, - word: &'a str, -) -> heed::Result> { - let chars = word.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = word.split_at(i); - - let pair_freq = ctx.word_pair_frequency(left, right, 1)?.unwrap_or(0); - - if pair_freq != 0 && best.map_or(true, |(old, _, _)| pair_freq > old) { - best = Some((pair_freq, left, right)); - } - } - - Ok(best.map(|(_, left, right)| (left, right))) -} - -#[derive(Clone)] -pub struct TypoConfig<'a> { - pub max_typos: u8, - pub word_len_one_typo: u8, - pub word_len_two_typo: u8, - pub exact_words: Option<&'a fst::Set>>, -} - -/// Return the `QueryKind` of a word depending on `authorize_typos` -/// and the provided word length. -fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { - if authorize_typos && !config.exact_words.map_or(false, |s| s.contains(&word)) { - let count = word.chars().count().min(u8::MAX as usize) as u8; - if count < config.word_len_one_typo { - QueryKind::exact(word) - } else if count < config.word_len_two_typo { - QueryKind::tolerant(1.min(config.max_typos), word) - } else { - QueryKind::tolerant(2.min(config.max_typos), word) - } - } else { - QueryKind::exact(word) - } -} - -/// Fetch synonyms from the `Context` for the provided word -/// and create the list of operations for the query tree -fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result>> { - let synonyms = ctx.synonyms(word)?; - Ok(synonyms.map(|synonyms| { - synonyms - .into_iter() - .map(|synonym| { - if synonym.len() == 1 { - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact(synonym[0].clone()), - }) - } else { - Operation::Phrase(synonym.into_iter().map(Some).collect()) - } - }) - .collect() - })) -} - -/// Main function that creates the final query tree from the primitive query. -fn create_query_tree( - ctx: &impl Context, - terms_matching_strategy: TermsMatchingStrategy, - authorize_typos: bool, - query: &[PrimitiveQueryPart], -) -> Result { - /// Matches on the `PrimitiveQueryPart` and create an operation from it. - fn resolve_primitive_part( - ctx: &impl Context, - authorize_typos: bool, - part: PrimitiveQueryPart, - ) -> Result { - match part { - // 1. try to split word in 2 - // 2. try to fetch synonyms - // 3. create an operation containing the word - // 4. wrap all in an OR operation - PrimitiveQueryPart::Word(word, prefix) => { - let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); - if let Some((left, right)) = split_best_frequency(ctx, &word)? { - children.push(Operation::Phrase(vec![ - Some(left.to_string()), - Some(right.to_string()), - ])); - } - let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words(); - let config = - TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; - children.push(Operation::Query(Query { - prefix, - kind: typos(word, authorize_typos, config), - })); - Ok(Operation::or(false, children)) - } - // create a CONSECUTIVE operation wrapping all word in the phrase - PrimitiveQueryPart::Phrase(words) => Ok(Operation::phrase(words)), - } - } - - /// Create all ngrams 1..=3 generating query tree branches. - fn ngrams( - ctx: &impl Context, - authorize_typos: bool, - query: &[PrimitiveQueryPart], - any_words: bool, - ) -> Result { - const MAX_NGRAM: usize = 3; - let mut op_children = Vec::new(); - - for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { - let mut or_op_children = Vec::new(); - - for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { - if let Some(group) = sub_query.get(..ngram) { - let mut and_op_children = Vec::new(); - let tail = &sub_query[ngram..]; - let is_last = tail.is_empty(); - - match group { - [part] => { - let operation = - resolve_primitive_part(ctx, authorize_typos, part.clone())?; - and_op_children.push(operation); - } - words => { - let is_prefix = words.last().map_or(false, |part| part.is_prefix()); - let words: Vec<_> = words - .iter() - .filter_map(|part| { - if let PrimitiveQueryPart::Word(word, _) = part { - Some(word.as_str()) - } else { - None - } - }) - .collect(); - let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); - let concat = words.concat(); - let (word_len_one_typo, word_len_two_typo) = - ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words(); - let config = TypoConfig { - max_typos: 1, - word_len_one_typo, - word_len_two_typo, - exact_words, - }; - let query = Query { - prefix: is_prefix, - kind: typos(concat, authorize_typos, config), - }; - operations.push(Operation::Query(query)); - and_op_children.push(Operation::or(false, operations)); - } - } - - if !is_last { - let ngrams = ngrams(ctx, authorize_typos, tail, any_words)?; - and_op_children.push(ngrams); - } - - if any_words { - or_op_children.push(Operation::or(false, and_op_children)); - } else { - or_op_children.push(Operation::and(and_op_children)); - } - } - } - op_children.push(Operation::or(false, or_op_children)); - } - - if any_words { - Ok(Operation::or(false, op_children)) - } else { - Ok(Operation::and(op_children)) - } - } - - let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); - let remove_count = query.len() - max(number_phrases, 1); - if remove_count == 0 { - return ngrams(ctx, authorize_typos, query, false); - } - - let mut operation_children = Vec::new(); - let mut query = query.to_vec(); - for _ in 0..=remove_count { - let pos = match terms_matching_strategy { - TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false), - TermsMatchingStrategy::Last => query - .iter() - .enumerate() - .filter(|(_, part)| !part.is_phrase()) - .last() - .map(|(pos, _)| pos), - }; - - // compute and push the current branch on the front - operation_children.insert(0, ngrams(ctx, authorize_typos, &query, false)?); - // remove word from query before creating an new branch - match pos { - Some(pos) => query.remove(pos), - None => break, - }; - } - - Ok(Operation::or(true, operation_children)) -} - -#[derive(Default, Debug)] -struct MatchingWordCache { - all: Vec>, - map: HashMap<(String, u8, bool), Rc>, -} -impl MatchingWordCache { - fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Option> { - match self.map.entry((word.clone(), typo, prefix)) { - Entry::Occupied(idx) => Some(idx.get().clone()), - Entry::Vacant(vacant) => { - let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); - self.all.push(matching_word.clone()); - vacant.insert(matching_word.clone()); - Some(matching_word) - } - } - // To deactivate the cache, for testing purposes, use the following instead: - // let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); - // self.all.push(matching_word.clone()); - // Some(matching_word) - } -} - -/// Main function that matchings words used for crop and highlight. -fn create_matching_words( - ctx: &impl Context, - authorize_typos: bool, - query: &[PrimitiveQueryPart], -) -> Result { - /// Matches on the `PrimitiveQueryPart` and create matchings words from it. - fn resolve_primitive_part( - ctx: &impl Context, - authorize_typos: bool, - part: PrimitiveQueryPart, - matching_words: &mut Vec<(Vec>, Vec)>, - matching_word_cache: &mut MatchingWordCache, - id: PrimitiveWordId, - ) -> Result<()> { - match part { - // 1. try to split word in 2 - // 2. try to fetch synonyms - PrimitiveQueryPart::Word(word, prefix) => { - if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? { - for synonym in synonyms { - // Require that all words of the synonym have a corresponding MatchingWord - // before adding any of its words to the matching_words result. - if let Some(synonym_matching_words) = synonym - .into_iter() - .map(|word| matching_word_cache.insert(word, 0, false)) - .collect() - { - matching_words.push((synonym_matching_words, vec![id])); - } - } - } - - if let Some((left, right)) = split_best_frequency(ctx, &word)? { - // Require that both left and right words have a corresponding MatchingWord - // before adding them to the matching_words result - if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) { - if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false) - { - matching_words.push((vec![left, right], vec![id])); - } - } - } - - let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words(); - let config = - TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; - - let matching_word = match typos(word, authorize_typos, config) { - QueryKind::Exact { word, .. } => matching_word_cache.insert(word, 0, prefix), - QueryKind::Tolerant { typo, word } => { - matching_word_cache.insert(word, typo, prefix) - } - }; - if let Some(matching_word) = matching_word { - matching_words.push((vec![matching_word], vec![id])); - } - } - // create a CONSECUTIVE matchings words wrapping all word in the phrase - PrimitiveQueryPart::Phrase(words) => { - let ids: Vec<_> = (0..words.len()).map(|i| id + i as PrimitiveWordId).collect(); - // Require that all words of the phrase have a corresponding MatchingWord - // before adding any of them to the matching_words result - if let Some(phrase_matching_words) = words - .into_iter() - .flatten() - .map(|w| matching_word_cache.insert(w, 0, false)) - .collect() - { - matching_words.push((phrase_matching_words, ids)); - } - } - } - - Ok(()) - } - - /// Create all ngrams 1..=3 generating query tree branches. - fn ngrams( - ctx: &impl Context, - authorize_typos: bool, - query: &[PrimitiveQueryPart], - matching_words: &mut Vec<(Vec>, Vec)>, - matching_word_cache: &mut MatchingWordCache, - mut id: PrimitiveWordId, - ) -> Result<()> { - const MAX_NGRAM: usize = 3; - - for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { - for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { - if let Some(group) = sub_query.get(..ngram) { - let tail = &sub_query[ngram..]; - let is_last = tail.is_empty(); - - match group { - [part] => { - resolve_primitive_part( - ctx, - authorize_typos, - part.clone(), - matching_words, - matching_word_cache, - id, - )?; - } - words => { - let is_prefix = words.last().map_or(false, |part| part.is_prefix()); - let words: Vec<_> = words - .iter() - .filter_map(|part| { - if let PrimitiveQueryPart::Word(word, _) = part { - Some(word.as_str()) - } else { - None - } - }) - .collect(); - let ids: Vec<_> = - (0..words.len()).map(|i| id + i as PrimitiveWordId).collect(); - - if let Some(synonyms) = ctx.synonyms(&words)? { - for synonym in synonyms { - if let Some(synonym) = synonym - .into_iter() - .map(|syn| matching_word_cache.insert(syn, 0, false)) - .collect() - { - matching_words.push((synonym, ids.clone())); - } - } - } - let word = words.concat(); - let (word_len_one_typo, word_len_two_typo) = - ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words(); - let config = TypoConfig { - max_typos: 1, - word_len_one_typo, - word_len_two_typo, - exact_words, - }; - let matching_word = match typos(word, authorize_typos, config) { - QueryKind::Exact { word, .. } => { - matching_word_cache.insert(word, 0, is_prefix) - } - QueryKind::Tolerant { typo, word } => { - matching_word_cache.insert(word, typo, is_prefix) - } - }; - if let Some(matching_word) = matching_word { - matching_words.push((vec![matching_word], ids)); - } - } - } - - if !is_last { - ngrams( - ctx, - authorize_typos, - tail, - matching_words, - matching_word_cache, - id + 1, - )?; - } - } - } - id += sub_query.iter().map(|x| x.len() as PrimitiveWordId).sum::(); - } - - Ok(()) - } - - let mut matching_word_cache = MatchingWordCache::default(); - let mut matching_words = Vec::new(); - ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?; - MatchingWords::new(matching_words) -} - -pub type PrimitiveQuery = Vec; - -#[derive(Debug, Clone)] -pub enum PrimitiveQueryPart { - Phrase(Vec>), - Word(String, IsPrefix), -} - -impl PrimitiveQueryPart { - fn is_phrase(&self) -> bool { - matches!(self, Self::Phrase(_)) - } - - fn is_prefix(&self) -> bool { - matches!(self, Self::Word(_, is_prefix) if *is_prefix) - } - - fn len(&self) -> usize { - match self { - Self::Phrase(words) => words.len(), - Self::Word(_, _) => 1, - } - } -} - -/// Create primitive query from tokenized query string, -/// the primitive query is an intermediate state to build the query tree. -fn create_primitive_query( - query: NormalizedTokenIter, - words_limit: Option, -) -> PrimitiveQuery -where - A: AsRef<[u8]>, -{ - let mut primitive_query = Vec::new(); - let mut phrase = Vec::new(); - let mut quoted = false; - - let parts_limit = words_limit.unwrap_or(usize::MAX); - - let mut peekable = query.peekable(); - while let Some(token) = peekable.next() { - // early return if word limit is exceeded - if primitive_query.len() >= parts_limit { - return primitive_query; - } - - match token.kind { - TokenKind::Word | TokenKind::StopWord => { - // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, - // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, - // 3. if the word is the last token of the query we push it as a prefix word. - if quoted { - if let TokenKind::StopWord = token.kind { - phrase.push(None) - } else { - phrase.push(Some(token.lemma().to_string())); - } - } else if peekable.peek().is_some() { - if let TokenKind::StopWord = token.kind { - } else { - primitive_query - .push(PrimitiveQueryPart::Word(token.lemma().to_string(), false)); - } - } else { - primitive_query.push(PrimitiveQueryPart::Word(token.lemma().to_string(), true)); - } - } - TokenKind::Separator(separator_kind) => { - let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - // swap quoted state if we encounter a double quote - if quote_count % 2 != 0 { - quoted = !quoted; - } - // if there is a quote or a hard separator we close the phrase. - if quote_count > 0 || separator_kind == SeparatorKind::Hard { - let phrase = mem::take(&mut phrase); - - // if the phrase only contains stop words, we don't keep it in the query. - if phrase.iter().any(|w| w.is_some()) { - primitive_query.push(PrimitiveQueryPart::Phrase(phrase)); - } - } - } - _ => (), - } - } - - // If a quote is never closed, we consider all of the end of the query as a phrase. - if phrase.iter().any(|w| w.is_some()) { - primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); - } - - primitive_query -} - -/// Returns the maximum number of typos that this Operation allows. -pub fn maximum_typo(operation: &Operation) -> usize { - use Operation::{And, Or, Phrase, Query}; - match operation { - Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0), - And(ops) => ops.iter().map(maximum_typo).sum::(), - Query(q) => q.kind.typo() as usize, - // no typo allowed in phrases - Phrase(_) => 0, - } -} - -/// Returns the maximum proximity that this Operation allows. -pub fn maximum_proximity(operation: &Operation) -> usize { - use Operation::{And, Or, Phrase, Query}; - match operation { - Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), - And(ops) => { - ops.iter().map(maximum_proximity).sum::() + ops.len().saturating_sub(1) * 7 - } - Query(_) | Phrase(_) => 0, - } -} - -#[cfg(test)] -mod test { - use std::collections::HashMap; - - use charabia::Tokenize; - use maplit::hashmap; - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; - - use super::*; - use crate::index::tests::TempIndex; - use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; - - #[derive(Debug)] - struct TestContext { - synonyms: HashMap, Vec>>, - postings: HashMap, - exact_words: Option>>, - } - - impl TestContext { - fn build>( - &self, - terms_matching_strategy: TermsMatchingStrategy, - authorize_typos: bool, - words_limit: Option, - query: NormalizedTokenIter, - ) -> Result> { - let primitive_query = create_primitive_query(query, words_limit); - if !primitive_query.is_empty() { - let qt = create_query_tree( - self, - terms_matching_strategy, - authorize_typos, - &primitive_query, - )?; - Ok(Some((qt, primitive_query))) - } else { - Ok(None) - } - } - } - - impl Context for TestContext { - fn word_docids(&self, word: &str) -> heed::Result> { - Ok(self.postings.get(word).cloned()) - } - - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { - let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); - Ok(self.synonyms.get(&words).cloned()) - } - - fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { - Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) - } - - fn exact_words(&self) -> Option<&fst::Set>> { - self.exact_words.as_ref() - } - - fn word_pair_frequency( - &self, - left_word: &str, - right_word: &str, - _proximity: u8, - ) -> heed::Result> { - match self.word_docids(&format!("{} {}", left_word, right_word))? { - Some(rb) => Ok(Some(rb.len())), - None => Ok(None), - } - } - } - - impl Default for TestContext { - fn default() -> TestContext { - let mut rng = StdRng::seed_from_u64(102); - let rng = &mut rng; - - fn random_postings(rng: &mut R, len: usize) -> RoaringBitmap { - let mut values = Vec::::with_capacity(len); - while values.len() != len { - values.push(rng.gen()); - } - values.sort_unstable(); - RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap() - } - - let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap(); - let exact_words = - Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); - - TestContext { - synonyms: hashmap! { - vec![String::from("hello")] => vec![ - vec![String::from("hi")], - vec![String::from("good"), String::from("morning")], - ], - vec![String::from("world")] => vec![ - vec![String::from("earth")], - vec![String::from("nature")], - ], - // new york city - vec![String::from("nyc")] => vec![ - vec![String::from("new"), String::from("york")], - vec![String::from("new"), String::from("york"), String::from("city")], - ], - vec![String::from("new"), String::from("york")] => vec![ - vec![String::from("nyc")], - vec![String::from("new"), String::from("york"), String::from("city")], - ], - vec![String::from("new"), String::from("york"), String::from("city")] => vec![ - vec![String::from("nyc")], - vec![String::from("new"), String::from("york")], - ], - }, - postings: hashmap! { - String::from("hello") => random_postings(rng, 1500), - String::from("hi") => random_postings(rng, 4000), - String::from("word") => random_postings(rng, 2500), - String::from("split") => random_postings(rng, 400), - String::from("ngrams") => random_postings(rng, 1400), - String::from("world") => random_postings(rng, 15_000), - String::from("earth") => random_postings(rng, 8000), - String::from("2021") => random_postings(rng, 100), - String::from("2020") => random_postings(rng, 500), - String::from("is") => random_postings(rng, 50_000), - String::from("this") => random_postings(rng, 50_000), - String::from("good") => random_postings(rng, 1250), - String::from("morning") => random_postings(rng, 125), - String::from("word split") => random_postings(rng, 5000), - String::from("quick brownfox") => random_postings(rng, 7000), - String::from("quickbrown fox") => random_postings(rng, 8000), - }, - exact_words, - } - } - } - - #[test] - fn prefix() { - let query = "hey friends"; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - Exact { word: "hey" } - PrefixTolerant { word: "friends", max typo: 1 } - PrefixTolerant { word: "heyfriends", max typo: 1 } - "###); - } - - #[test] - fn no_prefix() { - let query = "hey friends "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - Exact { word: "hey" } - Tolerant { word: "friends", max typo: 1 } - Tolerant { word: "heyfriends", max typo: 1 } - "###); - } - - #[test] - fn synonyms() { - let query = "hello world "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - OR - Exact { word: "hi" } - PHRASE [Some("good"), Some("morning")] - Tolerant { word: "hello", max typo: 1 } - OR - Exact { word: "earth" } - Exact { word: "nature" } - Tolerant { word: "world", max typo: 1 } - Tolerant { word: "helloworld", max typo: 1 } - "###); - } - - #[test] - fn simple_synonyms() { - let query = "nyc"; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::Last, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - PHRASE [Some("new"), Some("york")] - PHRASE [Some("new"), Some("york"), Some("city")] - PrefixExact { word: "nyc" } - "###); - } - - #[test] - fn complex_synonyms() { - let query = "new york city "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - Exact { word: "new" } - OR - AND - Exact { word: "york" } - Exact { word: "city" } - Tolerant { word: "yorkcity", max typo: 1 } - AND - OR - Exact { word: "nyc" } - PHRASE [Some("new"), Some("york"), Some("city")] - Tolerant { word: "newyork", max typo: 1 } - Exact { word: "city" } - Exact { word: "nyc" } - PHRASE [Some("new"), Some("york")] - Tolerant { word: "newyorkcity", max typo: 1 } - "###); - } - - #[test] - fn ngrams() { - let query = "n grams "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - Exact { word: "n" } - Tolerant { word: "grams", max typo: 1 } - Tolerant { word: "ngrams", max typo: 1 } - "###); - } - - #[test] - fn word_split() { - let query = "wordsplit fish "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - OR - PHRASE [Some("word"), Some("split")] - Tolerant { word: "wordsplit", max typo: 2 } - Exact { word: "fish" } - Tolerant { word: "wordsplitfish", max typo: 1 } - "###); - } - - #[test] - fn word_split_choose_pair_with_max_freq() { - let query = "quickbrownfox"; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - PHRASE [Some("quickbrown"), Some("fox")] - PrefixTolerant { word: "quickbrownfox", max typo: 2 } - "###); - } - - #[test] - fn phrase() { - let query = "\"hey friends\" \" \" \"wooop"; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - AND - PHRASE [Some("hey"), Some("friends")] - Exact { word: "wooop" } - "###); - } - - #[test] - fn phrase_2() { - // https://github.com/meilisearch/meilisearch/issues/2722 - let query = "coco \"harry\""; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::default(), true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR(WORD) - Exact { word: "harry" } - AND - Exact { word: "coco" } - Exact { word: "harry" } - "###); - } - - #[test] - fn phrase_with_hard_separator() { - let query = "\"hey friends. wooop wooop\""; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - AND - PHRASE [Some("hey"), Some("friends")] - PHRASE [Some("wooop"), Some("wooop")] - "###); - } - - #[test] - fn optional_word() { - let query = "hey my friend "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::default(), true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR(WORD) - Exact { word: "hey" } - OR - AND - Exact { word: "hey" } - Exact { word: "my" } - Tolerant { word: "heymy", max typo: 1 } - OR - AND - Exact { word: "hey" } - OR - AND - Exact { word: "my" } - Tolerant { word: "friend", max typo: 1 } - Tolerant { word: "myfriend", max typo: 1 } - AND - Tolerant { word: "heymy", max typo: 1 } - Tolerant { word: "friend", max typo: 1 } - Tolerant { word: "heymyfriend", max typo: 1 } - "###); - } - - #[test] - fn optional_word_phrase() { - let query = "\"hey my\""; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::default(), true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - PHRASE [Some("hey"), Some("my")] - "###); - } - - #[test] - fn optional_word_multiple_phrases() { - let query = r#""hey" my good "friend""#; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::default(), true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR(WORD) - AND - Exact { word: "hey" } - Exact { word: "friend" } - AND - Exact { word: "hey" } - Exact { word: "my" } - Exact { word: "friend" } - AND - Exact { word: "hey" } - OR - AND - Exact { word: "my" } - Exact { word: "good" } - Tolerant { word: "mygood", max typo: 1 } - Exact { word: "friend" } - "###); - } - - #[test] - fn no_typo() { - let query = "hey friends "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, false, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - Exact { word: "hey" } - Exact { word: "friends" } - Exact { word: "heyfriends" } - "###); - } - - #[test] - fn words_limit() { - let query = "\"hey my\" good friend"; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, false, Some(2), tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - AND - PHRASE [Some("hey"), Some("my")] - Exact { word: "good" } - "###); - } - - #[test] - fn test_min_word_len_typo() { - let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap(); - let config = TypoConfig { - max_typos: 2, - word_len_one_typo: 5, - word_len_two_typo: 7, - exact_words: Some(&exact_words), - }; - - assert_eq!( - typos("hello".to_string(), true, config.clone()), - QueryKind::Tolerant { typo: 1, word: "hello".to_string() } - ); - - assert_eq!( - typos("hell".to_string(), true, config.clone()), - QueryKind::exact("hell".to_string()) - ); - - assert_eq!( - typos("verylongword".to_string(), true, config.clone()), - QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() } - ); - } - - #[test] - fn test_dont_create_matching_word_for_long_words() { - let index = TempIndex::new(); - let rtxn = index.read_txn().unwrap(); - let query = "what a supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious house"; - let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); - builder.words_limit(10); - let (_, _, matching_words) = builder.build(query.tokenize()).unwrap().unwrap(); - insta::assert_snapshot!(format!("{matching_words:?}"), @r###" - [ - ([MatchingWord { word: "house", typo: 1, prefix: true }], [3]) - ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) - ([MatchingWord { word: "whata", typo: 1, prefix: false }], [0, 1]) - ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) - ([MatchingWord { word: "house", typo: 1, prefix: true }], [1]) - ([MatchingWord { word: "what", typo: 0, prefix: false }], [0]) - ([MatchingWord { word: "a", typo: 0, prefix: false }], [1]) - ] - "###); - } - - #[test] - fn disable_typo_on_word() { - let query = "goodbye"; - let tokens = query.tokenize(); - - let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); - let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); - let context = TestContext { exact_words, ..Default::default() }; - let (query_tree, _) = - context.build(TermsMatchingStrategy::All, true, Some(2), tokens).unwrap().unwrap(); - - assert!(matches!( - query_tree, - Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) - )); - } - - // The memory usage test below is disabled because `cargo test` runs multiple tests in parallel, - // which invalidates the measurements of memory usage. Nevertheless, it is a useful test to run - // manually from time to time, so I kept it here, commented-out. - - // use std::alloc::{GlobalAlloc, System}; - // use std::sync::atomic::{self, AtomicI64}; - // - // #[global_allocator] - // static ALLOC: CountingAlloc = - // CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) }; - // - // pub struct CountingAlloc { - // pub resident: AtomicI64, - // pub allocated: AtomicI64, - // } - // unsafe impl GlobalAlloc for CountingAlloc { - // unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { - // self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); - // self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); - // - // System.alloc(layout) - // } - // - // unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { - // self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); - // System.dealloc(ptr, layout) - // } - // } - // - // #[test] - // fn memory_usage_of_ten_word_query() { - // let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); - // let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); - // - // let index = TempIndex::new(); - // let rtxn = index.read_txn().unwrap(); - // let query = "a beautiful summer house by the beach overlooking what seems"; - // let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); - // builder.words_limit(10); - // let x = builder.build(query.tokenize()).unwrap().unwrap(); - // let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); - // let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); - // - // // Weak check on the memory usage - // // Don't keep more than 5MB. (Arguably 5MB is already too high) - // assert!(resident_after - resident_before < 5_000_000); - // // Don't allocate more than 10MB. - // assert!(allocated_after - allocated_before < 10_000_000); - // - // // Use these snapshots to measure the exact memory usage. - // // The values below were correct at the time I wrote them. - // // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950"); - // // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502"); - // - // // Note, with the matching word cache deactivated, the memory usage was: - // // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697"); - // // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588"); - // // or about 20x more resident memory (90MB vs 4.5MB) - // - // // Use x - // let _x = x; - // } -} From d18ebe4f3abd0b927b41e3c743c5356225dcd9eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 23 Mar 2023 09:41:18 +0100 Subject: [PATCH 085/234] Remove more warnings --- milli/src/search/mod.rs | 3 +- milli/src/update/facet/incremental.rs | 32 +++++++++---------- milli/src/update/index_documents/mod.rs | 1 - .../update/words_prefix_position_docids.rs | 2 +- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 46829b986..95a83e121 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -109,7 +109,8 @@ impl<'a> Search<'a> { self } - fn is_typo_authorized(&self) -> Result { + // TODO! + fn _is_typo_authorized(&self) -> Result { let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?; // only authorize typos if both the index and the query allow it. Ok(self.authorize_typos && index_authorizes_typos) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index aaef93b48..a921d4115 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -699,21 +699,21 @@ mod tests { #[test] fn many_field_ids_prepend() { let index = FacetIndex::::new(4, 8, 5); - for i in (0..256).into_iter().rev() { + for i in (0..256).rev() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); let mut txn = index.env.write_txn().unwrap(); index.insert(&mut txn, 0, &(i as f64), &bitmap); txn.commit().unwrap(); } - for i in (0..256).into_iter().rev() { + for i in (0..256).rev() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); let mut txn = index.env.write_txn().unwrap(); index.insert(&mut txn, 2, &(i as f64), &bitmap); txn.commit().unwrap(); } - for i in (0..256).into_iter().rev() { + for i in (0..256).rev() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); let mut txn = index.env.write_txn().unwrap(); @@ -733,7 +733,7 @@ mod tests { let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); - for i in (0..256).into_iter().rev() { + for i in (0..256).rev() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); index.insert(&mut txn, 0, &(i as f64), &bitmap); @@ -749,7 +749,7 @@ mod tests { let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); - let mut keys = (0..256).into_iter().collect::>(); + let mut keys = (0..256).collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); @@ -768,7 +768,7 @@ mod tests { let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); - let mut keys = (0..256).into_iter().collect::>(); + let mut keys = (0..256).collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); @@ -796,7 +796,7 @@ mod tests { index.insert(&mut txn, 0, &(i as f64), &bitmap); } - for i in (200..256).into_iter().rev() { + for i in (200..256).rev() { index.verify_structure_validity(&txn, 0); index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } @@ -805,7 +805,7 @@ mod tests { milli_snap!(format!("{index}"), 200); let mut txn = index.env.write_txn().unwrap(); - for i in (150..200).into_iter().rev() { + for i in (150..200).rev() { index.verify_structure_validity(&txn, 0); index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } @@ -813,7 +813,7 @@ mod tests { txn.commit().unwrap(); milli_snap!(format!("{index}"), 150); let mut txn = index.env.write_txn().unwrap(); - for i in (100..150).into_iter().rev() { + for i in (100..150).rev() { index.verify_structure_validity(&txn, 0); index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } @@ -821,7 +821,7 @@ mod tests { txn.commit().unwrap(); milli_snap!(format!("{index}"), 100); let mut txn = index.env.write_txn().unwrap(); - for i in (17..100).into_iter().rev() { + for i in (17..100).rev() { index.verify_structure_validity(&txn, 0); index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } @@ -829,14 +829,14 @@ mod tests { txn.commit().unwrap(); milli_snap!(format!("{index}"), 17); let mut txn = index.env.write_txn().unwrap(); - for i in (15..17).into_iter().rev() { + for i in (15..17).rev() { index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); milli_snap!(format!("{index}"), 15); let mut txn = index.env.write_txn().unwrap(); - for i in (0..15).into_iter().rev() { + for i in (0..15).rev() { index.verify_structure_validity(&txn, 0); index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } @@ -893,7 +893,7 @@ mod tests { index.insert(&mut txn, 0, &(i as f64), &bitmap); } - let mut keys = (0..256).into_iter().collect::>(); + let mut keys = (0..256).collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); @@ -930,7 +930,7 @@ mod tests { let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); - let mut keys = (0..16).into_iter().collect::>(); + let mut keys = (0..16).collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); for i in 0..4 { @@ -951,7 +951,7 @@ mod tests { let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); - let mut keys = (0..64).into_iter().collect::>(); + let mut keys = (0..64).collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); @@ -983,7 +983,7 @@ mod tests { let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); - let mut keys = (1000..1064).into_iter().collect::>(); + let mut keys = (1000..1064).collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2a7930f84..ade217beb 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1613,7 +1613,6 @@ mod tests { // Create 200 documents with a long text let content = { let documents_iter = (0..200i32) - .into_iter() .map(|i| serde_json::json!({ "id": i, "script": script })) .filter_map(|json| match json { serde_json::Value::Object(object) => Some(object), diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index 0822d0d26..eb036c52f 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -14,7 +14,7 @@ use crate::update::index_documents::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, CursorClonableMmap, MergeFn, }; -use crate::{bucketed_position, relative_from_absolute_position, Index, Result}; +use crate::{relative_from_absolute_position, Index, Result}; pub struct WordPrefixPositionDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, From 862714a18b37a27eb00c5f1bae865f0b48167736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 23 Mar 2023 09:42:25 +0100 Subject: [PATCH 086/234] Remove criterion_implementation_strategy param of Search --- milli/src/lib.rs | 5 ++-- milli/src/search/mod.rs | 54 +++++++++++++---------------------------- 2 files changed, 19 insertions(+), 40 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a62c344f9..b2bc9d1a2 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -95,9 +95,8 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - CriterionImplementationStrategy, FacetDistribution, Filter, FormatOptions, MatchBounds, - MatcherBuilder, MatchingWord, MatchingWords, Search, SearchResult, TermsMatchingStrategy, - DEFAULT_VALUES_PER_FACET, + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, + MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 95a83e121..0792ce799 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -31,7 +31,6 @@ pub struct Search<'a> { authorize_typos: bool, words_limit: usize, exhaustive_number_hits: bool, - criterion_implementation_strategy: CriterionImplementationStrategy, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } @@ -48,7 +47,6 @@ impl<'a> Search<'a> { authorize_typos: true, exhaustive_number_hits: false, words_limit: 10, - criterion_implementation_strategy: CriterionImplementationStrategy::default(), rtxn, index, } @@ -101,14 +99,6 @@ impl<'a> Search<'a> { self } - pub fn criterion_implementation_strategy( - &mut self, - strategy: CriterionImplementationStrategy, - ) -> &mut Search<'a> { - self.criterion_implementation_strategy = strategy; - self - } - // TODO! fn _is_typo_authorized(&self) -> Result { let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?; @@ -144,7 +134,6 @@ impl fmt::Debug for Search<'_> { authorize_typos, words_limit, exhaustive_number_hits, - criterion_implementation_strategy, rtxn: _, index: _, } = self; @@ -157,7 +146,6 @@ impl fmt::Debug for Search<'_> { .field("terms_matching_strategy", terms_matching_strategy) .field("authorize_typos", authorize_typos) .field("exhaustive_number_hits", exhaustive_number_hits) - .field("criterion_implementation_strategy", criterion_implementation_strategy) .field("words_limit", words_limit) .finish() } @@ -171,14 +159,6 @@ pub struct SearchResult { pub documents_ids: Vec, } -#[derive(Debug, Default, Clone, Copy)] -pub enum CriterionImplementationStrategy { - OnlyIterative, - OnlySetBased, - #[default] - Dynamic, -} - #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum TermsMatchingStrategy { // remove last word first @@ -241,30 +221,30 @@ mod test { assert_eq!(documents_ids, vec![1]); } - #[test] - fn test_is_authorized_typos() { - let index = TempIndex::new(); - let mut txn = index.write_txn().unwrap(); + // #[test] + // fn test_is_authorized_typos() { + // let index = TempIndex::new(); + // let mut txn = index.write_txn().unwrap(); - let mut search = Search::new(&txn, &index); + // let mut search = Search::new(&txn, &index); - // default is authorized - assert!(search.is_typo_authorized().unwrap()); + // // default is authorized + // assert!(search.is_typo_authorized().unwrap()); - search.authorize_typos(false); - assert!(!search.is_typo_authorized().unwrap()); + // search.authorize_typos(false); + // assert!(!search.is_typo_authorized().unwrap()); - index.put_authorize_typos(&mut txn, false).unwrap(); - txn.commit().unwrap(); + // index.put_authorize_typos(&mut txn, false).unwrap(); + // txn.commit().unwrap(); - let txn = index.read_txn().unwrap(); - let mut search = Search::new(&txn, &index); + // let txn = index.read_txn().unwrap(); + // let mut search = Search::new(&txn, &index); - assert!(!search.is_typo_authorized().unwrap()); + // assert!(!search.is_typo_authorized().unwrap()); - search.authorize_typos(true); - assert!(!search.is_typo_authorized().unwrap()); - } + // search.authorize_typos(true); + // assert!(!search.is_typo_authorized().unwrap()); + // } // #[test] // fn test_one_typos_tolerance() { From 00bad8c716f73a287fc25357a4d010262fbcb9c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 23 Mar 2023 10:18:24 +0100 Subject: [PATCH 087/234] Add comments suggesting performance improvements --- .../search/new/graph_based_ranking_rule.rs | 37 +++++++++++++++++++ milli/src/search/new/resolve_query_graph.rs | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 5127082f7..194e62c30 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -185,6 +185,43 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase if universe.is_empty() { return Ok(ControlFlow::Break(())); } + + /* TODO: there are a couple ways to improve the speed of path computation. + + 1. Since the `visit_paths_of_cost` method uses a depth-first-search, we know that + consecutive calls to this closure have a high chance of giving paths sharing + some prefix. It would be good to reuse `subpath_docids` and `visited_conditions` + to find out what this common prefix is, to avoid recomputing it. In a way, doing + this serves as the dual of the DeadEndsCache: it takes advantage of our knowledge that + some paths *aren't* deadends. There is however a subtlety in that the universe might + have changed between the two consecutive calls. This is why we should subtract the docids + of the previous path (if successful) to the `subpath_docids`, at the same time as we do + it for the universe. + + 2. We perform way too many intersections with the universe. For the first visited path, + the operation we do is essentially: + universe & (c1 & universe) & (c2 & universe) & (c3 & universe) & etc. + This is a good idea *only if the universe is very small*. But if the universe is (almost) + a superset of each condition, then these intersections serve no purpose and slow down the search. + Maybe in the future we have a `deserialize_within_universe` method, which would speed up + these intersections. But for now, we have to be careful. + + 3. We could know in advance how many paths of a certain cost exist, and only update the + DeadEndsCache if (m)any remaining paths exist. There is a subtlety here because + on the next call of `next_bucket`, we will want an updated and correct DeadEndsCache. + We need to think about that. We could also avoid putting forbidden edges in this cache + if we know, somehow, that we won't visit this edge again. + + 4. Finally, but that will be a long term difficult project. We should compute the path *lazily*. + That is, when we do `path_docids &= condition`. We shouldn't *actually* perform the intersection, + but simply register that operation. It's only when we ask if the path_docids is empty that + **the minimum amount of work to determine whether the path is empty** is carried out. In practice, + that means performing a MultiOps on each container, in order or not, until any resulting container + is found to be non-empty. (In fact, when we ask `is_empty`, we should probably find the container + that has the highest chance of being non-empty and compute that one first). + + */ + // Accumulate the path for logging purposes only considered_paths.push(path.to_vec()); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index f4db260ed..1b7057b51 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -53,7 +53,7 @@ impl QueryTermDocIdsCache { return Ok(&self.terms[&term_interned]); }; let mut docids = RoaringBitmap::new(); - + // TODO: use a MultiOps? let term = term_interner.get(term_interned); for word in term.all_single_words_except_prefix_db() { if let Some(word_docids) = db_cache.get_word_docids(index, txn, word_interner, word)? { From 16fefd364ead573faad167750fad89dadb999cc3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 27 Mar 2023 11:04:04 +0200 Subject: [PATCH 088/234] Add TODO notes --- milli/src/search/new/mod.rs | 2 ++ milli/src/search/new/query_graph.rs | 3 +++ milli/src/search/new/query_term.rs | 5 +++++ milli/src/search/new/small_bitmap.rs | 3 +++ milli/src/search/new/sort.rs | 18 ++++++++++++++++++ 5 files changed, 31 insertions(+) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 45cad378a..194ffb035 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -11,6 +11,7 @@ mod resolve_query_graph; // TODO: documentation + comments mod small_bitmap; // TODO: documentation + comments +// implementation is currently an adaptation of the previous implementation to fit with the new model mod sort; // TODO: documentation + comments mod words; @@ -42,6 +43,7 @@ pub struct SearchContext<'ctx> { pub word_interner: DedupInterner, pub phrase_interner: DedupInterner, pub term_interner: DedupInterner, + // think about memory usage of that field (roaring bitmaps in a hashmap) pub term_docids: QueryTermDocIdsCache, } impl<'ctx> SearchContext<'ctx> { diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 863ec0045..0f06b9b95 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -119,6 +119,8 @@ impl QueryGraph { impl QueryGraph { /// Build the query graph from the parsed user search query. + /// + /// The ngrams are made at this point. pub fn from_query(ctx: &mut SearchContext, terms: Vec) -> Result { let nbr_typos = number_of_typos_allowed(ctx)?; @@ -219,6 +221,7 @@ impl QueryGraph { } /// Remove the given nodes and all their edges from the query graph. + /// TODO: need to check where this is used, and if this is correct. pub fn remove_nodes(&mut self, nodes: &[Interned]) { for &node_id in nodes { let node = &self.nodes.get(node_id); diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index e239d4669..0850b2181 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -414,6 +414,7 @@ impl QueryTerm { #[derive(Clone)] pub struct LocatedQueryTerm { pub value: Interned, + // TODO: consider changing to u8, or even a u16 pub positions: RangeInclusive, } @@ -425,6 +426,8 @@ impl LocatedQueryTerm { } /// Convert the tokenised search query into a list of located query terms. +// TODO: checking if the positions are correct for phrases, separators, ngrams +// hard-limit the number of tokens that are considered pub fn located_query_terms_from_string( ctx: &mut SearchContext, query: NormalizedTokenIter<&[u8]>, @@ -484,6 +487,7 @@ pub fn located_query_terms_from_string( } } else { let word = token.lemma(); + // eagerly compute all derivations let term = query_term_from_word(ctx, word, nbr_typos(word), true)?; let located_term = LocatedQueryTerm { value: ctx.term_interner.insert(term), @@ -507,6 +511,7 @@ pub fn located_query_terms_from_string( quoted = !quoted; } // if there is a quote or a hard separator we close the phrase. + // TODO: limit phrase size? if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) { let located_query_term = LocatedQueryTerm { diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index 503bd72f5..24541eb6c 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -16,6 +16,9 @@ impl SmallBitmap { pub fn for_interned_values_in(interner: &FixedSizeInterner) -> Self { Self::new(interner.len()) } + // universe_length not stored anywhere, only used to decide between tiny/small + // universe_length: passed 63, actual length will be rounded up 64 + // passed 66, actual 64 * xs.len() as u16 = 128, passed sized rounded up to the next 64 pub fn new(universe_length: u16) -> Self { if universe_length <= 64 { Self { internal: SmallBitmapInternal::Tiny(0), _phantom: PhantomData } diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 6277149bd..04152f0f0 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -28,6 +28,21 @@ impl<'ctx, Query> RankingRuleOutputIter<'ctx, Query> for RankingRuleOutputIterWr } } +// `Query` type parameter: the same as the type parameter to bucket_sort +// implements RankingRuleQuery trait, either querygraph or placeholdersearch +// The sort ranking rule doesn't need the query parameter, it is doing the same thing +// whether we're doing a querygraph or placeholder search. +// +// Query Stored anyway because every ranking rule must return a query from next_bucket +// --- +// "Mismatch" between new/old impl.: +// - old impl: roaring bitmap as input, ranking rule iterates other all the buckets +// - new impl: still works like that but it shouldn't, because the universe may change for every call to next_bucket, itself due to: +// 1. elements that were already returned by the ranking rule are subtracted from the universe, also done in the old impl (subtracted from the candidates) +// 2. NEW in the new impl.: distinct rule might have been applied btwn calls to next_bucket +// new impl ignores docs removed in (2), which is a missed perf opt issue, see `next_bucket` +// this perf problem is P2 +// mostly happens when many documents map to the same distinct attribute value. pub struct Sort<'ctx, Query> { field_name: String, field_id: Option, @@ -127,6 +142,9 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, ) -> Result>> { let iter = self.iter.as_mut().unwrap(); // TODO: we should make use of the universe in the function below + // good for correctness, but ideally iter.next_bucket would take the current universe into account, + // as right now it could return buckets that don't intersect with the universe, meaning we will make many + // unneeded calls. if let Some(mut bucket) = iter.next_bucket()? { bucket.candidates &= universe; Ok(Some(bucket)) From 5a644054ab5521f78cca63b7ec057b4944cabf17 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 27 Mar 2023 11:04:27 +0200 Subject: [PATCH 089/234] Removed unused search impl --- milli/src/search/new/mod.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 194ffb035..5839787ef 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -33,7 +33,7 @@ use roaring::RoaringBitmap; use words::Words; use self::ranking_rules::RankingRule; -use crate::{Filter, Index, MatchingWords, Result, Search, SearchResult, TermsMatchingStrategy}; +use crate::{Filter, Index, MatchingWords, Result, SearchResult, TermsMatchingStrategy}; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -292,10 +292,3 @@ pub fn execute_search( documents_ids, }) } - -impl<'a> Search<'a> { - // TODO - pub fn execute_new(&self) -> Result { - todo!() - } -} From 3281a88d08ea7de1461658eb8fb9c1e0d3614290 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 27 Mar 2023 11:04:43 +0200 Subject: [PATCH 090/234] SmallBitmap: don't expose internal items --- milli/src/search/new/small_bitmap.rs | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index 24541eb6c..b1cb78c8c 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -31,6 +31,7 @@ impl SmallBitmap { } } } + pub fn universe_length(&self) -> u16 { match &self.internal { SmallBitmapInternal::Tiny(_) => 64, @@ -82,7 +83,7 @@ impl SmallBitmap { } } #[derive(Clone)] -pub enum SmallBitmapInternal { +enum SmallBitmapInternal { Tiny(u64), Small(Box<[u64]>), } @@ -182,11 +183,7 @@ impl SmallBitmapInternal { } } } - pub fn all_satisfy_op( - &self, - other: &SmallBitmapInternal, - op: impl Fn(u64, u64) -> bool, - ) -> bool { + fn all_satisfy_op(&self, other: &SmallBitmapInternal, op: impl Fn(u64, u64) -> bool) -> bool { match (self, other) { (SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(*a, *b), (SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => { @@ -203,11 +200,7 @@ impl SmallBitmapInternal { } } } - pub fn any_satisfy_op( - &self, - other: &SmallBitmapInternal, - op: impl Fn(u64, u64) -> bool, - ) -> bool { + fn any_satisfy_op(&self, other: &SmallBitmapInternal, op: impl Fn(u64, u64) -> bool) -> bool { match (self, other) { (SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(*a, *b), (SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => { From e9eb2714992b6708a0f796d7acd71d38ff92a65a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 27 Mar 2023 11:08:03 +0200 Subject: [PATCH 091/234] Remove empty attribute_rule mod --- milli/src/search/new/ranking_rule_graph/mod.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 5ceee3f4e..fb9a82d68 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -15,8 +15,6 @@ mod proximity; /// Implementation of the `typo` ranking rule mod typo; -mod attribute_rank; - use std::hash::Hash; pub use condition_docids_cache::ConditionDocIdsCache; From 9b83b1deb014dcc8e0efe41877d22cdbba22c8db Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 27 Mar 2023 17:49:18 +0200 Subject: [PATCH 092/234] Expose SearchLogger trait --- milli/src/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index b2bc9d1a2..eb63c3904 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -77,7 +77,9 @@ use std::hash::BuildHasherDefault; pub use filter_parser::{Condition, FilterCondition, Span, Token}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; -pub use search::new::{execute_search, DefaultSearchLogger, DetailedSearchLogger, SearchContext}; +pub use search::new::{ + execute_search, DefaultSearchLogger, DetailedSearchLogger, SearchContext, SearchLogger, +}; use serde_json::Value; pub use {charabia as tokenizer, heed}; From af65fe201aaac2e6e89e49a14babd683884cecfc Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 27 Mar 2023 17:49:43 +0200 Subject: [PATCH 093/234] Clean-up search example --- milli/examples/search.rs | 152 ++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 82 deletions(-) diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 57aac5a02..ecc7f9cb8 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -1,125 +1,113 @@ -// use crate::allocator::ALLOC; use std::error::Error; use std::io::stdin; use std::time::Instant; use heed::EnvOpenOptions; use milli::{ - execute_search, DefaultSearchLogger, Index, Search, SearchContext, TermsMatchingStrategy, + execute_search, DefaultSearchLogger, Index, SearchContext, SearchLogger, TermsMatchingStrategy, }; #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; fn main() -> Result<(), Box> { - // TODO: command line let mut args = std::env::args(); - let _ = args.next().unwrap(); - let dataset = args.next().unwrap(); + let program_name = args.next().expect("No program name"); + let dataset = args.next().unwrap_or_else(|| { + format!( + "Missing path to index. Usage: {} [] [print-documents]", + program_name + ) + }); + let detailed_logger = args.next(); + let print_documents: bool = + if let Some(arg) = args.next() { arg == "print-documents" } else { false }; let mut options = EnvOpenOptions::new(); options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - // Query: - // disp: 20 - // - // dasp: 70 words - // dosp: 80 - // dasc: 80 - // - // - // daspouyerf - // daspojewkfb - let index = Index::new(options, dataset)?; let txn = index.read_txn()?; let mut query = String::new(); while stdin().read_line(&mut query)? > 0 { for _ in 0..2 { - let start = Instant::now(); - let mut s = Search::new(&txn, &index); - s.query( - // "which a the releases from poison by the government", - // "sun flower s are the best", - query.trim(), - ); - s.terms_matching_strategy(TermsMatchingStrategy::Last); - s.offset(0); - // s.limit(1); - // s.criterion_implementation_strategy( - // milli::CriterionImplementationStrategy::OnlySetBased, - // ); - - let docs = s.execute().unwrap(); - let elapsed = start.elapsed(); - println!("old: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids); + let mut default_logger = DefaultSearchLogger; + // FIXME: consider resetting the state of the logger between search executions as otherwise panics are possible. + // Workaround'd here by recreating the logger on each iteration of the loop + let mut detailed_logger = detailed_logger + .as_ref() + .map(|logger_dir| milli::DetailedSearchLogger::new(logger_dir)); + let logger: &mut dyn SearchLogger<_> = + if let Some(detailed_logger) = detailed_logger.as_mut() { + detailed_logger + } else { + &mut default_logger + }; let start = Instant::now(); - // let mut logger = milli::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); let docs = execute_search( &mut ctx, - query.trim(), + &(!query.trim().is_empty()).then(|| query.trim().to_owned()), // what a the from which when there is TermsMatchingStrategy::Last, - None, + &None, 0, 20, + None, &mut DefaultSearchLogger, - &mut DefaultSearchLogger, - // &mut logger, + logger, )?; - // logger.write_d2_description(&mut ctx); + if let Some(logger) = &detailed_logger { + logger.write_d2_description(&mut ctx); + } let elapsed = start.elapsed(); println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids); + if print_documents { + let documents = index + .documents(&txn, docs.documents_ids.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); - // let documents = index - // .documents(&txn, docs.documents_ids.iter().copied()) - // .unwrap() - // .into_iter() - // .map(|(id, obkv)| { - // let mut object = serde_json::Map::default(); - // for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - // let value = obkv.get(fid).unwrap(); - // let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - // object.insert(fid_name.to_owned(), value); - // } - // (id, serde_json::to_string_pretty(&object).unwrap()) - // }) - // .collect::>(); + for (id, document) in documents { + println!("{id}:"); + println!("{document}"); + } - // println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - // for (id, document) in documents { - // println!("{id}:"); - // println!("{document}"); - // } - - // let documents = index - // .documents(&txn, docs.documents_ids.iter().copied()) - // .unwrap() - // .into_iter() - // .map(|(id, obkv)| { - // let mut object = serde_json::Map::default(); - // for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - // let value = obkv.get(fid).unwrap(); - // let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - // object.insert(fid_name.to_owned(), value); - // } - // (id, serde_json::to_string_pretty(&object).unwrap()) - // }) - // .collect::>(); - // println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - // for (id, document) in documents { - // println!("{id}:"); - // println!("{document}"); - // } + let documents = index + .documents(&txn, docs.documents_ids.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); + println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + for (id, document) in documents { + println!("{id}:"); + println!("{document}"); + } + } } query.clear(); } - // for (id, document) in documents { - // println!("{id}:"); - // // println!("{document}"); - // } Ok(()) } From 626a93b348e27d2372edd5346bf68a42f010261f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 27 Mar 2023 18:18:01 +0200 Subject: [PATCH 094/234] Search example: panic when missing the index path --- milli/examples/search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/examples/search.rs b/milli/examples/search.rs index ecc7f9cb8..30ce5eca7 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -14,7 +14,7 @@ fn main() -> Result<(), Box> { let mut args = std::env::args(); let program_name = args.next().expect("No program name"); let dataset = args.next().unwrap_or_else(|| { - format!( + panic!( "Missing path to index. Usage: {} [] [print-documents]", program_name ) From 8d7d8cdc2fff73bc85d79688e90e6cb5affef54c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 27 Mar 2023 18:34:10 +0200 Subject: [PATCH 095/234] Clean-up index example --- milli/examples/index.rs | 87 +++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 47 deletions(-) diff --git a/milli/examples/index.rs b/milli/examples/index.rs index 17a62b31f..504763664 100644 --- a/milli/examples/index.rs +++ b/milli/examples/index.rs @@ -1,41 +1,55 @@ -use std::{ - error::Error, - fs::File, - io::{BufRead, BufReader, Cursor, Seek}, - time::Duration, -}; +use std::error::Error; +use std::fs::File; +use std::io::{BufRead, BufReader, Cursor, Seek}; +use std::path::Path; use heed::EnvOpenOptions; -use milli::{ - documents::{DocumentsBatchBuilder, DocumentsBatchReader}, - update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}, - Criterion, Index, Object, -}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; +use milli::{Criterion, Index, Object}; + +fn usage(error: &str, program_name: &str) -> String { + format!( + "{}. Usage: {} [searchable_fields] [filterable_fields]", + error, program_name + ) +} fn main() -> Result<(), Box> { + let mut args = std::env::args(); + let program_name = args.next().expect("No program name"); + let index_path = + args.next().unwrap_or_else(|| panic!("{}", usage("Missing path to index.", &program_name))); + let dataset_path = args + .next() + .unwrap_or_else(|| panic!("{}", usage("Missing path to source dataset.", &program_name))); + let primary_key = args.next().unwrap_or_else(|| "id".into()); + // "title overview" + let searchable_fields: Vec = args + .next() + .map(|arg| arg.split_whitespace().map(ToString::to_string).collect()) + .unwrap_or_default(); + // "release_date genres" + let filterable_fields: Vec = args + .next() + .map(|arg| arg.split_whitespace().map(ToString::to_string).collect()) + .unwrap_or_default(); + let mut options = EnvOpenOptions::new(); options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - let index = Index::new(options, "data_organizations").unwrap(); + std::fs::create_dir_all(&index_path).unwrap(); + let index = Index::new(options, index_path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let primary_key = "uuid"; - // let searchable_fields = vec!["body", "title", "url"]; - // let searchable_fields = vec!["title", "overview"]; - let searchable_fields = - vec!["name", "primary_role", "city", "region", "country_code", "short_description"]; - // let filterable_fields = vec!["release_date", "genres"]; - let config = IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(primary_key.to_owned()); + builder.set_primary_key(primary_key); let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); builder.set_searchable_fields(searchable_fields); - // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_filterable_fields(filterable_fields); + let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(filterable_fields); - // builder.set_min_word_len_one_typo(5); - // builder.set_min_word_len_two_typos(100); builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); builder.execute(|_| (), || false).unwrap(); @@ -45,35 +59,14 @@ fn main() -> Result<(), Box> { IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); let documents = documents_from( - // "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json", - "/Users/meilisearch/Documents/datasets/organizations.csv", - // "json" - "csv", + &dataset_path, + Path::new(&dataset_path).extension().unwrap_or_default().to_str().unwrap_or_default(), ); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); - // let rtxn = index.read_txn().unwrap(); - - // let mut wtxn = index.write_txn().unwrap(); - // let config = IndexerConfig::default(); - // let indexing_config = IndexDocumentsConfig::default(); - // let builder = - // IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); - - // let documents = documents_from("test_doc.json", "json"); - // let (builder, user_error) = builder.add_documents(documents).unwrap(); - // user_error.unwrap(); - // builder.execute().unwrap(); - // wtxn.commit().unwrap(); - - // let _ = index.all_documents(&rtxn)?; - - // println!("done!"); - // std::thread::sleep(Duration::from_secs(100)); - index.prepare_for_closing().wait(); Ok(()) } From b4a52a622e138179d158f66acb849a5e4349c743 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 28 Mar 2023 12:39:42 +0200 Subject: [PATCH 096/234] BoxRankingRule --- milli/src/search/new/logger/detailed.rs | 3 ++- milli/src/search/new/logger/mod.rs | 5 +++-- milli/src/search/new/mod.rs | 10 +++++----- milli/src/search/new/ranking_rules.rs | 4 +++- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index d6037aab2..57be61612 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -13,6 +13,7 @@ use crate::search::new::ranking_rule_graph::{ DeadEndsCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, }; +use crate::search::new::ranking_rules::BoxRankingRule; use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger}; pub enum SearchEvents { @@ -98,7 +99,7 @@ impl SearchLogger for DetailedSearchLogger { fn initial_universe(&mut self, universe: &RoaringBitmap) { self.initial_universe = Some(universe.clone()); } - fn ranking_rules(&mut self, rr: &[Box>]) { + fn ranking_rules(&mut self, rr: &[BoxRankingRule]) { self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect()); } diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 9ebb4344a..470983017 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -8,6 +8,7 @@ use super::query_graph::QueryNode; use super::ranking_rule_graph::{ DeadEndsCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoCondition, TypoGraph, }; +use super::ranking_rules::BoxRankingRule; use super::{RankingRule, RankingRuleQueryTrait}; /// Trait for structure logging the execution of a search query. @@ -22,7 +23,7 @@ pub trait SearchLogger { fn initial_universe(&mut self, universe: &RoaringBitmap); /// Logs the ranking rules used to perform the search query - fn ranking_rules(&mut self, rr: &[Box>]); + fn ranking_rules(&mut self, rr: &[BoxRankingRule]); /// Logs the start of a ranking rule's iteration. fn start_iteration_ranking_rule( @@ -93,7 +94,7 @@ impl SearchLogger for DefaultSearchLogger { fn initial_universe(&mut self, _universe: &RoaringBitmap) {} - fn ranking_rules(&mut self, _rr: &[Box>]) {} + fn ranking_rules(&mut self, _rr: &[BoxRankingRule]) {} fn start_iteration_ranking_rule( &mut self, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 5839787ef..66feeab99 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -32,7 +32,7 @@ use resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; use roaring::RoaringBitmap; use words::Words; -use self::ranking_rules::RankingRule; +use self::ranking_rules::{BoxRankingRule, RankingRule}; use crate::{Filter, Index, MatchingWords, Result, SearchResult, TermsMatchingStrategy}; /// A structure used throughout the execution of a search query. @@ -106,11 +106,11 @@ fn resolve_maximally_reduced_query_graph( /// Return the list of initialised ranking rules to be used for a placeholder search. fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, -) -> Result>>> { +) -> Result>> { // let sort = false; // let mut asc = HashSet::new(); // let mut desc = HashSet::new(); - let /*mut*/ ranking_rules: Vec>> = vec![]; + let /*mut*/ ranking_rules: Vec> = vec![]; let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; for rr in settings_ranking_rules { // Add Words before any of: typo, proximity, attribute, exactness @@ -132,7 +132,7 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( fn get_ranking_rules_for_query_graph_search<'ctx>( ctx: &SearchContext<'ctx>, terms_matching_strategy: TermsMatchingStrategy, -) -> Result>>> { +) -> Result>> { // query graph search let mut words = false; let mut typo = false; @@ -143,7 +143,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut asc = HashSet::new(); let mut desc = HashSet::new(); - let mut ranking_rules: Vec>> = vec![]; + let mut ranking_rules: Vec> = vec![]; let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; for rr in settings_ranking_rules { // Add Words before any of: typo, proximity, attribute, exactness diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 32434248c..1f5f4b366 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -15,6 +15,8 @@ pub struct PlaceholderQuery; impl RankingRuleQueryTrait for PlaceholderQuery {} impl RankingRuleQueryTrait for QueryGraph {} +pub type BoxRankingRule<'ctx, Query> = Box + 'ctx>; + /// A trait that must be implemented by all ranking rules. /// /// It is generic over `'ctx`, the lifetime of the search context @@ -70,7 +72,7 @@ pub struct RankingRuleOutput { pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, - mut ranking_rules: Vec>>, + mut ranking_rules: Vec>, query: &Q, universe: &RoaringBitmap, from: usize, From abb19d368dc110db87676235168b27f3b1c545ab Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 28 Mar 2023 12:40:52 +0200 Subject: [PATCH 097/234] Initialize query time ranking rule for query search --- milli/src/search/mod.rs | 1 + milli/src/search/new/mod.rs | 45 +++++++++++++++++++++++++++++++----- milli/src/search/new/sort.rs | 2 +- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 0792ce799..734671990 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -113,6 +113,7 @@ impl<'a> Search<'a> { &self.query, self.terms_matching_strategy, &self.filter, + &self.sort_criteria, self.offset, self.limit, Some(self.words_limit), diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 66feeab99..5a282891b 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -33,7 +33,11 @@ use roaring::RoaringBitmap; use words::Words; use self::ranking_rules::{BoxRankingRule, RankingRule}; -use crate::{Filter, Index, MatchingWords, Result, SearchResult, TermsMatchingStrategy}; +use self::sort::Sort; +use crate::{ + AscDesc, CriterionError, Filter, Index, MatchingWords, Member, Result, SearchResult, + TermsMatchingStrategy, +}; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -106,6 +110,7 @@ fn resolve_maximally_reduced_query_graph( /// Return the list of initialised ranking rules to be used for a placeholder search. fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, + sort_criteria: &Option>, ) -> Result>> { // let sort = false; // let mut asc = HashSet::new(); @@ -131,13 +136,14 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( /// Return the list of initialised ranking rules to be used for a query graph search. fn get_ranking_rules_for_query_graph_search<'ctx>( ctx: &SearchContext<'ctx>, + sort_criteria: &Option>, terms_matching_strategy: TermsMatchingStrategy, ) -> Result>> { // query graph search let mut words = false; let mut typo = false; let mut proximity = false; - let sort = false; + let mut sort = false; let attribute = false; let exactness = false; let mut asc = HashSet::new(); @@ -193,8 +199,33 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( if sort { continue; } - // todo!(); - // sort = false; + + for criterion in sort_criteria.clone().unwrap_or_default() { + let sort_ranking_rule = match criterion { + AscDesc::Asc(Member::Field(field_name)) => { + if asc.contains(&field_name) { + continue; + } + asc.insert(field_name.clone()); + Sort::new(ctx.index, ctx.txn, field_name, true)? + } + AscDesc::Desc(Member::Field(field_name)) => { + if desc.contains(&field_name) { + continue; + } + desc.insert(field_name.clone()); + Sort::new(ctx.index, ctx.txn, field_name, false)? + } + _ => { + return Err(CriterionError::ReservedNameForSort { + name: "_geoPoint".to_string(), + } + .into()) + } + }; + ranking_rules.push(Box::new(sort_ranking_rule)); + } + sort = true; } crate::Criterion::Exactness => { if exactness { @@ -228,6 +259,7 @@ pub fn execute_search( query: &Option, terms_matching_strategy: TermsMatchingStrategy, filters: &Option, + sort_criteria: &Option>, from: usize, length: usize, words_limit: Option, @@ -268,11 +300,12 @@ pub fn execute_search( query_graph_logger, )?; - let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, terms_matching_strategy)?; + let ranking_rules = + get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { - let ranking_rules = get_ranking_rules_for_placeholder_search(ctx)?; + let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?; bucket_sort( ctx, ranking_rules, diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 04152f0f0..53144d00d 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -51,7 +51,7 @@ pub struct Sort<'ctx, Query> { iter: Option>, } impl<'ctx, Query> Sort<'ctx, Query> { - pub fn _new( + pub fn new( index: &Index, rtxn: &'ctx heed::RoTxn, field_name: String, From 53afda3237767c45d29e5f05709d914b3c02da45 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 28 Mar 2023 16:35:46 +0200 Subject: [PATCH 098/234] Update search usage in example --- milli/examples/search.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 30ce5eca7..1757c1c5b 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -53,6 +53,7 @@ fn main() -> Result<(), Box> { // what a the from which when there is TermsMatchingStrategy::Last, &None, + &None, 0, 20, None, From 77acafe534f887031f401bfcfcfc913512bdf321 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 28 Mar 2023 16:41:03 +0200 Subject: [PATCH 099/234] Resolve search time sort criteria for placeholder search --- milli/src/search/new/mod.rs | 79 ++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 5a282891b..881151aa3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -112,10 +112,10 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, sort_criteria: &Option>, ) -> Result>> { - // let sort = false; - // let mut asc = HashSet::new(); - // let mut desc = HashSet::new(); - let /*mut*/ ranking_rules: Vec> = vec![]; + let mut sort = false; + let mut asc = HashSet::new(); + let mut desc = HashSet::new(); + let mut ranking_rules: Vec> = vec![]; let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; for rr in settings_ranking_rules { // Add Words before any of: typo, proximity, attribute, exactness @@ -125,7 +125,13 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( | crate::Criterion::Attribute | crate::Criterion::Proximity | crate::Criterion::Exactness => continue, - crate::Criterion::Sort => todo!(), + crate::Criterion::Sort => { + if sort { + continue; + } + resolve_sort_criteria(sort_criteria, ctx, &mut ranking_rules, &mut asc, &mut desc)?; + sort = true; + } crate::Criterion::Asc(_) => todo!(), crate::Criterion::Desc(_) => todo!(), } @@ -199,32 +205,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( if sort { continue; } - - for criterion in sort_criteria.clone().unwrap_or_default() { - let sort_ranking_rule = match criterion { - AscDesc::Asc(Member::Field(field_name)) => { - if asc.contains(&field_name) { - continue; - } - asc.insert(field_name.clone()); - Sort::new(ctx.index, ctx.txn, field_name, true)? - } - AscDesc::Desc(Member::Field(field_name)) => { - if desc.contains(&field_name) { - continue; - } - desc.insert(field_name.clone()); - Sort::new(ctx.index, ctx.txn, field_name, false)? - } - _ => { - return Err(CriterionError::ReservedNameForSort { - name: "_geoPoint".to_string(), - } - .into()) - } - }; - ranking_rules.push(Box::new(sort_ranking_rule)); - } + resolve_sort_criteria(sort_criteria, ctx, &mut ranking_rules, &mut asc, &mut desc)?; sort = true; } crate::Criterion::Exactness => { @@ -253,6 +234,42 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( Ok(ranking_rules) } +fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( + sort_criteria: &Option>, + ctx: &SearchContext<'ctx>, + ranking_rules: &mut Vec>, + asc: &mut HashSet, + desc: &mut HashSet, +) -> Result<()> { + let sort_criteria = sort_criteria.clone().unwrap_or_default(); + ranking_rules.reserve(sort_criteria.len()); + for criterion in sort_criteria { + let sort_ranking_rule = match criterion { + AscDesc::Asc(Member::Field(field_name)) => { + if asc.contains(&field_name) { + continue; + } + asc.insert(field_name.clone()); + Sort::new(ctx.index, ctx.txn, field_name, true)? + } + AscDesc::Desc(Member::Field(field_name)) => { + if desc.contains(&field_name) { + continue; + } + desc.insert(field_name.clone()); + Sort::new(ctx.index, ctx.txn, field_name, false)? + } + _ => { + return Err( + CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() }.into() + ) + } + }; + ranking_rules.push(Box::new(sort_ranking_rule)); + } + Ok(()) +} + #[allow(clippy::too_many_arguments)] pub fn execute_search( ctx: &mut SearchContext, From d4f6216966f20c888f840471bd698657c281c345 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 28 Mar 2023 16:41:25 +0200 Subject: [PATCH 100/234] Resolve rule time sort criteria --- milli/src/search/new/mod.rs | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 881151aa3..a7728bc5a 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -132,8 +132,20 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( resolve_sort_criteria(sort_criteria, ctx, &mut ranking_rules, &mut asc, &mut desc)?; sort = true; } - crate::Criterion::Asc(_) => todo!(), - crate::Criterion::Desc(_) => todo!(), + crate::Criterion::Asc(field_name) => { + if asc.contains(&field_name) { + continue; + } + asc.insert(field_name.clone()); + ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?)); + } + crate::Criterion::Desc(field_name) => { + if desc.contains(&field_name) { + continue; + } + desc.insert(field_name.clone()); + ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?)); + } } } Ok(ranking_rules) @@ -215,19 +227,19 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( // todo!(); // exactness = false; } - crate::Criterion::Asc(field) => { - if asc.contains(&field) { + crate::Criterion::Asc(field_name) => { + if asc.contains(&field_name) { continue; } - asc.insert(field); - // TODO + asc.insert(field_name.clone()); + ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?)); } - crate::Criterion::Desc(field) => { - if desc.contains(&field) { + crate::Criterion::Desc(field_name) => { + if desc.contains(&field_name) { continue; } - desc.insert(field); - // todo!(); + desc.insert(field_name.clone()); + ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?)); } } } From 3524bd1257b624b82c2750409dcbc059ec412806 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 29 Mar 2023 08:44:11 +0200 Subject: [PATCH 101/234] SmallBitmap: Add documentation --- milli/src/search/new/small_bitmap.rs | 92 +++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 3 deletions(-) diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index b1cb78c8c..c7ff3009a 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -3,22 +3,37 @@ use std::marker::PhantomData; use super::interner::{FixedSizeInterner, Interned}; /// A compact set of [`Interned`] +/// +/// This set optimizes storage by storing the set of values in a bitmap, and further optimizes +/// for bitmaps where the highest possible index (describing the limits of the "universe") +/// is smaller than 64 by storing them as a `u64`. pub struct SmallBitmap { + // internals are not typed as they only represent the indexes that are set internal: SmallBitmapInternal, + // restores typing with a tag _phantom: PhantomData, } + +// manual implementation for when `T` is not Clone. impl Clone for SmallBitmap { fn clone(&self) -> Self { Self { internal: self.internal.clone(), _phantom: PhantomData } } } + impl SmallBitmap { + /// Constructs a new, **empty**, `SmallBitmap` with an universe large enough to hold all elements + /// from `interner`. + /// + /// The constructed bitmap does not refer to any element in the interner, use [`from_iter`] if there should be + /// some interned values in the bitmap after construction. pub fn for_interned_values_in(interner: &FixedSizeInterner) -> Self { Self::new(interner.len()) } - // universe_length not stored anywhere, only used to decide between tiny/small - // universe_length: passed 63, actual length will be rounded up 64 - // passed 66, actual 64 * xs.len() as u16 = 128, passed sized rounded up to the next 64 + + /// Constructs a new, **empty**, `SmallBitmap` with an universe at least as large as specified. + /// + /// If the passed universe length is not a multiple of 64, it will be rounded up to the next multiple of 64. pub fn new(universe_length: u16) -> Self { if universe_length <= 64 { Self { internal: SmallBitmapInternal::Tiny(0), _phantom: PhantomData } @@ -32,12 +47,24 @@ impl SmallBitmap { } } + /// The highest index that can be set in this bitmap. + /// + /// The universe length is always a multiple of 64, and may be higher than the value passed to [`Self::new`]. pub fn universe_length(&self) -> u16 { match &self.internal { SmallBitmapInternal::Tiny(_) => 64, SmallBitmapInternal::Small(xs) => 64 * xs.len() as u16, } } + + /// Constructs a new `SmallBitmap` with an universe large enough to hold all elements + /// from `from_interner`, and containing all the `Interned` produced by `xs`. + /// + /// It is a logic error to pass an iterator producing `Interned`s that don't belong to the passed interner. + /// + /// # Panics + /// + /// - If `xs` produces an element that doesn't fit the universe length obtained from `for_interner`. pub fn from_iter( xs: impl Iterator>, for_interner: &FixedSizeInterner, @@ -47,37 +74,96 @@ impl SmallBitmap { _phantom: PhantomData, } } + + /// Returns `true` if this bitmap does not contain any `Interned`. pub fn is_empty(&self) -> bool { self.internal.is_empty() } + + /// Removes all `Interned` from this bitmap, such that it [`is_empty`] returns `true` after this call. pub fn clear(&mut self) { self.internal.clear() } + + /// Whether `x` is part of the bitmap. + /// + /// It is a logic error to pass an `Interned` from a different interner that the one this bitmap references. + /// + /// # Panics + /// + /// - if `x` does not fit in [`universe_length`] pub fn contains(&self, x: Interned) -> bool { self.internal.contains(x.into_raw()) } + + /// Adds `x` to the bitmap, such that [`contains(x)`] returns `true` after this call. + /// + /// It is a logic error to pass an `Interned` from a different interner that the one this bitmap references. + /// + /// # Panics + /// + /// - if `x` does not fit in [`universe_length`] pub fn insert(&mut self, x: Interned) { self.internal.insert(x.into_raw()) } + + /// Removes `x` from the bitmap, such that [`contains(x)`] returns `false` after this call. + /// + /// It is a logic error to pass an `Interned` from a different interner that the one this bitmap references. + /// + /// # Panics + /// + /// - if `x` does not fit in [`universe_length`] pub fn remove(&mut self, x: Interned) { self.internal.remove(x.into_raw()) } + /// Modifies in place this bitmap to retain only the elements that are also present in `other`. + /// + /// # Panics + /// + /// - if the universe lengths of `self` and `other` differ pub fn intersection(&mut self, other: &Self) { self.internal.intersection(&other.internal) } + + /// Modifies in place this bitmap to add the elements that are present in `other`. + /// + /// # Panics + /// + /// - if the universe lengths of `self` and `other` differ pub fn union(&mut self, other: &Self) { self.internal.union(&other.internal) } + + /// Modifies in place this bitmap to remove the elements that are also present in `other`. + /// + /// # Panics + /// + /// - if the universe lengths of `self` and `other` differ pub fn subtract(&mut self, other: &Self) { self.internal.subtract(&other.internal) } + + /// Whether all the elements of `self` are contained in `other`. + /// + /// # Panics + /// + /// - if the universe lengths of `self` and `other` differ pub fn is_subset(&self, other: &Self) -> bool { self.internal.is_subset(&other.internal) } + + /// Whether any element of `self` is contained in `other`. + /// + /// # Panics + /// + /// - if the universe lengths of `self` and `other` differ pub fn intersects(&self, other: &Self) -> bool { self.internal.intersects(&other.internal) } + + /// Returns an iterator of the `Interned` that are present in this bitmap. pub fn iter(&self) -> impl Iterator> + '_ { self.internal.iter().map(|x| Interned::from_raw(x)) } From ef084ef0421a3d0c800c1a158441e1ef1f56c368 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 29 Mar 2023 08:45:38 +0200 Subject: [PATCH 102/234] SmallBitmap: Consistently panic on incoherent universe lengths --- milli/src/search/new/small_bitmap.rs | 132 +++++++++++++++++++-------- 1 file changed, 95 insertions(+), 37 deletions(-) diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index c7ff3009a..3fe404622 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -51,10 +51,7 @@ impl SmallBitmap { /// /// The universe length is always a multiple of 64, and may be higher than the value passed to [`Self::new`]. pub fn universe_length(&self) -> u16 { - match &self.internal { - SmallBitmapInternal::Tiny(_) => 64, - SmallBitmapInternal::Small(xs) => 64 * xs.len() as u16, - } + self.internal.universe_length() } /// Constructs a new `SmallBitmap` with an universe large enough to hold all elements @@ -211,37 +208,71 @@ impl SmallBitmapInternal { } } } - pub fn contains(&self, mut x: u16) -> bool { - let set = match self { - SmallBitmapInternal::Tiny(set) => *set, - SmallBitmapInternal::Small(set) => { - let idx = x / 64; - x %= 64; - set[idx as usize] + pub fn universe_length(&self) -> u16 { + match &self { + SmallBitmapInternal::Tiny(_) => 64, + SmallBitmapInternal::Small(xs) => 64 * xs.len() as u16, + } + } + + fn get_set_index(&self, x: u16) -> (u64, u16) { + match self { + SmallBitmapInternal::Tiny(set) => { + assert!( + x < 64, + "index out of bounds: the universe length is 64 but the index is {}", + x + ); + (*set, x) } - }; + SmallBitmapInternal::Small(set) => { + let idx = (x as usize) / 64; + assert!( + idx < set.len(), + "index out of bounds: the universe length is {} but the index is {}", + self.universe_length(), + x + ); + (set[idx], x % 64) + } + } + } + + fn get_set_index_mut(&mut self, x: u16) -> (&mut u64, u16) { + match self { + SmallBitmapInternal::Tiny(set) => { + assert!( + x < 64, + "index out of bounds: the universe length is 64 but the index is {}", + x + ); + (set, x) + } + SmallBitmapInternal::Small(set) => { + let idx = (x as usize) / 64; + assert!( + idx < set.len(), + "index out of bounds: the universe length is {} but the index is {}", + 64 * set.len() as u16, + x + ); + (&mut set[idx], x % 64) + } + } + } + + pub fn contains(&self, x: u16) -> bool { + let (set, x) = self.get_set_index(x); set & 0b1 << x != 0 } - pub fn insert(&mut self, mut x: u16) { - let set = match self { - SmallBitmapInternal::Tiny(set) => set, - SmallBitmapInternal::Small(set) => { - let idx = x / 64; - x %= 64; - &mut set[idx as usize] - } - }; + + pub fn insert(&mut self, x: u16) { + let (set, x) = self.get_set_index_mut(x); *set |= 0b1 << x; } - pub fn remove(&mut self, mut x: u16) { - let set = match self { - SmallBitmapInternal::Tiny(set) => set, - SmallBitmapInternal::Small(set) => { - let idx = x / 64; - x %= 64; - &mut set[idx as usize] - } - }; + + pub fn remove(&mut self, x: u16) { + let (set, x) = self.get_set_index_mut(x); *set &= !(0b1 << x); } @@ -259,13 +290,22 @@ impl SmallBitmapInternal { match (self, other) { (SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(a, *b), (SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => { - assert!(a.len() == b.len(),); + assert!( + a.len() == b.len(), + "universe length mismatch: left is {}, but right is {}", + a.len() * 64, + other.universe_length() + ); for (a, b) in a.iter_mut().zip(b.iter()) { op(a, *b); } } - _ => { - panic!(); + (this, other) => { + panic!( + "universe length mismatch: left is {}, but right is {}", + this.universe_length(), + other.universe_length() + ); } } } @@ -273,7 +313,12 @@ impl SmallBitmapInternal { match (self, other) { (SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(*a, *b), (SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => { - assert!(a.len() == b.len()); + assert!( + a.len() == b.len(), + "universe length mismatch: left is {}, but right is {}", + a.len() * 64, + other.universe_length() + ); for (a, b) in a.iter().zip(b.iter()) { if !op(*a, *b) { return false; @@ -282,7 +327,11 @@ impl SmallBitmapInternal { true } _ => { - panic!(); + panic!( + "universe length mismatch: left is {}, but right is {}", + self.universe_length(), + other.universe_length() + ); } } } @@ -290,7 +339,12 @@ impl SmallBitmapInternal { match (self, other) { (SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(*a, *b), (SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => { - assert!(a.len() == b.len()); + assert!( + a.len() == b.len(), + "universe length mismatch: left is {}, but right is {}", + a.len() * 64, + other.universe_length() + ); for (a, b) in a.iter().zip(b.iter()) { if op(*a, *b) { return true; @@ -299,7 +353,11 @@ impl SmallBitmapInternal { false } _ => { - panic!(); + panic!( + "universe length mismatch: left is {}, but right is {}", + self.universe_length(), + other.universe_length() + ); } } } From abb4522f7661796670e3a118bd755e2682b58a4b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 29 Mar 2023 09:11:06 +0200 Subject: [PATCH 103/234] Small comment on ignored rules for placeholder search --- milli/src/search/new/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index a7728bc5a..125752944 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -118,8 +118,8 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( let mut ranking_rules: Vec> = vec![]; let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; for rr in settings_ranking_rules { - // Add Words before any of: typo, proximity, attribute, exactness match rr { + // These rules need a query to have an effect; ignore them in placeholder search crate::Criterion::Words | crate::Criterion::Typo | crate::Criterion::Attribute From 5ac129bfa161a0aceda88382ace169c115c43f32 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 29 Mar 2023 15:20:22 +0200 Subject: [PATCH 104/234] Mark geosearch as currently unimplemented for sort rule --- milli/src/search/new/mod.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 125752944..de7a884e3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -35,8 +35,8 @@ use words::Words; use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::sort::Sort; use crate::{ - AscDesc, CriterionError, Filter, Index, MatchingWords, Member, Result, SearchResult, - TermsMatchingStrategy, + AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy, + UserError, }; /// A structure used throughout the execution of a search query. @@ -271,10 +271,9 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( desc.insert(field_name.clone()); Sort::new(ctx.index, ctx.txn, field_name, false)? } + // geosearch _ => { - return Err( - CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() }.into() - ) + todo!() } }; ranking_rules.push(Box::new(sort_ranking_rule)); From d74134ce3a550459556979c505edcafa255e1ea9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 29 Mar 2023 15:21:54 +0200 Subject: [PATCH 105/234] Check sort criteria --- milli/src/search/new/mod.rs | 44 +++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index de7a884e3..ef3f6c047 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -320,6 +320,8 @@ pub fn execute_search( let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; let graph = QueryGraph::from_query(ctx, query_terms)?; + check_sort_criteria(ctx, sort_criteria.as_ref())?; + universe = resolve_maximally_reduced_query_graph( ctx, &universe, @@ -353,3 +355,45 @@ pub fn execute_search( documents_ids, }) } + +fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec>) -> Result<()> { + let sort_criteria = if let Some(sort_criteria) = sort_criteria { + sort_criteria + } else { + return Ok(()); + }; + + if sort_criteria.is_empty() { + return Ok(()); + } + + // We check that the sort ranking rule exists and throw an + // error if we try to use it and that it doesn't. + let sort_ranking_rule_missing = !ctx.index.criteria(ctx.txn)?.contains(&crate::Criterion::Sort); + if sort_ranking_rule_missing { + return Err(UserError::SortRankingRuleMissing.into()); + } + + // We check that we are allowed to use the sort criteria, we check + // that they are declared in the sortable fields. + let sortable_fields = ctx.index.sortable_fields(ctx.txn)?; + for asc_desc in sort_criteria { + match asc_desc.member() { + Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => { + return Err(UserError::InvalidSortableAttribute { + field: field.to_string(), + valid_fields: sortable_fields.into_iter().collect(), + })? + } + Member::Geo(_) if !sortable_fields.contains("_geo") => { + return Err(UserError::InvalidSortableAttribute { + field: "_geo".to_string(), + valid_fields: sortable_fields.into_iter().collect(), + })? + } + _ => (), + } + } + + Ok(()) +} From 3a818c5e878054e5d9715de6a4fbb4300fd5a78d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 09:56:18 +0200 Subject: [PATCH 106/234] Add more functionality to interners --- milli/src/search/new/interner.rs | 65 ++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index b8f54d087..e9bfbef86 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -4,6 +4,8 @@ use std::marker::PhantomData; use fxhash::FxHashMap; +use super::small_bitmap::SmallBitmap; + /// An index within an interner ([`FixedSizeInterner`], [`DedupInterner`], or [`MappedInterner`]). pub struct Interned { idx: u16, @@ -86,6 +88,13 @@ impl FixedSizeInterner { pub fn from_vec(store: Vec) -> Self { Self { stable_store: store } } + pub fn all_interned_values(&self) -> SmallBitmap { + let mut b = SmallBitmap::for_interned_values_in(self); + for i in self.indexes() { + b.insert(i); + } + b + } pub fn get(&self, interned: Interned) -> &T { &self.stable_store[interned.idx as usize] } @@ -96,12 +105,68 @@ impl FixedSizeInterner { pub fn len(&self) -> u16 { self.stable_store.len() as u16 } + pub fn map_move(self, map_f: impl Fn(T) -> U) -> FixedSizeInterner { + FixedSizeInterner { stable_store: self.stable_store.into_iter().map(map_f).collect() } + } pub fn map(&self, map_f: impl Fn(&T) -> U) -> MappedInterner { MappedInterner { stable_store: self.stable_store.iter().map(map_f).collect(), _phantom: PhantomData, } } + pub fn map_indexes(&self, map_f: impl Fn(Interned) -> U) -> MappedInterner { + MappedInterner { stable_store: self.indexes().map(map_f).collect(), _phantom: PhantomData } + } + pub fn indexes(&self) -> impl Iterator> { + (0..self.stable_store.len()).map(|i| Interned::from_raw(i as u16)) + } + pub fn iter(&self) -> impl Iterator, &T)> { + self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x)) + } + pub fn iter_mut(&mut self) -> impl Iterator, &mut T)> { + self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x)) + } +} + +/// A fixed-length store for values of type `T`, where each value is identified +/// by an index of type [`Interned`]. +#[derive(Clone)] +pub struct Interner { + stable_store: Vec, +} +impl Default for Interner { + fn default() -> Self { + Self { stable_store: vec![] } + } +} + +impl Interner { + pub fn from_vec(v: Vec) -> Self { + Self { stable_store: v } + } + pub fn get(&self, interned: Interned) -> &T { + &self.stable_store[interned.idx as usize] + } + pub fn get_mut(&mut self, interned: Interned) -> &mut T { + &mut self.stable_store[interned.idx as usize] + } + pub fn push(&mut self, value: T) -> Interned { + assert!(self.stable_store.len() < u16::MAX as usize); + self.stable_store.push(value); + Interned::from_raw(self.stable_store.len() as u16 - 1) + } + pub fn len(&self) -> u16 { + self.stable_store.len() as u16 + } + pub fn map(&self, map_f: impl Fn(&T) -> U) -> MappedInterner { + MappedInterner { + stable_store: self.stable_store.iter().map(map_f).collect(), + _phantom: PhantomData, + } + } + pub fn map_indexes(&self, map_f: impl Fn(Interned) -> U) -> MappedInterner { + MappedInterner { stable_store: self.indexes().map(map_f).collect(), _phantom: PhantomData } + } pub fn indexes(&self) -> impl Iterator> { (0..self.stable_store.len()).map(|i| Interned::from_raw(i as u16)) } From c2b025946aa6dd64aebc1dfe8e9f591b0bd79290 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 30 Mar 2023 10:42:16 +0200 Subject: [PATCH 107/234] `located_query_terms_from_string`: use u16 for positions, hard limit number of iterated tokens. - Refactor phrase logic to reduce number of possible states --- milli/src/search/new/query_graph.rs | 2 +- milli/src/search/new/query_term.rs | 151 ++++++++++++++++++---------- milli/src/search/new/words.rs | 4 +- 3 files changed, 102 insertions(+), 55 deletions(-) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 0f06b9b95..acadb508c 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -267,7 +267,7 @@ impl QueryGraph { /// Remove all the nodes that correspond to a word starting at the given position, and connect /// the predecessors of these nodes to their successors. /// Return `true` if any node was removed. - pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool { + pub fn remove_words_starting_at_position(&mut self, position: u16) -> bool { let mut nodes_to_remove_keeping_edges = vec![]; for (node_idx, node) in self.nodes.iter() { let QueryNodeData::Term(LocatedQueryTerm { value: _, positions }) = &node.data else { continue }; diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 0850b2181..049b96646 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -1,5 +1,4 @@ use std::collections::HashSet; -use std::mem; use std::ops::RangeInclusive; use charabia::normalizer::NormalizedTokenIter; @@ -414,8 +413,7 @@ impl QueryTerm { #[derive(Clone)] pub struct LocatedQueryTerm { pub value: Interned, - // TODO: consider changing to u8, or even a u16 - pub positions: RangeInclusive, + pub positions: RangeInclusive, } impl LocatedQueryTerm { @@ -425,9 +423,55 @@ impl LocatedQueryTerm { } } +struct PhraseBuilder { + words: Vec>>, + start: u16, + end: u16, +} + +impl PhraseBuilder { + fn empty() -> Self { + Self { words: Default::default(), start: u16::MAX, end: u16::MAX } + } + + fn is_empty(&self) -> bool { + self.words.is_empty() + } + + // precondition: token has kind Word or StopWord + fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { + if self.is_empty() { + self.start = position; + } + self.end = position; + if let TokenKind::StopWord = token.kind { + self.words.push(None); + } else { + // token has kind Word + let word = ctx.word_interner.insert(token.lemma().to_string()); + // TODO: in a phrase, check that every word exists + // otherwise return an empty term + self.words.push(Some(word)); + } + } + + fn build(self, ctx: &mut SearchContext) -> Option { + if self.is_empty() { + return None; + } + Some(LocatedQueryTerm { + value: ctx.term_interner.insert(QueryTerm::phrase( + &mut ctx.word_interner, + &mut ctx.phrase_interner, + Phrase { words: self.words }, + )), + positions: self.start..=self.end, + }) + } +} + /// Convert the tokenised search query into a list of located query terms. // TODO: checking if the positions are correct for phrases, separators, ngrams -// hard-limit the number of tokens that are considered pub fn located_query_terms_from_string( ctx: &mut SearchContext, query: NormalizedTokenIter<&[u8]>, @@ -437,16 +481,17 @@ pub fn located_query_terms_from_string( let mut located_terms = Vec::new(); - let mut phrase = Vec::new(); - let mut quoted = false; + let mut phrase: Option = None; let parts_limit = words_limit.unwrap_or(usize::MAX); - let mut position = -1i8; - let mut phrase_start = -1i8; - let mut phrase_end = -1i8; + // start with the last position as we will wrap around to position 0 at the beginning of the loop below. + let mut position = u16::MAX; - let mut peekable = query.peekable(); + // TODO: Loic, find proper value here so we don't overflow the interner. + const MAX_TOKEN_COUNT: usize = 1_000; + + let mut peekable = query.take(MAX_TOKEN_COUNT).peekable(); while let Some(token) = peekable.next() { // early return if word limit is exceeded if located_terms.len() >= parts_limit { @@ -455,23 +500,14 @@ pub fn located_query_terms_from_string( match token.kind { TokenKind::Word | TokenKind::StopWord => { - position += 1; + // On first loop, goes from u16::MAX to 0, then normal increment. + position = position.wrapping_add(1); + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. - if quoted { - phrase_end = position; - if phrase.is_empty() { - phrase_start = position; - } - if let TokenKind::StopWord = token.kind { - phrase.push(None); - } else { - let word = ctx.word_interner.insert(token.lemma().to_string()); - // TODO: in a phrase, check that every word exists - // otherwise return an empty term - phrase.push(Some(word)); - } + if let Some(phrase) = &mut phrase { + phrase.push_word(ctx, &token, position) } else if peekable.peek().is_some() { match token.kind { TokenKind::Word => { @@ -505,41 +541,52 @@ pub fn located_query_terms_from_string( position += 0; } } - let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - // swap quoted state if we encounter a double quote - if quote_count % 2 != 0 { - quoted = !quoted; - } - // if there is a quote or a hard separator we close the phrase. - // TODO: limit phrase size? - if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) - { - let located_query_term = LocatedQueryTerm { - value: ctx.term_interner.insert(QueryTerm::phrase( - &mut ctx.word_interner, - &mut ctx.phrase_interner, - Phrase { words: mem::take(&mut phrase) }, - )), - positions: phrase_start..=phrase_end, + + phrase = 'phrase: { + let phrase = phrase.take(); + + // If we have a hard separator inside a phrase, we immediately start a new phrase + let phrase = if separator_kind == SeparatorKind::Hard { + if let Some(phrase) = phrase { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + Some(PhraseBuilder::empty()) + } else { + None + } + } else { + phrase }; - located_terms.push(located_query_term); - } + + // We close and start a new phrase depending on the number of double quotes + let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + if quote_count == 0 { + break 'phrase phrase; + } + + // Consume the closing quote and the phrase + if let Some(phrase) = phrase { + // Per the check above, quote_count > 0 + quote_count -= 1; + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + } + + // Start new phrase if the token ends with an opening quote + (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) + }; } _ => (), } } // If a quote is never closed, we consider all of the end of the query as a phrase. - if !phrase.is_empty() { - let located_query_term = LocatedQueryTerm { - value: ctx.term_interner.insert(QueryTerm::phrase( - &mut ctx.word_interner, - &mut ctx.phrase_interner, - Phrase { words: mem::take(&mut phrase) }, - )), - positions: phrase_start..=phrase_end, - }; - located_terms.push(located_query_term); + if let Some(phrase) = phrase.take() { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term); + } } Ok(located_terms) diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index fb2c62f11..ba8086c00 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -12,7 +12,7 @@ pub struct Words { exhausted: bool, // TODO: remove query_graph: Option, iterating: bool, // TODO: remove - positions_to_remove: Vec, + positions_to_remove: Vec, terms_matching_strategy: TermsMatchingStrategy, } impl Words { @@ -52,7 +52,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {} } } - let mut r: Vec = all_positions.into_iter().collect(); + let mut r: Vec = all_positions.into_iter().collect(); // don't remove the first term r.remove(0); r From 9507ff5e311d7b6089e572d4cb0a4718264c4f2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 09:52:47 +0200 Subject: [PATCH 108/234] Update query term structure to allow for laziness --- .../search/new/graph_based_ranking_rule.rs | 2 +- milli/src/search/new/mod.rs | 3 +- milli/src/search/new/query_term.rs | 924 ++++++++++++------ .../search/new/ranking_rule_graph/typo/mod.rs | 2 +- 4 files changed, 625 insertions(+), 306 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 194e62c30..db4310815 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -373,7 +373,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase if new_term.is_empty() { nodes_to_remove.push(node_id); } else { - term.value = ctx.term_interner.insert(new_term); + term.value = ctx.term_interner.push(new_term); } } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index ef3f6c047..4f50fcd29 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -32,6 +32,7 @@ use resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; use roaring::RoaringBitmap; use words::Words; +use self::interner::Interner; use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::sort::Sort; use crate::{ @@ -46,7 +47,7 @@ pub struct SearchContext<'ctx> { pub db_cache: DatabaseCache<'ctx>, pub word_interner: DedupInterner, pub phrase_interner: DedupInterner, - pub term_interner: DedupInterner, + pub term_interner: Interner, // think about memory usage of that field (roaring bitmaps in a hashmap) pub term_docids: QueryTermDocIdsCache, } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 0850b2181..1e9b2852c 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -1,4 +1,5 @@ -use std::collections::HashSet; +use std::borrow::Cow; +use std::collections::BTreeSet; use std::mem; use std::ops::RangeInclusive; @@ -28,143 +29,341 @@ impl Phrase { } } -/// A structure storing all the different ways to match -/// a term in the user's search query. +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum Lazy { + Uninit, + Init(T), +} +impl Lazy { + pub fn is_init(&self) -> bool { + match self { + Lazy::Uninit => false, + Lazy::Init(_) => true, + } + } + pub fn is_uninit(&self) -> bool { + match self { + Lazy::Uninit => true, + Lazy::Init(_) => false, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum NTypoTermSubset { + All, + Subset { + words: BTreeSet>, + phrases: BTreeSet>, + // TODO: prefixes: BTreeSet>, + }, + Nothing, +} + +impl NTypoTermSubset { + pub fn contains_word(&self, word: Interned) -> bool { + match self { + NTypoTermSubset::All => true, + NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word), + NTypoTermSubset::Nothing => false, + } + } + pub fn contains_phrase(&self, phrase: Interned) -> bool { + match self { + NTypoTermSubset::All => true, + NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase), + NTypoTermSubset::Nothing => false, + } + } + pub fn is_empty(&self) -> bool { + match self { + NTypoTermSubset::All => false, + NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(), + NTypoTermSubset::Nothing => true, + } + } + pub fn union(&mut self, other: &Self) { + match self { + Self::All => {} + Self::Subset { words, phrases } => match other { + Self::All => { + *self = Self::All; + } + Self::Subset { words: w2, phrases: p2 } => { + words.extend(w2); + phrases.extend(p2); + } + Self::Nothing => {} + }, + Self::Nothing => { + *self = other.clone(); + } + } + } + pub fn intersect(&mut self, other: &Self) { + match self { + Self::All => *self = other.clone(), + Self::Subset { words, phrases } => match other { + Self::All => {} + Self::Subset { words: w2, phrases: p2 } => { + let mut ws = BTreeSet::new(); + for w in words.intersection(w2) { + ws.insert(*w); + } + let mut ps = BTreeSet::new(); + for p in phrases.intersection(p2) { + ps.insert(*p); + } + *words = ws; + *phrases = ps; + } + Self::Nothing => *self = Self::Nothing, + }, + Self::Nothing => {} + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct QueryTermSubset { + pub original: Interned, + pub zero_typo_subset: NTypoTermSubset, + pub one_typo_subset: NTypoTermSubset, + pub two_typo_subset: NTypoTermSubset, +} + +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct LocatedQueryTermSubset { + pub term_subset: QueryTermSubset, + pub positions: RangeInclusive, + pub term_ids: RangeInclusive, +} + +impl QueryTermSubset { + pub fn empty(for_term: Interned) -> Self { + Self { + original: for_term, + zero_typo_subset: NTypoTermSubset::Nothing, + one_typo_subset: NTypoTermSubset::Nothing, + two_typo_subset: NTypoTermSubset::Nothing, + } + } + pub fn full(for_term: Interned) -> Self { + Self { + original: for_term, + zero_typo_subset: NTypoTermSubset::All, + one_typo_subset: NTypoTermSubset::All, + two_typo_subset: NTypoTermSubset::All, + } + } + + pub fn union(&mut self, other: &Self) { + assert!(self.original == other.original); + self.zero_typo_subset.union(&other.zero_typo_subset); + self.one_typo_subset.union(&other.one_typo_subset); + self.two_typo_subset.union(&other.two_typo_subset); + } + pub fn intersect(&mut self, other: &Self) { + assert!(self.original == other.original); + self.zero_typo_subset.intersect(&other.zero_typo_subset); + self.one_typo_subset.intersect(&other.one_typo_subset); + self.two_typo_subset.intersect(&other.two_typo_subset); + } + + pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option> { + let original = ctx.term_interner.get(self.original); + let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { + return None + }; + match &self.zero_typo_subset { + NTypoTermSubset::All => Some(use_prefix_db), + NTypoTermSubset::Subset { words, phrases: _ } => { + // TODO: use a subset of prefix words instead + if words.contains(&use_prefix_db) { + Some(use_prefix_db) + } else { + None + } + } + NTypoTermSubset::Nothing => None, + } + } + pub fn all_single_words_except_prefix_db( + &self, + ctx: &mut SearchContext, + ) -> Result>> { + let original = ctx.term_interner.get_mut(self.original); + let mut result = BTreeSet::default(); + // TODO: a compute_partially funtion + if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { + original.compute_fully_if_needed( + ctx.index, + ctx.txn, + &mut ctx.word_interner, + &mut ctx.phrase_interner, + )?; + } + + if !self.zero_typo_subset.is_empty() { + let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = + &original.zero_typo; + result.extend(zero_typo.iter().copied()); + result.extend(prefix_of.iter().copied()); + }; + + match &self.one_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { + panic!() + }; + result.extend(one_typo.iter().copied()) + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { + panic!() + }; + result.extend(one_typo.intersection(words)); + } + NTypoTermSubset::Nothing => {} + }; + + match &self.two_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { + panic!() + }; + result.extend(two_typos.iter().copied()); + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { + panic!() + }; + result.extend(two_typos.intersection(words)); + } + NTypoTermSubset::Nothing => {} + }; + + Ok(result) + } + pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result>> { + let original = ctx.term_interner.get_mut(self.original); + let mut result = BTreeSet::default(); + + if !self.one_typo_subset.is_empty() { + // TODO: compute less than fully if possible + original.compute_fully_if_needed( + ctx.index, + ctx.txn, + &mut ctx.word_interner, + &mut ctx.phrase_interner, + )?; + } + + let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = + &original.zero_typo; + result.extend(phrase.iter().copied()); + result.extend(synonyms.iter().copied()); + + if !self.one_typo_subset.is_empty() { + let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { + panic!(); + }; + result.extend(split_words.iter().copied()); + } + + Ok(result) + } +} + +impl QueryTerm { + pub fn compute_fully_if_needed( + &mut self, + index: &Index, + txn: &RoTxn, + word_interner: &mut DedupInterner, + phrase_interner: &mut DedupInterner, + ) -> Result<()> { + if self.max_nbr_typos == 0 { + self.one_typo = Lazy::Init(OneTypoTerm::default()); + self.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if self.max_nbr_typos == 1 && self.one_typo.is_uninit() { + assert!(self.two_typo.is_uninit()); + self.initialize_one_typo_subterm(index, txn, word_interner, phrase_interner)?; + assert!(self.one_typo.is_init()); + self.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if self.max_nbr_typos > 1 && self.two_typo.is_uninit() { + assert!(self.two_typo.is_uninit()); + self.initialize_one_and_two_typo_subterm(index, txn, word_interner, phrase_interner)?; + assert!(self.one_typo.is_init() && self.two_typo.is_init()); + } + Ok(()) + } +} + #[derive(Clone, PartialEq, Eq, Hash)] pub struct QueryTerm { - /// The original terms, for debugging purposes pub original: Interned, - /// Whether the term is an ngram - pub is_ngram: bool, - /// Whether the term can be only the prefix of a word + pub is_multiple_words: bool, + pub max_nbr_typos: u8, pub is_prefix: bool, + pub zero_typo: ZeroTypoTerm, + // May not be computed yet + pub one_typo: Lazy, + // May not be computed yet + pub two_typo: Lazy, +} + +// SubTerms will be in a dedup interner +#[derive(Default, Clone, PartialEq, Eq, Hash)] +pub struct ZeroTypoTerm { /// The original phrase, if any pub phrase: Option>, /// A single word equivalent to the original term, with zero typos pub zero_typo: Option>, /// All the words that contain the original word as prefix - pub prefix_of: Box<[Interned]>, + pub prefix_of: BTreeSet>, /// All the synonyms of the original word or phrase - pub synonyms: Box<[Interned]>, - - /// The original word split into multiple consecutive words - pub split_words: Option>, - - /// Words that are 1 typo away from the original word - pub one_typo: Box<[Interned]>, - - /// Words that are 2 typos away from the original word - pub two_typos: Box<[Interned]>, - + pub synonyms: BTreeSet>, /// A prefix in the prefix databases matching the original word pub use_prefix_db: Option>, } -impl QueryTerm { - pub fn removing_forbidden_terms( - &self, - allowed_words: &HashSet>, - allowed_phrases: &HashSet>, - ) -> Option { - let QueryTerm { - original, - is_ngram, - is_prefix, - phrase, - zero_typo, - prefix_of, - synonyms, - split_words, - one_typo, - two_typos, - use_prefix_db, - } = self; +#[derive(Default, Clone, PartialEq, Eq, Hash)] +pub struct OneTypoTerm { + /// The original word split into multiple consecutive words + pub split_words: Option>, + /// Words that are 1 typo away from the original word + pub one_typo: BTreeSet>, +} +#[derive(Default, Clone, PartialEq, Eq, Hash)] +pub struct TwoTypoTerm { + /// Words that are 2 typos away from the original word + pub two_typos: BTreeSet>, +} - let mut changed = false; - - let mut new_zero_typo = None; - if let Some(w) = zero_typo { - if allowed_words.contains(w) { - new_zero_typo = Some(*w); - } else { - changed = true; - } - } - // TODO: this is incorrect, prefix DB stuff should be treated separately - let mut new_use_prefix_db = None; - if let Some(w) = use_prefix_db { - if allowed_words.contains(w) { - new_use_prefix_db = Some(*w); - } else { - changed = true; - } - } - let mut new_prefix_of = vec![]; - for w in prefix_of.iter() { - if allowed_words.contains(w) { - new_prefix_of.push(*w); - } else { - changed = true; - } - } - let mut new_one_typo = vec![]; - for w in one_typo.iter() { - if allowed_words.contains(w) { - new_one_typo.push(*w); - } else { - changed = true; - } - } - let mut new_two_typos = vec![]; - for w in two_typos.iter() { - if allowed_words.contains(w) { - new_two_typos.push(*w); - } else { - changed = true; - } - } - // TODO: this is incorrect, prefix DB stuff should be treated separately - let mut new_phrase = None; - if let Some(w) = phrase { - if !allowed_phrases.contains(w) { - new_phrase = Some(*w); - } else { - changed = true; - } - } - let mut new_split_words = None; - if let Some(w) = split_words { - if allowed_phrases.contains(w) { - new_split_words = Some(*w); - } else { - changed = true; - } - } - let mut new_synonyms = vec![]; - for w in synonyms.iter() { - if allowed_phrases.contains(w) { - new_synonyms.push(*w); - } else { - changed = true; - } - } - if changed { - Some(QueryTerm { - original: *original, - is_ngram: *is_ngram, - is_prefix: *is_prefix, - phrase: new_phrase, - zero_typo: new_zero_typo, - prefix_of: new_prefix_of.into_boxed_slice(), - synonyms: new_synonyms.into_boxed_slice(), - split_words: new_split_words, - one_typo: new_one_typo.into_boxed_slice(), - two_typos: new_two_typos.into_boxed_slice(), - use_prefix_db: new_use_prefix_db, - }) - } else { - None - } +impl ZeroTypoTerm { + fn is_empty(&self) -> bool { + let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; + phrase.is_none() + && zero_typo.is_none() + && prefix_of.is_empty() + && synonyms.is_empty() + && use_prefix_db.is_none() } +} +impl OneTypoTerm { + fn is_empty(&self) -> bool { + let OneTypoTerm { split_words, one_typo } = self; + one_typo.is_empty() && split_words.is_none() + } +} +impl TwoTypoTerm { + fn is_empty(&self) -> bool { + let TwoTypoTerm { two_typos } = self; + two_typos.is_empty() + } +} + +impl QueryTerm { pub fn phrase( word_interner: &mut DedupInterner, phrase_interner: &mut DedupInterner, @@ -172,76 +371,172 @@ impl QueryTerm { ) -> Self { Self { original: word_interner.insert(phrase.description(word_interner)), - phrase: Some(phrase_interner.insert(phrase)), + is_multiple_words: false, + max_nbr_typos: 0, is_prefix: false, - zero_typo: None, - prefix_of: Box::new([]), - synonyms: Box::new([]), - split_words: None, - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: None, - is_ngram: false, + zero_typo: ZeroTypoTerm { + phrase: Some(phrase_interner.insert(phrase)), + zero_typo: None, + prefix_of: BTreeSet::default(), + synonyms: BTreeSet::default(), + use_prefix_db: None, + }, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, } } pub fn empty(word_interner: &mut DedupInterner, original: &str) -> Self { Self { original: word_interner.insert(original.to_owned()), - phrase: None, + is_multiple_words: false, is_prefix: false, - zero_typo: None, - prefix_of: Box::new([]), - synonyms: Box::new([]), - split_words: None, - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: None, - is_ngram: false, + max_nbr_typos: 0, + zero_typo: <_>::default(), + one_typo: Lazy::Init(<_>::default()), + two_typo: Lazy::Init(<_>::default()), } } - /// Return an iterator over all the single words derived from the original word. - /// - /// This excludes synonyms, split words, and words stored in the prefix databases. - pub fn all_single_words_except_prefix_db( - &'_ self, - ) -> impl Iterator> + Clone + '_ { - self.zero_typo - .iter() - .chain(self.prefix_of.iter()) - .chain(self.one_typo.iter()) - .chain(self.two_typos.iter()) - .copied() - } - /// Return an iterator over all the single words derived from the original word. - /// - /// This excludes synonyms, split words, and words stored in the prefix databases. - pub fn all_phrases(&'_ self) -> impl Iterator> + Clone + '_ { - self.split_words.iter().chain(self.synonyms.iter()).copied() - } + pub fn is_empty(&self) -> bool { - self.zero_typo.is_none() - && self.one_typo.is_empty() - && self.two_typos.is_empty() - && self.prefix_of.is_empty() - && self.synonyms.is_empty() - && self.split_words.is_none() - && self.use_prefix_db.is_none() + let Lazy::Init(one_typo) = &self.one_typo else { + return false; + }; + let Lazy::Init(two_typo) = &self.two_typo else { + return false; + }; + + self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() } } -/// Compute the query term for the given word -pub fn query_term_from_word( +pub enum ZeroOrOneTypo { + Zero, + One, +} + +fn find_zero_typo_prefix_derivations( + word_interned: Interned, + fst: fst::Set>, + word_interner: &mut DedupInterner, + mut visit: impl FnMut(Interned) -> Result<()>, +) -> Result<()> { + let word = word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + let prefix = Str::new(word).starts_with(); + let mut stream = fst.search(prefix).into_stream(); + + while let Some(derived_word) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?.to_owned(); + let derived_word_interned = word_interner.insert(derived_word); + if derived_word_interned != word_interned { + visit(derived_word_interned)?; + } + } + Ok(()) +} + +fn find_zero_one_typo_derivations( + word_interned: Interned, + is_prefix: bool, + fst: fst::Set>, + word_interner: &mut DedupInterner, + mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result<()>, +) -> Result<()> { + let word = word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + + let dfa = build_dfa(word, 1, is_prefix); + let starts = StartsWith(Str::new(get_first(word))); + let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); + // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? + + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; + let derived_word = word_interner.insert(derived_word.to_owned()); + let d = dfa.distance(state.1); + match d.to_u8() { + 0 => { + if derived_word != word_interned { + visit(derived_word, ZeroOrOneTypo::Zero)?; + } + } + 1 => { + visit(derived_word, ZeroOrOneTypo::One)?; + } + _ => panic!(), + } + } + Ok(()) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum NumberOfTypos { + Zero, + One, + Two, +} +fn find_zero_one_two_typo_derivations( + word_interned: Interned, + is_prefix: bool, + fst: fst::Set>, + word_interner: &mut DedupInterner, + mut visit: impl FnMut(Interned, NumberOfTypos) -> Result<()>, +) -> Result<()> { + let word = word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + + let starts = StartsWith(Str::new(get_first(word))); + let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); + let second_dfa = build_dfa(word, 2, is_prefix); + let second = Intersection(&second_dfa, &starts); + let automaton = Union(first, &second); + + let mut stream = fst.search_with_state(automaton).into_stream(); + // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? + + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; + let derived_word_interned = word_interner.insert(derived_word.to_owned()); + // in the case the typo is on the first letter, we know the number of typo + // is two + if get_first(derived_word) != get_first(word) { + visit(derived_word_interned, NumberOfTypos::Two)?; + } else { + // Else, we know that it is the second dfa that matched and compute the + // correct distance + let d = second_dfa.distance((state.1).0); + match d.to_u8() { + 0 => { + if derived_word_interned != word_interned { + visit(derived_word_interned, NumberOfTypos::Zero)?; + } + } + 1 => { + visit(derived_word_interned, NumberOfTypos::One)?; + } + 2 => { + visit(derived_word_interned, NumberOfTypos::Two)?; + } + _ => panic!(), + } + } + } + Ok(()) +} + +fn partially_initialized_term_from_word( ctx: &mut SearchContext, word: &str, max_typo: u8, is_prefix: bool, ) -> Result { + let word_interned = ctx.word_interner.insert(word.to_owned()); + if word.len() > MAX_WORD_LENGTH { return Ok(QueryTerm::empty(&mut ctx.word_interner, word)); } let fst = ctx.index.words_fst(ctx.txn)?; - let word_interned = ctx.word_interner.insert(word.to_owned()); let use_prefix_db = is_prefix && ctx @@ -253,94 +548,23 @@ pub fn query_term_from_word( let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; let mut zero_typo = None; - let mut prefix_of = vec![]; - let mut one_typo = vec![]; - let mut two_typos = vec![]; + let mut prefix_of = BTreeSet::new(); if fst.contains(word) { zero_typo = Some(word_interned); } - if max_typo == 0 { - if is_prefix && use_prefix_db.is_none() { - let prefix = Str::new(word).starts_with(); - let mut stream = fst.search(prefix).into_stream(); - - while let Some(derived_word) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?.to_owned(); - let derived_word_interned = ctx.word_interner.insert(derived_word); - if derived_word_interned != word_interned { - prefix_of.push(derived_word_interned); - } - } - } - } else if max_typo == 1 { - let dfa = build_dfa(word, 1, is_prefix); - let starts = StartsWith(Str::new(get_first(word))); - let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); - // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? - - while let Some((derived_word, state)) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?; - - let d = dfa.distance(state.1); - let derived_word_interned = ctx.word_interner.insert(derived_word.to_owned()); - match d.to_u8() { - 0 => { - if derived_word_interned != word_interned { - prefix_of.push(derived_word_interned); - } - } - 1 => { - one_typo.push(derived_word_interned); - } - _ => panic!(), - } - } - } else { - let starts = StartsWith(Str::new(get_first(word))); - let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); - let second_dfa = build_dfa(word, 2, is_prefix); - let second = Intersection(&second_dfa, &starts); - let automaton = Union(first, &second); - - let mut stream = fst.search_with_state(automaton).into_stream(); - // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? - - while let Some((derived_word, state)) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?; - let derived_word_interned = ctx.word_interner.insert(derived_word.to_owned()); - // in the case the typo is on the first letter, we know the number of typo - // is two - if get_first(derived_word) != get_first(word) { - two_typos.push(derived_word_interned); - } else { - // Else, we know that it is the second dfa that matched and compute the - // correct distance - let d = second_dfa.distance((state.1).0); - match d.to_u8() { - 0 => { - if derived_word_interned != word_interned { - prefix_of.push(derived_word_interned); - } - } - 1 => { - one_typo.push(derived_word_interned); - } - 2 => { - two_typos.push(derived_word_interned); - } - _ => panic!(), - } - } - } + if is_prefix && use_prefix_db.is_none() { + find_zero_typo_prefix_derivations( + word_interned, + fst, + &mut ctx.word_interner, + |derived_word| { + prefix_of.insert(derived_word); + Ok(()) + }, + )?; } - let split_words = split_best_frequency(ctx.index, ctx.txn, word)?.map(|(l, r)| { - ctx.phrase_interner.insert(Phrase { - words: vec![Some(ctx.word_interner.insert(l)), Some(ctx.word_interner.insert(r))], - }) - }); - let synonyms = ctx.index.synonyms(ctx.txn)?; let synonyms = synonyms @@ -353,22 +577,118 @@ pub fn query_term_from_word( ctx.phrase_interner.insert(Phrase { words }) }) .collect(); + let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; Ok(QueryTerm { original: word_interned, - phrase: None, + is_multiple_words: false, + max_nbr_typos: max_typo, is_prefix, zero_typo, - prefix_of: prefix_of.into_boxed_slice(), - synonyms, - split_words, - one_typo: one_typo.into_boxed_slice(), - two_typos: two_typos.into_boxed_slice(), - use_prefix_db, - is_ngram: false, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, }) } +fn find_split_words( + index: &Index, + txn: &RoTxn, + word_interner: &mut DedupInterner, + phrase_interner: &mut DedupInterner, + word: &str, +) -> Result>> { + let split_words = split_best_frequency(index, txn, word)?.map(|(l, r)| { + phrase_interner.insert(Phrase { + words: vec![Some(word_interner.insert(l)), Some(word_interner.insert(r))], + }) + }); + Ok(split_words) +} + +impl QueryTerm { + fn initialize_one_typo_subterm( + &mut self, + index: &Index, + txn: &RoTxn, + word_interner: &mut DedupInterner, + phrase_interner: &mut DedupInterner, + ) -> Result<()> { + let QueryTerm { original, is_prefix, one_typo, .. } = self; + let original_str = word_interner.get(*original).to_owned(); + if one_typo.is_init() { + return Ok(()); + } + let mut one_typo_words = BTreeSet::new(); + + find_zero_one_typo_derivations( + *original, + *is_prefix, + index.words_fst(txn)?, + word_interner, + |derived_word, nbr_typos| { + match nbr_typos { + ZeroOrOneTypo::Zero => {} + ZeroOrOneTypo::One => { + one_typo_words.insert(derived_word); + } + } + Ok(()) + }, + )?; + let split_words = + find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?; + let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; + + self.one_typo = Lazy::Init(one_typo); + + Ok(()) + } + fn initialize_one_and_two_typo_subterm( + &mut self, + index: &Index, + txn: &RoTxn, + word_interner: &mut DedupInterner, + phrase_interner: &mut DedupInterner, + ) -> Result<()> { + let QueryTerm { original, is_prefix, two_typo, .. } = self; + let original_str = word_interner.get(*original).to_owned(); + if two_typo.is_init() { + return Ok(()); + } + let mut one_typo_words = BTreeSet::new(); + let mut two_typo_words = BTreeSet::new(); + + find_zero_one_two_typo_derivations( + *original, + *is_prefix, + index.words_fst(txn)?, + word_interner, + |derived_word, nbr_typos| { + match nbr_typos { + NumberOfTypos::Zero => {} + NumberOfTypos::One => { + one_typo_words.insert(derived_word); + } + NumberOfTypos::Two => { + two_typo_words.insert(derived_word); + } + } + Ok(()) + }, + )?; + let split_words = + find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?; + let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; + + let two_typo = TwoTypoTerm { two_typos: two_typo_words }; + + self.one_typo = Lazy::Init(one_typo); + self.two_typo = Lazy::Init(two_typo); + + Ok(()) + } +} + /// Split the original word into the two words that appear the /// most next to each other in the index. /// @@ -402,7 +722,7 @@ fn split_best_frequency( impl QueryTerm { /// Return the original word from the given query term pub fn original_single_word(&self) -> Option> { - if self.phrase.is_some() || self.is_ngram { + if self.is_multiple_words { None } else { Some(self.original) @@ -413,21 +733,13 @@ impl QueryTerm { /// A query term term coupled with its position in the user's search query. #[derive(Clone)] pub struct LocatedQueryTerm { + // should the query term subset really be interned? + // possibly, yes pub value: Interned, - // TODO: consider changing to u8, or even a u16 pub positions: RangeInclusive, } -impl LocatedQueryTerm { - /// Return `true` iff the term is empty - pub fn is_empty(&self, interner: &DedupInterner) -> bool { - interner.get(self.value).is_empty() - } -} - /// Convert the tokenised search query into a list of located query terms. -// TODO: checking if the positions are correct for phrases, separators, ngrams -// hard-limit the number of tokens that are considered pub fn located_query_terms_from_string( ctx: &mut SearchContext, query: NormalizedTokenIter<&[u8]>, @@ -476,9 +788,14 @@ pub fn located_query_terms_from_string( match token.kind { TokenKind::Word => { let word = token.lemma(); - let term = query_term_from_word(ctx, word, nbr_typos(word), false)?; + let term = partially_initialized_term_from_word( + ctx, + word, + nbr_typos(word), + false, + )?; let located_term = LocatedQueryTerm { - value: ctx.term_interner.insert(term), + value: ctx.term_interner.push(term), positions: position..=position, }; located_terms.push(located_term); @@ -487,10 +804,10 @@ pub fn located_query_terms_from_string( } } else { let word = token.lemma(); - // eagerly compute all derivations - let term = query_term_from_word(ctx, word, nbr_typos(word), true)?; + let term = + partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; let located_term = LocatedQueryTerm { - value: ctx.term_interner.insert(term), + value: ctx.term_interner.push(term), positions: position..=position, }; located_terms.push(located_term); @@ -511,11 +828,10 @@ pub fn located_query_terms_from_string( quoted = !quoted; } // if there is a quote or a hard separator we close the phrase. - // TODO: limit phrase size? if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) { let located_query_term = LocatedQueryTerm { - value: ctx.term_interner.insert(QueryTerm::phrase( + value: ctx.term_interner.push(QueryTerm::phrase( &mut ctx.word_interner, &mut ctx.phrase_interner, Phrase { words: mem::take(&mut phrase) }, @@ -532,7 +848,7 @@ pub fn located_query_terms_from_string( // If a quote is never closed, we consider all of the end of the query as a phrase. if !phrase.is_empty() { let located_query_term = LocatedQueryTerm { - value: ctx.term_interner.insert(QueryTerm::phrase( + value: ctx.term_interner.push(QueryTerm::phrase( &mut ctx.word_interner, &mut ctx.phrase_interner, Phrase { words: mem::take(&mut phrase) }, @@ -600,34 +916,36 @@ pub fn make_ngram( return Ok(None); } - let mut term = query_term_from_word( - ctx, - &ngram_str, - number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8), - is_prefix, - )?; - term.original = ctx.word_interner.insert(words.join(" ")); + let max_nbr_typos = + number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); + + let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; + + // let (_, mut zero_typo, mut one_typo, two_typo) = + // all_subterms_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; + let original = ctx.word_interner.insert(words.join(" ")); + // Now add the synonyms let index_synonyms = ctx.index.synonyms(ctx.txn)?; - let mut term_synonyms = term.synonyms.to_vec(); - term_synonyms.extend(index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map( - |words| { + + term.zero_typo.synonyms.extend( + index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| { let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); ctx.phrase_interner.insert(Phrase { words }) - }, - )); - term.synonyms = term_synonyms.into_boxed_slice(); - if let Some(split_words) = term.split_words { - let split_words = ctx.phrase_interner.get(split_words); - if split_words.words == words_interned.iter().map(|&i| Some(i)).collect::>() { - term.split_words = None; - } - } - if term.is_empty() { - return Ok(None); - } - term.is_ngram = true; - let term = LocatedQueryTerm { value: ctx.term_interner.insert(term), positions: start..=end }; + }), + ); + + let term = QueryTerm { + original, + is_multiple_words: true, + is_prefix, + max_nbr_typos, + zero_typo: term.zero_typo, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + }; + + let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end }; Ok(Some(term)) } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index d20523cc9..c27051de0 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -128,7 +128,7 @@ impl RankingRuleGraphTrait for TypoGraph { nbr_typos as u8 + base_cost, Some( conditions_interner - .insert(TypoCondition { term: term_interner.insert(new_term) }), + .insert(TypoCondition { term: term_interner.push(new_term) }), ), )) } From 223e82a10d5f99c89b8e948999d7d97be7052cda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 11:05:51 +0200 Subject: [PATCH 109/234] Update QueryGraph to use new lazy query terms + build from paths --- milli/src/search/new/query_graph.rs | 383 +++++++++++++++++----------- 1 file changed, 241 insertions(+), 142 deletions(-) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 0f06b9b95..8f87dfd8c 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,10 +1,14 @@ -use std::collections::HashSet; - use super::interner::{FixedSizeInterner, Interned}; -use super::query_term::{self, number_of_typos_allowed, LocatedQueryTerm}; +use super::query_term::{ + self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, NTypoTermSubset, + QueryTermSubset, +}; use super::small_bitmap::SmallBitmap; use super::SearchContext; +use crate::search::new::interner::DedupInterner; use crate::Result; +use std::cmp::Ordering; +use std::collections::BTreeMap; /// A node of the [`QueryGraph`]. /// @@ -21,9 +25,9 @@ pub struct QueryNode { pub predecessors: SmallBitmap, pub successors: SmallBitmap, } -#[derive(Clone)] +#[derive(Clone, PartialEq, Eq, Hash)] pub enum QueryNodeData { - Term(LocatedQueryTerm), + Term(LocatedQueryTermSubset), Deleted, Start, End, @@ -83,51 +87,15 @@ pub struct QueryGraph { pub nodes: FixedSizeInterner, } -// impl Default for QueryGraph { -// /// Create a new QueryGraph with two disconnected nodes: the root and end nodes. -// fn default() -> Self { -// let nodes = vec![ -// QueryNode { -// data: QueryNodeData::Start, -// predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), -// successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), -// }, -// QueryNode { -// data: QueryNodeData::End, -// predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), -// successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), -// }, -// ]; - -// Self { root_node: 0, end_node: 1, nodes } -// } -// } - -impl QueryGraph { - /// Connect all the given predecessor nodes to the given successor node - fn connect_to_node( - &mut self, - from_nodes: &[Interned], - to_node: Interned, - ) { - for &from_node in from_nodes { - self.nodes.get_mut(from_node).successors.insert(to_node); - self.nodes.get_mut(to_node).predecessors.insert(from_node); - } - } -} - impl QueryGraph { /// Build the query graph from the parsed user search query. - /// - /// The ngrams are made at this point. - pub fn from_query(ctx: &mut SearchContext, terms: Vec) -> Result { + pub fn from_query( + ctx: &mut SearchContext, + // NOTE: the terms here must be consecutive + terms: &[LocatedQueryTerm], + ) -> Result { let nbr_typos = number_of_typos_allowed(ctx)?; - let mut empty_nodes = vec![]; - - let mut predecessors: Vec> = vec![HashSet::new(), HashSet::new()]; - let mut successors: Vec> = vec![HashSet::new(), HashSet::new()]; let mut nodes_data: Vec = vec![QueryNodeData::Start, QueryNodeData::End]; let root_node = 0; let end_node = 1; @@ -136,21 +104,23 @@ impl QueryGraph { let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![root_node]); - for term_idx in 0..terms.len() { - let term0 = &terms[term_idx]; - + let original_terms_len = terms.len(); + for term_idx in 0..original_terms_len { let mut new_nodes = vec![]; let new_node_idx = add_node( &mut nodes_data, - QueryNodeData::Term(term0.clone()), - &prev0, - &mut successors, - &mut predecessors, + QueryNodeData::Term(LocatedQueryTermSubset { + term_subset: QueryTermSubset { + original: Interned::from_raw(term_idx as u16), + zero_typo_subset: NTypoTermSubset::All, + one_typo_subset: NTypoTermSubset::All, + two_typo_subset: NTypoTermSubset::All, + }, + positions: terms[term_idx].positions.clone(), + term_ids: term_idx as u8..=term_idx as u8, + }), ); new_nodes.push(new_node_idx); - if term0.is_empty(&ctx.term_interner) { - empty_nodes.push(new_node_idx); - } if !prev1.is_empty() { if let Some(ngram) = @@ -158,10 +128,16 @@ impl QueryGraph { { let ngram_idx = add_node( &mut nodes_data, - QueryNodeData::Term(ngram), - &prev1, - &mut successors, - &mut predecessors, + QueryNodeData::Term(LocatedQueryTermSubset { + term_subset: QueryTermSubset { + original: ngram.value, + zero_typo_subset: NTypoTermSubset::All, + one_typo_subset: NTypoTermSubset::All, + two_typo_subset: NTypoTermSubset::All, + }, + positions: ngram.positions, + term_ids: term_idx as u8 - 1..=term_idx as u8, + }), ); new_nodes.push(ngram_idx); } @@ -172,10 +148,16 @@ impl QueryGraph { { let ngram_idx = add_node( &mut nodes_data, - QueryNodeData::Term(ngram), - &prev2, - &mut successors, - &mut predecessors, + QueryNodeData::Term(LocatedQueryTermSubset { + term_subset: QueryTermSubset { + original: ngram.value, + zero_typo_subset: NTypoTermSubset::All, + one_typo_subset: NTypoTermSubset::All, + two_typo_subset: NTypoTermSubset::All, + }, + positions: ngram.positions, + term_ids: term_idx as u8 - 2..=term_idx as u8, + }), ); new_nodes.push(ngram_idx); } @@ -193,35 +175,17 @@ impl QueryGraph { successors: SmallBitmap::new(nodes_data.len() as u16), }, ); - for (node_idx, ((node_data, predecessors), successors)) in nodes_data - .into_iter() - .zip(predecessors.into_iter()) - .zip(successors.into_iter()) - .enumerate() - { + for (node_idx, node_data) in nodes_data.into_iter().enumerate() { let node = nodes.get_mut(Interned::from_raw(node_idx as u16)); node.data = node_data; - for x in predecessors { - node.predecessors.insert(Interned::from_raw(x)); - } - for x in successors { - node.successors.insert(Interned::from_raw(x)); - } } let mut graph = QueryGraph { root_node, end_node, nodes }; - - graph.connect_to_node( - prev0.into_iter().map(Interned::from_raw).collect::>().as_slice(), - end_node, - ); - let empty_nodes = empty_nodes.into_iter().map(Interned::from_raw).collect::>(); - graph.remove_nodes_keep_edges(&empty_nodes); + graph.rebuild_edges(); Ok(graph) } /// Remove the given nodes and all their edges from the query graph. - /// TODO: need to check where this is used, and if this is correct. pub fn remove_nodes(&mut self, nodes: &[Interned]) { for &node_id in nodes { let node = &self.nodes.get(node_id); @@ -240,85 +204,220 @@ impl QueryGraph { node.predecessors.clear(); node.successors.clear(); } + self.rebuild_edges(); } - /// Remove the given nodes, connecting all their predecessors to all their successors. - pub fn remove_nodes_keep_edges(&mut self, nodes: &[Interned]) { - for &node_id in nodes { - let node = self.nodes.get(node_id); - let old_node_pred = node.predecessors.clone(); - let old_node_succ = node.successors.clone(); - for pred in old_node_pred.iter() { - let pred_successors = &mut self.nodes.get_mut(pred).successors; - pred_successors.remove(node_id); - pred_successors.union(&old_node_succ); - } - for succ in old_node_succ.iter() { - let succ_predecessors = &mut self.nodes.get_mut(succ).predecessors; - succ_predecessors.remove(node_id); - succ_predecessors.union(&old_node_pred); - } - let node = self.nodes.get_mut(node_id); - node.data = QueryNodeData::Deleted; - node.predecessors.clear(); + + fn rebuild_edges(&mut self) { + for (_, node) in self.nodes.iter_mut() { node.successors.clear(); + node.predecessors.clear(); + } + for node_id in self.nodes.indexes() { + let node = self.nodes.get(node_id); + let end_position = match &node.data { + QueryNodeData::Term(term) => *term.positions.end(), + QueryNodeData::Start => -1, + QueryNodeData::Deleted => continue, + QueryNodeData::End => continue, + }; + let successors = { + let mut successors = SmallBitmap::for_interned_values_in(&self.nodes); + let mut min = i8::MAX; + for (node_id, node) in self.nodes.iter() { + let start_position = match &node.data { + QueryNodeData::Term(term) => *term.positions.start(), + QueryNodeData::End => i8::MAX, + QueryNodeData::Start => continue, + QueryNodeData::Deleted => continue, + }; + if start_position <= end_position { + continue; + } + match start_position.cmp(&min) { + Ordering::Less => { + min = start_position; + successors.clear(); + successors.insert(node_id); + } + Ordering::Equal => { + successors.insert(node_id); + } + Ordering::Greater => continue, + } + } + successors + }; + let node = self.nodes.get_mut(node_id); + node.successors = successors.clone(); + for successor in successors.iter() { + let successor = self.nodes.get_mut(successor); + successor.predecessors.insert(node_id); + } } } - /// Remove all the nodes that correspond to a word starting at the given position, and connect - /// the predecessors of these nodes to their successors. - /// Return `true` if any node was removed. + /// Remove all the nodes that correspond to a word starting at the given position and rebuild + /// the edges of the graph appropriately. pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool { - let mut nodes_to_remove_keeping_edges = vec![]; + let mut nodes_to_remove = vec![]; for (node_idx, node) in self.nodes.iter() { - let QueryNodeData::Term(LocatedQueryTerm { value: _, positions }) = &node.data else { continue }; + let QueryNodeData::Term(LocatedQueryTermSubset { term_subset: _, positions, term_ids: _ }) = &node.data else { continue }; if positions.start() == &position { - nodes_to_remove_keeping_edges.push(node_idx); + nodes_to_remove.push(node_idx); } } - self.remove_nodes_keep_edges(&nodes_to_remove_keeping_edges); + self.remove_nodes(&nodes_to_remove); - self.simplify(); - !nodes_to_remove_keeping_edges.is_empty() + !nodes_to_remove.is_empty() } - /// Simplify the query graph by removing all nodes that are disconnected from - /// the start or end nodes. - pub fn simplify(&mut self) { - loop { - let mut nodes_to_remove = vec![]; - for (node_idx, node) in self.nodes.iter() { - if (!matches!(node.data, QueryNodeData::End | QueryNodeData::Deleted) - && node.successors.is_empty()) - || (!matches!(node.data, QueryNodeData::Start | QueryNodeData::Deleted) - && node.predecessors.is_empty()) - { - nodes_to_remove.push(node_idx); + pub fn removal_order_for_terms_matching_strategy_last(&self) -> Vec> { + let (first_term_idx, last_term_idx) = { + let mut first_term_idx = u8::MAX; + let mut last_term_idx = 0u8; + for (_, node) in self.nodes.iter() { + match &node.data { + QueryNodeData::Term(t) => { + if *t.term_ids.end() > last_term_idx { + last_term_idx = *t.term_ids.end(); + } + if *t.term_ids.start() < first_term_idx { + first_term_idx = *t.term_ids.start(); + } + } + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, } } - if nodes_to_remove.is_empty() { - break; - } else { - self.remove_nodes(&nodes_to_remove); - } + (first_term_idx, last_term_idx) + }; + if first_term_idx >= last_term_idx { + return vec![]; } + let cost_of_term_idx = |term_idx: u8| { + if term_idx == first_term_idx { + None + } else { + let rank = 1 + last_term_idx - term_idx; + Some(rank as u16) + } + }; + let mut nodes_to_remove = BTreeMap::>::new(); + for (node_id, node) in self.nodes.iter() { + let QueryNodeData::Term(t) = &node.data else { continue }; + let mut cost = 0; + for id in t.term_ids.clone() { + if let Some(t_cost) = cost_of_term_idx(id) { + cost += t_cost; + } else { + continue; + } + } + nodes_to_remove + .entry(cost) + .or_insert_with(|| SmallBitmap::for_interned_values_in(&self.nodes)) + .insert(node_id); + } + nodes_to_remove.into_values().collect() } } -fn add_node( - nodes_data: &mut Vec, - node_data: QueryNodeData, - from_nodes: &Vec, - successors: &mut Vec>, - predecessors: &mut Vec>, -) -> u16 { - successors.push(HashSet::new()); - predecessors.push(HashSet::new()); +fn add_node(nodes_data: &mut Vec, node_data: QueryNodeData) -> u16 { let new_node_idx = nodes_data.len() as u16; nodes_data.push(node_data); - for &from_node in from_nodes { - successors[from_node as usize].insert(new_node_idx); - predecessors[new_node_idx as usize].insert(from_node); - } new_node_idx } + +impl QueryGraph { + /* + Build a query graph from a list of paths + + The paths are composed of source and dest terms. + If the source term is `None`, then the last dest term is used + as the predecessor of the dest term. If the source is Some(_), + then an edge is built between the last dest term and the source, + and between the source and new dest term. + + Note that the resulting graph will not correspond to a perfect + representation of the set of paths. + For example, consider the following paths: + ```txt + PATH 1 : a -> b1 -> c1 -> d -> e1 + PATH 2 : a -> b2 -> c2 -> d -> e2 + ``` + Then the resulting graph will be: + ```txt + ┌────┐ ┌────┐ ┌────┐ + ┌──│ b1 │──│ c1 │─┐ ┌──│ e1 │ + ┌────┐ │ └────┘ └────┘ │ ┌────┐ │ └────┘ + │ a │─┤ ├─│ d │─┤ + └────┘ │ ┌────┐ ┌────┐ │ └────┘ │ ┌────┐ + └──│ b2 │──│ c2 │─┘ └──│ e2 │ + └────┘ └────┘ └────┘ + ``` + which is different from the fully correct representation: + ```txt + ┌────┐ ┌────┐ ┌────┐ ┌────┐ + ┌──│ b1 │──│ c1 │───│ d │───│ e1 │ + ┌────┐ │ └────┘ └────┘ └────┘ └────┘ + │ a │─┤ + └────┘ │ ┌────┐ ┌────┐ ┌────┐ ┌────┐ + └──│ b2 │──│ c2 │───│ d │───│ e2 │ + └────┘ └────┘ └────┘ └────┘ + ``` + But we accept the first representation as it reduces the size + of the graph and shouldn't cause much problems. + */ + pub fn build_from_paths( + paths: Vec, LocatedQueryTermSubset)>>, + ) -> Self { + let mut node_data = DedupInterner::default(); + let root_node = node_data.insert(QueryNodeData::Start); + let end_node = node_data.insert(QueryNodeData::End); + + let mut paths_with_ids = vec![]; + for path in paths { + let mut path_with_ids = vec![]; + for node in path { + let (start_term, end_term) = node; + let src_node_id = start_term.map(|x| node_data.insert(QueryNodeData::Term(x))); + let dest_node_id = node_data.insert(QueryNodeData::Term(end_term)); + path_with_ids.push((src_node_id, dest_node_id)); + } + paths_with_ids.push(path_with_ids); + } + let nodes_data = node_data.freeze(); + let nodes_data_len = nodes_data.len(); + let mut nodes = nodes_data.map_move(|n| QueryNode { + data: n, + predecessors: SmallBitmap::new(nodes_data_len), + successors: SmallBitmap::new(nodes_data_len), + }); + + let root_node = Interned::from_raw(root_node.into_raw()); + let end_node = Interned::from_raw(end_node.into_raw()); + + for path in paths_with_ids { + let mut prev_node = root_node; + for node in path { + let (start_term, dest_term) = node; + let end_term = Interned::from_raw(dest_term.into_raw()); + let src = if let Some(start_term) = start_term { + let start_term = Interned::from_raw(start_term.into_raw()); + nodes.get_mut(prev_node).successors.insert(start_term); + nodes.get_mut(start_term).predecessors.insert(prev_node); + start_term + } else { + prev_node + }; + nodes.get_mut(src).successors.insert(end_term); + nodes.get_mut(end_term).predecessors.insert(src); + prev_node = end_term; + } + nodes.get_mut(prev_node).successors.insert(end_node); + nodes.get_mut(end_node).predecessors.insert(prev_node); + } + + QueryGraph { root_node, end_node, nodes } + } +} From d0f048c068403fd146cc98f0e9e76a82915ece8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 11:08:17 +0200 Subject: [PATCH 110/234] Simplify the API of the DatabaseCache --- milli/src/search/new/db_cache.rs | 95 +++++++++++++++----------------- 1 file changed, 43 insertions(+), 52 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 2fa92900c..af94108e2 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -5,8 +5,8 @@ use fxhash::FxHashMap; use heed::types::ByteSlice; use heed::{BytesEncode, Database, RoTxn}; -use super::interner::{DedupInterner, Interned}; -use crate::{Index, Result}; +use super::interner::Interned; +use crate::{Result, SearchContext}; /// A cache storing pointers to values in the LMDB databases. /// @@ -47,94 +47,85 @@ impl<'ctx> DatabaseCache<'ctx> { }; Ok(bitmap_ptr) } - +} +impl<'ctx> SearchContext<'ctx> { /// Retrieve or insert the given value in the `word_docids` database. - pub fn get_word_docids( - &mut self, - index: &Index, - txn: &'ctx RoTxn, - word_interner: &DedupInterner, - word: Interned, - ) -> Result> { - Self::get_value( - txn, + pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { + DatabaseCache::get_value( + self.txn, word, - word_interner.get(word).as_str(), - &mut self.word_docids, - index.word_docids.remap_data_type::(), + self.word_interner.get(word).as_str(), + &mut self.db_cache.word_docids, + self.index.word_docids.remap_data_type::(), ) } /// Retrieve or insert the given value in the `word_prefix_docids` database. - pub fn get_word_prefix_docids( + pub fn get_db_word_prefix_docids( &mut self, - index: &Index, - txn: &'ctx RoTxn, - word_interner: &DedupInterner, prefix: Interned, ) -> Result> { - Self::get_value( - txn, + DatabaseCache::get_value( + self.txn, prefix, - word_interner.get(prefix).as_str(), - &mut self.word_prefix_docids, - index.word_prefix_docids.remap_data_type::(), + self.word_interner.get(prefix).as_str(), + &mut self.db_cache.word_prefix_docids, + self.index.word_prefix_docids.remap_data_type::(), ) } - pub fn get_word_pair_proximity_docids( + pub fn get_db_word_pair_proximity_docids( &mut self, - index: &Index, - txn: &'ctx RoTxn, - word_interner: &DedupInterner, word1: Interned, word2: Interned, proximity: u8, ) -> Result> { - Self::get_value( - txn, + DatabaseCache::get_value( + self.txn, (proximity, word1, word2), - &(proximity, word_interner.get(word1).as_str(), word_interner.get(word2).as_str()), - &mut self.word_pair_proximity_docids, - index.word_pair_proximity_docids.remap_data_type::(), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(word2).as_str(), + ), + &mut self.db_cache.word_pair_proximity_docids, + self.index.word_pair_proximity_docids.remap_data_type::(), ) } - pub fn get_word_prefix_pair_proximity_docids( + pub fn get_db_word_prefix_pair_proximity_docids( &mut self, - index: &Index, - txn: &'ctx RoTxn, - word_interner: &DedupInterner, word1: Interned, prefix2: Interned, proximity: u8, ) -> Result> { - Self::get_value( - txn, + DatabaseCache::get_value( + self.txn, (proximity, word1, prefix2), - &(proximity, word_interner.get(word1).as_str(), word_interner.get(prefix2).as_str()), - &mut self.word_prefix_pair_proximity_docids, - index.word_prefix_pair_proximity_docids.remap_data_type::(), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(prefix2).as_str(), + ), + &mut self.db_cache.word_prefix_pair_proximity_docids, + self.index.word_prefix_pair_proximity_docids.remap_data_type::(), ) } - pub fn get_prefix_word_pair_proximity_docids( + pub fn get_db_prefix_word_pair_proximity_docids( &mut self, - index: &Index, - txn: &'ctx RoTxn, - word_interner: &DedupInterner, left_prefix: Interned, right: Interned, proximity: u8, ) -> Result> { - Self::get_value( - txn, + DatabaseCache::get_value( + self.txn, (proximity, left_prefix, right), &( proximity, - word_interner.get(left_prefix).as_str(), - word_interner.get(right).as_str(), + self.word_interner.get(left_prefix).as_str(), + self.word_interner.get(right).as_str(), ), - &mut self.prefix_word_pair_proximity_docids, - index.prefix_word_pair_proximity_docids.remap_data_type::(), + &mut self.db_cache.prefix_word_pair_proximity_docids, + self.index.prefix_word_pair_proximity_docids.remap_data_type::(), ) } } From b96a682f1647f3ce54d8b01e8681f0d334d92914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 11:10:38 +0200 Subject: [PATCH 111/234] Update resolve_graph module to work with lazy query terms --- milli/src/search/new/mod.rs | 12 +- milli/src/search/new/resolve_query_graph.rs | 169 ++++++-------------- milli/src/search/new/words.rs | 4 +- 3 files changed, 61 insertions(+), 124 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4f50fcd29..b3c828048 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -28,12 +28,13 @@ pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode, QueryNodeData}; use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; -use resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; +use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; use self::interner::Interner; use self::ranking_rules::{BoxRankingRule, RankingRule}; +use self::resolve_query_graph::compute_query_graph_docids; use self::sort::Sort; use crate::{ AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy, @@ -48,8 +49,7 @@ pub struct SearchContext<'ctx> { pub word_interner: DedupInterner, pub phrase_interner: DedupInterner, pub term_interner: Interner, - // think about memory usage of that field (roaring bitmaps in a hashmap) - pub term_docids: QueryTermDocIdsCache, + pub phrase_docids: PhraseDocIdsCache, } impl<'ctx> SearchContext<'ctx> { pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { @@ -60,7 +60,7 @@ impl<'ctx> SearchContext<'ctx> { word_interner: <_>::default(), phrase_interner: <_>::default(), term_interner: <_>::default(), - term_docids: <_>::default(), + phrase_docids: <_>::default(), } } } @@ -103,7 +103,7 @@ fn resolve_maximally_reduced_query_graph( } } logger.query_for_universe(&graph); - let docids = resolve_query_graph(ctx, &graph, universe)?; + let docids = compute_query_graph_docids(ctx, &graph, universe)?; Ok(docids) } @@ -319,7 +319,7 @@ pub fn execute_search( let tokens = tokenizer.tokenize(query); let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; - let graph = QueryGraph::from_query(ctx, query_terms)?; + let graph = QueryGraph::from_query(ctx, &query_terms)?; check_sort_criteria(ctx, sort_criteria.as_ref())?; diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 1b7057b51..707082cab 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -3,106 +3,63 @@ use std::collections::VecDeque; use fxhash::FxHashMap; -use heed::{BytesDecode, RoTxn}; +use heed::BytesDecode; use roaring::RoaringBitmap; -use super::db_cache::DatabaseCache; -use super::interner::{DedupInterner, Interned}; +use super::interner::Interned; use super::query_graph::QueryNodeData; -use super::query_term::{Phrase, QueryTerm}; +use super::query_term::{Phrase, QueryTermSubset}; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, SearchContext}; -use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; +use crate::search::new::query_term::LocatedQueryTermSubset; +use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; #[derive(Default)] -pub struct QueryTermDocIdsCache { - pub phrases: FxHashMap, RoaringBitmap>, - pub terms: FxHashMap, RoaringBitmap>, +pub struct PhraseDocIdsCache { + pub cache: FxHashMap, RoaringBitmap>, } -impl QueryTermDocIdsCache { +impl<'ctx> SearchContext<'ctx> { /// Get the document ids associated with the given phrase - pub fn get_phrase_docids<'s, 'ctx>( - &'s mut self, - index: &Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - word_interner: &DedupInterner, - phrase_interner: &DedupInterner, - phrase: Interned, - ) -> Result<&'s RoaringBitmap> { - if self.phrases.contains_key(&phrase) { - return Ok(&self.phrases[&phrase]); + pub fn get_phrase_docids(&mut self, phrase: Interned) -> Result<&RoaringBitmap> { + if self.phrase_docids.cache.contains_key(&phrase) { + return Ok(&self.phrase_docids.cache[&phrase]); }; - let docids = resolve_phrase(index, txn, db_cache, word_interner, phrase_interner, phrase)?; - let _ = self.phrases.insert(phrase, docids); - let docids = &self.phrases[&phrase]; - Ok(docids) - } - /// Get the document ids associated with the given term - pub fn get_query_term_docids<'s, 'ctx>( - &'s mut self, - index: &Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - word_interner: &DedupInterner, - term_interner: &DedupInterner, - phrase_interner: &DedupInterner, - term_interned: Interned, - ) -> Result<&'s RoaringBitmap> { - if self.terms.contains_key(&term_interned) { - return Ok(&self.terms[&term_interned]); - }; - let mut docids = RoaringBitmap::new(); - // TODO: use a MultiOps? - let term = term_interner.get(term_interned); - for word in term.all_single_words_except_prefix_db() { - if let Some(word_docids) = db_cache.get_word_docids(index, txn, word_interner, word)? { - docids |= - RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?; - } - } - for phrase in term.all_phrases() { - docids |= self.get_phrase_docids( - index, - txn, - db_cache, - word_interner, - phrase_interner, - phrase, - )?; - } - - if let Some(prefix) = term.use_prefix_db { - if let Some(prefix_docids) = - db_cache.get_word_prefix_docids(index, txn, word_interner, prefix)? - { - docids |= - RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?; - } - } - - let _ = self.terms.insert(term_interned, docids); - let docids = &self.terms[&term_interned]; + let docids = compute_phrase_docids(self, phrase)?; + let _ = self.phrase_docids.cache.insert(phrase, docids); + let docids = &self.phrase_docids.cache[&phrase]; Ok(docids) } } +pub fn compute_query_term_subset_docids( + ctx: &mut SearchContext, + term: &QueryTermSubset, +) -> Result { + let mut docids = RoaringBitmap::new(); + for word in term.all_single_words_except_prefix_db(ctx)? { + if let Some(word_docids) = ctx.get_db_word_docids(word)? { + docids |= RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?; + } + } + for phrase in term.all_phrases(ctx)? { + docids |= ctx.get_phrase_docids(phrase)?; + } -pub fn resolve_query_graph( + if let Some(prefix) = term.use_prefix_db(ctx) { + if let Some(prefix_docids) = ctx.get_db_word_prefix_docids(prefix)? { + docids |= + RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?; + } + } + + Ok(docids) +} + +pub fn compute_query_graph_docids( ctx: &mut SearchContext, q: &QueryGraph, universe: &RoaringBitmap, ) -> Result { - let SearchContext { - index, - txn, - db_cache, - word_interner, - phrase_interner, - term_interner, - term_docids: query_term_docids, - .. - } = ctx; - // TODO: there is a faster way to compute this big + // TODO: there must be a faster way to compute this big // roaring bitmap expression let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes); @@ -125,17 +82,13 @@ pub fn resolve_query_graph( } let node_docids = match &node.data { - QueryNodeData::Term(located_term) => { - let term_docids = query_term_docids.get_query_term_docids( - index, - txn, - db_cache, - word_interner, - term_interner, - phrase_interner, - located_term.value, - )?; - predecessors_docids & term_docids + QueryNodeData::Term(LocatedQueryTermSubset { + term_subset, + positions: _, + term_ids: _, + }) => { + let phrase_docids = compute_query_term_subset_docids(ctx, term_subset)?; + predecessors_docids & phrase_docids } QueryNodeData::Deleted => { panic!() @@ -163,15 +116,11 @@ pub fn resolve_query_graph( panic!() } -pub fn resolve_phrase<'ctx>( - index: &Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - word_interner: &DedupInterner, - phrase_interner: &DedupInterner, +pub fn compute_phrase_docids( + ctx: &mut SearchContext, phrase: Interned, ) -> Result { - let Phrase { words } = phrase_interner.get(phrase).clone(); + let Phrase { words } = ctx.phrase_interner.get(phrase).clone(); let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = words.len().min(3); @@ -195,14 +144,7 @@ pub fn resolve_phrase<'ctx>( .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) { if dist == 0 { - match db_cache.get_word_pair_proximity_docids( - index, - txn, - word_interner, - s1, - s2, - 1, - )? { + match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? { Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?), // If there are no documents for this pair, there will be no // results for the phrase query. @@ -211,14 +153,9 @@ pub fn resolve_phrase<'ctx>( } else { let mut bitmap = RoaringBitmap::new(); for dist in 0..=dist { - if let Some(m) = db_cache.get_word_pair_proximity_docids( - index, - txn, - word_interner, - s1, - s2, - dist as u8 + 1, - )? { + if let Some(m) = + ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)? + { bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?; } } diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index fb2c62f11..dc798e55d 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -4,7 +4,7 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::query_graph::QueryNodeData; -use super::resolve_query_graph::resolve_query_graph; +use super::resolve_query_graph::compute_query_graph_docids; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::{Result, TermsMatchingStrategy}; @@ -80,7 +80,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { logger.log_words_state(query_graph); - let this_bucket = resolve_query_graph(ctx, query_graph, universe)?; + let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?; let child_query_graph = query_graph.clone(); loop { From fa8138186591bbba6f7dcde2d10db69098afba82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 11:19:45 +0200 Subject: [PATCH 112/234] Update the trait requirements of ranking-rule graphs --- .../src/search/new/ranking_rule_graph/mod.rs | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index fb9a82d68..7c40008c8 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -17,32 +17,42 @@ mod typo; use std::hash::Hash; +pub use cheapest_paths::PathVisitor; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; -use fxhash::FxHashSet; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner}; use super::logger::SearchLogger; -use super::query_term::Phrase; +use super::query_term::LocatedQueryTermSubset; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; use crate::Result; +pub struct ComputedCondition { + pub docids: RoaringBitmap, + pub universe_len: u64, + pub start_term_subset: Option, + pub end_term_subset: LocatedQueryTermSubset, +} + /// An edge in the ranking rule graph. /// /// It contains: /// 1. The source and destination nodes /// 2. The cost of traversing this edge /// 3. The condition associated with it +/// 4. The list of nodes that have to be skipped +/// if this edge is traversed. #[derive(Clone)] pub struct Edge { pub source_node: Interned, pub dest_node: Interned, - pub cost: u8, + pub cost: u32, pub condition: Option>, + pub nodes_to_skip: SmallBitmap, } impl Hash for Edge { @@ -83,23 +93,23 @@ pub trait RankingRuleGraphTrait: Sized { ctx: &mut SearchContext, condition: &Self::Condition, universe: &RoaringBitmap, - ) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)>; + ) -> Result; /// Return the costs and conditions of the edges going from the source node to the destination node fn build_edges( ctx: &mut SearchContext, conditions_interner: &mut DedupInterner, - source_node: &QueryNode, - dest_node: &QueryNode, - ) -> Result>)>>; + source_node: Option<&LocatedQueryTermSubset>, + dest_node: &LocatedQueryTermSubset, + ) -> Result)>>; fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u16, + costs: &MappedInterner>, + cost: u64, logger: &mut dyn SearchLogger, ); } From 728710d63af5b5cb85fd466280c3da274fcbf002 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 11:32:19 +0200 Subject: [PATCH 113/234] Update typo ranking rule to use new query term structure --- milli/src/search/new/logger/mod.rs | 16 +- .../search/new/ranking_rule_graph/typo/mod.rs | 235 ++++-------------- 2 files changed, 63 insertions(+), 188 deletions(-) diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 470983017..889e811ad 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -68,8 +68,8 @@ pub trait SearchLogger { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u16, + distances: &MappedInterner>, + cost: u64, ); /// Logs the internal state of the typo ranking rule @@ -79,8 +79,8 @@ pub trait SearchLogger { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u16, + distances: &MappedInterner>, + cost: u64, ); } @@ -139,8 +139,8 @@ impl SearchLogger for DefaultSearchLogger { _paths_map: &[Vec>], _dead_ends_cache: &DeadEndsCache, _universe: &RoaringBitmap, - _distances: &MappedInterner>, - _cost: u16, + _distances: &MappedInterner>, + _cost: u64, ) { } @@ -150,8 +150,8 @@ impl SearchLogger for DefaultSearchLogger { _paths: &[Vec>], _dead_ends_cache: &DeadEndsCache, _universe: &RoaringBitmap, - _distances: &MappedInterner>, - _cost: u16, + _distances: &MappedInterner>, + _cost: u64, ) { } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index c27051de0..de02b67a4 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,21 +1,17 @@ -// use std::collections::HashSet; -use std::fmt::Write; -use std::iter::FromIterator; - -use fxhash::FxHashSet; use roaring::RoaringBitmap; -use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; +use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; -use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; +use crate::search::new::query_term::{LocatedQueryTermSubset, NTypoTermSubset}; +use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] pub struct TypoCondition { - term: Interned, + term: LocatedQueryTermSubset, + nbr_typos: u8, } pub enum TypoGraph {} @@ -23,121 +19,63 @@ pub enum TypoGraph {} impl RankingRuleGraphTrait for TypoGraph { type Condition = TypoCondition; - fn resolve_condition<'db_cache>( + fn resolve_condition( ctx: &mut SearchContext, condition: &Self::Condition, universe: &RoaringBitmap, - ) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)> { - let SearchContext { - index, - txn, - db_cache, - word_interner, - phrase_interner, - term_interner, - term_docids: query_term_docids, - } = ctx; + ) -> Result { + let TypoCondition { term, .. } = condition; + // maybe compute_query_term_subset_docids should accept a universe as argument + let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?; + docids &= universe; - let docids = universe - & query_term_docids.get_query_term_docids( - index, - txn, - db_cache, - word_interner, - term_interner, - phrase_interner, - condition.term, - )?; - - let term = term_interner.get(condition.term); - Ok(( + Ok(ComputedCondition { docids, - FxHashSet::from_iter(term.all_single_words_except_prefix_db()), - FxHashSet::from_iter(term.all_phrases()), - )) + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: term.clone(), + }) } fn build_edges( ctx: &mut SearchContext, conditions_interner: &mut DedupInterner, - _from_node: &QueryNode, - to_node: &QueryNode, - ) -> Result>)>> { - let SearchContext { term_interner, .. } = ctx; - match &to_node.data { - QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { - let mut edges = vec![]; - // Ngrams have a base typo cost - // 2-gram -> equivalent to 1 typo - // 3-gram -> equivalent to 2 typos - let base_cost = positions.len().min(2) as u8; + _from: Option<&LocatedQueryTermSubset>, + to_term: &LocatedQueryTermSubset, + ) -> Result)>> { + let term = to_term; // LocatedQueryTermSubset { term_subset, positions: _, term_ids } = to_term; + let original_full_term = ctx.term_interner.get(term.term_subset.original); - for nbr_typos in 0..=2 { - let term = term_interner.get(*value).clone(); - let new_term = match nbr_typos { - 0 => QueryTerm { - original: term.original, - is_prefix: term.is_prefix, - zero_typo: term.zero_typo, - prefix_of: term.prefix_of, - // TOOD: debatable - synonyms: term.synonyms, - split_words: None, - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: term.use_prefix_db, - is_ngram: term.is_ngram, - phrase: term.phrase, - }, - 1 => { - // What about split words and synonyms here? - QueryTerm { - original: term.original, - is_prefix: false, - zero_typo: None, - prefix_of: Box::new([]), - synonyms: Box::new([]), - split_words: term.split_words, - one_typo: term.one_typo, - two_typos: Box::new([]), - use_prefix_db: None, // false because all items from use_prefix_db have 0 typos - is_ngram: term.is_ngram, - phrase: None, - } - } - 2 => { - // What about split words and synonyms here? - QueryTerm { - original: term.original, - zero_typo: None, - is_prefix: false, - prefix_of: Box::new([]), - synonyms: Box::new([]), - split_words: None, - one_typo: Box::new([]), - two_typos: term.two_typos, - use_prefix_db: None, // false because all items from use_prefix_db have 0 typos - is_ngram: term.is_ngram, - phrase: None, - } - } - _ => panic!(), - }; - if !new_term.is_empty() { - edges.push(( - nbr_typos as u8 + base_cost, - Some( - conditions_interner - .insert(TypoCondition { term: term_interner.push(new_term) }), - ), - )) - } + let mut edges = vec![]; + // Ngrams have a base typo cost + // 2-gram -> equivalent to 1 typo + // 3-gram -> equivalent to 2 typos + let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; + + for nbr_typos in 0..=original_full_term.max_nbr_typos { + let mut term = term.clone(); + match nbr_typos { + 0 => { + term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; } - Ok(edges) - } - QueryNodeData::End => Ok(vec![(0, None)]), - QueryNodeData::Deleted | QueryNodeData::Start => panic!(), + 1 => { + term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; + } + 2 => { + term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; + } + _ => panic!(), + }; + + edges.push(( + nbr_typos as u32 + base_cost, + conditions_interner.insert(TypoCondition { term, nbr_typos }), + )); } + Ok(edges) } fn log_state( @@ -145,81 +83,18 @@ impl RankingRuleGraphTrait for TypoGraph { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u16, + distances: &MappedInterner>, + cost: u64, logger: &mut dyn SearchLogger, ) { logger.log_typo_state(graph, paths, dead_ends_cache, universe, distances, cost); } fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { - let TypoCondition { term } = condition; - let term = ctx.term_interner.get(*term); - let QueryTerm { - original: _, - is_ngram: _, - is_prefix: _, - phrase, - zero_typo, - prefix_of, - synonyms, - split_words, - one_typo, - two_typos, - use_prefix_db, - } = term; - let mut s = String::new(); - if let Some(phrase) = phrase { - let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner); - writeln!(&mut s, "\"{phrase}\" : phrase").unwrap(); - } - if let Some(w) = zero_typo { - let w = ctx.word_interner.get(*w); - writeln!(&mut s, "\"{w}\" : 0 typo").unwrap(); - } - for w in prefix_of.iter() { - let w = ctx.word_interner.get(*w); - writeln!(&mut s, "\"{w}\" : prefix").unwrap(); - } - for w in one_typo.iter() { - let w = ctx.word_interner.get(*w); - writeln!(&mut s, "\"{w}\" : 1 typo").unwrap(); - } - for w in two_typos.iter() { - let w = ctx.word_interner.get(*w); - writeln!(&mut s, "\"{w}\" : 2 typos").unwrap(); - } - if let Some(phrase) = split_words { - let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner); - writeln!(&mut s, "\"{phrase}\" : split words").unwrap(); - } - for phrase in synonyms.iter() { - let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner); - writeln!(&mut s, "\"{phrase}\" : synonym").unwrap(); - } - if let Some(w) = use_prefix_db { - let w = ctx.word_interner.get(*w); - writeln!(&mut s, "\"{w}\" : use prefix db").unwrap(); - } + let TypoCondition { term, nbr_typos } = condition; + let original_term = ctx.term_interner.get(term.term_subset.original); + let original = ctx.word_interner.get(original_term.original); - Ok(s) + Ok(format!("{original}: {nbr_typos}")) } - - // fn words_used_by_condition<'ctx>( - // ctx: &mut SearchContext<'ctx>, - // condition: &Self::Condition, - // ) -> Result>> { - // let TypoCondition { term, .. } = condition; - // let term = ctx.term_interner.get(*term); - // Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) - // } - - // fn phrases_used_by_condition<'ctx>( - // ctx: &mut SearchContext<'ctx>, - // condition: &Self::Condition, - // ) -> Result>> { - // let TypoCondition { term, .. } = condition; - // let term = ctx.term_interner.get(*term); - // Ok(HashSet::from_iter(term.all_phrases())) - // } } From 5fd28620cd60e91ee16d30c3c03c46627bcfd751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 11:32:55 +0200 Subject: [PATCH 114/234] Build ranking rule graph correctly after changes to trait definition --- .../search/new/ranking_rule_graph/build.rs | 65 ++++++++++++++----- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index cd622a7ba..c92eac526 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,20 +1,18 @@ -use std::collections::HashSet; - use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::interner::DedupInterner; +use crate::search::new::interner::{DedupInterner, MappedInterner}; +use crate::search::new::query_graph::{QueryNode, QueryNodeData}; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, SearchContext}; use crate::Result; +use std::collections::HashSet; impl RankingRuleGraph { - // TODO: here, the docids of all the edges should already be computed! - // an edge condition would then be reduced to a (ptr to) a roaring bitmap? - // we could build fewer of them by directly comparing them with the universe - // (e.g. for each word pairs?) with `deserialize_within_universe` maybe - // - /// Build the ranking rule graph from the given query graph - pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { + pub fn build( + ctx: &mut SearchContext, + query_graph: QueryGraph, + cost_of_ignoring_node: MappedInterner)>>, + ) -> Result { let QueryGraph { nodes: graph_nodes, .. } = &query_graph; let mut conditions_interner = DedupInterner::default(); @@ -26,8 +24,41 @@ impl RankingRuleGraph { let new_edges = edges_of_node.get_mut(source_id); for dest_idx in source_node.successors.iter() { + let src_term = match &source_node.data { + QueryNodeData::Term(t) => Some(t), + QueryNodeData::Start => None, + QueryNodeData::Deleted | QueryNodeData::End => panic!(), + }; let dest_node = graph_nodes.get(dest_idx); - let edges = G::build_edges(ctx, &mut conditions_interner, source_node, dest_node)?; + let dest_term = match &dest_node.data { + QueryNodeData::Term(t) => t, + QueryNodeData::End => { + let new_edge_id = edges_store.insert(Some(Edge { + source_node: source_id, + dest_node: dest_idx, + cost: 0, + condition: None, + nodes_to_skip: SmallBitmap::for_interned_values_in(graph_nodes), + })); + new_edges.insert(new_edge_id); + continue; + } + QueryNodeData::Deleted | QueryNodeData::Start => panic!(), + }; + if let Some((cost_of_ignoring, forbidden_nodes)) = + cost_of_ignoring_node.get(dest_idx) + { + let new_edge_id = edges_store.insert(Some(Edge { + source_node: source_id, + dest_node: dest_idx, + cost: *cost_of_ignoring, + condition: None, + nodes_to_skip: forbidden_nodes.clone(), + })); + new_edges.insert(new_edge_id); + } + + let edges = G::build_edges(ctx, &mut conditions_interner, src_term, dest_term)?; if edges.is_empty() { continue; } @@ -37,7 +68,8 @@ impl RankingRuleGraph { source_node: source_id, dest_node: dest_idx, cost, - condition, + condition: Some(condition), + nodes_to_skip: SmallBitmap::for_interned_values_in(graph_nodes), })); new_edges.insert(new_edge_id); } @@ -47,11 +79,8 @@ impl RankingRuleGraph { let edges_of_node = edges_of_node.map(|edges| SmallBitmap::from_iter(edges.iter().copied(), &edges_store)); - Ok(RankingRuleGraph { - query_graph, - edges_store, - edges_of_node, - conditions_interner: conditions_interner.freeze(), - }) + let conditions_interner = conditions_interner.freeze(); + + Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node, conditions_interner }) } } From ae6bb1ce172d19140003f5c46dc5c0a3fac8a29f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 11:41:20 +0200 Subject: [PATCH 115/234] Update the ConditionDocidsCache after change to RankingRuleGraphTrait --- .../condition_docids_cache.rs | 55 +++++++------------ 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs index 67e9be6a4..d0fcd8bd8 100644 --- a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs @@ -1,27 +1,20 @@ use std::marker::PhantomData; -use fxhash::{FxHashMap, FxHashSet}; +use fxhash::FxHashMap; use roaring::RoaringBitmap; -use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::{ComputedCondition, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::Interned; -use crate::search::new::query_term::Phrase; +use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::SearchContext; use crate::Result; // TODO: give a generation to each universe, then be able to get the exact // delta of docids between two universes of different generations! -#[derive(Default)] -pub struct ComputedCondition { - docids: RoaringBitmap, - universe_len: u64, - used_words: FxHashSet>, - used_phrases: FxHashSet>, -} - /// A cache storing the document ids associated with each ranking rule edge pub struct ConditionDocIdsCache { + // TOOD: should be a mapped interner? pub cache: FxHashMap, ComputedCondition>, _phantom: PhantomData, } @@ -31,45 +24,39 @@ impl Default for ConditionDocIdsCache { } } impl ConditionDocIdsCache { - pub fn get_condition_used_words_and_phrases( + pub fn get_subsets_used_by_condition( &mut self, interned_condition: Interned, - ) -> (&FxHashSet>, &FxHashSet>) { - let ComputedCondition { used_words, used_phrases, .. } = &self.cache[&interned_condition]; - (used_words, used_phrases) + ) -> (&Option, &LocatedQueryTermSubset) { + let c = &self.cache[&interned_condition]; + (&c.start_term_subset, &c.end_term_subset) } - /// Retrieve the document ids for the given edge condition. /// /// If the cache does not yet contain these docids, they are computed /// and inserted in the cache. - pub fn get_condition_docids<'s>( + pub fn get_computed_condition<'s>( &'s mut self, ctx: &mut SearchContext, interned_condition: Interned, graph: &mut RankingRuleGraph, universe: &RoaringBitmap, - ) -> Result<&'s RoaringBitmap> { + ) -> Result<&'s ComputedCondition> { if self.cache.contains_key(&interned_condition) { - // TODO compare length of universe compared to the one in self - // if it is smaller, then update the value - let ComputedCondition { docids, universe_len, .. } = - self.cache.entry(interned_condition).or_default(); - if *universe_len == universe.len() { - return Ok(docids); + let computed = self.cache.get_mut(&interned_condition).unwrap(); + if computed.universe_len == universe.len() { + return Ok(computed); } else { - *docids &= universe; - *universe_len = universe.len(); - return Ok(docids); + computed.docids &= universe; + computed.universe_len = universe.len(); + return Ok(computed); } } let condition = graph.conditions_interner.get_mut(interned_condition); - let (docids, used_words, used_phrases) = G::resolve_condition(ctx, condition, universe)?; - let _ = self.cache.insert( - interned_condition, - ComputedCondition { docids, universe_len: universe.len(), used_words, used_phrases }, - ); - let ComputedCondition { docids, .. } = &self.cache[&interned_condition]; - Ok(docids) + let computed = G::resolve_condition(ctx, condition, universe)?; + // TODO: if computed.universe_len != universe.len() ? + let _ = self.cache.insert(interned_condition, computed); + let computed = &self.cache[&interned_condition]; + Ok(computed) } } From 01e24dd630d173986d5f7303f4dfe4f1df1ec9a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 11:59:06 +0200 Subject: [PATCH 116/234] Rewrite proximity ranking rule --- .../new/ranking_rule_graph/proximity/build.rs | 68 ++-- .../proximity/compute_docids.rs | 298 +++++++++--------- .../new/ranking_rule_graph/proximity/mod.rs | 36 +-- 3 files changed, 181 insertions(+), 221 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 4d42463e8..660d59b3e 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -2,44 +2,26 @@ use super::ProximityCondition; use crate::search::new::interner::{DedupInterner, Interned}; -use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::LocatedQueryTerm; -use crate::search::new::{QueryNode, SearchContext}; +use crate::search::new::query_term::LocatedQueryTermSubset; +use crate::search::new::SearchContext; use crate::Result; pub fn build_edges( _ctx: &mut SearchContext, conditions_interner: &mut DedupInterner, - from_node: &QueryNode, - to_node: &QueryNode, -) -> Result>)>> { - let right_term = match &to_node.data { - QueryNodeData::End => return Ok(vec![(0, None)]), - QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]), - QueryNodeData::Term(term) => term, + left_term: Option<&LocatedQueryTermSubset>, + right_term: &LocatedQueryTermSubset, +) -> Result)>> { + let right_ngram_length = right_term.term_ids.len(); + + let Some(left_term) = left_term else { + return Ok(vec![( + (right_ngram_length - 1) as u32, + conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }), + )]) }; - let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term; - - let (right_start_position, right_ngram_length) = - (*right_positions.start(), right_positions.len()); - - let (left_term_interned, left_end_position) = match &from_node.data { - QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()), - QueryNodeData::Deleted => return Ok(vec![]), - QueryNodeData::Start => { - return Ok(vec![( - (right_ngram_length - 1) as u8, - Some( - conditions_interner - .insert(ProximityCondition::Term { term: *right_term_interned }), - ), - )]) - } - QueryNodeData::End => return Ok(vec![]), - }; - - if left_end_position + 1 != right_start_position { + if left_term.positions.end() + 1 != *right_term.positions.start() { // We want to ignore this pair of terms // Unconditionally walk through the edge without computing the docids // This can happen when, in a query like `the sun flowers are beautiful`, the term @@ -47,30 +29,26 @@ pub fn build_edges( // The remaining query graph represents `the sun .. are beautiful` // but `sun` and `are` have no proximity condition between them return Ok(vec![( - (right_ngram_length - 1) as u8, - Some( - conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }), - ), + (right_ngram_length - 1) as u32, + conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }), )]); } let mut conditions = vec![]; for cost in right_ngram_length..(7 + right_ngram_length) { - let cost = cost as u8; conditions.push(( - cost, - Some(conditions_interner.insert(ProximityCondition::Uninit { - left_term: left_term_interned, - right_term: *right_term_interned, - right_term_ngram_len: right_ngram_length as u8, - cost, - })), + cost as u32, + conditions_interner.insert(ProximityCondition::Uninit { + left_term: left_term.clone(), + right_term: right_term.clone(), + cost: cost as u8, + }), )) } conditions.push(( - (7 + right_ngram_length) as u8, - Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })), + (7 + right_ngram_length) as u32, + conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }), )); Ok(conditions) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 6f56e6221..8496054b7 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,49 +1,37 @@ #![allow(clippy::too_many_arguments)] -use std::iter::FromIterator; - use super::ProximityCondition; -use crate::search::new::db_cache::DatabaseCache; -use crate::search::new::interner::{DedupInterner, Interned}; -use crate::search::new::query_term::{Phrase, QueryTerm}; -use crate::search::new::resolve_query_graph::QueryTermDocIdsCache; +use crate::search::new::interner::Interned; +use crate::search::new::query_term::{Phrase, QueryTermSubset}; +use crate::search::new::ranking_rule_graph::ComputedCondition; +use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::SearchContext; -use crate::{CboRoaringBitmapCodec, Index, Result}; -use fxhash::FxHashSet; -use heed::RoTxn; +use crate::{CboRoaringBitmapCodec, Result}; use roaring::RoaringBitmap; +use std::collections::BTreeSet; pub fn compute_docids( ctx: &mut SearchContext, condition: &ProximityCondition, universe: &RoaringBitmap, -) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)> { - let (left_term, right_term, right_term_ngram_len, cost) = match condition { - ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => { - (*left_term, *right_term, *right_term_ngram_len, *cost) +) -> Result { + let (left_term, right_term, cost) = match condition { + ProximityCondition::Uninit { left_term, right_term, cost } => { + (left_term, right_term, *cost) } ProximityCondition::Term { term } => { - let term_v = ctx.term_interner.get(*term); - return Ok(( - ctx.term_docids - .get_query_term_docids( - ctx.index, - ctx.txn, - &mut ctx.db_cache, - &ctx.word_interner, - &ctx.term_interner, - &ctx.phrase_interner, - *term, - )? - .clone(), - FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()), - FxHashSet::from_iter(term_v.all_phrases()), - )); + let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?; + docids &= universe; + return Ok(ComputedCondition { + docids, + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: term.clone(), + }); } }; - let left_term = ctx.term_interner.get(left_term); - let right_term = ctx.term_interner.get(right_term); + let right_term_ngram_len = right_term.term_ids.len() as u8; // e.g. for the simple words `sun .. flower` // the cost is 5 @@ -57,20 +45,13 @@ pub fn compute_docids( let forward_proximity = 1 + cost - right_term_ngram_len; let backward_proximity = cost - right_term_ngram_len; - let mut used_words = FxHashSet::default(); - let mut used_phrases = FxHashSet::default(); - let mut docids = RoaringBitmap::new(); - if let Some(right_prefix) = right_term.use_prefix_db { - for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) { + if let Some(right_prefix) = right_term.term_subset.use_prefix_db(ctx) { + for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? + { compute_prefix_edges( - ctx.index, - ctx.txn, - &mut ctx.db_cache, - &mut ctx.term_docids, - &ctx.word_interner, - &ctx.phrase_interner, + ctx, left_word, right_prefix, left_phrase, @@ -78,8 +59,6 @@ pub fn compute_docids( backward_proximity, &mut docids, universe, - &mut used_words, - &mut used_phrases, )?; } } @@ -91,39 +70,60 @@ pub fn compute_docids( // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been // reached - for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) { - for (right_word, right_phrase) in first_word_of_term_iter(right_term, &ctx.phrase_interner) - { + for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? { + // Before computing the edges, check that the left word and left phrase + // aren't disjoint with the universe, but only do it if there is more than + // one word derivation to the right. + // + // This is an optimisation to avoid checking for an excessive number of + // pairs. + // WAIT, NO. + // This should only be done once per node. + // Here, we'll potentially do is.. 16 times? + // Maybe we should do it at edge-build time instead. + // Same for the future attribute ranking rule. + let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?; + if right_derivs.len() > 1 { + let universe = &universe; + if let Some(left_phrase) = left_phrase { + if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) { + continue; + } + } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? { + let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?; + if universe.is_disjoint(&left_word_docids) { + continue; + } + } + } + + for (right_word, right_phrase) in right_derivs { compute_non_prefix_edges( - ctx.index, - ctx.txn, - &mut ctx.db_cache, - &mut ctx.term_docids, - &ctx.word_interner, - &ctx.phrase_interner, + ctx, left_word, right_word, - &[left_phrase, right_phrase].iter().copied().flatten().collect::>(), + left_phrase, + right_phrase, forward_proximity, backward_proximity, &mut docids, universe, - &mut used_words, - &mut used_phrases, )?; } } - Ok((docids, used_words, used_phrases)) + Ok(ComputedCondition { + docids, + universe_len: universe.len(), + // TODO: think about whether we want to reduce the subset, + // we probably should! + start_term_subset: Some(left_term.clone()), + end_term_subset: right_term.clone(), + }) } -fn compute_prefix_edges<'ctx>( - index: &Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - term_docids: &mut QueryTermDocIdsCache, - word_interner: &DedupInterner, - phrase_interner: &DedupInterner, +fn compute_prefix_edges( + ctx: &mut SearchContext, left_word: Interned, right_prefix: Interned, left_phrase: Option>, @@ -131,21 +131,16 @@ fn compute_prefix_edges<'ctx>( backward_proximity: u8, docids: &mut RoaringBitmap, universe: &RoaringBitmap, - used_words: &mut FxHashSet>, - used_phrases: &mut FxHashSet>, ) -> Result<()> { + let mut used_left_words = BTreeSet::new(); + let mut used_left_phrases = BTreeSet::new(); + let mut used_right_prefix = BTreeSet::new(); + let mut universe = universe.clone(); if let Some(phrase) = left_phrase { - let phrase_docids = term_docids.get_phrase_docids( - index, - txn, - db_cache, - word_interner, - phrase_interner, - phrase, - )?; + let phrase_docids = ctx.get_phrase_docids(phrase)?; if !phrase_docids.is_empty() { - used_phrases.insert(phrase); + used_left_phrases.insert(phrase); } universe &= phrase_docids; if universe.is_empty() { @@ -153,36 +148,28 @@ fn compute_prefix_edges<'ctx>( } } - if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids( - index, - txn, - word_interner, - left_word, - right_prefix, - forward_proximity, - )? { + if let Some(new_docids) = + ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)? + { let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { - used_words.insert(left_word); - used_words.insert(right_prefix); + used_left_words.insert(left_word); + used_right_prefix.insert(right_prefix); *docids |= new_docids; } } // No swapping when computing the proximity between a phrase and a word if left_phrase.is_none() { - if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids( - index, - txn, - word_interner, + if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids( right_prefix, left_word, backward_proximity, )? { let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { - used_words.insert(left_word); - used_words.insert(right_prefix); + used_left_words.insert(left_word); + used_right_prefix.insert(right_prefix); *docids |= new_docids; } } @@ -191,72 +178,59 @@ fn compute_prefix_edges<'ctx>( Ok(()) } -fn compute_non_prefix_edges<'ctx>( - index: &Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - term_docids: &mut QueryTermDocIdsCache, - word_interner: &DedupInterner, - phrase_interner: &DedupInterner, +fn compute_non_prefix_edges( + ctx: &mut SearchContext, word1: Interned, word2: Interned, - phrases: &[Interned], + left_phrase: Option>, + right_phrase: Option>, forward_proximity: u8, backward_proximity: u8, docids: &mut RoaringBitmap, universe: &RoaringBitmap, - used_words: &mut FxHashSet>, - used_phrases: &mut FxHashSet>, ) -> Result<()> { + let mut used_left_phrases = BTreeSet::new(); + let mut used_right_phrases = BTreeSet::new(); + let mut used_left_words = BTreeSet::new(); + let mut used_right_words = BTreeSet::new(); + let mut universe = universe.clone(); - for phrase in phrases { - let phrase_docids = term_docids.get_phrase_docids( - index, - txn, - db_cache, - word_interner, - phrase_interner, - *phrase, - )?; - if !phrase_docids.is_empty() { - used_phrases.insert(*phrase); - } + + for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() { + let phrase_docids = ctx.get_phrase_docids(phrase)?; universe &= phrase_docids; if universe.is_empty() { return Ok(()); } } - if let Some(new_docids) = db_cache.get_word_pair_proximity_docids( - index, - txn, - word_interner, - word1, - word2, - forward_proximity, - )? { + if let Some(left_phrase) = left_phrase { + used_left_phrases.insert(left_phrase); + } + if let Some(right_phrase) = right_phrase { + used_right_phrases.insert(right_phrase); + } + + if let Some(new_docids) = + ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)? + { let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { - used_words.insert(word1); - used_words.insert(word2); + used_left_words.insert(word1); + used_right_words.insert(word2); *docids |= new_docids; } } if backward_proximity >= 1 // no swapping when either term is a phrase - && phrases.is_empty() + && left_phrase.is_none() && right_phrase.is_none() { - if let Some(new_docids) = db_cache.get_word_pair_proximity_docids( - index, - txn, - word_interner, - word2, - word1, - backward_proximity, - )? { + if let Some(new_docids) = + ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)? + { let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { - used_words.insert(word1); - used_words.insert(word2); + used_left_words.insert(word2); + used_right_words.insert(word1); *docids |= new_docids; } } @@ -265,25 +239,41 @@ fn compute_non_prefix_edges<'ctx>( Ok(()) } -fn last_word_of_term_iter<'t>( - t: &'t QueryTerm, - phrase_interner: &'t DedupInterner, -) -> impl Iterator>, Interned)> + 't { - t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map( - move |p| { - let phrase = phrase_interner.get(p); - phrase.words.last().unwrap().map(|last| (Some(p), last)) - }, - )) +fn last_words_of_term_derivations( + ctx: &mut SearchContext, + t: &QueryTermSubset, +) -> Result>, Interned)>> { + let mut result = BTreeSet::new(); + + for w in t.all_single_words_except_prefix_db(ctx)? { + result.insert((None, w)); + } + for p in t.all_phrases(ctx)? { + let phrase = ctx.phrase_interner.get(p); + let last_term_of_phrase = phrase.words.last().unwrap(); + if let Some(last_word) = last_term_of_phrase { + result.insert((Some(p), *last_word)); + } + } + + Ok(result) } -fn first_word_of_term_iter<'t>( - t: &'t QueryTerm, - phrase_interner: &'t DedupInterner, -) -> impl Iterator, Option>)> + 't { - t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map( - move |p| { - let phrase = phrase_interner.get(p); - phrase.words.first().unwrap().map(|first| (first, Some(p))) - }, - )) +fn first_word_of_term_iter( + ctx: &mut SearchContext, + t: &QueryTermSubset, +) -> Result, Option>)>> { + let mut result = BTreeSet::new(); + let all_words = t.all_single_words_except_prefix_db(ctx)?; + for w in all_words { + result.insert((w, None)); + } + for p in t.all_phrases(ctx)? { + let phrase = ctx.phrase_interner.get(p); + let first_term_of_phrase = phrase.words.first().unwrap(); + if let Some(first_word) = first_term_of_phrase { + result.insert((*first_word, Some(p))); + } + } + + Ok(result) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 3b98ed5b5..81c99fd9a 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -1,27 +1,19 @@ pub mod build; pub mod compute_docids; -use fxhash::FxHashSet; use roaring::RoaringBitmap; -use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; +use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; -use crate::search::new::query_term::{Phrase, QueryTerm}; +use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] pub enum ProximityCondition { - Uninit { - left_term: Interned, - right_term: Interned, - right_term_ngram_len: u8, - cost: u8, - }, - Term { - term: Interned, - }, + Uninit { left_term: LocatedQueryTermSubset, right_term: LocatedQueryTermSubset, cost: u8 }, + Term { term: LocatedQueryTermSubset }, } pub enum ProximityGraph {} @@ -33,18 +25,17 @@ impl RankingRuleGraphTrait for ProximityGraph { ctx: &mut SearchContext, condition: &Self::Condition, universe: &RoaringBitmap, - ) -> Result<(roaring::RoaringBitmap, FxHashSet>, FxHashSet>)> - { + ) -> Result { compute_docids::compute_docids(ctx, condition, universe) } fn build_edges( ctx: &mut SearchContext, conditions_interner: &mut DedupInterner, - source_node: &QueryNode, - dest_node: &QueryNode, - ) -> Result>)>> { - build::build_edges(ctx, conditions_interner, source_node, dest_node) + source_term: Option<&LocatedQueryTermSubset>, + dest_term: &LocatedQueryTermSubset, + ) -> Result)>> { + build::build_edges(ctx, conditions_interner, source_term, dest_term) } fn log_state( @@ -52,8 +43,8 @@ impl RankingRuleGraphTrait for ProximityGraph { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u16, + distances: &MappedInterner>, + cost: u64, logger: &mut dyn SearchLogger, ) { logger.log_proximity_state(graph, paths, dead_ends_cache, universe, distances, cost); @@ -66,8 +57,9 @@ impl RankingRuleGraphTrait for ProximityGraph { Ok(format!("{cost}: cost")) } ProximityCondition::Term { term } => { - let term = ctx.term_interner.get(*term); - Ok(format!("{} : exists", ctx.word_interner.get(term.original))) + let original_term = ctx.term_interner.get(term.term_subset.original); + let original_word = ctx.word_interner.get(original_term.original); + Ok(format!("{original_word} : exists")) } } } From aa9592455c05928b6932e6c6e7a16759f04d3c31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 12:11:11 +0200 Subject: [PATCH 117/234] Refactor the paths_of_cost algorithm Support conditions that require certain nodes to be skipped --- .../new/ranking_rule_graph/cheapest_paths.rs | 309 +++++++++++------- .../new/ranking_rule_graph/dead_ends_cache.rs | 12 +- 2 files changed, 191 insertions(+), 130 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index c09f6e5e0..443ab0ec4 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -9,141 +9,202 @@ use crate::search::new::query_graph::QueryNode; use crate::search::new::small_bitmap::SmallBitmap; use crate::Result; -impl RankingRuleGraph { - pub fn visit_paths_of_cost( - &mut self, - from: Interned, - cost: u16, - all_distances: &MappedInterner>, - dead_ends_cache: &mut DeadEndsCache, - mut visit: impl FnMut( - &[Interned], - &mut Self, - &mut DeadEndsCache, - ) -> Result>, - ) -> Result<()> { - let _ = self.visit_paths_of_cost_rec( - from, - cost, - all_distances, - dead_ends_cache, - &mut visit, - &mut vec![], - &mut SmallBitmap::for_interned_values_in(&self.conditions_interner), - dead_ends_cache.forbidden.clone(), - )?; +type VisitFn<'f, G> = &'f mut dyn FnMut( + &[Interned<::Condition>], + &mut RankingRuleGraph, + &mut DeadEndsCache<::Condition>, +) -> Result>; + +struct VisitorContext<'a, G: RankingRuleGraphTrait> { + graph: &'a mut RankingRuleGraph, + all_costs_from_node: &'a MappedInterner>, + dead_ends_cache: &'a mut DeadEndsCache, +} + +struct VisitorState { + remaining_cost: u64, + + path: Vec>, + + visited_conditions: SmallBitmap, + visited_nodes: SmallBitmap, + + forbidden_conditions: SmallBitmap, + forbidden_conditions_to_nodes: SmallBitmap, +} + +pub struct PathVisitor<'a, G: RankingRuleGraphTrait> { + state: VisitorState, + ctx: VisitorContext<'a, G>, +} +impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> { + pub fn new( + cost: u64, + graph: &'a mut RankingRuleGraph, + all_costs_from_node: &'a MappedInterner>, + dead_ends_cache: &'a mut DeadEndsCache, + ) -> Self { + Self { + state: VisitorState { + remaining_cost: cost, + path: vec![], + visited_conditions: SmallBitmap::for_interned_values_in(&graph.conditions_interner), + visited_nodes: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes), + forbidden_conditions: SmallBitmap::for_interned_values_in( + &graph.conditions_interner, + ), + forbidden_conditions_to_nodes: SmallBitmap::for_interned_values_in( + &graph.query_graph.nodes, + ), + }, + ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache }, + } + } + + pub fn visit_paths(mut self, visit: VisitFn) -> Result<()> { + let _ = + self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?; Ok(()) } - pub fn visit_paths_of_cost_rec( +} + +impl VisitorState { + fn visit_node( &mut self, - from: Interned, - cost: u16, - all_distances: &MappedInterner>, - dead_ends_cache: &mut DeadEndsCache, - visit: &mut impl FnMut( - &[Interned], - &mut Self, - &mut DeadEndsCache, - ) -> Result>, - prev_conditions: &mut Vec>, - cur_path: &mut SmallBitmap, - mut forbidden_conditions: SmallBitmap, - ) -> Result { + from_node: Interned, + visit: VisitFn, + ctx: &mut VisitorContext, + ) -> Result> { let mut any_valid = false; - let edges = self.edges_of_node.get(from).clone(); - 'edges_loop: for edge_idx in edges.iter() { - let Some(edge) = self.edges_store.get(edge_idx).as_ref() else { continue }; - if cost < edge.cost as u16 { + let edges = ctx.graph.edges_of_node.get(from_node).clone(); + for edge_idx in edges.iter() { + let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue }; + + if self.remaining_cost < edge.cost as u64 { continue; } - let next_any_valid = match edge.condition { - None => { - if edge.dest_node == self.query_graph.end_node { - any_valid = true; - let control_flow = visit(prev_conditions, self, dead_ends_cache)?; - match control_flow { - ControlFlow::Continue(_) => {} - ControlFlow::Break(_) => return Ok(true), - } - true - } else { - self.visit_paths_of_cost_rec( - edge.dest_node, - cost - edge.cost as u16, - all_distances, - dead_ends_cache, - visit, - prev_conditions, - cur_path, - forbidden_conditions.clone(), - )? - } - } - Some(condition) => { - if forbidden_conditions.contains(condition) - || all_distances - .get(edge.dest_node) - .iter() - .all(|next_cost| *next_cost != cost - edge.cost as u16) - { - continue; - } - cur_path.insert(condition); - prev_conditions.push(condition); - let mut new_forbidden_conditions = forbidden_conditions.clone(); - if let Some(next_forbidden) = - dead_ends_cache.forbidden_conditions_after_prefix(prev_conditions) - { - new_forbidden_conditions.union(&next_forbidden); - } - - let next_any_valid = if edge.dest_node == self.query_graph.end_node { - any_valid = true; - let control_flow = visit(prev_conditions, self, dead_ends_cache)?; - match control_flow { - ControlFlow::Continue(_) => {} - ControlFlow::Break(_) => return Ok(true), - } - true - } else { - self.visit_paths_of_cost_rec( - edge.dest_node, - cost - edge.cost as u16, - all_distances, - dead_ends_cache, - visit, - prev_conditions, - cur_path, - new_forbidden_conditions, - )? - }; - cur_path.remove(condition); - prev_conditions.pop(); - next_any_valid - } + self.remaining_cost -= edge.cost as u64; + let cf = match edge.condition { + Some(condition) => self.visit_condition( + condition, + edge.dest_node, + &edge.nodes_to_skip, + visit, + ctx, + )?, + None => self.visit_no_condition(edge.dest_node, &edge.nodes_to_skip, visit, ctx)?, }; - any_valid |= next_any_valid; + self.remaining_cost += edge.cost as u64; + let ControlFlow::Continue(next_any_valid) = cf else { + return Ok(ControlFlow::Break(())); + }; if next_any_valid { - forbidden_conditions = - dead_ends_cache.forbidden_conditions_for_all_prefixes_up_to(prev_conditions); - if cur_path.intersects(&forbidden_conditions) { - break 'edges_loop; + self.forbidden_conditions = ctx + .dead_ends_cache + .forbidden_conditions_for_all_prefixes_up_to(self.path.iter().copied()); + if self.visited_conditions.intersects(&self.forbidden_conditions) { + break; } } + any_valid |= next_any_valid; } - Ok(any_valid) + Ok(ControlFlow::Continue(any_valid)) } - pub fn initialize_distances_with_necessary_edges(&self) -> MappedInterner> { - let mut distances_to_end = self.query_graph.nodes.map(|_| vec![]); + fn visit_no_condition( + &mut self, + dest_node: Interned, + edge_forbidden_nodes: &SmallBitmap, + visit: VisitFn, + ctx: &mut VisitorContext, + ) -> Result> { + if ctx + .all_costs_from_node + .get(dest_node) + .iter() + .all(|next_cost| *next_cost != self.remaining_cost) + { + return Ok(ControlFlow::Continue(false)); + } + if dest_node == ctx.graph.query_graph.end_node { + let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?; + match control_flow { + ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)), + ControlFlow::Break(_) => Ok(ControlFlow::Break(())), + } + } else { + let old_fbct = self.forbidden_conditions_to_nodes.clone(); + self.forbidden_conditions_to_nodes.union(edge_forbidden_nodes); + let cf = self.visit_node(dest_node, visit, ctx)?; + self.forbidden_conditions_to_nodes = old_fbct; + Ok(cf) + } + } + fn visit_condition( + &mut self, + condition: Interned, + dest_node: Interned, + edge_forbidden_nodes: &SmallBitmap, + visit: VisitFn, + ctx: &mut VisitorContext, + ) -> Result> { + assert!(dest_node != ctx.graph.query_graph.end_node); + + if self.forbidden_conditions_to_nodes.contains(dest_node) + || edge_forbidden_nodes.intersects(&self.visited_nodes) + { + return Ok(ControlFlow::Continue(false)); + } + if self.forbidden_conditions.contains(condition) { + return Ok(ControlFlow::Continue(false)); + } + + if ctx + .all_costs_from_node + .get(dest_node) + .iter() + .all(|next_cost| *next_cost != self.remaining_cost) + { + return Ok(ControlFlow::Continue(false)); + } + + self.path.push(condition); + self.visited_nodes.insert(dest_node); + self.visited_conditions.insert(condition); + + let old_fc = self.forbidden_conditions.clone(); + if let Some(next_forbidden) = + ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied()) + { + self.forbidden_conditions.union(&next_forbidden); + } + let old_fctn = self.forbidden_conditions_to_nodes.clone(); + self.forbidden_conditions_to_nodes.union(edge_forbidden_nodes); + + let cf = self.visit_node(dest_node, visit, ctx)?; + + self.forbidden_conditions_to_nodes = old_fctn; + self.forbidden_conditions = old_fc; + + self.visited_conditions.remove(condition); + self.visited_nodes.remove(dest_node); + self.path.pop(); + + Ok(cf) + } +} + +impl RankingRuleGraph { + pub fn find_all_costs_to_end(&self) -> MappedInterner> { + let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]); let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); let mut node_stack = VecDeque::new(); - *distances_to_end.get_mut(self.query_graph.end_node) = vec![0]; + *costs_to_end.get_mut(self.query_graph.end_node) = vec![0]; for prev_node in self.query_graph.nodes.get(self.query_graph.end_node).predecessors.iter() { node_stack.push_back(prev_node); @@ -151,22 +212,22 @@ impl RankingRuleGraph { } while let Some(cur_node) = node_stack.pop_front() { - let mut self_distances = BTreeSet::::new(); + let mut self_costs = BTreeSet::::new(); let cur_node_edges = &self.edges_of_node.get(cur_node); for edge_idx in cur_node_edges.iter() { let edge = self.edges_store.get(edge_idx).as_ref().unwrap(); let succ_node = edge.dest_node; - let succ_distances = distances_to_end.get(succ_node); - for succ_distance in succ_distances { - self_distances.insert(edge.cost as u16 + succ_distance); + let succ_costs = costs_to_end.get(succ_node); + for succ_distance in succ_costs { + self_costs.insert(edge.cost as u64 + succ_distance); } } - let distances_to_end_cur_node = distances_to_end.get_mut(cur_node); - for cost in self_distances.iter() { - distances_to_end_cur_node.push(*cost); + let costs_to_end_cur_node = costs_to_end.get_mut(cur_node); + for cost in self_costs.iter() { + costs_to_end_cur_node.push(*cost); } - *distances_to_end.get_mut(cur_node) = self_distances.into_iter().collect(); + *costs_to_end.get_mut(cur_node) = self_costs.into_iter().collect(); for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { if !enqueued.contains(prev_node) { node_stack.push_back(prev_node); @@ -174,6 +235,6 @@ impl RankingRuleGraph { } } } - distances_to_end + costs_to_end } } diff --git a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs index d25c69c23..f3bb25d56 100644 --- a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs @@ -36,12 +36,12 @@ impl DeadEndsCache { } pub fn forbidden_conditions_for_all_prefixes_up_to( &mut self, - prefix: &[Interned], + prefix: impl Iterator>, ) -> SmallBitmap { let mut forbidden = self.forbidden.clone(); let mut cursor = self; - for c in prefix.iter() { - if let Some(next) = cursor.advance(*c) { + for c in prefix { + if let Some(next) = cursor.advance(c) { cursor = next; forbidden.union(&cursor.forbidden); } else { @@ -52,11 +52,11 @@ impl DeadEndsCache { } pub fn forbidden_conditions_after_prefix( &mut self, - prefix: &[Interned], + prefix: impl Iterator>, ) -> Option> { let mut cursor = self; - for c in prefix.iter() { - if let Some(next) = cursor.advance(*c) { + for c in prefix { + if let Some(next) = cursor.advance(c) { cursor = next; } else { return None; From fdd02105acb7da7cf481f56bb5fcce33330bf9d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 12:12:41 +0200 Subject: [PATCH 118/234] Graph-based ranking rule + term matching strategy support --- .../search/new/graph_based_ranking_rule.rs | 392 ++++++++---------- milli/src/search/new/mod.rs | 4 +- 2 files changed, 185 insertions(+), 211 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index db4310815..4bb31fc43 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -36,12 +36,11 @@ That is we find the documents where either: - OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by` */ -use std::collections::HashSet; use std::ops::ControlFlow; use roaring::RoaringBitmap; -use super::interner::MappedInterner; +use super::interner::{Interned, MappedInterner}; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ @@ -50,33 +49,35 @@ use super::ranking_rule_graph::{ }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; -use crate::search::new::query_graph::QueryNodeData; -use crate::Result; +use crate::search::new::query_term::LocatedQueryTermSubset; +use crate::search::new::ranking_rule_graph::PathVisitor; +use crate::{Result, TermsMatchingStrategy}; pub type Proximity = GraphBasedRankingRule; -impl Default for GraphBasedRankingRule { - fn default() -> Self { - Self::new("proximity".to_owned()) +impl GraphBasedRankingRule { + pub fn new(terms_matching_strategy: Option) -> Self { + Self::new_with_id("proximity".to_owned(), terms_matching_strategy) } } pub type Typo = GraphBasedRankingRule; -impl Default for GraphBasedRankingRule { - fn default() -> Self { - Self::new("typo".to_owned()) +impl GraphBasedRankingRule { + pub fn new(terms_matching_strategy: Option) -> Self { + Self::new_with_id("typo".to_owned(), terms_matching_strategy) } } /// A generic graph-based ranking rule pub struct GraphBasedRankingRule { id: String, + terms_matching_strategy: Option, // When the ranking rule is not iterating over its buckets, // its state is `None`. state: Option>, } impl GraphBasedRankingRule { /// Creates the ranking rule with the given identifier - pub fn new(id: String) -> Self { - Self { id, state: None } + pub fn new_with_id(id: String, terms_matching_strategy: Option) -> Self { + Self { id, terms_matching_strategy, state: None } } } @@ -89,7 +90,7 @@ pub struct GraphBasedRankingRuleState { /// Cache used to optimistically discard paths that resolve to no documents. dead_ends_cache: DeadEndsCache, /// A structure giving the list of possible costs from each node to the end node - all_distances: MappedInterner>, + all_costs: MappedInterner>, /// An index in the first element of `all_distances`, giving the cost of the next bucket cur_distance_idx: usize, } @@ -105,18 +106,45 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase _universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { - let graph = RankingRuleGraph::build(ctx, query_graph.clone())?; + let removal_cost = if let Some(terms_matching_strategy) = self.terms_matching_strategy { + // oh no this is wrong! + // because + // skipping the second node should require that the first one be skipped too + match terms_matching_strategy { + TermsMatchingStrategy::Last => { + let removal_order = + query_graph.removal_order_for_terms_matching_strategy_last(); + let mut forbidden_nodes = + SmallBitmap::for_interned_values_in(&query_graph.nodes); + let mut costs = query_graph.nodes.map(|_| None); + let mut cost = 100; + for ns in removal_order { + for n in ns.iter() { + *costs.get_mut(n) = Some((cost, forbidden_nodes.clone())); + } + forbidden_nodes.union(&ns); + cost = 1000; + } + costs + } + TermsMatchingStrategy::All => query_graph.nodes.map(|_| None), + } + } else { + query_graph.nodes.map(|_| None) + }; + + let graph = RankingRuleGraph::build(ctx, query_graph.clone(), removal_cost)?; let condition_docids_cache = ConditionDocIdsCache::default(); let dead_ends_cache = DeadEndsCache::new(&graph.conditions_interner); // Then pre-compute the cost of all paths from each node to the end node - let all_distances = graph.initialize_distances_with_necessary_edges(); + let all_costs = graph.find_all_costs_to_end(); let state = GraphBasedRankingRuleState { graph, conditions_cache: condition_docids_cache, dead_ends_cache, - all_distances, + all_costs, cur_distance_idx: 0, }; @@ -140,16 +168,13 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // If the cur_distance_idx does not point to a valid cost in the `all_distances` // structure, then we have computed all the buckets and can return. - if state.cur_distance_idx - >= state.all_distances.get(state.graph.query_graph.root_node).len() - { + if state.cur_distance_idx >= state.all_costs.get(state.graph.query_graph.root_node).len() { self.state = None; return Ok(None); } // Retrieve the cost of the paths to compute - let cost = - state.all_distances.get(state.graph.query_graph.root_node)[state.cur_distance_idx]; + let cost = state.all_costs.get(state.graph.query_graph.root_node)[state.cur_distance_idx]; state.cur_distance_idx += 1; let mut bucket = RoaringBitmap::new(); @@ -158,7 +183,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase graph, conditions_cache: condition_docids_cache, dead_ends_cache, - all_distances, + all_costs, cur_distance_idx: _, } = &mut state; @@ -167,8 +192,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let original_graph = graph.clone(); let mut used_conditions = SmallBitmap::for_interned_values_in(&graph.conditions_interner); - let mut considered_paths = vec![]; let mut good_paths = vec![]; + let mut considered_paths = vec![]; // For each path of the given cost, we will compute its associated // document ids. @@ -176,168 +201,80 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // and update the `dead_ends_cache` accordingly. // Updating the dead_ends_cache helps speed up the execution of `visit_paths_of_cost` and reduces // the number of future candidate paths given by that same function. - graph.visit_paths_of_cost( - graph.query_graph.root_node, - cost, - all_distances, - dead_ends_cache, - |path, graph, dead_ends_cache| { - if universe.is_empty() { - return Ok(ControlFlow::Break(())); - } - /* TODO: there are a couple ways to improve the speed of path computation. + let mut subpaths_docids: Vec<(Interned, RoaringBitmap)> = vec![]; - 1. Since the `visit_paths_of_cost` method uses a depth-first-search, we know that - consecutive calls to this closure have a high chance of giving paths sharing - some prefix. It would be good to reuse `subpath_docids` and `visited_conditions` - to find out what this common prefix is, to avoid recomputing it. In a way, doing - this serves as the dual of the DeadEndsCache: it takes advantage of our knowledge that - some paths *aren't* deadends. There is however a subtlety in that the universe might - have changed between the two consecutive calls. This is why we should subtract the docids - of the previous path (if successful) to the `subpath_docids`, at the same time as we do - it for the universe. - - 2. We perform way too many intersections with the universe. For the first visited path, - the operation we do is essentially: - universe & (c1 & universe) & (c2 & universe) & (c3 & universe) & etc. - This is a good idea *only if the universe is very small*. But if the universe is (almost) - a superset of each condition, then these intersections serve no purpose and slow down the search. - Maybe in the future we have a `deserialize_within_universe` method, which would speed up - these intersections. But for now, we have to be careful. - - 3. We could know in advance how many paths of a certain cost exist, and only update the - DeadEndsCache if (m)any remaining paths exist. There is a subtlety here because - on the next call of `next_bucket`, we will want an updated and correct DeadEndsCache. - We need to think about that. We could also avoid putting forbidden edges in this cache - if we know, somehow, that we won't visit this edge again. - - 4. Finally, but that will be a long term difficult project. We should compute the path *lazily*. - That is, when we do `path_docids &= condition`. We shouldn't *actually* perform the intersection, - but simply register that operation. It's only when we ask if the path_docids is empty that - **the minimum amount of work to determine whether the path is empty** is carried out. In practice, - that means performing a MultiOps on each container, in order or not, until any resulting container - is found to be non-empty. (In fact, when we ask `is_empty`, we should probably find the container - that has the highest chance of being non-empty and compute that one first). - - */ - - // Accumulate the path for logging purposes only - considered_paths.push(path.to_vec()); - - let mut path_docids = universe.clone(); - - // We store the edges and their docids in vectors in case the path turns out to be - // empty and we need to figure out why it was empty. - let mut visited_conditions = vec![]; - // let mut cached_condition_docids = vec![]; - let mut subpath_docids = vec![]; - - for (latest_condition_path_idx, &latest_condition) in path.iter().enumerate() { - visited_conditions.push(latest_condition); - - let condition_docids = condition_docids_cache.get_condition_docids( - ctx, - latest_condition, - graph, - &universe, - )?; - - // If the edge is empty, then the path will be empty as well, we update the graph - // and caches accordingly and skip to the next candidate path. - if condition_docids.is_empty() { - // 1. Store in the cache that this edge is empty for this universe - dead_ends_cache.forbid_condition(latest_condition); - // 2. remove all the edges with this condition from the ranking rule graph - graph.remove_edges_with_condition(latest_condition); - return Ok(ControlFlow::Continue(())); - } - path_docids &= condition_docids; - subpath_docids.push(path_docids.clone()); - - // If the (sub)path is empty, we try to figure out why and update the caches accordingly. - if path_docids.is_empty() { - let len_prefix = subpath_docids.len() - 1; - // First, we know that this path is empty, and thus any path - // that is a superset of it will also be empty. - dead_ends_cache.forbid_condition_after_prefix( - visited_conditions[..len_prefix].iter().copied(), - latest_condition, - ); - - if visited_conditions.len() > 1 { - let mut subprefix = vec![]; - // Deadend if the intersection between this edge and any - // previous prefix is disjoint with the universe - for (past_condition, subpath_docids) in visited_conditions[..len_prefix] - .iter() - .zip(subpath_docids[..len_prefix].iter()) - { - if *past_condition == latest_condition { - todo!(); - }; - subprefix.push(*past_condition); - if condition_docids.is_disjoint(subpath_docids) { - dead_ends_cache.forbid_condition_after_prefix( - subprefix.iter().copied(), - latest_condition, - ); - } - } - - // keep the same prefix and check the intersection with - // all the remaining conditions - let mut forbidden = dead_ends_cache.forbidden.clone(); - let mut cursor = dead_ends_cache; - for &c in visited_conditions[..len_prefix].iter() { - cursor = cursor.advance(c).unwrap(); - forbidden.union(&cursor.forbidden); - } - - let past_path_docids = &subpath_docids[subpath_docids.len() - 2]; - - let remaining_conditions = - path[latest_condition_path_idx..].iter().skip(1); - for next_condition in remaining_conditions { - if forbidden.contains(*next_condition) { - continue; - } - let next_condition_docids = condition_docids_cache - .get_condition_docids(ctx, *next_condition, graph, &universe)?; - - if past_path_docids.is_disjoint(next_condition_docids) { - cursor.forbid_condition(*next_condition); - } - } - } - - return Ok(ControlFlow::Continue(())); + let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache); + visitor.visit_paths(&mut |path, graph, dead_ends_cache| { + considered_paths.push(path.to_vec()); + // If the universe is empty, stop exploring the graph, since no docids will ever be found anymore. + if universe.is_empty() { + return Ok(ControlFlow::Break(())); + } + // `visit_paths` performs a depth-first search, so the previously visited path + // is likely to share a prefix with the current one. + // We stored the previous path and the docids associated to each of its prefixes in `subpaths_docids`. + // We take advantage of this to avoid computing the docids associated with the common prefix between + // the old and current path. + let idx_of_first_different_condition = { + let mut idx = 0; + for (&last_c, cur_c) in path.iter().zip(subpaths_docids.iter().map(|x| x.0)) { + if last_c == cur_c { + idx += 1; + } else { + break; } } - assert!(!path_docids.is_empty()); - // Accumulate the path for logging purposes only - good_paths.push(path.to_vec()); - for condition in path { - used_conditions.insert(*condition); + subpaths_docids.truncate(idx); + idx + }; + // Then for the remaining of the path, we continue computing docids. + for latest_condition in path[idx_of_first_different_condition..].iter().copied() { + // The visit_path_condition will stop + let success = visit_path_condition( + ctx, + graph, + &universe, + dead_ends_cache, + condition_docids_cache, + &mut subpaths_docids, + latest_condition, + )?; + if !success { + return Ok(ControlFlow::Continue(())); } - bucket |= &path_docids; - // Reduce the size of the universe so that we can more optimistically discard candidate paths - universe -= path_docids; + } + assert!(subpaths_docids.iter().map(|x| x.0).eq(path.iter().copied())); + + let path_docids = + subpaths_docids.pop().map(|x| x.1).unwrap_or_else(|| universe.clone()); + assert!(!path_docids.is_empty()); + + // Accumulate the path for logging purposes only + good_paths.push(path.to_vec()); + for &condition in path { + used_conditions.insert(condition); + } + bucket |= &path_docids; + // Reduce the size of the universe so that we can more optimistically discard candidate paths + universe -= &path_docids; + for (_, docids) in subpaths_docids.iter_mut() { + *docids -= &path_docids; + } + + if universe.is_empty() { + Ok(ControlFlow::Break(())) + } else { + Ok(ControlFlow::Continue(())) + } + })?; - if universe.is_empty() { - Ok(ControlFlow::Break(())) - } else { - Ok(ControlFlow::Continue(())) - } - }, - )?; - // println!(" {} paths of cost {} in {}", paths.len(), cost, self.id); G::log_state( &original_graph, - &good_paths, + &considered_paths, dead_ends_cache, original_universe, - all_distances, + all_costs, cost, logger, ); @@ -346,40 +283,21 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // that was used to compute this bucket // But we only do it in case the bucket length is >1, because otherwise // we know the child ranking rule won't be called anyway - let mut next_query_graph = original_graph.query_graph; - if bucket.len() > 1 { - next_query_graph.simplify(); - // 1. Gather all the words and phrases used in the computation of this bucket - let mut used_words = HashSet::new(); - let mut used_phrases = HashSet::new(); - for condition in used_conditions.iter() { - let (ws, ps) = - condition_docids_cache.get_condition_used_words_and_phrases(condition); - used_words.extend(ws); - used_phrases.extend(ps); - } - // 2. Remove the unused words and phrases from all the nodes in the graph - let mut nodes_to_remove = vec![]; - for (node_id, node) in next_query_graph.nodes.iter_mut() { - let term = match &mut node.data { - QueryNodeData::Term(term) => term, - QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, - }; - if let Some(new_term) = ctx - .term_interner - .get(term.value) - .removing_forbidden_terms(&used_words, &used_phrases) - { - if new_term.is_empty() { - nodes_to_remove.push(node_id); - } else { - term.value = ctx.term_interner.push(new_term); - } - } - } - // 3. Remove the empty nodes from the graph - next_query_graph.remove_nodes(&nodes_to_remove); - } + + let paths: Vec, LocatedQueryTermSubset)>> = good_paths + .into_iter() + .map(|path| { + path.into_iter() + .map(|condition| { + let (a, b) = + condition_docids_cache.get_subsets_used_by_condition(condition); + (a.clone(), b.clone()) + }) + .collect() + }) + .collect(); + + let next_query_graph = QueryGraph::build_from_paths(paths); self.state = Some(state); @@ -394,3 +312,59 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase self.state = None; } } + +/// Returns false if the intersection between the condition +/// docids and the previous path docids is empty. +fn visit_path_condition( + ctx: &mut SearchContext, + graph: &mut RankingRuleGraph, + universe: &RoaringBitmap, + dead_ends_cache: &mut DeadEndsCache, + condition_docids_cache: &mut ConditionDocIdsCache, + subpath: &mut Vec<(Interned, RoaringBitmap)>, + latest_condition: Interned, +) -> Result { + let condition_docids = &condition_docids_cache + .get_computed_condition(ctx, latest_condition, graph, universe)? + .docids; + if condition_docids.is_empty() { + // 1. Store in the cache that this edge is empty for this universe + dead_ends_cache.forbid_condition(latest_condition); + // 2. remove all the edges with this condition from the ranking rule graph + graph.remove_edges_with_condition(latest_condition); + return Ok(false); + } + + let latest_path_docids = if let Some((_, prev_docids)) = subpath.last() { + prev_docids & condition_docids + } else { + condition_docids.clone() + }; + if !latest_path_docids.is_empty() { + subpath.push((latest_condition, latest_path_docids)); + return Ok(true); + } + // If the (sub)path is empty, we try to figure out why and update the caches accordingly. + + // First, we know that this path is empty, and thus any path + // that is a superset of it will also be empty. + dead_ends_cache.forbid_condition_after_prefix(subpath.iter().map(|x| x.0), latest_condition); + + if subpath.len() <= 1 { + return Ok(false); + } + let mut subprefix = vec![]; + // Deadend if the intersection between this edge and any + // previous prefix is disjoint with the universe + // We already know that the intersection with the last one + // is empty, + for (past_condition, sp_docids) in subpath[..subpath.len() - 1].iter() { + subprefix.push(*past_condition); + if condition_docids.is_disjoint(sp_docids) { + dead_ends_cache + .forbid_condition_after_prefix(subprefix.iter().copied(), latest_condition); + } + } + + Ok(false) +} diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index b3c828048..1f3302a40 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -198,14 +198,14 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( continue; } typo = true; - ranking_rules.push(Box::::default()); + ranking_rules.push(Box::new(Typo::new(None))); } crate::Criterion::Proximity => { if proximity { continue; } proximity = true; - ranking_rules.push(Box::::default()); + ranking_rules.push(Box::new(Proximity::new(None))); } crate::Criterion::Attribute => { if attribute { From 3b0737a0926ae1c6a8316e943527303cd38743e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 12:20:44 +0200 Subject: [PATCH 119/234] Fix detailed logger --- milli/src/search/new/logger/detailed.rs | 159 ++++++++++++++---------- 1 file changed, 94 insertions(+), 65 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 57be61612..3a02950a8 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -8,7 +8,9 @@ use roaring::RoaringBitmap; use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; +use crate::search::new::query_term::{ + Lazy, LocatedQueryTermSubset, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm, +}; use crate::search::new::ranking_rule_graph::{ DeadEndsCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, @@ -45,16 +47,16 @@ pub enum SearchEvents { paths: Vec>>, dead_ends_cache: DeadEndsCache, universe: RoaringBitmap, - distances: MappedInterner>, - cost: u16, + costs: MappedInterner>, + cost: u64, }, TypoState { graph: RankingRuleGraph, paths: Vec>>, dead_ends_cache: DeadEndsCache, universe: RoaringBitmap, - distances: MappedInterner>, - cost: u16, + costs: MappedInterner>, + cost: u64, }, RankingRuleSkipBucket { ranking_rule_idx: usize, @@ -171,15 +173,15 @@ impl SearchLogger for DetailedSearchLogger { paths_map: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u16, + costs: &MappedInterner>, + cost: u64, ) { self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), dead_ends_cache: dead_ends_cache.clone(), universe: universe.clone(), - distances: distances.clone(), + costs: costs.clone(), cost, }) } @@ -190,15 +192,15 @@ impl SearchLogger for DetailedSearchLogger { paths_map: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u16, + costs: &MappedInterner>, + cost: u64, ) { self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), dead_ends_cache: dead_ends_cache.clone(), universe: universe.clone(), - distances: distances.clone(), + costs: costs.clone(), cost, }) } @@ -358,7 +360,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ paths, dead_ends_cache, universe, - distances, + costs, cost, } => { let cur_ranking_rule = timestamp.len() - 1; @@ -373,15 +375,15 @@ results.{cur_ranking_rule}{cur_activated_id} {{ graph, paths, dead_ends_cache, - distances.clone(), + costs.clone(), &mut new_file, ); writeln!( &mut file, "{id} {{ - link: \"{id}.d2.svg\" - tooltip: \"cost {cost}, universe len: {}\" -}}", + link: \"{id}.d2.svg\" + tooltip: \"cost {cost}, universe len: {}\" + }}", universe.len() ) .unwrap(); @@ -391,7 +393,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ paths, dead_ends_cache, universe, - distances, + costs, cost, } => { let cur_ranking_rule = timestamp.len() - 1; @@ -406,7 +408,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ graph, paths, dead_ends_cache, - distances.clone(), + costs.clone(), &mut new_file, ); writeln!( @@ -424,74 +426,101 @@ results.{cur_ranking_rule}{cur_activated_id} {{ writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc( + fn query_node_d2_desc( ctx: &mut SearchContext, node_idx: Interned, node: &QueryNode, - _distances: &[u16], + _costs: &[u64], file: &mut File, ) { match &node.data { - QueryNodeData::Term(LocatedQueryTerm { value, .. }) => { + QueryNodeData::Term(LocatedQueryTermSubset { + term_subset, + positions: _, + term_ids: _, + }) => { let QueryTerm { original, + is_multiple_words: _, + is_prefix: _, + max_nbr_typos, zero_typo, one_typo, - two_typos, - use_prefix_db, - synonyms, - split_words, - prefix_of, - is_prefix: _, - is_ngram: _, - phrase, - } = ctx.term_interner.get(*value); + two_typo, + } = ctx.term_interner.get(term_subset.original); let original = ctx.word_interner.get(*original); writeln!( file, "{node_idx} : \"{original}\" {{ -shape: class" + shape: class + max_nbr_typo: {max_nbr_typos}" ) .unwrap(); + + let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = + zero_typo; + for w in zero_typo.iter().copied() { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0").unwrap(); + if term_subset.zero_typo_subset.contains_word(w) { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 0").unwrap(); + } } for w in prefix_of.iter().copied() { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0P").unwrap(); - } - for w in one_typo.iter().copied() { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 1").unwrap(); - } - for w in two_typos.iter().copied() { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 2").unwrap(); + if term_subset.zero_typo_subset.contains_word(w) { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 0P").unwrap(); + } } + if let Some(phrase) = phrase { - let phrase = ctx.phrase_interner.get(*phrase); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : phrase").unwrap(); - } - if let Some(split_words) = split_words { - let phrase = ctx.phrase_interner.get(*split_words); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); + if term_subset.zero_typo_subset.contains_phrase(*phrase) { + let phrase = ctx.phrase_interner.get(*phrase); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "\"{phrase_str}\" : phrase").unwrap(); + } } + for synonym in synonyms.iter().copied() { - let phrase = ctx.phrase_interner.get(synonym); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); + if term_subset.zero_typo_subset.contains_phrase(synonym) { + let phrase = ctx.phrase_interner.get(synonym); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); + } } if let Some(use_prefix_db) = use_prefix_db { - let p = ctx.word_interner.get(*use_prefix_db); - writeln!(file, "use prefix DB : {p}").unwrap(); + if term_subset.zero_typo_subset.contains_word(*use_prefix_db) { + let p = ctx.word_interner.get(*use_prefix_db); + writeln!(file, "use prefix DB : {p}").unwrap(); + } + } + if let Lazy::Init(one_typo) = one_typo { + let OneTypoTerm { split_words, one_typo } = one_typo; + + for w in one_typo.iter().copied() { + if term_subset.one_typo_subset.contains_word(w) { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 1").unwrap(); + } + } + if let Some(split_words) = split_words { + if term_subset.one_typo_subset.contains_phrase(*split_words) { + let phrase = ctx.phrase_interner.get(*split_words); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); + } + } + } + if let Lazy::Init(two_typo) = two_typo { + let TwoTypoTerm { two_typos } = two_typo; + for w in two_typos.iter().copied() { + if term_subset.two_typo_subset.contains_word(w) { + let w = ctx.word_interner.get(w); + writeln!(file, "\"{w}\" : 2").unwrap(); + } + } } - // for d in distances.iter() { - // writeln!(file, "\"d_{d}\" : distance").unwrap(); - // } writeln!(file, "}}").unwrap(); } @@ -514,7 +543,7 @@ shape: class" if matches!(node.data, QueryNodeData::Deleted) { continue; } - Self::query_node_d2_desc::(ctx, node_id, node, &[], file); + Self::query_node_d2_desc(ctx, node_id, node, &[], file); for edge in node.successors.iter() { writeln!(file, "{node_id} -> {edge};\n").unwrap(); @@ -526,7 +555,7 @@ shape: class" graph: &RankingRuleGraph, paths: &[Vec>], _dead_ends_cache: &DeadEndsCache, - distances: MappedInterner>, + costs: MappedInterner>, file: &mut File, ) { writeln!(file, "direction: right").unwrap(); @@ -536,12 +565,12 @@ shape: class" if matches!(&node.data, QueryNodeData::Deleted) { continue; } - let distances = &distances.get(node_idx); - Self::query_node_d2_desc::(ctx, node_idx, node, distances, file); + let costs = &costs.get(node_idx); + Self::query_node_d2_desc(ctx, node_idx, node, costs, file); } for (_edge_id, edge) in graph.edges_store.iter() { let Some(edge) = edge else { continue }; - let Edge { source_node, dest_node, condition: details, cost } = edge; + let Edge { source_node, dest_node, condition: details, cost, nodes_to_skip: _ } = edge; match &details { None => { @@ -561,7 +590,7 @@ shape: class" } writeln!(file, "}}").unwrap(); - // writeln!(file, "Distances {{").unwrap(); + // writeln!(file, "costs {{").unwrap(); // Self::paths_d2_description(graph, paths, file); // writeln!(file, "}}").unwrap(); From ee8a9e0bad6fbca8728b22e0307ed713d80bc209 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 12:22:24 +0200 Subject: [PATCH 120/234] Remove outdated sentence in documentation --- milli/src/search/new/query_graph.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 8f87dfd8c..ce3732927 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -36,9 +36,6 @@ pub enum QueryNodeData { /** A graph representing all the ways to interpret the user's search query. -## Important -At the moment, a query graph has a hardcoded limit of [`QUERY_GRAPH_NODE_LENGTH_LIMIT`] nodes. - ## Example 1 For the search query `sunflower`, we need to register the following things: - we need to look for the exact word `sunflower` From 2a5997fb20a3cdea154b59e81925859b734df350 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 13:07:17 +0200 Subject: [PATCH 121/234] Avoid expensive assert! in bucket sort function --- milli/src/search/new/ranking_rules.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 1f5f4b366..9dc6018e6 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -206,7 +206,9 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( &next_bucket.candidates, ); - assert!(ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates)); + debug_assert!( + ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates) + ); ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; if cur_ranking_rule_index == ranking_rules_len - 1 From 2997d1f1861034f62c59401398eb44ddab445113 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 13:12:51 +0200 Subject: [PATCH 122/234] Use new term matching strategy logic in resolve_maximally_reduced_... --- milli/src/search/new/mod.rs | 38 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 1f3302a40..5628ee1a9 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -16,7 +16,7 @@ mod sort; // TODO: documentation + comments mod words; -use std::collections::{BTreeSet, HashSet}; +use std::collections::HashSet; use charabia::TokenizerBuilder; use db_cache::DatabaseCache; @@ -25,7 +25,7 @@ use heed::RoTxn; use interner::DedupInterner; pub use logger::detailed::DetailedSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; -use query_graph::{QueryGraph, QueryNode, QueryNodeData}; +use query_graph::{QueryGraph, QueryNode}; use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; use resolve_query_graph::PhraseDocIdsCache; @@ -75,33 +75,17 @@ fn resolve_maximally_reduced_query_graph( logger: &mut dyn SearchLogger, ) -> Result { let mut graph = query_graph.clone(); - let mut positions_to_remove = match matching_strategy { - TermsMatchingStrategy::Last => { - let mut all_positions = BTreeSet::new(); - for (_, n) in query_graph.nodes.iter() { - match &n.data { - QueryNodeData::Term(term) => { - all_positions.extend(term.positions.clone()); - } - QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {} - } - } - all_positions.into_iter().collect() - } + + let nodes_to_remove = match matching_strategy { + TermsMatchingStrategy::Last => query_graph + .removal_order_for_terms_matching_strategy_last() + .iter() + .flat_map(|x| x.iter()) + .collect(), TermsMatchingStrategy::All => vec![], }; - // don't remove the first term - if !positions_to_remove.is_empty() { - positions_to_remove.remove(0); - } - loop { - if positions_to_remove.is_empty() { - break; - } else { - let position_to_remove = positions_to_remove.pop().unwrap(); - let _ = graph.remove_words_starting_at_position(position_to_remove); - } - } + graph.remove_nodes(&nodes_to_remove); + logger.query_for_universe(&graph); let docids = compute_query_graph_docids(ctx, &graph, universe)?; From 35c16ad047f2f7ff082ffa627bd25e621698029c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 13:15:43 +0200 Subject: [PATCH 123/234] Use new term matching strategy logic in words ranking rule --- milli/src/search/new/words.rs | 54 +++++++++++------------------------ 1 file changed, 17 insertions(+), 37 deletions(-) diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index dc798e55d..263e9220c 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -1,18 +1,16 @@ -use std::collections::BTreeSet; - -use roaring::RoaringBitmap; - use super::logger::SearchLogger; -use super::query_graph::QueryNodeData; +use super::query_graph::QueryNode; use super::resolve_query_graph::compute_query_graph_docids; +use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::{Result, TermsMatchingStrategy}; +use roaring::RoaringBitmap; pub struct Words { exhausted: bool, // TODO: remove query_graph: Option, iterating: bool, // TODO: remove - positions_to_remove: Vec, + nodes_to_remove: Vec>, terms_matching_strategy: TermsMatchingStrategy, } impl Words { @@ -21,7 +19,7 @@ impl Words { exhausted: true, query_graph: None, iterating: false, - positions_to_remove: vec![], + nodes_to_remove: vec![], terms_matching_strategy, } } @@ -40,26 +38,14 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { ) -> Result<()> { self.exhausted = false; self.query_graph = Some(parent_query_graph.clone()); - - let positions_to_remove = match self.terms_matching_strategy { + self.nodes_to_remove = match self.terms_matching_strategy { TermsMatchingStrategy::Last => { - let mut all_positions = BTreeSet::new(); - for (_, n) in parent_query_graph.nodes.iter() { - match &n.data { - QueryNodeData::Term(term) => { - all_positions.extend(term.positions.clone()); - } - QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {} - } - } - let mut r: Vec = all_positions.into_iter().collect(); - // don't remove the first term - r.remove(0); - r + parent_query_graph.removal_order_for_terms_matching_strategy_last() + } + TermsMatchingStrategy::All => { + vec![] } - TermsMatchingStrategy::All => vec![], }; - self.positions_to_remove = positions_to_remove; self.iterating = true; Ok(()) } @@ -83,18 +69,12 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?; let child_query_graph = query_graph.clone(); - loop { - if self.positions_to_remove.is_empty() { - self.exhausted = true; - break; - } else { - let position_to_remove = self.positions_to_remove.pop().unwrap(); - let did_delete_any_node = - query_graph.remove_words_starting_at_position(position_to_remove); - if did_delete_any_node { - break; - } - } + + if self.nodes_to_remove.is_empty() { + self.exhausted = true; + } else { + let nodes_to_remove = self.nodes_to_remove.pop().unwrap(); + query_graph.remove_nodes(&nodes_to_remove.iter().collect::>()); } Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket })) @@ -107,7 +87,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { ) { self.iterating = false; self.exhausted = true; - self.positions_to_remove = vec![]; + self.nodes_to_remove = vec![]; self.query_graph = None; } } From d48cdc67a07b0e6dcdfdff2097e26865453db473 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 14:01:52 +0200 Subject: [PATCH 124/234] Fix term matching strategy bugs --- milli/src/search/new/graph_based_ranking_rule.rs | 5 +---- milli/src/search/new/query_graph.rs | 6 +++--- milli/src/search/new/words.rs | 4 +++- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 4bb31fc43..88df56448 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -107,9 +107,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase query_graph: &QueryGraph, ) -> Result<()> { let removal_cost = if let Some(terms_matching_strategy) = self.terms_matching_strategy { - // oh no this is wrong! - // because - // skipping the second node should require that the first one be skipped too match terms_matching_strategy { TermsMatchingStrategy::Last => { let removal_order = @@ -123,7 +120,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase *costs.get_mut(n) = Some((cost, forbidden_nodes.clone())); } forbidden_nodes.union(&ns); - cost = 1000; + cost += 100; } costs } diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index ce3732927..368e22fda 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -300,14 +300,14 @@ impl QueryGraph { } }; let mut nodes_to_remove = BTreeMap::>::new(); - for (node_id, node) in self.nodes.iter() { + 'outer: for (node_id, node) in self.nodes.iter() { let QueryNodeData::Term(t) = &node.data else { continue }; let mut cost = 0; for id in t.term_ids.clone() { if let Some(t_cost) = cost_of_term_idx(id) { - cost += t_cost; + cost = std::cmp::max(cost, t_cost); } else { - continue; + continue 'outer; } } nodes_to_remove diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 263e9220c..b5ee5e085 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -40,7 +40,9 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { self.query_graph = Some(parent_query_graph.clone()); self.nodes_to_remove = match self.terms_matching_strategy { TermsMatchingStrategy::Last => { - parent_query_graph.removal_order_for_terms_matching_strategy_last() + let mut ns = parent_query_graph.removal_order_for_terms_matching_strategy_last(); + ns.reverse(); + ns } TermsMatchingStrategy::All => { vec![] From 0d6e8b5c31233d4548c90344f09c5b17178a5b1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 14:48:12 +0200 Subject: [PATCH 125/234] Fix phrase search bug when the phrase has only one word --- milli/src/search/new/resolve_query_graph.rs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 707082cab..ef7adad14 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -121,14 +121,27 @@ pub fn compute_phrase_docids( phrase: Interned, ) -> Result { let Phrase { words } = ctx.phrase_interner.get(phrase).clone(); + + if words.is_empty() { + return Ok(RoaringBitmap::new()); + } + if words.len() == 1 { + if let Some(word) = &words[0] { + if let Some(word_docids) = ctx.get_db_word_docids(*word)? { + return RoaringBitmapCodec::bytes_decode(word_docids) + .ok_or(heed::Error::Decoding.into()); + } else { + return Ok(RoaringBitmap::new()); + } + } else { + return Ok(RoaringBitmap::new()); + } + } + let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = words.len().min(3); - if words.is_empty() { - return Ok(candidates); - } - for win in words.windows(winsize) { // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); From 061b1e6d7cb35b46a563b0f9b8505beed063b7f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 14:49:25 +0200 Subject: [PATCH 126/234] Tiny refactor of query graph remove_nodes method --- milli/src/search/new/mod.rs | 2 +- milli/src/search/new/query_graph.rs | 65 ++++++++++++++++++++--------- milli/src/search/new/words.rs | 2 +- 3 files changed, 48 insertions(+), 21 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 5628ee1a9..92e8882be 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -84,7 +84,7 @@ fn resolve_maximally_reduced_query_graph( .collect(), TermsMatchingStrategy::All => vec![], }; - graph.remove_nodes(&nodes_to_remove); + graph.remove_nodes_keep_edges(&nodes_to_remove); logger.query_for_universe(&graph); let docids = compute_query_graph_docids(ctx, &graph, universe)?; diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 368e22fda..faa487299 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -177,11 +177,34 @@ impl QueryGraph { node.data = node_data; } let mut graph = QueryGraph { root_node, end_node, nodes }; - graph.rebuild_edges(); + graph.build_initial_edges(); Ok(graph) } + /// Remove the given nodes, connecting all their predecessors to all their successors. + pub fn remove_nodes_keep_edges(&mut self, nodes: &[Interned]) { + for &node_id in nodes { + let node = self.nodes.get(node_id); + let old_node_pred = node.predecessors.clone(); + let old_node_succ = node.successors.clone(); + for pred in old_node_pred.iter() { + let pred_successors = &mut self.nodes.get_mut(pred).successors; + pred_successors.remove(node_id); + pred_successors.union(&old_node_succ); + } + for succ in old_node_succ.iter() { + let succ_predecessors = &mut self.nodes.get_mut(succ).predecessors; + succ_predecessors.remove(node_id); + succ_predecessors.union(&old_node_pred); + } + let node = self.nodes.get_mut(node_id); + node.data = QueryNodeData::Deleted; + node.predecessors.clear(); + node.successors.clear(); + } + } + /// Remove the given nodes and all their edges from the query graph. pub fn remove_nodes(&mut self, nodes: &[Interned]) { for &node_id in nodes { @@ -201,10 +224,30 @@ impl QueryGraph { node.predecessors.clear(); node.successors.clear(); } - self.rebuild_edges(); + } + /// Simplify the query graph by removing all nodes that are disconnected from + /// the start or end nodes. + pub fn simplify(&mut self) { + loop { + let mut nodes_to_remove = vec![]; + for (node_idx, node) in self.nodes.iter() { + if (!matches!(node.data, QueryNodeData::End | QueryNodeData::Deleted) + && node.successors.is_empty()) + || (!matches!(node.data, QueryNodeData::Start | QueryNodeData::Deleted) + && node.predecessors.is_empty()) + { + nodes_to_remove.push(node_idx); + } + } + if nodes_to_remove.is_empty() { + break; + } else { + self.remove_nodes(&nodes_to_remove); + } + } } - fn rebuild_edges(&mut self) { + fn build_initial_edges(&mut self) { for (_, node) in self.nodes.iter_mut() { node.successors.clear(); node.predecessors.clear(); @@ -253,22 +296,6 @@ impl QueryGraph { } } - /// Remove all the nodes that correspond to a word starting at the given position and rebuild - /// the edges of the graph appropriately. - pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool { - let mut nodes_to_remove = vec![]; - for (node_idx, node) in self.nodes.iter() { - let QueryNodeData::Term(LocatedQueryTermSubset { term_subset: _, positions, term_ids: _ }) = &node.data else { continue }; - if positions.start() == &position { - nodes_to_remove.push(node_idx); - } - } - - self.remove_nodes(&nodes_to_remove); - - !nodes_to_remove.is_empty() - } - pub fn removal_order_for_terms_matching_strategy_last(&self) -> Vec> { let (first_term_idx, last_term_idx) = { let mut first_term_idx = u8::MAX; diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index b5ee5e085..710af2243 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -76,7 +76,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { self.exhausted = true; } else { let nodes_to_remove = self.nodes_to_remove.pop().unwrap(); - query_graph.remove_nodes(&nodes_to_remove.iter().collect::>()); + query_graph.remove_nodes_keep_edges(&nodes_to_remove.iter().collect::>()); } Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket })) From 12b26cd54e5e6b9f23f32f86fe3e2a109db95dff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 14:54:08 +0200 Subject: [PATCH 127/234] Don't remove phrases from the query with term matching strategy Last --- milli/src/search/new/graph_based_ranking_rule.rs | 2 +- milli/src/search/new/mod.rs | 2 +- milli/src/search/new/query_graph.rs | 8 +++++++- milli/src/search/new/words.rs | 4 ++-- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 88df56448..b8c58c726 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -110,7 +110,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase match terms_matching_strategy { TermsMatchingStrategy::Last => { let removal_order = - query_graph.removal_order_for_terms_matching_strategy_last(); + query_graph.removal_order_for_terms_matching_strategy_last(ctx); let mut forbidden_nodes = SmallBitmap::for_interned_values_in(&query_graph.nodes); let mut costs = query_graph.nodes.map(|_| None); diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 92e8882be..588b89123 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -78,7 +78,7 @@ fn resolve_maximally_reduced_query_graph( let nodes_to_remove = match matching_strategy { TermsMatchingStrategy::Last => query_graph - .removal_order_for_terms_matching_strategy_last() + .removal_order_for_terms_matching_strategy_last(ctx) .iter() .flat_map(|x| x.iter()) .collect(), diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index faa487299..fc0b5e4d3 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -296,7 +296,10 @@ impl QueryGraph { } } - pub fn removal_order_for_terms_matching_strategy_last(&self) -> Vec> { + pub fn removal_order_for_terms_matching_strategy_last( + &self, + ctx: &SearchContext, + ) -> Vec> { let (first_term_idx, last_term_idx) = { let mut first_term_idx = u8::MAX; let mut last_term_idx = 0u8; @@ -329,6 +332,9 @@ impl QueryGraph { let mut nodes_to_remove = BTreeMap::>::new(); 'outer: for (node_id, node) in self.nodes.iter() { let QueryNodeData::Term(t) = &node.data else { continue }; + if ctx.term_interner.get(t.term_subset.original).zero_typo.phrase.is_some() { + continue; + } let mut cost = 0; for id in t.term_ids.clone() { if let Some(t_cost) = cost_of_term_idx(id) { diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 710af2243..0036694c3 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -31,7 +31,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { } fn start_iteration( &mut self, - _ctx: &mut SearchContext<'ctx>, + ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, _parent_candidates: &RoaringBitmap, parent_query_graph: &QueryGraph, @@ -40,7 +40,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { self.query_graph = Some(parent_query_graph.clone()); self.nodes_to_remove = match self.terms_matching_strategy { TermsMatchingStrategy::Last => { - let mut ns = parent_query_graph.removal_order_for_terms_matching_strategy_last(); + let mut ns = parent_query_graph.removal_order_for_terms_matching_strategy_last(ctx); ns.reverse(); ns } From 9b87c36200e6db60a5519f5a72feaa2062694cc7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Fri, 31 Mar 2023 09:19:18 +0200 Subject: [PATCH 128/234] Limit the number of derivations for a single word. --- milli/src/search/new/limits.rs | 18 +++++++++ milli/src/search/new/mod.rs | 1 + milli/src/search/new/query_term.rs | 62 +++++++++++++++++++++--------- 3 files changed, 63 insertions(+), 18 deletions(-) create mode 100644 milli/src/search/new/limits.rs diff --git a/milli/src/search/new/limits.rs b/milli/src/search/new/limits.rs new file mode 100644 index 000000000..33a5a4a6c --- /dev/null +++ b/milli/src/search/new/limits.rs @@ -0,0 +1,18 @@ +/// Maximum number of tokens we consider in a single search. +// TODO: Loic, find proper value here so we don't overflow the interner. +pub const MAX_TOKEN_COUNT: usize = 1_000; + +/// Maximum number of prefixes that can be derived from a single word. +pub const MAX_PREFIX_COUNT: usize = 1_000; +/// Maximum number of words that can be derived from a single word with a distance of one to that word. +pub const MAX_ONE_TYPO_COUNT: usize = 150; +/// Maximum number of words that can be derived from a single word with a distance of two to that word. +pub const MAX_TWO_TYPOS_COUNT: usize = 50; + +/// Maximum amount of synonym phrases that can be derived from a single word. +pub const MAX_SYNONYM_PHRASE_COUNT: usize = 50; + +/// Maximum amount of words inside of all the synonym phrases that can be derived from a single word. +/// +/// This limit is meant to gracefully handle the case where a word would have very long phrases as synonyms. +pub const MAX_SYNONYM_WORD_COUNT: usize = 100; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index ef3f6c047..707ba4ea6 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -2,6 +2,7 @@ mod db_cache; mod distinct; mod graph_based_ranking_rule; mod interner; +mod limits; mod logger; mod query_graph; mod query_term; diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 049b96646..2c1b52414 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -10,7 +10,7 @@ use heed::RoTxn; use itertools::Itertools; use super::interner::{DedupInterner, Interned}; -use super::SearchContext; +use super::{limits, SearchContext}; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; use crate::{CboRoaringBitmapLenCodec, Index, Result, MAX_WORD_LENGTH}; @@ -266,6 +266,9 @@ pub fn query_term_from_word( let mut stream = fst.search(prefix).into_stream(); while let Some(derived_word) = stream.next() { + if prefix_of.len() >= limits::MAX_PREFIX_COUNT { + break; + } let derived_word = std::str::from_utf8(derived_word)?.to_owned(); let derived_word_interned = ctx.word_interner.insert(derived_word); if derived_word_interned != word_interned { @@ -277,23 +280,31 @@ pub fn query_term_from_word( let dfa = build_dfa(word, 1, is_prefix); let starts = StartsWith(Str::new(get_first(word))); let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); - // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? while let Some((derived_word, state)) = stream.next() { + if prefix_of.len() >= limits::MAX_PREFIX_COUNT + && one_typo.len() >= limits::MAX_ONE_TYPO_COUNT + { + break; + } let derived_word = std::str::from_utf8(derived_word)?; let d = dfa.distance(state.1); let derived_word_interned = ctx.word_interner.insert(derived_word.to_owned()); match d.to_u8() { 0 => { - if derived_word_interned != word_interned { + if derived_word_interned != word_interned + && prefix_of.len() < limits::MAX_PREFIX_COUNT + { prefix_of.push(derived_word_interned); } } 1 => { - one_typo.push(derived_word_interned); + if one_typo.len() < limits::MAX_PREFIX_COUNT { + one_typo.push(derived_word_interned); + } } - _ => panic!(), + _ => unreachable!("One typo dfa produced multiple typos"), } } } else { @@ -304,14 +315,21 @@ pub fn query_term_from_word( let automaton = Union(first, &second); let mut stream = fst.search_with_state(automaton).into_stream(); - // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? while let Some((derived_word, state)) = stream.next() { + if prefix_of.len() >= limits::MAX_PREFIX_COUNT + && one_typo.len() >= limits::MAX_ONE_TYPO_COUNT + && two_typos.len() >= limits::MAX_TWO_TYPOS_COUNT + { + break; + } let derived_word = std::str::from_utf8(derived_word)?; let derived_word_interned = ctx.word_interner.insert(derived_word.to_owned()); // in the case the typo is on the first letter, we know the number of typo // is two - if get_first(derived_word) != get_first(word) { + if get_first(derived_word) != get_first(word) + && two_typos.len() < limits::MAX_TWO_TYPOS_COUNT + { two_typos.push(derived_word_interned); } else { // Else, we know that it is the second dfa that matched and compute the @@ -319,17 +337,23 @@ pub fn query_term_from_word( let d = second_dfa.distance((state.1).0); match d.to_u8() { 0 => { - if derived_word_interned != word_interned { + if derived_word_interned != word_interned + && prefix_of.len() < limits::MAX_PREFIX_COUNT + { prefix_of.push(derived_word_interned); } } 1 => { - one_typo.push(derived_word_interned); + if one_typo.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo.push(derived_word_interned); + } } 2 => { - two_typos.push(derived_word_interned); + if two_typos.len() < limits::MAX_TWO_TYPOS_COUNT { + two_typos.push(derived_word_interned); + } } - _ => panic!(), + _ => unreachable!("2 typos DFA produced a distance greater than 2"), } } } @@ -341,15 +365,20 @@ pub fn query_term_from_word( }); let synonyms = ctx.index.synonyms(ctx.txn)?; - + let mut synonym_word_count = 0; let synonyms = synonyms .get(&vec![word.to_owned()]) .cloned() .unwrap_or_default() .into_iter() - .map(|words| { + .take(limits::MAX_SYNONYM_PHRASE_COUNT) + .filter_map(|words| { + if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT { + return None; + } + synonym_word_count += words.len(); let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); - ctx.phrase_interner.insert(Phrase { words }) + Some(ctx.phrase_interner.insert(Phrase { words })) }) .collect(); @@ -488,10 +517,7 @@ pub fn located_query_terms_from_string( // start with the last position as we will wrap around to position 0 at the beginning of the loop below. let mut position = u16::MAX; - // TODO: Loic, find proper value here so we don't overflow the interner. - const MAX_TOKEN_COUNT: usize = 1_000; - - let mut peekable = query.take(MAX_TOKEN_COUNT).peekable(); + let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable(); while let Some(token) = peekable.next() { // early return if word limit is exceeded if located_terms.len() >= parts_limit { From 24e5f6f7a917ab5da6715b8a91295d877ecd5f4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 3 Apr 2023 09:17:33 +0200 Subject: [PATCH 129/234] Don't remove phrases with "last" term matching strategy --- milli/src/search/new/query_term.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 1e9b2852c..f0c4c3921 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -891,6 +891,11 @@ pub fn make_ngram( number_of_typos_allowed: &impl Fn(&str) -> u8, ) -> Result> { assert!(!terms.is_empty()); + for t in terms { + if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() { + return Ok(None); + } + } for ts in terms.windows(2) { let [t1, t2] = ts else { panic!() }; if *t1.positions.end() != t2.positions.start() - 1 { From 58fe260c720d3abf6df21b1ec14d24311c80cca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 3 Apr 2023 09:18:02 +0200 Subject: [PATCH 130/234] Allow removing all the terms from a query if it contains a phrase --- milli/src/search/new/query_graph.rs | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index fc0b5e4d3..f0930eb01 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -322,33 +322,31 @@ impl QueryGraph { return vec![]; } let cost_of_term_idx = |term_idx: u8| { - if term_idx == first_term_idx { - None - } else { - let rank = 1 + last_term_idx - term_idx; - Some(rank as u16) - } + let rank = 1 + last_term_idx - term_idx; + rank as u16 }; let mut nodes_to_remove = BTreeMap::>::new(); - 'outer: for (node_id, node) in self.nodes.iter() { + let mut at_least_one_phrase = false; + for (node_id, node) in self.nodes.iter() { let QueryNodeData::Term(t) = &node.data else { continue }; if ctx.term_interner.get(t.term_subset.original).zero_typo.phrase.is_some() { + at_least_one_phrase = true; continue; } let mut cost = 0; for id in t.term_ids.clone() { - if let Some(t_cost) = cost_of_term_idx(id) { - cost = std::cmp::max(cost, t_cost); - } else { - continue 'outer; - } + cost = std::cmp::max(cost, cost_of_term_idx(id)); } nodes_to_remove .entry(cost) .or_insert_with(|| SmallBitmap::for_interned_values_in(&self.nodes)) .insert(node_id); } - nodes_to_remove.into_values().collect() + let mut res: Vec<_> = nodes_to_remove.into_values().collect(); + if !at_least_one_phrase { + res.pop(); + } + res } } From 0d2e7bcc130ce187721d275752c94ed9f02fb395 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 30 Mar 2023 16:10:10 +0200 Subject: [PATCH 131/234] Implement the previous way for the exhaustive distinct candidates --- milli/src/search/mod.rs | 11 +++++++---- milli/src/search/new/mod.rs | 16 ++++++++++++++-- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 734671990..08803b73f 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,3 +1,9 @@ +use std::fmt; + +use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; +use once_cell::sync::Lazy; +use roaring::bitmap::RoaringBitmap; + pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, @@ -5,10 +11,6 @@ pub use self::matches::{ use crate::{ execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, }; -use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; -use once_cell::sync::Lazy; -use roaring::bitmap::RoaringBitmap; -use std::fmt; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -112,6 +114,7 @@ impl<'a> Search<'a> { &mut ctx, &self.query, self.terms_matching_strategy, + self.exhaustive_number_hits, &self.filter, &self.sort_criteria, self.offset, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4061df162..4d2805fef 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -37,6 +37,7 @@ use self::interner::Interner; use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::resolve_query_graph::compute_query_graph_docids; use self::sort::Sort; +use crate::search::new::distinct::{apply_distinct_rule, DistinctOutput}; use crate::{ AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy, UserError, @@ -272,6 +273,7 @@ pub fn execute_search( ctx: &mut SearchContext, query: &Option, terms_matching_strategy: TermsMatchingStrategy, + exhaustive_number_hits: bool, filters: &Option, sort_criteria: &Option>, from: usize, @@ -333,11 +335,21 @@ pub fn execute_search( )? }; + // The candidates is the universe unless the exhaustive number of hits + // is requested and a distinct attribute is set. + let mut candidates = universe; + if exhaustive_number_hits { + if let Some(f) = ctx.index.distinct_field(ctx.txn)? { + if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) { + candidates = apply_distinct_rule(ctx, distinct_fid, &candidates)?.remaining; + } + } + } + Ok(SearchResult { // TODO: correct matching words matching_words: MatchingWords::default(), - // TODO: candidates with distinct - candidates: universe, + candidates, documents_ids, }) } From 4708d9b01633b88fa25baaafc4d6155631775578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 3 Apr 2023 10:09:27 +0200 Subject: [PATCH 132/234] Fix compiler warnings/errors --- milli/examples/search.rs | 1 + milli/src/search/new/mod.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 1757c1c5b..ff7d564c6 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -52,6 +52,7 @@ fn main() -> Result<(), Box> { &(!query.trim().is_empty()).then(|| query.trim().to_owned()), // what a the from which when there is TermsMatchingStrategy::Last, + false, &None, &None, 0, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d2805fef..4d561d25b 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -37,7 +37,7 @@ use self::interner::Interner; use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::resolve_query_graph::compute_query_graph_docids; use self::sort::Sort; -use crate::search::new::distinct::{apply_distinct_rule, DistinctOutput}; +use crate::search::new::distinct::apply_distinct_rule; use crate::{ AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy, UserError, From 3f13608002355ed547590af925f283f6406e590c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 3 Apr 2023 15:27:49 +0200 Subject: [PATCH 133/234] Fix computation of ngram derivations --- milli/src/search/new/query_term.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index d19ab6135..9d59b9999 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -1017,16 +1017,13 @@ pub fn make_ngram( if ngram_str.len() > MAX_WORD_LENGTH { return Ok(None); } + let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone()); let max_nbr_typos = number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; - // let (_, mut zero_typo, mut one_typo, two_typo) = - // all_subterms_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; - let original = ctx.word_interner.insert(words.join(" ")); - // Now add the synonyms let index_synonyms = ctx.index.synonyms(ctx.txn)?; @@ -1038,7 +1035,7 @@ pub fn make_ngram( ); let term = QueryTerm { - original, + original: ngram_str_interned, is_multiple_words: true, is_prefix, max_nbr_typos, From 4129d657e2f36c173dcf89f4a5aa6f7da052d1f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 15:01:42 +0200 Subject: [PATCH 134/234] Simplify query_term module a bit --- milli/src/search/new/db_cache.rs | 13 ++ milli/src/search/new/query_term.rs | 274 ++++++++++++----------------- 2 files changed, 129 insertions(+), 158 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index af94108e2..b780af39f 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::hash_map::Entry; use std::hash::Hash; @@ -24,6 +25,8 @@ pub struct DatabaseCache<'ctx> { pub word_docids: FxHashMap, Option<&'ctx [u8]>>, pub exact_word_docids: FxHashMap, Option<&'ctx [u8]>>, pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, + + pub words_fst: Option>>, } impl<'ctx> DatabaseCache<'ctx> { fn get_value<'v, K1, KC>( @@ -49,6 +52,16 @@ impl<'ctx> DatabaseCache<'ctx> { } } impl<'ctx> SearchContext<'ctx> { + pub fn get_words_fst(&mut self) -> Result>> { + if let Some(fst) = self.db_cache.words_fst.clone() { + Ok(fst) + } else { + let fst = self.index.words_fst(self.txn)?; + self.db_cache.words_fst = Some(fst.clone()); + Ok(fst) + } + } + /// Retrieve or insert the given value in the `word_docids` database. pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { DatabaseCache::get_value( diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 9d59b9999..15e106e06 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -7,14 +7,14 @@ use charabia::{SeparatorKind, TokenKind}; use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use heed::types::DecodeIgnore; -use heed::RoTxn; +use heed::BytesDecode; use itertools::Itertools; use super::interner::{DedupInterner, Interned}; use super::{limits, SearchContext}; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; -use crate::{CboRoaringBitmapLenCodec, Index, Result, MAX_WORD_LENGTH}; +use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; /// A phrase in the user's search query, consisting of several words /// that must appear side-by-side in the search results. @@ -191,18 +191,13 @@ impl QueryTermSubset { &self, ctx: &mut SearchContext, ) -> Result>> { - let original = ctx.term_interner.get_mut(self.original); let mut result = BTreeSet::default(); // TODO: a compute_partially funtion if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { - original.compute_fully_if_needed( - ctx.index, - ctx.txn, - &mut ctx.word_interner, - &mut ctx.phrase_interner, - )?; + self.original.compute_fully_if_needed(ctx)?; } + let original = ctx.term_interner.get_mut(self.original); if !self.zero_typo_subset.is_empty() { let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = &original.zero_typo; @@ -245,18 +240,13 @@ impl QueryTermSubset { Ok(result) } pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result>> { - let original = ctx.term_interner.get_mut(self.original); let mut result = BTreeSet::default(); if !self.one_typo_subset.is_empty() { // TODO: compute less than fully if possible - original.compute_fully_if_needed( - ctx.index, - ctx.txn, - &mut ctx.word_interner, - &mut ctx.phrase_interner, - )?; + self.original.compute_fully_if_needed(ctx)?; } + let original = ctx.term_interner.get_mut(self.original); let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = &original.zero_typo; @@ -274,26 +264,23 @@ impl QueryTermSubset { } } -impl QueryTerm { - pub fn compute_fully_if_needed( - &mut self, - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - ) -> Result<()> { - if self.max_nbr_typos == 0 { - self.one_typo = Lazy::Init(OneTypoTerm::default()); - self.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if self.max_nbr_typos == 1 && self.one_typo.is_uninit() { - assert!(self.two_typo.is_uninit()); - self.initialize_one_typo_subterm(index, txn, word_interner, phrase_interner)?; - assert!(self.one_typo.is_init()); - self.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if self.max_nbr_typos > 1 && self.two_typo.is_uninit() { - assert!(self.two_typo.is_uninit()); - self.initialize_one_and_two_typo_subterm(index, txn, word_interner, phrase_interner)?; - assert!(self.one_typo.is_init() && self.two_typo.is_init()); +impl Interned { + pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { + let s = ctx.term_interner.get_mut(self); + if s.max_nbr_typos == 0 { + s.one_typo = Lazy::Init(OneTypoTerm::default()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_and_two_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init() && s.two_typo.is_init()); } Ok(()) } @@ -302,7 +289,7 @@ impl QueryTerm { #[derive(Clone, PartialEq, Eq, Hash)] pub struct QueryTerm { pub original: Interned, - pub is_multiple_words: bool, + pub ngram_words: Option>>, pub max_nbr_typos: u8, pub is_prefix: bool, pub zero_typo: ZeroTypoTerm, @@ -363,39 +350,6 @@ impl TwoTypoTerm { } impl QueryTerm { - pub fn phrase( - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - phrase: Phrase, - ) -> Self { - Self { - original: word_interner.insert(phrase.description(word_interner)), - is_multiple_words: false, - max_nbr_typos: 0, - is_prefix: false, - zero_typo: ZeroTypoTerm { - phrase: Some(phrase_interner.insert(phrase)), - zero_typo: None, - prefix_of: BTreeSet::default(), - synonyms: BTreeSet::default(), - use_prefix_db: None, - }, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - } - } - pub fn empty(word_interner: &mut DedupInterner, original: &str) -> Self { - Self { - original: word_interner.insert(original.to_owned()), - is_multiple_words: false, - is_prefix: false, - max_nbr_typos: 0, - zero_typo: <_>::default(), - one_typo: Lazy::Init(<_>::default()), - two_typo: Lazy::Init(<_>::default()), - } - } - pub fn is_empty(&self) -> bool { let Lazy::Init(one_typo) = &self.one_typo else { return false; @@ -438,13 +392,13 @@ fn find_zero_typo_prefix_derivations( } fn find_zero_one_typo_derivations( + ctx: &mut SearchContext, word_interned: Interned, is_prefix: bool, - fst: fst::Set>, - word_interner: &mut DedupInterner, mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, ) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); + let fst = ctx.get_words_fst()?; + let word = ctx.word_interner.get(word_interned).to_owned(); let word = word.as_str(); let dfa = build_dfa(word, 1, is_prefix); @@ -453,7 +407,7 @@ fn find_zero_one_typo_derivations( while let Some((derived_word, state)) = stream.next() { let derived_word = std::str::from_utf8(derived_word)?; - let derived_word = word_interner.insert(derived_word.to_owned()); + let derived_word = ctx.word_interner.insert(derived_word.to_owned()); let d = dfa.distance(state.1); match d.to_u8() { 0 => { @@ -553,7 +507,17 @@ fn partially_initialized_term_from_word( let word_interned = ctx.word_interner.insert(word.to_owned()); if word.len() > MAX_WORD_LENGTH { - return Ok(QueryTerm::empty(&mut ctx.word_interner, word)); + return Ok({ + QueryTerm { + original: ctx.word_interner.insert(word.to_owned()), + ngram_words: None, + is_prefix: false, + max_nbr_typos: 0, + zero_typo: <_>::default(), + one_typo: Lazy::Init(<_>::default()), + two_typo: Lazy::Init(<_>::default()), + } + }); } let fst = ctx.index.words_fst(ctx.txn)?; @@ -610,7 +574,7 @@ fn partially_initialized_term_from_word( Ok(QueryTerm { original: word_interned, - is_multiple_words: false, + ngram_words: None, max_nbr_typos: max_typo, is_prefix, zero_typo, @@ -619,72 +583,52 @@ fn partially_initialized_term_from_word( }) } -fn find_split_words( - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - word: &str, -) -> Result>> { - let split_words = split_best_frequency(index, txn, word)?.map(|(l, r)| { - phrase_interner.insert(Phrase { - words: vec![Some(word_interner.insert(l)), Some(word_interner.insert(r))], - }) - }); - Ok(split_words) +fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result>> { + if let Some((l, r)) = split_best_frequency(ctx, word)? { + Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) + } else { + Ok(None) + } } -impl QueryTerm { - fn initialize_one_typo_subterm( - &mut self, - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - ) -> Result<()> { - let QueryTerm { original, is_prefix, one_typo, .. } = self; - let original_str = word_interner.get(*original).to_owned(); +impl Interned { + fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; + let original = *original; + let is_prefix = *is_prefix; + // let original_str = ctx.word_interner.get(*original).to_owned(); if one_typo.is_init() { return Ok(()); } let mut one_typo_words = BTreeSet::new(); - find_zero_one_typo_derivations( - *original, - *is_prefix, - index.words_fst(txn)?, - word_interner, - |derived_word, nbr_typos| { - match nbr_typos { - ZeroOrOneTypo::Zero => {} - ZeroOrOneTypo::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } else { - return Ok(ControlFlow::Break(())); - } + find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { + match nbr_typos { + ZeroOrOneTypo::Zero => {} + ZeroOrOneTypo::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } else { + return Ok(ControlFlow::Break(())); } } - Ok(ControlFlow::Continue(())) - }, - )?; - let split_words = - find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?; + } + Ok(ControlFlow::Continue(())) + })?; + let original_str = ctx.word_interner.get(original).to_owned(); + let split_words = find_split_words(ctx, original_str.as_str())?; let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; - self.one_typo = Lazy::Init(one_typo); + let self_mut = ctx.term_interner.get_mut(self); + self_mut.one_typo = Lazy::Init(one_typo); Ok(()) } - fn initialize_one_and_two_typo_subterm( - &mut self, - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - ) -> Result<()> { - let QueryTerm { original, is_prefix, two_typo, .. } = self; - let original_str = word_interner.get(*original).to_owned(); + fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; + let original_str = ctx.word_interner.get(*original).to_owned(); if two_typo.is_init() { return Ok(()); } @@ -694,8 +638,8 @@ impl QueryTerm { find_zero_one_two_typo_derivations( *original, *is_prefix, - index.words_fst(txn)?, - word_interner, + ctx.index.words_fst(ctx.txn)?, + &mut ctx.word_interner, |derived_word, nbr_typos| { if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT @@ -719,14 +663,15 @@ impl QueryTerm { Ok(ControlFlow::Continue(())) }, )?; - let split_words = - find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?; + let split_words = find_split_words(ctx, original_str.as_str())?; + let self_mut = ctx.term_interner.get_mut(self); + let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; let two_typo = TwoTypoTerm { two_typos: two_typo_words }; - self.one_typo = Lazy::Init(one_typo); - self.two_typo = Lazy::Init(two_typo); + self_mut.one_typo = Lazy::Init(one_typo); + self_mut.two_typo = Lazy::Init(two_typo); Ok(()) } @@ -737,38 +682,37 @@ impl QueryTerm { /// /// Return `None` if the original word cannot be split. fn split_best_frequency( - index: &Index, - txn: &RoTxn, + ctx: &mut SearchContext, original: &str, -) -> Result> { +) -> Result, Interned)>> { let chars = original.char_indices().skip(1); let mut best = None; for (i, _) in chars { let (left, right) = original.split_at(i); + let left = ctx.word_interner.insert(left.to_owned()); + let right = ctx.word_interner.insert(right.to_owned()); - let key = (1, left, right); - let frequency = index - .word_pair_proximity_docids - .remap_data_type::() - .get(txn, &key)? - .unwrap_or(0); - - if frequency != 0 && best.map_or(true, |(old, _, _)| frequency > old) { - best = Some((frequency, left, right)); + if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { + let frequency = + CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?; + if best.map_or(true, |(old, _, _)| frequency > old) { + best = Some((frequency, left, right)); + } } } - Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned()))) + Ok(best.map(|(_, left, right)| (left, right))) } -impl QueryTerm { +impl Interned { /// Return the original word from the given query term - pub fn original_single_word(&self) -> Option> { - if self.is_multiple_words { + pub fn original_single_word(self, ctx: &SearchContext) -> Option> { + let self_ = ctx.term_interner.get(self); + if self_.ngram_words.is_some() { None } else { - Some(self.original) + Some(self_.original) } } } @@ -824,11 +768,25 @@ impl PhraseBuilder { return None; } Some(LocatedQueryTerm { - value: ctx.term_interner.push(QueryTerm::phrase( - &mut ctx.word_interner, - &mut ctx.phrase_interner, - Phrase { words: self.words }, - )), + value: ctx.term_interner.push({ + let phrase = Phrase { words: self.words }; + let phrase_desc = phrase.description(&ctx.word_interner); + QueryTerm { + original: ctx.word_interner.insert(phrase_desc), + ngram_words: None, + max_nbr_typos: 0, + is_prefix: false, + zero_typo: ZeroTypoTerm { + phrase: Some(ctx.phrase_interner.insert(phrase)), + zero_typo: None, + prefix_of: BTreeSet::default(), + synonyms: BTreeSet::default(), + use_prefix_db: None, + }, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + } + }), positions: self.start..=self.end, }) } @@ -1001,7 +959,7 @@ pub fn make_ngram( } let mut words_interned = vec![]; for term in terms { - if let Some(original_term_word) = ctx.term_interner.get(term.value).original_single_word() { + if let Some(original_term_word) = term.value.original_single_word(ctx) { words_interned.push(original_term_word); } else { return Ok(None); @@ -1036,7 +994,7 @@ pub fn make_ngram( let term = QueryTerm { original: ngram_str_interned, - is_multiple_words: true, + ngram_words: Some(words_interned), is_prefix, max_nbr_typos, zero_typo: term.zero_typo, From faceb661e301c4dbc0590c3b5c26e3c9870f90f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 15:02:01 +0200 Subject: [PATCH 135/234] Add note that a part of the code needs fixing --- milli/src/search/new/query_graph.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index cba9e590f..1eede33c2 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -431,6 +431,9 @@ impl QueryGraph { let (start_term, dest_term) = node; let end_term = Interned::from_raw(dest_term.into_raw()); let src = if let Some(start_term) = start_term { + // TODO: this is incorrect! should take the intersection + // between the prev node and the start term if they refer to the same + // original query term! let start_term = Interned::from_raw(start_term.into_raw()); nodes.get_mut(prev_node).successors.insert(start_term); nodes.get_mut(start_term).predecessors.insert(prev_node); From b439d36807f663d29953a1fe7dfc6488524a136b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 15:38:30 +0200 Subject: [PATCH 136/234] Split query_term module into multiple submodules --- milli/src/search/new/logger/detailed.rs | 89 +- milli/src/search/new/logger/mod.rs | 2 + milli/src/search/new/mod.rs | 3 + milli/src/search/new/query_graph.rs | 26 +- milli/src/search/new/query_term.rs | 1008 ----------------- .../new/query_term/compute_derivations.rs | 380 +++++++ milli/src/search/new/query_term/mod.rs | 331 ++++++ .../src/search/new/query_term/ntypo_subset.rs | 80 ++ .../src/search/new/query_term/parse_query.rs | 281 +++++ milli/src/search/new/query_term/phrase.rs | 16 + .../new/ranking_rule_graph/proximity/mod.rs | 4 +- .../search/new/ranking_rule_graph/typo/mod.rs | 24 +- 12 files changed, 1122 insertions(+), 1122 deletions(-) delete mode 100644 milli/src/search/new/query_term.rs create mode 100644 milli/src/search/new/query_term/compute_derivations.rs create mode 100644 milli/src/search/new/query_term/mod.rs create mode 100644 milli/src/search/new/query_term/ntypo_subset.rs create mode 100644 milli/src/search/new/query_term/parse_query.rs create mode 100644 milli/src/search/new/query_term/phrase.rs diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 3a02950a8..86568d5d2 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -8,9 +8,7 @@ use roaring::RoaringBitmap; use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::{ - Lazy, LocatedQueryTermSubset, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm, -}; +use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::ranking_rule_graph::{ DeadEndsCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, @@ -439,87 +437,26 @@ results.{cur_ranking_rule}{cur_activated_id} {{ positions: _, term_ids: _, }) => { - let QueryTerm { - original, - is_multiple_words: _, - is_prefix: _, - max_nbr_typos, - zero_typo, - one_typo, - two_typo, - } = ctx.term_interner.get(term_subset.original); - - let original = ctx.word_interner.get(*original); writeln!( file, - "{node_idx} : \"{original}\" {{ + "{node_idx} : \"{}\" {{ shape: class - max_nbr_typo: {max_nbr_typos}" + max_nbr_typo: {}", + term_subset.description(ctx), + term_subset.max_nbr_typos(ctx) ) .unwrap(); - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = - zero_typo; - - for w in zero_typo.iter().copied() { - if term_subset.zero_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0").unwrap(); - } + for w in term_subset.all_single_words_except_prefix_db(ctx).unwrap() { + let w = ctx.word_interner.get(w); + writeln!(file, "{w}: word").unwrap(); } - for w in prefix_of.iter().copied() { - if term_subset.zero_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0P").unwrap(); - } + for p in term_subset.all_phrases(ctx).unwrap() { + writeln!(file, "{}: phrase", p.description(ctx)).unwrap(); } - - if let Some(phrase) = phrase { - if term_subset.zero_typo_subset.contains_phrase(*phrase) { - let phrase = ctx.phrase_interner.get(*phrase); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : phrase").unwrap(); - } - } - - for synonym in synonyms.iter().copied() { - if term_subset.zero_typo_subset.contains_phrase(synonym) { - let phrase = ctx.phrase_interner.get(synonym); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); - } - } - if let Some(use_prefix_db) = use_prefix_db { - if term_subset.zero_typo_subset.contains_word(*use_prefix_db) { - let p = ctx.word_interner.get(*use_prefix_db); - writeln!(file, "use prefix DB : {p}").unwrap(); - } - } - if let Lazy::Init(one_typo) = one_typo { - let OneTypoTerm { split_words, one_typo } = one_typo; - - for w in one_typo.iter().copied() { - if term_subset.one_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 1").unwrap(); - } - } - if let Some(split_words) = split_words { - if term_subset.one_typo_subset.contains_phrase(*split_words) { - let phrase = ctx.phrase_interner.get(*split_words); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); - } - } - } - if let Lazy::Init(two_typo) = two_typo { - let TwoTypoTerm { two_typos } = two_typo; - for w in two_typos.iter().copied() { - if term_subset.two_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 2").unwrap(); - } - } + if let Some(w) = term_subset.use_prefix_db(ctx) { + let w = ctx.word_interner.get(w); + writeln!(file, "{w}: prefix db").unwrap(); } writeln!(file, "}}").unwrap(); diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 889e811ad..15cb78784 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -1,6 +1,8 @@ // #[cfg(test)] pub mod detailed; +pub mod test_logger; + use roaring::RoaringBitmap; use super::interner::{Interned, MappedInterner}; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d561d25b..4456d693d 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -17,6 +17,9 @@ mod sort; // TODO: documentation + comments mod words; +#[cfg(test)] +mod tests; + use std::collections::HashSet; use charabia::TokenizerBuilder; diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 1eede33c2..33e178494 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,7 +1,6 @@ use super::interner::{FixedSizeInterner, Interned}; use super::query_term::{ - self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, NTypoTermSubset, - QueryTermSubset, + self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, QueryTermSubset, }; use super::small_bitmap::SmallBitmap; use super::SearchContext; @@ -107,12 +106,7 @@ impl QueryGraph { let new_node_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: Interned::from_raw(term_idx as u16), - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(Interned::from_raw(term_idx as u16)), positions: terms[term_idx].positions.clone(), term_ids: term_idx as u8..=term_idx as u8, }), @@ -126,12 +120,7 @@ impl QueryGraph { let ngram_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: ngram.value, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(ngram.value), positions: ngram.positions, term_ids: term_idx as u8 - 1..=term_idx as u8, }), @@ -146,12 +135,7 @@ impl QueryGraph { let ngram_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: ngram.value, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(ngram.value), positions: ngram.positions, term_ids: term_idx as u8 - 2..=term_idx as u8, }), @@ -329,7 +313,7 @@ impl QueryGraph { let mut at_least_one_phrase = false; for (node_id, node) in self.nodes.iter() { let QueryNodeData::Term(t) = &node.data else { continue }; - if ctx.term_interner.get(t.term_subset.original).zero_typo.phrase.is_some() { + if t.term_subset.original_phrase(ctx).is_some() { at_least_one_phrase = true; continue; } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs deleted file mode 100644 index 15e106e06..000000000 --- a/milli/src/search/new/query_term.rs +++ /dev/null @@ -1,1008 +0,0 @@ -use std::borrow::Cow; -use std::collections::BTreeSet; -use std::ops::{ControlFlow, RangeInclusive}; - -use charabia::normalizer::NormalizedTokenIter; -use charabia::{SeparatorKind, TokenKind}; -use fst::automaton::Str; -use fst::{Automaton, IntoStreamer, Streamer}; -use heed::types::DecodeIgnore; -use heed::BytesDecode; -use itertools::Itertools; - -use super::interner::{DedupInterner, Interned}; -use super::{limits, SearchContext}; -use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; -use crate::search::{build_dfa, get_first}; -use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; - -/// A phrase in the user's search query, consisting of several words -/// that must appear side-by-side in the search results. -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct Phrase { - pub words: Vec>>, -} -impl Phrase { - pub fn description(&self, interner: &DedupInterner) -> String { - self.words.iter().flatten().map(|w| interner.get(*w)).join(" ") - } -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub enum Lazy { - Uninit, - Init(T), -} -impl Lazy { - pub fn is_init(&self) -> bool { - match self { - Lazy::Uninit => false, - Lazy::Init(_) => true, - } - } - pub fn is_uninit(&self) -> bool { - match self { - Lazy::Uninit => true, - Lazy::Init(_) => false, - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum NTypoTermSubset { - All, - Subset { - words: BTreeSet>, - phrases: BTreeSet>, - // TODO: prefixes: BTreeSet>, - }, - Nothing, -} - -impl NTypoTermSubset { - pub fn contains_word(&self, word: Interned) -> bool { - match self { - NTypoTermSubset::All => true, - NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word), - NTypoTermSubset::Nothing => false, - } - } - pub fn contains_phrase(&self, phrase: Interned) -> bool { - match self { - NTypoTermSubset::All => true, - NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase), - NTypoTermSubset::Nothing => false, - } - } - pub fn is_empty(&self) -> bool { - match self { - NTypoTermSubset::All => false, - NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(), - NTypoTermSubset::Nothing => true, - } - } - pub fn union(&mut self, other: &Self) { - match self { - Self::All => {} - Self::Subset { words, phrases } => match other { - Self::All => { - *self = Self::All; - } - Self::Subset { words: w2, phrases: p2 } => { - words.extend(w2); - phrases.extend(p2); - } - Self::Nothing => {} - }, - Self::Nothing => { - *self = other.clone(); - } - } - } - pub fn intersect(&mut self, other: &Self) { - match self { - Self::All => *self = other.clone(), - Self::Subset { words, phrases } => match other { - Self::All => {} - Self::Subset { words: w2, phrases: p2 } => { - let mut ws = BTreeSet::new(); - for w in words.intersection(w2) { - ws.insert(*w); - } - let mut ps = BTreeSet::new(); - for p in phrases.intersection(p2) { - ps.insert(*p); - } - *words = ws; - *phrases = ps; - } - Self::Nothing => *self = Self::Nothing, - }, - Self::Nothing => {} - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct QueryTermSubset { - pub original: Interned, - pub zero_typo_subset: NTypoTermSubset, - pub one_typo_subset: NTypoTermSubset, - pub two_typo_subset: NTypoTermSubset, -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub struct LocatedQueryTermSubset { - pub term_subset: QueryTermSubset, - pub positions: RangeInclusive, - pub term_ids: RangeInclusive, -} - -impl QueryTermSubset { - pub fn empty(for_term: Interned) -> Self { - Self { - original: for_term, - zero_typo_subset: NTypoTermSubset::Nothing, - one_typo_subset: NTypoTermSubset::Nothing, - two_typo_subset: NTypoTermSubset::Nothing, - } - } - pub fn full(for_term: Interned) -> Self { - Self { - original: for_term, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - } - } - - pub fn union(&mut self, other: &Self) { - assert!(self.original == other.original); - self.zero_typo_subset.union(&other.zero_typo_subset); - self.one_typo_subset.union(&other.one_typo_subset); - self.two_typo_subset.union(&other.two_typo_subset); - } - pub fn intersect(&mut self, other: &Self) { - assert!(self.original == other.original); - self.zero_typo_subset.intersect(&other.zero_typo_subset); - self.one_typo_subset.intersect(&other.one_typo_subset); - self.two_typo_subset.intersect(&other.two_typo_subset); - } - - pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option> { - let original = ctx.term_interner.get(self.original); - let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { - return None - }; - match &self.zero_typo_subset { - NTypoTermSubset::All => Some(use_prefix_db), - NTypoTermSubset::Subset { words, phrases: _ } => { - // TODO: use a subset of prefix words instead - if words.contains(&use_prefix_db) { - Some(use_prefix_db) - } else { - None - } - } - NTypoTermSubset::Nothing => None, - } - } - pub fn all_single_words_except_prefix_db( - &self, - ctx: &mut SearchContext, - ) -> Result>> { - let mut result = BTreeSet::default(); - // TODO: a compute_partially funtion - if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { - self.original.compute_fully_if_needed(ctx)?; - } - - let original = ctx.term_interner.get_mut(self.original); - if !self.zero_typo_subset.is_empty() { - let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = - &original.zero_typo; - result.extend(zero_typo.iter().copied()); - result.extend(prefix_of.iter().copied()); - }; - - match &self.one_typo_subset { - NTypoTermSubset::All => { - let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { - panic!() - }; - result.extend(one_typo.iter().copied()) - } - NTypoTermSubset::Subset { words, phrases: _ } => { - let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { - panic!() - }; - result.extend(one_typo.intersection(words)); - } - NTypoTermSubset::Nothing => {} - }; - - match &self.two_typo_subset { - NTypoTermSubset::All => { - let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { - panic!() - }; - result.extend(two_typos.iter().copied()); - } - NTypoTermSubset::Subset { words, phrases: _ } => { - let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { - panic!() - }; - result.extend(two_typos.intersection(words)); - } - NTypoTermSubset::Nothing => {} - }; - - Ok(result) - } - pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result>> { - let mut result = BTreeSet::default(); - - if !self.one_typo_subset.is_empty() { - // TODO: compute less than fully if possible - self.original.compute_fully_if_needed(ctx)?; - } - let original = ctx.term_interner.get_mut(self.original); - - let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = - &original.zero_typo; - result.extend(phrase.iter().copied()); - result.extend(synonyms.iter().copied()); - - if !self.one_typo_subset.is_empty() { - let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { - panic!(); - }; - result.extend(split_words.iter().copied()); - } - - Ok(result) - } -} - -impl Interned { - pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { - let s = ctx.term_interner.get_mut(self); - if s.max_nbr_typos == 0 { - s.one_typo = Lazy::Init(OneTypoTerm::default()); - s.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { - assert!(s.two_typo.is_uninit()); - self.initialize_one_typo_subterm(ctx)?; - let s = ctx.term_interner.get_mut(self); - assert!(s.one_typo.is_init()); - s.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { - assert!(s.two_typo.is_uninit()); - self.initialize_one_and_two_typo_subterm(ctx)?; - let s = ctx.term_interner.get_mut(self); - assert!(s.one_typo.is_init() && s.two_typo.is_init()); - } - Ok(()) - } -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub struct QueryTerm { - pub original: Interned, - pub ngram_words: Option>>, - pub max_nbr_typos: u8, - pub is_prefix: bool, - pub zero_typo: ZeroTypoTerm, - // May not be computed yet - pub one_typo: Lazy, - // May not be computed yet - pub two_typo: Lazy, -} - -// SubTerms will be in a dedup interner -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct ZeroTypoTerm { - /// The original phrase, if any - pub phrase: Option>, - /// A single word equivalent to the original term, with zero typos - pub zero_typo: Option>, - /// All the words that contain the original word as prefix - pub prefix_of: BTreeSet>, - /// All the synonyms of the original word or phrase - pub synonyms: BTreeSet>, - /// A prefix in the prefix databases matching the original word - pub use_prefix_db: Option>, -} -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct OneTypoTerm { - /// The original word split into multiple consecutive words - pub split_words: Option>, - /// Words that are 1 typo away from the original word - pub one_typo: BTreeSet>, -} -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct TwoTypoTerm { - /// Words that are 2 typos away from the original word - pub two_typos: BTreeSet>, -} - -impl ZeroTypoTerm { - fn is_empty(&self) -> bool { - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; - phrase.is_none() - && zero_typo.is_none() - && prefix_of.is_empty() - && synonyms.is_empty() - && use_prefix_db.is_none() - } -} -impl OneTypoTerm { - fn is_empty(&self) -> bool { - let OneTypoTerm { split_words, one_typo } = self; - one_typo.is_empty() && split_words.is_none() - } -} -impl TwoTypoTerm { - fn is_empty(&self) -> bool { - let TwoTypoTerm { two_typos } = self; - two_typos.is_empty() - } -} - -impl QueryTerm { - pub fn is_empty(&self) -> bool { - let Lazy::Init(one_typo) = &self.one_typo else { - return false; - }; - let Lazy::Init(two_typo) = &self.two_typo else { - return false; - }; - - self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() - } -} - -pub enum ZeroOrOneTypo { - Zero, - One, -} - -fn find_zero_typo_prefix_derivations( - word_interned: Interned, - fst: fst::Set>, - word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned) -> Result>, -) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - let prefix = Str::new(word).starts_with(); - let mut stream = fst.search(prefix).into_stream(); - - while let Some(derived_word) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?.to_owned(); - let derived_word_interned = word_interner.insert(derived_word); - if derived_word_interned != word_interned { - let cf = visit(derived_word_interned)?; - if cf.is_break() { - break; - } - } - } - Ok(()) -} - -fn find_zero_one_typo_derivations( - ctx: &mut SearchContext, - word_interned: Interned, - is_prefix: bool, - mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, -) -> Result<()> { - let fst = ctx.get_words_fst()?; - let word = ctx.word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - - let dfa = build_dfa(word, 1, is_prefix); - let starts = StartsWith(Str::new(get_first(word))); - let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); - - while let Some((derived_word, state)) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?; - let derived_word = ctx.word_interner.insert(derived_word.to_owned()); - let d = dfa.distance(state.1); - match d.to_u8() { - 0 => { - if derived_word != word_interned { - let cf = visit(derived_word, ZeroOrOneTypo::Zero)?; - if cf.is_break() { - break; - } - } - } - 1 => { - let cf = visit(derived_word, ZeroOrOneTypo::One)?; - if cf.is_break() { - break; - } - } - _ => { - unreachable!("One typo dfa produced multiple typos") - } - } - } - Ok(()) -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum NumberOfTypos { - Zero, - One, - Two, -} -fn find_zero_one_two_typo_derivations( - word_interned: Interned, - is_prefix: bool, - fst: fst::Set>, - word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned, NumberOfTypos) -> Result>, -) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - - let starts = StartsWith(Str::new(get_first(word))); - let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); - let second_dfa = build_dfa(word, 2, is_prefix); - let second = Intersection(&second_dfa, &starts); - let automaton = Union(first, &second); - - let mut stream = fst.search_with_state(automaton).into_stream(); - - while let Some((derived_word, state)) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?; - let derived_word_interned = word_interner.insert(derived_word.to_owned()); - // in the case the typo is on the first letter, we know the number of typo - // is two - if get_first(derived_word) != get_first(word) { - let cf = visit(derived_word_interned, NumberOfTypos::Two)?; - if cf.is_break() { - break; - } - } else { - // Else, we know that it is the second dfa that matched and compute the - // correct distance - let d = second_dfa.distance((state.1).0); - match d.to_u8() { - 0 => { - if derived_word_interned != word_interned { - let cf = visit(derived_word_interned, NumberOfTypos::Zero)?; - if cf.is_break() { - break; - } - } - } - 1 => { - let cf = visit(derived_word_interned, NumberOfTypos::One)?; - if cf.is_break() { - break; - } - } - 2 => { - let cf = visit(derived_word_interned, NumberOfTypos::Two)?; - if cf.is_break() { - break; - } - } - _ => unreachable!("2 typos DFA produced a distance greater than 2"), - } - } - } - Ok(()) -} - -fn partially_initialized_term_from_word( - ctx: &mut SearchContext, - word: &str, - max_typo: u8, - is_prefix: bool, -) -> Result { - let word_interned = ctx.word_interner.insert(word.to_owned()); - - if word.len() > MAX_WORD_LENGTH { - return Ok({ - QueryTerm { - original: ctx.word_interner.insert(word.to_owned()), - ngram_words: None, - is_prefix: false, - max_nbr_typos: 0, - zero_typo: <_>::default(), - one_typo: Lazy::Init(<_>::default()), - two_typo: Lazy::Init(<_>::default()), - } - }); - } - - let fst = ctx.index.words_fst(ctx.txn)?; - - let use_prefix_db = is_prefix - && ctx - .index - .word_prefix_docids - .remap_data_type::() - .get(ctx.txn, word)? - .is_some(); - let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; - - let mut zero_typo = None; - let mut prefix_of = BTreeSet::new(); - - if fst.contains(word) { - zero_typo = Some(word_interned); - } - - if is_prefix && use_prefix_db.is_none() { - find_zero_typo_prefix_derivations( - word_interned, - fst, - &mut ctx.word_interner, - |derived_word| { - if prefix_of.len() < limits::MAX_PREFIX_COUNT { - prefix_of.insert(derived_word); - Ok(ControlFlow::Continue(())) - } else { - Ok(ControlFlow::Break(())) - } - }, - )?; - } - let synonyms = ctx.index.synonyms(ctx.txn)?; - let mut synonym_word_count = 0; - let synonyms = synonyms - .get(&vec![word.to_owned()]) - .cloned() - .unwrap_or_default() - .into_iter() - .take(limits::MAX_SYNONYM_PHRASE_COUNT) - .filter_map(|words| { - if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT { - return None; - } - synonym_word_count += words.len(); - let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); - Some(ctx.phrase_interner.insert(Phrase { words })) - }) - .collect(); - let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; - - Ok(QueryTerm { - original: word_interned, - ngram_words: None, - max_nbr_typos: max_typo, - is_prefix, - zero_typo, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - }) -} - -fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result>> { - if let Some((l, r)) = split_best_frequency(ctx, word)? { - Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) - } else { - Ok(None) - } -} - -impl Interned { - fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { - let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; - let original = *original; - let is_prefix = *is_prefix; - // let original_str = ctx.word_interner.get(*original).to_owned(); - if one_typo.is_init() { - return Ok(()); - } - let mut one_typo_words = BTreeSet::new(); - - find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { - match nbr_typos { - ZeroOrOneTypo::Zero => {} - ZeroOrOneTypo::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } else { - return Ok(ControlFlow::Break(())); - } - } - } - Ok(ControlFlow::Continue(())) - })?; - let original_str = ctx.word_interner.get(original).to_owned(); - let split_words = find_split_words(ctx, original_str.as_str())?; - let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; - - let self_mut = ctx.term_interner.get_mut(self); - self_mut.one_typo = Lazy::Init(one_typo); - - Ok(()) - } - fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { - let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; - let original_str = ctx.word_interner.get(*original).to_owned(); - if two_typo.is_init() { - return Ok(()); - } - let mut one_typo_words = BTreeSet::new(); - let mut two_typo_words = BTreeSet::new(); - - find_zero_one_two_typo_derivations( - *original, - *is_prefix, - ctx.index.words_fst(ctx.txn)?, - &mut ctx.word_interner, - |derived_word, nbr_typos| { - if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT - && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT - { - // No chance we will add either one- or two-typo derivations anymore, stop iterating. - return Ok(ControlFlow::Break(())); - } - match nbr_typos { - NumberOfTypos::Zero => {} - NumberOfTypos::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } - } - NumberOfTypos::Two => { - if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { - two_typo_words.insert(derived_word); - } - } - } - Ok(ControlFlow::Continue(())) - }, - )?; - let split_words = find_split_words(ctx, original_str.as_str())?; - let self_mut = ctx.term_interner.get_mut(self); - - let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; - - let two_typo = TwoTypoTerm { two_typos: two_typo_words }; - - self_mut.one_typo = Lazy::Init(one_typo); - self_mut.two_typo = Lazy::Init(two_typo); - - Ok(()) - } -} - -/// Split the original word into the two words that appear the -/// most next to each other in the index. -/// -/// Return `None` if the original word cannot be split. -fn split_best_frequency( - ctx: &mut SearchContext, - original: &str, -) -> Result, Interned)>> { - let chars = original.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = original.split_at(i); - let left = ctx.word_interner.insert(left.to_owned()); - let right = ctx.word_interner.insert(right.to_owned()); - - if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { - let frequency = - CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?; - if best.map_or(true, |(old, _, _)| frequency > old) { - best = Some((frequency, left, right)); - } - } - } - - Ok(best.map(|(_, left, right)| (left, right))) -} - -impl Interned { - /// Return the original word from the given query term - pub fn original_single_word(self, ctx: &SearchContext) -> Option> { - let self_ = ctx.term_interner.get(self); - if self_.ngram_words.is_some() { - None - } else { - Some(self_.original) - } - } -} - -/// A query term term coupled with its position in the user's search query. -#[derive(Clone)] -pub struct LocatedQueryTerm { - pub value: Interned, - pub positions: RangeInclusive, -} - -impl LocatedQueryTerm { - /// Return `true` iff the term is empty - pub fn is_empty(&self, interner: &DedupInterner) -> bool { - interner.get(self.value).is_empty() - } -} - -struct PhraseBuilder { - words: Vec>>, - start: u16, - end: u16, -} - -impl PhraseBuilder { - fn empty() -> Self { - Self { words: Default::default(), start: u16::MAX, end: u16::MAX } - } - - fn is_empty(&self) -> bool { - self.words.is_empty() - } - - // precondition: token has kind Word or StopWord - fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { - if self.is_empty() { - self.start = position; - } - self.end = position; - if let TokenKind::StopWord = token.kind { - self.words.push(None); - } else { - // token has kind Word - let word = ctx.word_interner.insert(token.lemma().to_string()); - // TODO: in a phrase, check that every word exists - // otherwise return an empty term - self.words.push(Some(word)); - } - } - - fn build(self, ctx: &mut SearchContext) -> Option { - if self.is_empty() { - return None; - } - Some(LocatedQueryTerm { - value: ctx.term_interner.push({ - let phrase = Phrase { words: self.words }; - let phrase_desc = phrase.description(&ctx.word_interner); - QueryTerm { - original: ctx.word_interner.insert(phrase_desc), - ngram_words: None, - max_nbr_typos: 0, - is_prefix: false, - zero_typo: ZeroTypoTerm { - phrase: Some(ctx.phrase_interner.insert(phrase)), - zero_typo: None, - prefix_of: BTreeSet::default(), - synonyms: BTreeSet::default(), - use_prefix_db: None, - }, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - } - }), - positions: self.start..=self.end, - }) - } -} - -/// Convert the tokenised search query into a list of located query terms. -// TODO: checking if the positions are correct for phrases, separators, ngrams -pub fn located_query_terms_from_string( - ctx: &mut SearchContext, - query: NormalizedTokenIter<&[u8]>, - words_limit: Option, -) -> Result> { - let nbr_typos = number_of_typos_allowed(ctx)?; - - let mut located_terms = Vec::new(); - - let mut phrase: Option = None; - - let parts_limit = words_limit.unwrap_or(usize::MAX); - - // start with the last position as we will wrap around to position 0 at the beginning of the loop below. - let mut position = u16::MAX; - - let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable(); - while let Some(token) = peekable.next() { - // early return if word limit is exceeded - if located_terms.len() >= parts_limit { - return Ok(located_terms); - } - - match token.kind { - TokenKind::Word | TokenKind::StopWord => { - // On first loop, goes from u16::MAX to 0, then normal increment. - position = position.wrapping_add(1); - - // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, - // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, - // 3. if the word is the last token of the query we push it as a prefix word. - if let Some(phrase) = &mut phrase { - phrase.push_word(ctx, &token, position) - } else if peekable.peek().is_some() { - match token.kind { - TokenKind::Word => { - let word = token.lemma(); - let term = partially_initialized_term_from_word( - ctx, - word, - nbr_typos(word), - false, - )?; - let located_term = LocatedQueryTerm { - value: ctx.term_interner.push(term), - positions: position..=position, - }; - located_terms.push(located_term); - } - TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} - } - } else { - let word = token.lemma(); - let term = - partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; - let located_term = LocatedQueryTerm { - value: ctx.term_interner.push(term), - positions: position..=position, - }; - located_terms.push(located_term); - } - } - TokenKind::Separator(separator_kind) => { - match separator_kind { - SeparatorKind::Hard => { - position += 1; - } - SeparatorKind::Soft => { - position += 0; - } - } - - phrase = 'phrase: { - let phrase = phrase.take(); - - // If we have a hard separator inside a phrase, we immediately start a new phrase - let phrase = if separator_kind == SeparatorKind::Hard { - if let Some(phrase) = phrase { - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) - } - Some(PhraseBuilder::empty()) - } else { - None - } - } else { - phrase - }; - - // We close and start a new phrase depending on the number of double quotes - let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - if quote_count == 0 { - break 'phrase phrase; - } - - // Consume the closing quote and the phrase - if let Some(phrase) = phrase { - // Per the check above, quote_count > 0 - quote_count -= 1; - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) - } - } - - // Start new phrase if the token ends with an opening quote - (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) - }; - } - _ => (), - } - } - - // If a quote is never closed, we consider all of the end of the query as a phrase. - if let Some(phrase) = phrase.take() { - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term); - } - } - - Ok(located_terms) -} - -pub fn number_of_typos_allowed<'ctx>( - ctx: &SearchContext<'ctx>, -) -> Result u8 + 'ctx> { - let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; - let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; - let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; - - // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? - let exact_words = ctx.index.exact_words(ctx.txn)?; - - Ok(Box::new(move |word: &str| { - if !authorize_typos - || word.len() < min_len_one_typo as usize - || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) - { - 0 - } else if word.len() < min_len_two_typos as usize { - 1 - } else { - 2 - } - })) -} - -pub fn make_ngram( - ctx: &mut SearchContext, - terms: &[LocatedQueryTerm], - number_of_typos_allowed: &impl Fn(&str) -> u8, -) -> Result> { - assert!(!terms.is_empty()); - for t in terms { - if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() { - return Ok(None); - } - } - for ts in terms.windows(2) { - let [t1, t2] = ts else { panic!() }; - if *t1.positions.end() != t2.positions.start() - 1 { - return Ok(None); - } - } - let mut words_interned = vec![]; - for term in terms { - if let Some(original_term_word) = term.value.original_single_word(ctx) { - words_interned.push(original_term_word); - } else { - return Ok(None); - } - } - let words = - words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::>(); - - let start = *terms.first().as_ref().unwrap().positions.start(); - let end = *terms.last().as_ref().unwrap().positions.end(); - let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix; - let ngram_str = words.join(""); - if ngram_str.len() > MAX_WORD_LENGTH { - return Ok(None); - } - let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone()); - - let max_nbr_typos = - number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); - - let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; - - // Now add the synonyms - let index_synonyms = ctx.index.synonyms(ctx.txn)?; - - term.zero_typo.synonyms.extend( - index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| { - let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); - ctx.phrase_interner.insert(Phrase { words }) - }), - ); - - let term = QueryTerm { - original: ngram_str_interned, - ngram_words: Some(words_interned), - is_prefix, - max_nbr_typos, - zero_typo: term.zero_typo, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - }; - - let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end }; - - Ok(Some(term)) -} diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs new file mode 100644 index 000000000..f95956fbf --- /dev/null +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -0,0 +1,380 @@ +use fst::automaton::Str; +use fst::{Automaton, IntoStreamer, Streamer}; +use heed::types::DecodeIgnore; +use heed::BytesDecode; +use std::borrow::Cow; +use std::collections::BTreeSet; +use std::ops::ControlFlow; + +use super::*; +use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; +use crate::search::new::query_term::TwoTypoTerm; +use crate::search::new::{limits, SearchContext}; +use crate::search::{build_dfa, get_first}; +use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum NumberOfTypos { + Zero, + One, + Two, +} + +pub enum ZeroOrOneTypo { + Zero, + One, +} + +impl Interned { + pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { + let s = ctx.term_interner.get_mut(self); + if s.max_nbr_typos == 0 { + s.one_typo = Lazy::Init(OneTypoTerm::default()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_and_two_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init() && s.two_typo.is_init()); + } + Ok(()) + } +} + +fn find_zero_typo_prefix_derivations( + word_interned: Interned, + fst: fst::Set>, + word_interner: &mut DedupInterner, + mut visit: impl FnMut(Interned) -> Result>, +) -> Result<()> { + let word = word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + let prefix = Str::new(word).starts_with(); + let mut stream = fst.search(prefix).into_stream(); + + while let Some(derived_word) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?.to_owned(); + let derived_word_interned = word_interner.insert(derived_word); + if derived_word_interned != word_interned { + let cf = visit(derived_word_interned)?; + if cf.is_break() { + break; + } + } + } + Ok(()) +} + +fn find_zero_one_typo_derivations( + ctx: &mut SearchContext, + word_interned: Interned, + is_prefix: bool, + mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, +) -> Result<()> { + let fst = ctx.get_words_fst()?; + let word = ctx.word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + + let dfa = build_dfa(word, 1, is_prefix); + let starts = StartsWith(Str::new(get_first(word))); + let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); + + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; + let derived_word = ctx.word_interner.insert(derived_word.to_owned()); + let d = dfa.distance(state.1); + match d.to_u8() { + 0 => { + if derived_word != word_interned { + let cf = visit(derived_word, ZeroOrOneTypo::Zero)?; + if cf.is_break() { + break; + } + } + } + 1 => { + let cf = visit(derived_word, ZeroOrOneTypo::One)?; + if cf.is_break() { + break; + } + } + _ => { + unreachable!("One typo dfa produced multiple typos") + } + } + } + Ok(()) +} + +fn find_zero_one_two_typo_derivations( + word_interned: Interned, + is_prefix: bool, + fst: fst::Set>, + word_interner: &mut DedupInterner, + mut visit: impl FnMut(Interned, NumberOfTypos) -> Result>, +) -> Result<()> { + let word = word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + + let starts = StartsWith(Str::new(get_first(word))); + let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); + let second_dfa = build_dfa(word, 2, is_prefix); + let second = Intersection(&second_dfa, &starts); + let automaton = Union(first, &second); + + let mut stream = fst.search_with_state(automaton).into_stream(); + + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; + let derived_word_interned = word_interner.insert(derived_word.to_owned()); + // in the case the typo is on the first letter, we know the number of typo + // is two + if get_first(derived_word) != get_first(word) { + let cf = visit(derived_word_interned, NumberOfTypos::Two)?; + if cf.is_break() { + break; + } + } else { + // Else, we know that it is the second dfa that matched and compute the + // correct distance + let d = second_dfa.distance((state.1).0); + match d.to_u8() { + 0 => { + if derived_word_interned != word_interned { + let cf = visit(derived_word_interned, NumberOfTypos::Zero)?; + if cf.is_break() { + break; + } + } + } + 1 => { + let cf = visit(derived_word_interned, NumberOfTypos::One)?; + if cf.is_break() { + break; + } + } + 2 => { + let cf = visit(derived_word_interned, NumberOfTypos::Two)?; + if cf.is_break() { + break; + } + } + _ => unreachable!("2 typos DFA produced a distance greater than 2"), + } + } + } + Ok(()) +} + +pub fn partially_initialized_term_from_word( + ctx: &mut SearchContext, + word: &str, + max_typo: u8, + is_prefix: bool, +) -> Result { + let word_interned = ctx.word_interner.insert(word.to_owned()); + + if word.len() > MAX_WORD_LENGTH { + return Ok({ + QueryTerm { + original: ctx.word_interner.insert(word.to_owned()), + ngram_words: None, + is_prefix: false, + max_nbr_typos: 0, + zero_typo: <_>::default(), + one_typo: Lazy::Init(<_>::default()), + two_typo: Lazy::Init(<_>::default()), + } + }); + } + + let fst = ctx.index.words_fst(ctx.txn)?; + + let use_prefix_db = is_prefix + && ctx + .index + .word_prefix_docids + .remap_data_type::() + .get(ctx.txn, word)? + .is_some(); + let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; + + let mut zero_typo = None; + let mut prefix_of = BTreeSet::new(); + + if fst.contains(word) { + zero_typo = Some(word_interned); + } + + if is_prefix && use_prefix_db.is_none() { + find_zero_typo_prefix_derivations( + word_interned, + fst, + &mut ctx.word_interner, + |derived_word| { + if prefix_of.len() < limits::MAX_PREFIX_COUNT { + prefix_of.insert(derived_word); + Ok(ControlFlow::Continue(())) + } else { + Ok(ControlFlow::Break(())) + } + }, + )?; + } + let synonyms = ctx.index.synonyms(ctx.txn)?; + let mut synonym_word_count = 0; + let synonyms = synonyms + .get(&vec![word.to_owned()]) + .cloned() + .unwrap_or_default() + .into_iter() + .take(limits::MAX_SYNONYM_PHRASE_COUNT) + .filter_map(|words| { + if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT { + return None; + } + synonym_word_count += words.len(); + let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); + Some(ctx.phrase_interner.insert(Phrase { words })) + }) + .collect(); + let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; + + Ok(QueryTerm { + original: word_interned, + ngram_words: None, + max_nbr_typos: max_typo, + is_prefix, + zero_typo, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + }) +} + +fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result>> { + if let Some((l, r)) = split_best_frequency(ctx, word)? { + Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) + } else { + Ok(None) + } +} + +impl Interned { + fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; + let original = *original; + let is_prefix = *is_prefix; + // let original_str = ctx.word_interner.get(*original).to_owned(); + if one_typo.is_init() { + return Ok(()); + } + let mut one_typo_words = BTreeSet::new(); + + find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { + match nbr_typos { + ZeroOrOneTypo::Zero => {} + ZeroOrOneTypo::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } else { + return Ok(ControlFlow::Break(())); + } + } + } + Ok(ControlFlow::Continue(())) + })?; + let original_str = ctx.word_interner.get(original).to_owned(); + let split_words = find_split_words(ctx, original_str.as_str())?; + let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; + + let self_mut = ctx.term_interner.get_mut(self); + self_mut.one_typo = Lazy::Init(one_typo); + + Ok(()) + } + fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; + let original_str = ctx.word_interner.get(*original).to_owned(); + if two_typo.is_init() { + return Ok(()); + } + let mut one_typo_words = BTreeSet::new(); + let mut two_typo_words = BTreeSet::new(); + + find_zero_one_two_typo_derivations( + *original, + *is_prefix, + ctx.index.words_fst(ctx.txn)?, + &mut ctx.word_interner, + |derived_word, nbr_typos| { + if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT + && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT + { + // No chance we will add either one- or two-typo derivations anymore, stop iterating. + return Ok(ControlFlow::Break(())); + } + match nbr_typos { + NumberOfTypos::Zero => {} + NumberOfTypos::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } + } + NumberOfTypos::Two => { + if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { + two_typo_words.insert(derived_word); + } + } + } + Ok(ControlFlow::Continue(())) + }, + )?; + let split_words = find_split_words(ctx, original_str.as_str())?; + let self_mut = ctx.term_interner.get_mut(self); + + let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; + + let two_typo = TwoTypoTerm { two_typos: two_typo_words }; + + self_mut.one_typo = Lazy::Init(one_typo); + self_mut.two_typo = Lazy::Init(two_typo); + + Ok(()) + } +} + +/// Split the original word into the two words that appear the +/// most next to each other in the index. +/// +/// Return `None` if the original word cannot be split. +fn split_best_frequency( + ctx: &mut SearchContext, + original: &str, +) -> Result, Interned)>> { + let chars = original.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = original.split_at(i); + let left = ctx.word_interner.insert(left.to_owned()); + let right = ctx.word_interner.insert(right.to_owned()); + + if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { + let frequency = + CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?; + if best.map_or(true, |(old, _, _)| frequency > old) { + best = Some((frequency, left, right)); + } + } + } + + Ok(best.map(|(_, left, right)| (left, right))) +} diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs new file mode 100644 index 000000000..50977395b --- /dev/null +++ b/milli/src/search/new/query_term/mod.rs @@ -0,0 +1,331 @@ +mod compute_derivations; +mod ntypo_subset; +mod parse_query; +mod phrase; + +use super::interner::{DedupInterner, Interned}; +use super::{limits, SearchContext}; +use crate::Result; +use std::collections::BTreeSet; +use std::ops::RangeInclusive; + +pub use ntypo_subset::NTypoTermSubset; +pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed}; +pub use phrase::Phrase; + +use compute_derivations::partially_initialized_term_from_word; + +/** +A set of word derivations attached to a location in the search query. + +*/ +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct LocatedQueryTermSubset { + pub term_subset: QueryTermSubset, + pub positions: RangeInclusive, + pub term_ids: RangeInclusive, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct QueryTermSubset { + original: Interned, + zero_typo_subset: NTypoTermSubset, + one_typo_subset: NTypoTermSubset, + two_typo_subset: NTypoTermSubset, +} + +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct QueryTerm { + original: Interned, + ngram_words: Option>>, + max_nbr_typos: u8, + is_prefix: bool, + zero_typo: ZeroTypoTerm, + // May not be computed yet + one_typo: Lazy, + // May not be computed yet + two_typo: Lazy, +} + +// SubTerms will be in a dedup interner +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct ZeroTypoTerm { + /// The original phrase, if any + phrase: Option>, + /// A single word equivalent to the original term, with zero typos + zero_typo: Option>, + /// All the words that contain the original word as prefix + prefix_of: BTreeSet>, + /// All the synonyms of the original word or phrase + synonyms: BTreeSet>, + /// A prefix in the prefix databases matching the original word + use_prefix_db: Option>, +} +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct OneTypoTerm { + /// The original word split into multiple consecutive words + split_words: Option>, + /// Words that are 1 typo away from the original word + one_typo: BTreeSet>, +} +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct TwoTypoTerm { + /// Words that are 2 typos away from the original word + two_typos: BTreeSet>, +} + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum Lazy { + Uninit, + Init(T), +} +impl Lazy { + pub fn is_init(&self) -> bool { + match self { + Lazy::Uninit => false, + Lazy::Init(_) => true, + } + } + pub fn is_uninit(&self) -> bool { + match self { + Lazy::Uninit => true, + Lazy::Init(_) => false, + } + } +} + +impl QueryTermSubset { + pub fn empty(for_term: Interned) -> Self { + Self { + original: for_term, + zero_typo_subset: NTypoTermSubset::Nothing, + one_typo_subset: NTypoTermSubset::Nothing, + two_typo_subset: NTypoTermSubset::Nothing, + } + } + pub fn full(for_term: Interned) -> Self { + Self { + original: for_term, + zero_typo_subset: NTypoTermSubset::All, + one_typo_subset: NTypoTermSubset::All, + two_typo_subset: NTypoTermSubset::All, + } + } + + pub fn union(&mut self, other: &Self) { + assert!(self.original == other.original); + self.zero_typo_subset.union(&other.zero_typo_subset); + self.one_typo_subset.union(&other.one_typo_subset); + self.two_typo_subset.union(&other.two_typo_subset); + } + pub fn intersect(&mut self, other: &Self) { + assert!(self.original == other.original); + self.zero_typo_subset.intersect(&other.zero_typo_subset); + self.one_typo_subset.intersect(&other.one_typo_subset); + self.two_typo_subset.intersect(&other.two_typo_subset); + } + + pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option> { + let original = ctx.term_interner.get(self.original); + let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { + return None + }; + match &self.zero_typo_subset { + NTypoTermSubset::All => Some(use_prefix_db), + NTypoTermSubset::Subset { words, phrases: _ } => { + // TODO: use a subset of prefix words instead + if words.contains(&use_prefix_db) { + Some(use_prefix_db) + } else { + None + } + } + NTypoTermSubset::Nothing => None, + } + } + pub fn all_single_words_except_prefix_db( + &self, + ctx: &mut SearchContext, + ) -> Result>> { + let mut result = BTreeSet::default(); + // TODO: a compute_partially funtion + if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { + self.original.compute_fully_if_needed(ctx)?; + } + + let original = ctx.term_interner.get_mut(self.original); + if !self.zero_typo_subset.is_empty() { + let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = + &original.zero_typo; + result.extend(zero_typo.iter().copied()); + result.extend(prefix_of.iter().copied()); + }; + + match &self.one_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { + panic!() + }; + result.extend(one_typo.iter().copied()) + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { + panic!() + }; + result.extend(one_typo.intersection(words)); + } + NTypoTermSubset::Nothing => {} + }; + + match &self.two_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { + panic!() + }; + result.extend(two_typos.iter().copied()); + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { + panic!() + }; + result.extend(two_typos.intersection(words)); + } + NTypoTermSubset::Nothing => {} + }; + + Ok(result) + } + pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result>> { + let mut result = BTreeSet::default(); + + if !self.one_typo_subset.is_empty() { + // TODO: compute less than fully if possible + self.original.compute_fully_if_needed(ctx)?; + } + let original = ctx.term_interner.get_mut(self.original); + + let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = + &original.zero_typo; + result.extend(phrase.iter().copied()); + result.extend(synonyms.iter().copied()); + + if !self.one_typo_subset.is_empty() { + let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { + panic!(); + }; + result.extend(split_words.iter().copied()); + } + + Ok(result) + } + + pub fn original_phrase(&self, ctx: &SearchContext) -> Option> { + let t = ctx.term_interner.get(self.original); + if let Some(p) = t.zero_typo.phrase { + if self.zero_typo_subset.contains_phrase(p) { + return Some(p); + } + } + None + } + pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 { + let t = ctx.term_interner.get(self.original); + match t.max_nbr_typos { + 0 => 0, + 1 => { + if self.one_typo_subset.is_empty() { + 0 + } else { + 1 + } + } + 2 => { + if self.two_typo_subset.is_empty() { + if self.one_typo_subset.is_empty() { + 0 + } else { + 1 + } + } else { + 2 + } + } + _ => panic!(), + } + } + pub fn clear_zero_typo_subset(&mut self) { + self.zero_typo_subset = NTypoTermSubset::Nothing; + } + pub fn clear_one_typo_subset(&mut self) { + self.one_typo_subset = NTypoTermSubset::Nothing; + } + pub fn clear_two_typo_subset(&mut self) { + self.two_typo_subset = NTypoTermSubset::Nothing; + } + pub fn description(&self, ctx: &SearchContext) -> String { + let t = ctx.term_interner.get(self.original); + ctx.word_interner.get(t.original).to_owned() + } +} + +impl ZeroTypoTerm { + fn is_empty(&self) -> bool { + let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; + phrase.is_none() + && zero_typo.is_none() + && prefix_of.is_empty() + && synonyms.is_empty() + && use_prefix_db.is_none() + } +} +impl OneTypoTerm { + fn is_empty(&self) -> bool { + let OneTypoTerm { split_words, one_typo } = self; + one_typo.is_empty() && split_words.is_none() + } +} +impl TwoTypoTerm { + fn is_empty(&self) -> bool { + let TwoTypoTerm { two_typos } = self; + two_typos.is_empty() + } +} + +impl QueryTerm { + fn is_empty(&self) -> bool { + let Lazy::Init(one_typo) = &self.one_typo else { + return false; + }; + let Lazy::Init(two_typo) = &self.two_typo else { + return false; + }; + + self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() + } +} + +impl Interned { + /// Return the original word from the given query term + fn original_single_word(self, ctx: &SearchContext) -> Option> { + let self_ = ctx.term_interner.get(self); + if self_.ngram_words.is_some() { + None + } else { + Some(self_.original) + } + } +} + +/// A query term coupled with its position in the user's search query. +#[derive(Clone)] +pub struct LocatedQueryTerm { + pub value: Interned, + pub positions: RangeInclusive, +} + +impl LocatedQueryTerm { + /// Return `true` iff the term is empty + pub fn is_empty(&self, interner: &DedupInterner) -> bool { + interner.get(self.value).is_empty() + } +} diff --git a/milli/src/search/new/query_term/ntypo_subset.rs b/milli/src/search/new/query_term/ntypo_subset.rs new file mode 100644 index 000000000..ad25d73c7 --- /dev/null +++ b/milli/src/search/new/query_term/ntypo_subset.rs @@ -0,0 +1,80 @@ +use std::collections::BTreeSet; + +use crate::search::new::interner::Interned; + +use super::Phrase; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum NTypoTermSubset { + All, + Subset { + words: BTreeSet>, + phrases: BTreeSet>, + // TODO: prefixes: BTreeSet>, + }, + Nothing, +} + +impl NTypoTermSubset { + pub fn contains_word(&self, word: Interned) -> bool { + match self { + NTypoTermSubset::All => true, + NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word), + NTypoTermSubset::Nothing => false, + } + } + pub fn contains_phrase(&self, phrase: Interned) -> bool { + match self { + NTypoTermSubset::All => true, + NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase), + NTypoTermSubset::Nothing => false, + } + } + pub fn is_empty(&self) -> bool { + match self { + NTypoTermSubset::All => false, + NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(), + NTypoTermSubset::Nothing => true, + } + } + pub fn union(&mut self, other: &Self) { + match self { + Self::All => {} + Self::Subset { words, phrases } => match other { + Self::All => { + *self = Self::All; + } + Self::Subset { words: w2, phrases: p2 } => { + words.extend(w2); + phrases.extend(p2); + } + Self::Nothing => {} + }, + Self::Nothing => { + *self = other.clone(); + } + } + } + pub fn intersect(&mut self, other: &Self) { + match self { + Self::All => *self = other.clone(), + Self::Subset { words, phrases } => match other { + Self::All => {} + Self::Subset { words: w2, phrases: p2 } => { + let mut ws = BTreeSet::new(); + for w in words.intersection(w2) { + ws.insert(*w); + } + let mut ps = BTreeSet::new(); + for p in phrases.intersection(p2) { + ps.insert(*p); + } + *words = ws; + *phrases = ps; + } + Self::Nothing => *self = Self::Nothing, + }, + Self::Nothing => {} + } + } +} diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs new file mode 100644 index 000000000..e0f6d971b --- /dev/null +++ b/milli/src/search/new/query_term/parse_query.rs @@ -0,0 +1,281 @@ +use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind}; + +use crate::{Result, SearchContext, MAX_WORD_LENGTH}; + +use super::*; + +/// Convert the tokenised search query into a list of located query terms. +// TODO: checking if the positions are correct for phrases, separators, ngrams +pub fn located_query_terms_from_string( + ctx: &mut SearchContext, + query: NormalizedTokenIter<&[u8]>, + words_limit: Option, +) -> Result> { + let nbr_typos = number_of_typos_allowed(ctx)?; + + let mut located_terms = Vec::new(); + + let mut phrase: Option = None; + + let parts_limit = words_limit.unwrap_or(usize::MAX); + + // start with the last position as we will wrap around to position 0 at the beginning of the loop below. + let mut position = u16::MAX; + + let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable(); + while let Some(token) = peekable.next() { + // early return if word limit is exceeded + if located_terms.len() >= parts_limit { + return Ok(located_terms); + } + + match token.kind { + TokenKind::Word | TokenKind::StopWord => { + // On first loop, goes from u16::MAX to 0, then normal increment. + position = position.wrapping_add(1); + + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, + // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, + // 3. if the word is the last token of the query we push it as a prefix word. + if let Some(phrase) = &mut phrase { + phrase.push_word(ctx, &token, position) + } else if peekable.peek().is_some() { + match token.kind { + TokenKind::Word => { + let word = token.lemma(); + let term = partially_initialized_term_from_word( + ctx, + word, + nbr_typos(word), + false, + )?; + let located_term = LocatedQueryTerm { + value: ctx.term_interner.push(term), + positions: position..=position, + }; + located_terms.push(located_term); + } + TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} + } + } else { + let word = token.lemma(); + let term = + partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; + let located_term = LocatedQueryTerm { + value: ctx.term_interner.push(term), + positions: position..=position, + }; + located_terms.push(located_term); + } + } + TokenKind::Separator(separator_kind) => { + match separator_kind { + SeparatorKind::Hard => { + position += 1; + } + SeparatorKind::Soft => { + position += 0; + } + } + + phrase = 'phrase: { + let phrase = phrase.take(); + + // If we have a hard separator inside a phrase, we immediately start a new phrase + let phrase = if separator_kind == SeparatorKind::Hard { + if let Some(phrase) = phrase { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + Some(PhraseBuilder::empty()) + } else { + None + } + } else { + phrase + }; + + // We close and start a new phrase depending on the number of double quotes + let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + if quote_count == 0 { + break 'phrase phrase; + } + + // Consume the closing quote and the phrase + if let Some(phrase) = phrase { + // Per the check above, quote_count > 0 + quote_count -= 1; + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + } + + // Start new phrase if the token ends with an opening quote + (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) + }; + } + _ => (), + } + } + + // If a quote is never closed, we consider all of the end of the query as a phrase. + if let Some(phrase) = phrase.take() { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term); + } + } + + Ok(located_terms) +} + +pub fn number_of_typos_allowed<'ctx>( + ctx: &SearchContext<'ctx>, +) -> Result u8 + 'ctx> { + let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; + let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; + let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; + + // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? + let exact_words = ctx.index.exact_words(ctx.txn)?; + + Ok(Box::new(move |word: &str| { + if !authorize_typos + || word.len() < min_len_one_typo as usize + || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) + { + 0 + } else if word.len() < min_len_two_typos as usize { + 1 + } else { + 2 + } + })) +} + +pub fn make_ngram( + ctx: &mut SearchContext, + terms: &[LocatedQueryTerm], + number_of_typos_allowed: &impl Fn(&str) -> u8, +) -> Result> { + assert!(!terms.is_empty()); + for t in terms { + if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() { + return Ok(None); + } + } + for ts in terms.windows(2) { + let [t1, t2] = ts else { panic!() }; + if *t1.positions.end() != t2.positions.start() - 1 { + return Ok(None); + } + } + let mut words_interned = vec![]; + for term in terms { + if let Some(original_term_word) = term.value.original_single_word(ctx) { + words_interned.push(original_term_word); + } else { + return Ok(None); + } + } + let words = + words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::>(); + + let start = *terms.first().as_ref().unwrap().positions.start(); + let end = *terms.last().as_ref().unwrap().positions.end(); + let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix; + let ngram_str = words.join(""); + if ngram_str.len() > MAX_WORD_LENGTH { + return Ok(None); + } + let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone()); + + let max_nbr_typos = + number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); + + let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; + + // Now add the synonyms + let index_synonyms = ctx.index.synonyms(ctx.txn)?; + + term.zero_typo.synonyms.extend( + index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| { + let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); + ctx.phrase_interner.insert(Phrase { words }) + }), + ); + + let term = QueryTerm { + original: ngram_str_interned, + ngram_words: Some(words_interned), + is_prefix, + max_nbr_typos, + zero_typo: term.zero_typo, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + }; + + let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end }; + + Ok(Some(term)) +} + +struct PhraseBuilder { + words: Vec>>, + start: u16, + end: u16, +} + +impl PhraseBuilder { + fn empty() -> Self { + Self { words: Default::default(), start: u16::MAX, end: u16::MAX } + } + + fn is_empty(&self) -> bool { + self.words.is_empty() + } + + // precondition: token has kind Word or StopWord + fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { + if self.is_empty() { + self.start = position; + } + self.end = position; + if let TokenKind::StopWord = token.kind { + self.words.push(None); + } else { + // token has kind Word + let word = ctx.word_interner.insert(token.lemma().to_string()); + // TODO: in a phrase, check that every word exists + // otherwise return an empty term + self.words.push(Some(word)); + } + } + + fn build(self, ctx: &mut SearchContext) -> Option { + if self.is_empty() { + return None; + } + Some(LocatedQueryTerm { + value: ctx.term_interner.push({ + let phrase = ctx.phrase_interner.insert(Phrase { words: self.words }); + let phrase_desc = phrase.description(ctx); + QueryTerm { + original: ctx.word_interner.insert(phrase_desc), + ngram_words: None, + max_nbr_typos: 0, + is_prefix: false, + zero_typo: ZeroTypoTerm { + phrase: Some(phrase), + zero_typo: None, + prefix_of: BTreeSet::default(), + synonyms: BTreeSet::default(), + use_prefix_db: None, + }, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + } + }), + positions: self.start..=self.end, + }) + } +} diff --git a/milli/src/search/new/query_term/phrase.rs b/milli/src/search/new/query_term/phrase.rs new file mode 100644 index 000000000..2ea8e0d39 --- /dev/null +++ b/milli/src/search/new/query_term/phrase.rs @@ -0,0 +1,16 @@ +use itertools::Itertools; + +use crate::{search::new::interner::Interned, SearchContext}; + +/// A phrase in the user's search query, consisting of several words +/// that must appear side-by-side in the search results. +#[derive(Default, Clone, PartialEq, Eq, Hash)] +pub struct Phrase { + pub words: Vec>>, +} +impl Interned { + pub fn description(self, ctx: &SearchContext) -> String { + let p = ctx.phrase_interner.get(self); + p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ") + } +} diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 81c99fd9a..cfd3f62bf 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -57,9 +57,7 @@ impl RankingRuleGraphTrait for ProximityGraph { Ok(format!("{cost}: cost")) } ProximityCondition::Term { term } => { - let original_term = ctx.term_interner.get(term.term_subset.original); - let original_word = ctx.word_interner.get(original_term.original); - Ok(format!("{original_word} : exists")) + Ok(format!("{} : exists", term.term_subset.description(ctx))) } } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index de02b67a4..5d7e0f874 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -3,7 +3,7 @@ use roaring::RoaringBitmap; use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; -use crate::search::new::query_term::{LocatedQueryTermSubset, NTypoTermSubset}; +use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; @@ -43,8 +43,7 @@ impl RankingRuleGraphTrait for TypoGraph { _from: Option<&LocatedQueryTermSubset>, to_term: &LocatedQueryTermSubset, ) -> Result)>> { - let term = to_term; // LocatedQueryTermSubset { term_subset, positions: _, term_ids } = to_term; - let original_full_term = ctx.term_interner.get(term.term_subset.original); + let term = to_term; let mut edges = vec![]; // Ngrams have a base typo cost @@ -52,20 +51,20 @@ impl RankingRuleGraphTrait for TypoGraph { // 3-gram -> equivalent to 2 typos let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; - for nbr_typos in 0..=original_full_term.max_nbr_typos { + for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) { let mut term = term.clone(); match nbr_typos { 0 => { - term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_one_typo_subset(); + term.term_subset.clear_two_typo_subset(); } 1 => { - term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_zero_typo_subset(); + term.term_subset.clear_two_typo_subset(); } 2 => { - term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_zero_typo_subset(); + term.term_subset.clear_one_typo_subset(); } _ => panic!(), }; @@ -92,9 +91,6 @@ impl RankingRuleGraphTrait for TypoGraph { fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { let TypoCondition { term, nbr_typos } = condition; - let original_term = ctx.term_interner.get(term.term_subset.original); - let original = ctx.word_interner.get(original_term.original); - - Ok(format!("{original}: {nbr_typos}")) + Ok(format!("{}: {nbr_typos}", term.term_subset.description(ctx))) } } From 62b9c6fbee82cb0a1bf600457f74a49457f9bde2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 16:18:22 +0200 Subject: [PATCH 137/234] Add search tests --- milli/src/search/new/tests/mod.rs | 3 + .../src/search/new/tests/ngram_split_words.rs | 255 ++++++++++++ milli/src/search/new/tests/typo.rs | 363 ++++++++++++++++++ milli/src/search/new/tests/words_tms.rs | 266 +++++++++++++ 4 files changed, 887 insertions(+) create mode 100644 milli/src/search/new/tests/mod.rs create mode 100644 milli/src/search/new/tests/ngram_split_words.rs create mode 100644 milli/src/search/new/tests/typo.rs create mode 100644 milli/src/search/new/tests/words_tms.rs diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs new file mode 100644 index 000000000..eec4c62ec --- /dev/null +++ b/milli/src/search/new/tests/mod.rs @@ -0,0 +1,3 @@ +pub mod ngram_split_words; +pub mod typo; +pub mod words_tms; diff --git a/milli/src/search/new/tests/ngram_split_words.rs b/milli/src/search/new/tests/ngram_split_words.rs new file mode 100644 index 000000000..06c49274c --- /dev/null +++ b/milli/src/search/new/tests/ngram_split_words.rs @@ -0,0 +1,255 @@ +/*! +This module tests the following properties: + +1. Two consecutive words from a query can be combined into a "2gram" +2. Three consecutive words from a query can be combined into a "3gram" +3. A word from the query can be split into two consecutive words (split words) +4. A 2gram can be split into two words +5. A 3gram cannot be split into two words +6. 2grams can contain up to 1 typo +7. 3grams cannot have typos +8. 2grams and 3grams can be prefix tolerant +9. Disabling typo tolerance also disable the split words feature +10. Disabling typo tolerance does not disable prefix tolerance +11. Disabling typo tolerance does not disable ngram tolerance +12. Prefix tolerance is disabled for the last word if a space follows it +13. Ngrams cannot be formed by combining a phrase and a word or two phrases +*/ + +use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "the sun flowers are pretty" + }, + { + "id": 1, + "text": "the sun flower is tall" + }, + { + "id": 2, + "text": "the sunflowers are pretty" + }, + { + "id": 3, + "text": "the sunflower is tall" + } + ])) + .unwrap(); + index +} + +#[test] +fn test_2gram_simple() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // will also match documents with "sun flower" + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); +} +#[test] +fn test_3gram_simple() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower s are"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2]"); +} + +#[test] +fn test_2gram_typo() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flawer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); +} + +#[test] +fn test_no_disable_ngrams() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // documents containing `sunflower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); +} + +#[test] +fn test_2gram_prefix() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flow"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // documents containing words beginning with `sunflow` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); +} + +#[test] +fn test_3gram_prefix() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("su nf l"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // documents containing a word beginning with sunfl + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]"); +} + +#[test] +fn test_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunflower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // all the documents with either `sunflower` or `sun flower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]"); +} + +#[test] +fn test_disable_split_words() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunflower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // no document containing `sun flower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); +} + +#[test] +fn test_2gram_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf lower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // all the documents with "sunflower", "sun flower", or (sunflower + 1 typo) + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]"); +} + +#[test] +fn test_3gram_no_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf lo wer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // no document with `sun flower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]"); +} + +#[test] +fn test_3gram_no_typos() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf la wer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + +#[test] +fn test_no_ngram_phrases() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("\"sun\" flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("\"sun\" \"flower\""); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]"); +} diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs new file mode 100644 index 000000000..6ac8f5516 --- /dev/null +++ b/milli/src/search/new/tests/typo.rs @@ -0,0 +1,363 @@ +/*! +This module tests the following properties: + +1. The `words` ranking rule is typo-tolerant +2. Typo-tolerance handles missing letters, extra letters, replaced letters, and swapped letters (at least) +3. Words which are < `min_word_len_one_typo` are not typo tolerant +4. Words which are >= `min_word_len_one_typo` but < `min_word_len_two_typos` can have one typo +5. Words which are >= `min_word_len_two_typos` can have two typos +6. A typo on the first letter of a word counts as two typos +7. Phrases are not typo tolerant +8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos` +9. 3grams are not typo tolerant +10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly +if `words` doesn't exist before it. +11. The `typo` ranking rule places documents with the same number of typos in the same bucket +12. Prefix tolerance costs nothing according to the typo ranking rule +13. Split words cost 1 typo according to the typo ranking rule +14. Synonyms cost nothing according to the typo ranking rule +*/ + +use std::collections::HashMap; + +use crate::{ + index::tests::TempIndex, Criterion, + Search, SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "the quick brown fox jumps over the lazy dog" + }, + { + "id": 1, + "text": "the quick brown foxes jump over the lazy dog" + }, + { + "id": 2, + "text": "the quick brown fax sends a letter to the dog" + }, + { + "id": 3, + "text": "the quickest brownest fox jumps over the laziest dog" + }, + { + "id": 4, + "text": "a fox doesn't quack, that crown goes to the duck." + }, + { + "id": 5, + "text": "the quicker browner fox jumped over the lazier dog" + }, + { + "id": 6, + "text": "the extravagant fox skyrocketed over the languorous dog" // thanks thesaurus + }, + { + "id": 7, + "text": "the quick brown fox jumps over the lazy" + }, + { + "id": 8, + "text": "the quick brown fox jumps over the" + }, + { + "id": 9, + "text": "the quick brown fox jumps over" + }, + { + "id": 10, + "text": "the quick brown fox jumps" + }, + { + "id": 11, + "text": "the quick brown fox" + }, + { + "id": 12, + "text": "the quick brown" + }, + { + "id": 13, + "text": "the quick" + }, + { + "id": 14, + "text": "netwolk interconections sunflawar" + }, + { + "id": 15, + "text": "network interconnections sunflawer" + }, + { + "id": 16, + "text": "network interconnection sunflower" + }, + { + "id": 17, + "text": "network interconnection sun flower" + }, + { + "id": 18, + "text": "network interconnection sunflowering" + }, + { + "id": 19, + "text": "network interconnection sun flowering" + }, + { + "id": 20, + "text": "network interconnection sunflowar" + }, + { + "id": 21, + "text": "the fast brownish fox jumps over the lackadaisical dog" + }, + { + "id": 22, + "text": "the quick brown fox jumps over the lackadaisical dog" + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_no_typo() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); +} + +#[test] +fn test_default_typo() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let ot = index.min_word_len_one_typo(&txn).unwrap(); + let tt = index.min_word_len_two_typos(&txn).unwrap(); + insta::assert_debug_snapshot!(ot, @"5"); + insta::assert_debug_snapshot!(tt, @"9"); + + // 0 typo + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 1 typo on one word, replaced letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quack brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 1 typo on one word, missing letter, extra letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quicest brownest fox jummps over the laziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + + // 1 typo on one word, swapped letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quikc borwn fox jupms over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 1 first letter typo on a word <5 bytes, replaced letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the nuick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 1 first letter typo on a word <5 bytes, missing letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the uick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 1 typo on all words >=5 bytes, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quack brawn fox junps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 2 typos on words < 9 bytes + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quckest brawnert fox jumps over the aziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 2 typos on words >= 9 bytes: missing letters, missing first letter, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extravant fox kyrocketed over the lamguorout dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + // 2 typos on words >= 9 bytes: 2 extra letters in a single word, swapped letters + extra letter, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extravaganttt fox sktyrocnketed over the lagnuorrous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); +} + +#[test] +fn test_phrase_no_typo_allowed() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the \"quick brewn\" fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + +#[test] +fn test_ngram_typos() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extra lagant fox skyrocketed over the languorous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the ex tra lagant fox skyrocketed over the languorous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} +#[test] +fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids: ids_1, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{ids_1:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Typo]); + }) + .unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids: ids_2, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{ids_2:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + + assert_eq!(ids_1, ids_2); +} + +#[test] +fn test_typo_bucketing() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + // First do the search with just the Words ranking rule + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sunflower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 15, 16, 17, 18, 20]"); + + // Then with the typo ranking rule + drop(txn); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + }) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sunflower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18, 17, 20, 15, 14]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sun flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[17, 19, 16, 18, 20, 15]"); +} + +#[test] +fn test_typo_synonyms() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + + let mut synonyms = HashMap::new(); + synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]); + synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]); + + s.set_synonyms(synonyms); + }) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lackadaisical dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the fast brownish fox jumps over the lackadaisical dog"); + + // TODO: is this correct? interaction of ngrams + synonyms means that the + // multi-word synonyms end up having a typo cost. This is probably not what we want. + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]"); +} diff --git a/milli/src/search/new/tests/words_tms.rs b/milli/src/search/new/tests/words_tms.rs new file mode 100644 index 000000000..8b5c0153f --- /dev/null +++ b/milli/src/search/new/tests/words_tms.rs @@ -0,0 +1,266 @@ +/*! +This module tests the following properties: + +1. The `last` term matching strategy starts removing terms from the query +starting from the end if no more results match it. +2. Phrases are never deleted by the `last` term matching strategy +3. Duplicate words don't affect the ranking of a document according to the `words` ranking rule +4. The proximity of the first and last word of a phrase to its adjacent terms is taken into +account by the proximity ranking rule. +5. Unclosed double quotes still make a phrase +6. The `all` term matching strategy does not remove any term from the query +7. The search is capable of returning no results if no documents match the query +*/ + +use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; + +fn create_quick_brown_fox_trivial_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "", + }, + { + "id": 1, + "text": "the", + }, + { + "id": 2, + "text": "the quick", + }, + { + "id": 3, + "text": "the quick brown", + }, + { + "id": 4, + "text": "the quick brown fox", + }, + { + "id": 5, + "text": "the quick brown fox jumps", + }, + { + "id": 6, + "text": "the quick brown fox jumps over", + }, + { + "id": 7, + "text": "the quick brown fox jumps over the", + }, + { + "id": 8, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 9, + "text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 10, + "text": "the brown quick fox jumps over the lazy dog", + }, + { + "id": 11, + "text": "the quick brown fox talks to the lazy and slow dog", + }, + { + "id": 12, + "text": "the quick brown fox talks to the lazy dog", + }, + { + "id": 13, + "text": "the mighty and quick brown fox jumps over the lazy dog", + }, + { + "id": 14, + "text": "the great quick brown fox jumps over the lazy dog", + }, + { + "id": 15, + "text": "this quick brown and very scary fox jumps over the lazy dog", + }, + { + "id": 16, + "text": "this quick brown and scary fox jumps over the lazy dog", + }, + { + "id": 17, + "text": "the quick brown fox jumps over the really lazy dog", + }, + { + "id": 18, + "text": "the brown quick fox jumps over the really lazy dog", + }, + { + "id": 19, + "text": "the brown quick fox immediately jumps over the really lazy dog", + }, + { + "id": 20, + "text": "the brown quick fox immediately jumps over the really lazy blue dog", + }, + { + "id": 21, + "text": "the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.", + }, + { + "id": 22, + "text": "the, quick, brown, fox, jumps, over, the, lazy, dog", + } + ])) + .unwrap(); + index +} + +#[test] +fn test_words_tms_last_simple() { + let index = create_quick_brown_fox_trivial_index(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 6 and 7 have the same score because "the" appears twice + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 8, 6, 7, 5, 4, 11, 12, 3]"); + + let mut s = Search::new(&txn, &index); + s.query("extravagant the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + +#[test] +fn test_words_tms_last_phrase() { + let index = create_quick_brown_fox_trivial_index(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox\" jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "The quick brown fox" is a phrase, not deleted by this term matching strategy + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 6, 7, 5, 4, 11, 12]"); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox\" jumps over the \"lazy\" dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "lazy" is a phrase, not deleted by this term matching strategy + // but words before it can be deleted + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 11, 12]"); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox jumps over the lazy dog\""); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // The whole query is a phrase, no terms are removed + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // The whole query is still a phrase, even without closing quotes, so no terms are removed + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); +} + +#[test] +fn test_words_proximity_tms_last_simple() { + let index = create_quick_brown_fox_trivial_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 7 is better than 6 because of the proximity between "the" and its surrounding terms + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); + + let mut s = Search::new(&txn, &index); + s.query("the brown quick fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 10 is better than 9 because of the proximity between "quick" and "brown" + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); +} + +#[test] +fn test_words_proximity_tms_last_phrase() { + let index = create_quick_brown_fox_trivial_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the \"quick brown\" fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "quick brown" is a phrase. The proximity of its first and last words + // to their adjacent query words should be taken into account + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5, 4, 11, 12, 3]"); + + let mut s = Search::new(&txn, &index); + s.query("the \"quick brown\" \"fox jumps\" over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "quick brown" is a phrase. The proximity of its first and last words + // to their adjacent query words should be taken into account. + // The same applies to `fox jumps`. + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5]"); +} + +#[test] +fn test_words_tms_all() { + let index = create_quick_brown_fox_trivial_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::All); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]"); + + let mut s = Search::new(&txn, &index); + s.query("extravagant"); + s.terms_matching_strategy(TermsMatchingStrategy::All); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} From 406b8bd2489931afb2373ed60011e73d9530922e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:04:46 +0200 Subject: [PATCH 138/234] Add new db caches --- milli/src/search/new/db_cache.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index af94108e2..effd123be 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -24,6 +24,8 @@ pub struct DatabaseCache<'ctx> { pub word_docids: FxHashMap, Option<&'ctx [u8]>>, pub exact_word_docids: FxHashMap, Option<&'ctx [u8]>>, pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, + pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, } impl<'ctx> DatabaseCache<'ctx> { fn get_value<'v, K1, KC>( @@ -128,4 +130,32 @@ impl<'ctx> SearchContext<'ctx> { self.index.prefix_word_pair_proximity_docids.remap_data_type::(), ) } + + pub fn get_db_word_position_docids( + &mut self, + word: Interned, + position: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, position), + &(self.word_interner.get(word).as_str(), position), + &mut self.db_cache.word_position_docids, + self.index.word_position_docids.remap_data_type::(), + ) + } + + pub fn get_db_word_fid_docids( + &mut self, + word: Interned, + fid: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, fid), + &(self.word_interner.get(word).as_str(), fid), + &mut self.db_cache.word_fid_docids, + self.index.word_fid_docids.remap_data_type::(), + ) + } } From ec2f8e804003f9ece9d49f3f616a60152b5c0ed2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:06:07 +0200 Subject: [PATCH 139/234] Rename `is_multiple_words` to `is_ngram` and `zero_typo` to `exact` --- milli/src/search/new/logger/detailed.rs | 4 ++-- milli/src/search/new/query_term.rs | 32 +++++++++++++++---------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 3a02950a8..3c4779ad9 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -441,7 +441,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ }) => { let QueryTerm { original, - is_multiple_words: _, + is_ngram: _, is_prefix: _, max_nbr_typos, zero_typo, @@ -458,7 +458,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ ) .unwrap(); - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = zero_typo; for w in zero_typo.iter().copied() { diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index d19ab6135..90b03d194 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -204,8 +204,13 @@ impl QueryTermSubset { } if !self.zero_typo_subset.is_empty() { - let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = - &original.zero_typo; + let ZeroTypoTerm { + phrase: _, + exact: zero_typo, + prefix_of, + synonyms: _, + use_prefix_db: _, + } = &original.zero_typo; result.extend(zero_typo.iter().copied()); result.extend(prefix_of.iter().copied()); }; @@ -258,7 +263,7 @@ impl QueryTermSubset { )?; } - let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = + let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } = &original.zero_typo; result.extend(phrase.iter().copied()); result.extend(synonyms.iter().copied()); @@ -302,7 +307,7 @@ impl QueryTerm { #[derive(Clone, PartialEq, Eq, Hash)] pub struct QueryTerm { pub original: Interned, - pub is_multiple_words: bool, + pub is_ngram: bool, pub max_nbr_typos: u8, pub is_prefix: bool, pub zero_typo: ZeroTypoTerm, @@ -318,7 +323,7 @@ pub struct ZeroTypoTerm { /// The original phrase, if any pub phrase: Option>, /// A single word equivalent to the original term, with zero typos - pub zero_typo: Option>, + pub exact: Option>, /// All the words that contain the original word as prefix pub prefix_of: BTreeSet>, /// All the synonyms of the original word or phrase @@ -341,7 +346,7 @@ pub struct TwoTypoTerm { impl ZeroTypoTerm { fn is_empty(&self) -> bool { - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self; phrase.is_none() && zero_typo.is_none() && prefix_of.is_empty() @@ -370,12 +375,12 @@ impl QueryTerm { ) -> Self { Self { original: word_interner.insert(phrase.description(word_interner)), - is_multiple_words: false, + is_ngram: false, max_nbr_typos: 0, is_prefix: false, zero_typo: ZeroTypoTerm { phrase: Some(phrase_interner.insert(phrase)), - zero_typo: None, + exact: None, prefix_of: BTreeSet::default(), synonyms: BTreeSet::default(), use_prefix_db: None, @@ -387,7 +392,7 @@ impl QueryTerm { pub fn empty(word_interner: &mut DedupInterner, original: &str) -> Self { Self { original: word_interner.insert(original.to_owned()), - is_multiple_words: false, + is_ngram: false, is_prefix: false, max_nbr_typos: 0, zero_typo: <_>::default(), @@ -606,11 +611,12 @@ fn partially_initialized_term_from_word( Some(ctx.phrase_interner.insert(Phrase { words })) }) .collect(); - let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; + let zero_typo = + ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db }; Ok(QueryTerm { original: word_interned, - is_multiple_words: false, + is_ngram: false, max_nbr_typos: max_typo, is_prefix, zero_typo, @@ -765,7 +771,7 @@ fn split_best_frequency( impl QueryTerm { /// Return the original word from the given query term pub fn original_single_word(&self) -> Option> { - if self.is_multiple_words { + if self.is_ngram { None } else { Some(self.original) @@ -1039,7 +1045,7 @@ pub fn make_ngram( let term = QueryTerm { original, - is_multiple_words: true, + is_ngram: true, is_prefix, max_nbr_typos, zero_typo: term.zero_typo, From 4d5bc9df4c4f3145ebc72ce73d3e51325f6fba1c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:07:26 +0200 Subject: [PATCH 140/234] Increase position by 8 on hard separator when building query terms --- milli/src/search/new/query_term.rs | 2 +- .../index_documents/extract/extract_docid_word_positions.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 90b03d194..005c0a2e3 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -907,7 +907,7 @@ pub fn located_query_terms_from_string( TokenKind::Separator(separator_kind) => { match separator_kind { SeparatorKind::Hard => { - position += 1; + position += 8; } SeparatorKind::Soft => { position += 0; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 2d51fcc1a..c362f8f1b 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -153,7 +153,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st /// take an iterator on tokens and compute their relative position depending on separator kinds /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, -/// else we keep the standart proximity of 1 between words. +/// else we keep the standard proximity of 1 between words. fn process_tokens<'a>( tokens: impl Iterator>, ) -> impl Iterator)> { From 3951fe22ab72e9d9e44498c9d95ad29a0449a8dc Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:09:32 +0200 Subject: [PATCH 141/234] Add ExactTerm and helper method --- milli/src/search/new/query_term.rs | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 005c0a2e3..4e3922980 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -4,6 +4,7 @@ use std::ops::{ControlFlow, RangeInclusive}; use charabia::normalizer::NormalizedTokenIter; use charabia::{SeparatorKind, TokenKind}; +use either::Either; use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use heed::types::DecodeIgnore; @@ -138,7 +139,43 @@ pub struct LocatedQueryTermSubset { pub term_ids: RangeInclusive, } +#[derive(Clone, Copy)] +pub enum ExactTerm { + Phrase(Interned), + Word(Interned), +} + +impl ExactTerm { + pub fn interned_words<'ctx>( + &self, + ctx: &'ctx SearchContext<'ctx>, + ) -> impl Iterator>> + 'ctx { + match *self { + ExactTerm::Phrase(phrase) => { + let phrase = ctx.phrase_interner.get(phrase); + Either::Left(phrase.words.iter().copied()) + } + ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))), + } + } +} + impl QueryTermSubset { + pub fn exact_term(&self, ctx: &SearchContext) -> Option { + let full_query_term = ctx.term_interner.get(self.original); + if full_query_term.is_ngram { + return None; + } + // TODO: included in subset + if let Some(phrase) = full_query_term.zero_typo.phrase { + self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase)) + } else if let Some(word) = full_query_term.zero_typo.exact { + self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word)) + } else { + None + } + } + pub fn empty(for_term: Interned) -> Self { Self { original: for_term, From 4b4ffb8ec993729fb53467a2899b198a14d320f9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:12:07 +0200 Subject: [PATCH 142/234] Add exactness ranking rules --- milli/src/search/new/exact_attribute.rs | 175 ++++++++++++++++++ .../search/new/graph_based_ranking_rule.rs | 10 +- milli/src/search/new/mod.rs | 12 +- .../new/ranking_rule_graph/exactness/mod.rs | 107 +++++++++++ .../src/search/new/ranking_rule_graph/mod.rs | 3 + 5 files changed, 301 insertions(+), 6 deletions(-) create mode 100644 milli/src/search/new/exact_attribute.rs create mode 100644 milli/src/search/new/ranking_rule_graph/exactness/mod.rs diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs new file mode 100644 index 000000000..bb6299e28 --- /dev/null +++ b/milli/src/search/new/exact_attribute.rs @@ -0,0 +1,175 @@ +use heed::BytesDecode; +use roaring::MultiOps; + +use super::query_graph::QueryGraph; +use super::ranking_rules::{RankingRule, RankingRuleOutput}; +use crate::search::new::query_graph::QueryNodeData; +use crate::search::new::query_term::ExactTerm; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; + +/// FIXME: +/// +/// - A lot of work done in next_bucket that start_iteration could do. +/// - Consider calling the graph based rule directly from this one. +/// - currently we did exact term, don't forget about prefix +/// - some tests +pub struct ExactAttribute { + query_graph: Option, +} + +impl ExactAttribute { + pub fn new() -> Self { + Self { query_graph: None } + } +} + +impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { + fn id(&self) -> String { + "exact_attribute".to_owned() + } + + fn start_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + _universe: &roaring::RoaringBitmap, + query: &QueryGraph, + ) -> Result<()> { + self.query_graph = Some(query.clone()); + Ok(()) + } + + fn next_bucket( + &mut self, + ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + universe: &roaring::RoaringBitmap, + ) -> Result>> { + // iterate on the nodes of the graph, retain LocatedQueryTermSubset + let query_graph = self.query_graph.as_ref().unwrap(); + let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = + Vec::with_capacity(query_graph.nodes.len() as usize); + for (_, node) in query_graph.nodes.iter() { + match &node.data { + QueryNodeData::Term(term) => { + let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { + exact_term + } else { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + }; + exact_term_position_ids.push(( + exact_term, + *term.positions.start(), + *term.term_ids.start(), + )) + } + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, + } + } + + exact_term_position_ids.sort_by_key(|(_, _, id)| *id); + // bail if there is a "hole" (missing word) in remaining query graph + let mut previous_id = 0; + for (_, _, id) in exact_term_position_ids.iter().copied() { + if id < previous_id || id - previous_id > 1 { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } else { + previous_id = id; + } + } + + // sample query: "sunflower are pretty" + // sunflower at pos 0 in attr A + // are at pos 1 in attr B + // pretty at pos 2 in attr C + // We want to eliminate such document + + // first check that for each term, there exists some attribute that has this term at the correct position + //"word-position-docids"; + let mut candidates = universe.clone(); + let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids + .iter() + .copied() + .map(|(term, position, _)| (term.interned_words(ctx).collect(), position)) + .collect(); + for (words, position) in &words_positions { + if candidates.is_empty() { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } + + 'words: for (offset, word) in words.iter().enumerate() { + let offset = offset as u16; + let word = if let Some(word) = word { + word + } else { + continue 'words; + }; + let word_position_docids = CboRoaringBitmapCodec::bytes_decode( + ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), + ) + .unwrap_or_default(); + candidates &= word_position_docids; + } + } + + let candidates = candidates; + + if candidates.is_empty() { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } + + let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); + + let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); + + // then check that there exists at least one attribute that has all of the terms + for fid in searchable_fields_ids { + let mut intersection = MultiOps::intersection( + words_positions + .iter() + .flat_map(|(words, ..)| words.iter()) + // ignore stop words words in phrases + .flatten() + .map(|word| -> Result<_> { + Ok(ctx + .get_db_word_fid_docids(*word, fid)? + .map(CboRoaringBitmapCodec::bytes_decode) + .unwrap_or_default() + .unwrap_or_default()) + }), + )?; + intersection &= &candidates; + if !intersection.is_empty() { + candidates_per_attributes.push(intersection); + } + } + // note we could have "false positives" where there both exist different attributes that collectively + // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. + + let candidates = MultiOps::union(candidates_per_attributes.into_iter()); + Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates })) + } + + fn end_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + ) { + } +} diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index b8c58c726..28b4ed1f4 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner}; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, - TypoGraph, + ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph, + RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; @@ -65,6 +65,12 @@ impl GraphBasedRankingRule { Self::new_with_id("typo".to_owned(), terms_matching_strategy) } } +pub type Exactness = GraphBasedRankingRule; +impl GraphBasedRankingRule { + pub fn new() -> Self { + Self::new_with_id("exactness".to_owned(), None) + } +} /// A generic graph-based ranking rule pub struct GraphBasedRankingRule { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d561d25b..779e589b3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -9,8 +9,9 @@ mod query_term; mod ranking_rule_graph; mod ranking_rules; mod resolve_query_graph; -// TODO: documentation + comments mod small_bitmap; + +mod exact_attribute; // TODO: documentation + comments // implementation is currently an adaptation of the previous implementation to fit with the new model mod sort; @@ -33,6 +34,8 @@ use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; +use self::exact_attribute::ExactAttribute; +use self::graph_based_ranking_rule::Exactness; use self::interner::Interner; use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::resolve_query_graph::compute_query_graph_docids; @@ -150,7 +153,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut proximity = false; let mut sort = false; let attribute = false; - let exactness = false; + let mut exactness = false; let mut asc = HashSet::new(); let mut desc = HashSet::new(); @@ -211,8 +214,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( if exactness { continue; } - // todo!(); - // exactness = false; + ranking_rules.push(Box::new(ExactAttribute::new())); + ranking_rules.push(Box::new(Exactness::new())); + exactness = true; } crate::Criterion::Asc(field_name) => { if asc.contains(&field_name) { diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs new file mode 100644 index 000000000..a1e19a015 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -0,0 +1,107 @@ +use roaring::RoaringBitmap; + +use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; +use crate::search::new::query_graph::{QueryGraph, QueryNode}; +use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; + +/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other +/// word than a doc that matches 9 words non exactly but none exactly +/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider +/// +/// "naive vision" +/// condition from one node to another: +/// - word exactly present: cost 0 +/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside +/// +/// Three strategies: +/// 1. ExactAttribute: word position / word_fid_docid +/// 2. AttributeStart: +/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait` + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum ExactnessCondition { + ExactInAttribute(LocatedQueryTermSubset), + Skip(LocatedQueryTermSubset), +} + +pub enum ExactnessGraph {} + +fn compute_docids( + ctx: &mut SearchContext, + dest_node: &LocatedQueryTermSubset, + universe: &RoaringBitmap, +) -> Result { + let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) { + exact_term + } else { + return Ok(Default::default()); + }; + let mut candidates = match exact_term { + ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), + ExactTerm::Word(word) => { + if let Some(word_candidates) = ctx.get_db_word_docids(word)? { + CboRoaringBitmapCodec::deserialize_from(word_candidates)? + } else { + return Ok(Default::default()); + } + } + }; + // TODO: synonyms? + candidates &= universe; + Ok(candidates) +} + +impl RankingRuleGraphTrait for ExactnessGraph { + type Condition = ExactnessCondition; + + fn resolve_condition( + ctx: &mut SearchContext, + condition: &Self::Condition, + universe: &RoaringBitmap, + ) -> Result { + let (docids, dest_node) = match condition { + ExactnessCondition::ExactInAttribute(dest_node) => { + (compute_docids(ctx, dest_node, universe)?, dest_node) + } + ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node), + }; + Ok(ComputedCondition { + docids, + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: dest_node.clone(), + }) + } + + fn build_edges( + _ctx: &mut SearchContext, + conditions_interner: &mut DedupInterner, + _source_node: Option<&LocatedQueryTermSubset>, + dest_node: &LocatedQueryTermSubset, + ) -> Result)>> { + let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone()); + let exact_condition = conditions_interner.insert(exact_condition); + + let skip_condition = ExactnessCondition::Skip(dest_node.clone()); + let skip_condition = conditions_interner.insert(skip_condition); + Ok(vec![(0, exact_condition), (1, skip_condition)]) + } + + fn log_state( + graph: &RankingRuleGraph, + paths: &[Vec>], + dead_ends_cache: &DeadEndsCache, + universe: &RoaringBitmap, + costs: &MappedInterner>, + cost: u64, + logger: &mut dyn SearchLogger, + ) { + todo!() + } + + fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { + todo!() + } +} diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 7c40008c8..936c3e942 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -10,6 +10,8 @@ mod cheapest_paths; mod condition_docids_cache; mod dead_ends_cache; +/// Implementation of the `exactness` ranking rule +mod exactness; /// Implementation of the `proximity` ranking rule mod proximity; /// Implementation of the `typo` ranking rule @@ -20,6 +22,7 @@ use std::hash::Hash; pub use cheapest_paths::PathVisitor; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; +pub use exactness::{ExactnessCondition, ExactnessGraph}; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; From 959e4607bb11c684463ddf1149895d9aecb08a7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 18:02:46 +0200 Subject: [PATCH 143/234] Add more search tests --- milli/src/search/new/distinct.rs | 2 +- milli/src/search/new/mod.rs | 4 +- milli/src/search/new/tests/distinct.rs | 590 ++++++++++++++++++++++++ milli/src/search/new/tests/language.rs | 22 + milli/src/search/new/tests/mod.rs | 25 + milli/src/search/new/tests/proximity.rs | 0 milli/src/search/new/tests/sort.rs | 316 +++++++++++++ 7 files changed, 957 insertions(+), 2 deletions(-) create mode 100644 milli/src/search/new/tests/distinct.rs create mode 100644 milli/src/search/new/tests/language.rs create mode 100644 milli/src/search/new/tests/proximity.rs create mode 100644 milli/src/search/new/tests/sort.rs diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs index ad4b46659..7b77adf49 100644 --- a/milli/src/search/new/distinct.rs +++ b/milli/src/search/new/distinct.rs @@ -41,7 +41,7 @@ pub fn apply_distinct_rule( } /// Apply the distinct rule defined by [`apply_distinct_rule`] for a single document id. -fn distinct_single_docid( +pub fn distinct_single_docid( index: &Index, txn: &RoTxn, field_id: u16, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4456d693d..e7e38fe89 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -1,3 +1,4 @@ +mod bucket_sort; mod db_cache; mod distinct; mod graph_based_ranking_rule; @@ -31,7 +32,8 @@ pub use logger::detailed::DetailedSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; -use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; +use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; +use bucket_sort::bucket_sort; use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; diff --git a/milli/src/search/new/tests/distinct.rs b/milli/src/search/new/tests/distinct.rs new file mode 100644 index 000000000..4073cf585 --- /dev/null +++ b/milli/src/search/new/tests/distinct.rs @@ -0,0 +1,590 @@ +/*! +This module tests the "distinct attribute" feature, and its +interaction with other ranking rules. + +1. no duplicate distinct attributes are ever returned +2. only the best document (according to the search rules) for each distinct value appears in the result +3. if a document does not have a distinct attribute, then the distinct rule does not apply to it + +It doesn't test properly: +- combination of distinct + exhaustive_nbr_hits (because we know it's incorrect) +- distinct attributes with arrays (because we know it's incorrect as well) +*/ + +use std::collections::HashSet; + +use big_s::S; +use heed::RoTxn; +use maplit::hashset; + +use crate::{ + index::tests::TempIndex, AscDesc, Criterion, Index, Member, Search, SearchResult, + TermsMatchingStrategy, +}; + +use super::collect_field_values; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_sortable_fields(hashset! { S("rank1"), S("letter") }); + s.set_distinct_field("letter".to_owned()); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "letter": "A", + "rank1": 0, + "text": "the quick brown fox jamps over the lazy dog", + }, + { + "id": 1, + "letter": "A", + "rank1": 1, + "text": "the quick brown fox jumpes over the lazy dog", + }, + { + "id": 2, + "letter": "B", + "rank1": 0, + "text": "the quick brown foxjumps over the lazy dog", + }, + { + "id": 3, + "letter": "B", + "rank1": 1, + "text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 4, + "letter": "B", + "rank1": 2, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 5, + "letter": "C", + "rank1": 0, + "text": "the quickbrownfox jumps over the lazy", + }, + { + "id": 6, + "letter": "C", + "rank1": 1, + "text": "the quick brown fox jumpss over the lazy", + }, + { + "id": 7, + "letter": "C", + "rank1": 2, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 8, + "letter": "D", + "rank1": 0, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 9, + "letter": "E", + "rank1": 0, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 10, + "letter": "E", + "rank1": 1, + "text": "the quackbrown foxjunps over", + }, + { + "id": 11, + "letter": "E", + "rank1": 2, + "text": "the quicko browno fox junps over", + }, + { + "id": 12, + "letter": "E", + "rank1": 3, + "text": "the quicko browno fox jumps over", + }, + { + "id": 13, + "letter": "E", + "rank1": 4, + "text": "the quick brewn fox jumps over", + }, + { + "id": 14, + "letter": "E", + "rank1": 5, + "text": "the quick brown fox jumps over", + }, + { + "id": 15, + "letter": "F", + "rank1": 0, + "text": "the quick brownf fox jumps over", + }, + { + "id": 16, + "letter": "F", + "rank1": 1, + "text": "the quic brown fox jamps over", + }, + { + "id": 17, + "letter": "F", + "rank1": 2, + "text": "thequick browns fox jimps", + }, + { + "id": 18, + "letter": "G", + "rank1": 0, + "text": "the qick brown fox jumps", + }, + { + "id": 19, + "letter": "G", + "rank1": 1, + "text": "the quick brownfoxjumps", + }, + { + "id": 20, + "letter": "H", + "rank1": 0, + "text": "the quick brow fox jumps", + }, + { + "id": 21, + "letter": "I", + "rank1": 0, + "text": "the quick brown fox jpmps", + }, + { + "id": 22, + "letter": "I", + "rank1": 1, + "text": "the quick brown fox jumps", + }, + { + "id": 23, + "letter": "I", + "rank1": 2, + "text": "the quick", + }, + { + "id": 24, + "rank1": 0, + "text": "the quick", + }, + { + "id": 25, + "rank1": 1, + "text": "the quick brown", + }, + { + "id": 26, + "rank1": 2, + "text": "the quick brown fox", + }, + { + "id": 26, + "rank1": 3, + "text": "the quick brown fox jumps over the lazy dog", + }, + ])) + .unwrap(); + index +} + +fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec { + let vs = collect_field_values(index, txn, index.distinct_field(txn).unwrap().unwrap(), docids); + + let mut unique = HashSet::new(); + for v in vs.iter() { + if v == "__does_not_exist__" { + continue; + } + assert!(unique.insert(v.clone())); + } + + vs +} + +#[test] +fn test_distinct_placeholder_no_ranking_rules() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let s = Search::new(&txn, &index); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"A\"", + "\"B\"", + "\"C\"", + "\"D\"", + "\"E\"", + "\"F\"", + "\"G\"", + "\"H\"", + "\"I\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); +} + +#[test] +fn test_distinct_placeholder_sort() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Sort]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank1")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"E\"", + "__does_not_exist__", + "\"B\"", + "\"C\"", + "\"F\"", + "\"I\"", + "\"A\"", + "\"G\"", + "__does_not_exist__", + "\"D\"", + "\"H\"", + "__does_not_exist__", + ] + "###); + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "5", + "3", + "2", + "2", + "2", + "2", + "1", + "1", + "1", + "0", + "0", + "0", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"I\"", + "\"H\"", + "\"G\"", + "\"F\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"B\"", + "\"A\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "1", + "3", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.sort_criteria(vec![ + AscDesc::Desc(Member::Field(S("letter"))), + AscDesc::Desc(Member::Field(S("rank1"))), + ]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"I\"", + "\"H\"", + "\"G\"", + "\"F\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"B\"", + "\"A\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "2", + "0", + "1", + "2", + "5", + "0", + "2", + "2", + "1", + "3", + "1", + "0", + ] + "###); +} + +#[test] +fn test_distinct_words() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"A\"", + "\"B\"", + "__does_not_exist__", + "\"C\"", + "\"D\"", + "\"E\"", + "\"F\"", + "\"G\"", + "\"H\"", + "\"I\"", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + let text_values = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(text_values, @r###" + [ + "\"the quick brown fox jamps over the lazy dog\"", + "\"the quick brown foxjumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quickbrownfox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brownf fox jumps over\"", + "\"the qick brown fox jumps\"", + "\"the quick brow fox jumps\"", + "\"the quick brown fox jpmps\"", + "\"the quick brown\"", + "\"the quick\"", + ] + "###); +} + +#[test] +fn test_distinct_sort_words() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Sort, Criterion::Words, Criterion::Desc(S("rank1"))]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"I\"", + "\"H\"", + "\"G\"", + "\"F\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"B\"", + "\"A\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "1", + "0", + "1", + "1", + "0", + "0", + "2", + "1", + "1", + "3", + "1", + "0", + ] + "###); + + let text_values = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(text_values, @r###" + [ + "\"the quick brown fox jumps\"", + "\"the quick brow fox jumps\"", + "\"the quick brownfoxjumps\"", + "\"the quic brown fox jamps over\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumpes over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown\"", + "\"the quick\"", + ] + "###); +} + +#[test] +fn test_distinct_all_candidates() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Sort]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank1")))]); + s.exhaustive_number_hits(true); + + let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap(); + let candidates = candidates.iter().collect::>(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); + // TODO: this is incorrect! + insta::assert_snapshot!(format!("{candidates:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); +} + +#[test] +fn test_distinct_typo() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Typo]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]"); + + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"B\"", + "__does_not_exist__", + "\"A\"", + "\"C\"", + "\"D\"", + "\"E\"", + "\"F\"", + "\"I\"", + "\"G\"", + "\"H\"", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + + let text_values = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(text_values, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jamps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brownf fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the qick brown fox jumps\"", + "\"the quick brow fox jumps\"", + "\"the quick brown\"", + "\"the quick\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/language.rs b/milli/src/search/new/tests/language.rs new file mode 100644 index 000000000..6adad748c --- /dev/null +++ b/milli/src/search/new/tests/language.rs @@ -0,0 +1,22 @@ +use crate::{index::tests::TempIndex, Search, SearchResult}; + +#[test] +fn test_kanji_language_detection() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": 1, "title": "東京のお寿司。" }, + { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" } + ])) + .unwrap(); + + let txn = index.write_txn().unwrap(); + let mut search = Search::new(&txn, &index); + + search.query("東京"); + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + assert_eq!(documents_ids, vec![1]); +} diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index eec4c62ec..0fd5013db 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -1,3 +1,28 @@ +pub mod distinct; +#[cfg(feature = "default")] +pub mod language; pub mod ngram_split_words; +pub mod proximity; +pub mod sort; pub mod typo; pub mod words_tms; + +fn collect_field_values( + index: &crate::Index, + txn: &heed::RoTxn, + fid: &str, + docids: &[u32], +) -> Vec { + let mut values = vec![]; + let fid = index.fields_ids_map(txn).unwrap().id(fid).unwrap(); + for doc in index.documents(txn, docids.iter().copied()).unwrap() { + if let Some(v) = doc.1.get(fid) { + let v: serde_json::Value = serde_json::from_slice(v).unwrap(); + let v = v.to_string(); + values.push(v); + } else { + values.push("__does_not_exist__".to_owned()); + } + } + values +} diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs new file mode 100644 index 000000000..e69de29bb diff --git a/milli/src/search/new/tests/sort.rs b/milli/src/search/new/tests/sort.rs new file mode 100644 index 000000000..d3a952a24 --- /dev/null +++ b/milli/src/search/new/tests/sort.rs @@ -0,0 +1,316 @@ +/*! +This module tests the `sort` ranking rule: + +1. an error is returned if the sort ranking rule exists but no fields-to-sort were given at search time +2. an error is returned if the fields-to-sort are not sortable +3. it is possible to add multiple fields-to-sort at search time +4. custom sort ranking rules can be added to the settings, they interact with the generic `sort` ranking rule as expected +5. numbers appear before strings +6. documents with either: (1) no value, (2) null, or (3) an object for the field-to-sort appear at the end of the bucket +7. boolean values are translated to strings +8. if a field contains an array, it is sorted by the best value in the array according to the sort rule +*/ + +use big_s::S; +use maplit::hashset; + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, AscDesc, Criterion, Member, + Search, SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_sortable_fields(hashset! { S("rank"), S("vague"), S("letter") }); + s.set_criteria(vec![Criterion::Sort]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "letter": "A", + "rank": 0, + "vague": 0, + }, + { + "id": 1, + "letter": "A", + "rank": 1, + "vague": "0", + }, + { + "id": 2, + "letter": "B", + "rank": 0, + "vague": 1, + }, + { + "id": 3, + "letter": "B", + "rank": 1, + "vague": "1", + }, + { + "id": 4, + "letter": "B", + "rank": 2, + "vague": [1, 2], + }, + { + "id": 5, + "letter": "C", + "rank": 0, + "vague": [1, "2"], + }, + { + "id": 6, + "letter": "C", + "rank": 1, + }, + { + "id": 7, + "letter": "C", + "rank": 2, + "vague": null, + }, + { + "id": 8, + "letter": "D", + "rank": 0, + "vague": [null, null, ""] + }, + { + "id": 9, + "letter": "E", + "rank": 0, + "vague": "" + }, + { + "id": 10, + "letter": "E", + "rank": 1, + "vague": { + "sub": 0, + } + }, + { + "id": 11, + "letter": "E", + "rank": 2, + "vague": true, + }, + { + "id": 12, + "letter": "E", + "rank": 3, + "vague": false, + }, + { + "id": 13, + "letter": "E", + "rank": 4, + "vague": 1.5673, + }, + { + "id": 14, + "letter": "E", + "rank": 5, + }, + { + "id": 15, + "letter": "F", + "rank": 0, + }, + { + "id": 16, + "letter": "F", + "rank": 1, + }, + { + "id": 17, + "letter": "F", + "rank": 2, + }, + { + "id": 18, + "letter": "G", + "rank": 0, + }, + { + "id": 19, + "letter": "G", + "rank": 1, + }, + { + "id": 20, + "letter": "H", + "rank": 0, + "vague": true, + }, + { + "id": 21, + "letter": "I", + "rank": 0, + "vague": false, + }, + { + "id": 22, + "letter": "I", + "rank": 1, + "vague": [1.1367, "help", null] + }, + { + "id": 23, + "letter": "I", + "rank": 2, + "vague": [1.2367, "hello"] + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_sort() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 22, 23, 20, 18, 19, 15, 16, 17, 9, 10, 11, 12, 13, 14, 8, 5, 6, 7, 2]"); + + let letter_values = collect_field_values(&index, &txn, "letter", &documents_ids); + insta::assert_debug_snapshot!(letter_values, @r###" + [ + "\"I\"", + "\"I\"", + "\"I\"", + "\"H\"", + "\"G\"", + "\"G\"", + "\"F\"", + "\"F\"", + "\"F\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"C\"", + "\"C\"", + "\"B\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 13, 12, 4, 7, 11, 17, 23, 1, 3, 6, 10, 16, 19, 22, 0, 2, 5, 8, 9]"); + + let rank_values = collect_field_values(&index, &txn, "rank", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "5", + "4", + "3", + "2", + "2", + "2", + "2", + "2", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "0", + "0", + "0", + "0", + "0", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Asc(Member::Field(S("vague")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 4, 5, 22, 23, 13, 1, 3, 12, 21, 11, 20, 6, 7, 8, 9, 10, 14, 15]"); + + let vague_values = collect_field_values(&index, &txn, "vague", &documents_ids); + insta::assert_debug_snapshot!(vague_values, @r###" + [ + "0", + "1", + "[1,2]", + "[1,\"2\"]", + "[1.1367,\"help\",null]", + "[1.2367,\"hello\"]", + "1.5673", + "\"0\"", + "\"1\"", + "false", + "false", + "true", + "true", + "__does_not_exist___", + "null", + "[null,null,\"\"]", + "\"\"", + "{\"sub\":0}", + "__does_not_exist___", + "__does_not_exist___", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("vague")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 13, 23, 22, 2, 5, 0, 11, 20, 12, 21, 3, 1, 6, 7, 8, 9, 10, 14, 15]"); + + let vague_values = collect_field_values(&index, &txn, "vague", &documents_ids); + insta::assert_debug_snapshot!(vague_values, @r###" + [ + "[1,2]", + "1.5673", + "[1.2367,\"hello\"]", + "[1.1367,\"help\",null]", + "1", + "[1,\"2\"]", + "0", + "true", + "true", + "false", + "false", + "\"1\"", + "\"0\"", + "__does_not_exist___", + "null", + "[null,null,\"\"]", + "\"\"", + "{\"sub\":0}", + "__does_not_exist___", + "__does_not_exist___", + ] + "###); +} From ce328c329d354aa1ae168377eecf800d18cc163a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 18:02:16 +0200 Subject: [PATCH 144/234] Move bucket sort function to its own module and fix a bug --- milli/src/search/new/bucket_sort.rs | 195 ++++++++++++++++++++++++++ milli/src/search/new/ranking_rules.rs | 170 ---------------------- 2 files changed, 195 insertions(+), 170 deletions(-) create mode 100644 milli/src/search/new/bucket_sort.rs diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs new file mode 100644 index 000000000..712825c31 --- /dev/null +++ b/milli/src/search/new/bucket_sort.rs @@ -0,0 +1,195 @@ +use roaring::RoaringBitmap; + +use super::logger::SearchLogger; +use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait}; +use super::SearchContext; +use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput}; +use crate::Result; + +pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( + ctx: &mut SearchContext<'ctx>, + mut ranking_rules: Vec>, + query: &Q, + universe: &RoaringBitmap, + from: usize, + length: usize, + logger: &mut dyn SearchLogger, +) -> Result> { + logger.initial_query(query); + logger.ranking_rules(&ranking_rules); + logger.initial_universe(universe); + + let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { + ctx.index.fields_ids_map(ctx.txn)?.id(field) + } else { + None + }; + + if universe.len() < from as u64 { + return Ok(vec![]); + } + if ranking_rules.is_empty() { + if let Some(distinct_fid) = distinct_fid { + let mut excluded = RoaringBitmap::new(); + let mut results = vec![]; + for docid in universe.iter() { + if results.len() >= from + length { + break; + } + if excluded.contains(docid) { + continue; + } + distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?; + results.push(docid); + } + return Ok(results); + } else { + return Ok(universe.iter().skip(from).take(length).collect()); + }; + } + + let ranking_rules_len = ranking_rules.len(); + + logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe); + ranking_rules[0].start_iteration(ctx, logger, universe, query)?; + + let mut ranking_rule_universes: Vec = + vec![RoaringBitmap::default(); ranking_rules_len]; + ranking_rule_universes[0] = universe.clone(); + + let mut cur_ranking_rule_index = 0; + + /// Finish iterating over the current ranking rule, yielding + /// control to the parent (or finishing the search if not possible). + /// Update the candidates accordingly and inform the logger. + macro_rules! back { + () => { + assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); + logger.end_iteration_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &ranking_rule_universes[cur_ranking_rule_index], + ); + ranking_rule_universes[cur_ranking_rule_index].clear(); + ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger); + if cur_ranking_rule_index == 0 { + break; + } else { + cur_ranking_rule_index -= 1; + } + }; + } + + let mut results = vec![]; + let mut cur_offset = 0usize; + + /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` + /// into account and inform the logger. + macro_rules! maybe_add_to_results { + ($candidates:expr) => { + // First apply the distinct rule on the candidates, reducing the universes if necessary + let candidates = if let Some(distinct_fid) = distinct_fid { + let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?; + for universe in ranking_rule_universes.iter_mut() { + *universe -= &excluded; + } + remaining + } else { + $candidates.clone() + }; + let len = candidates.len(); + // if the candidates are empty, there is nothing to do; + if !candidates.is_empty() { + // if we still haven't reached the first document to return + if cur_offset < from { + // and if no document from this bucket can be returned + if cur_offset + (candidates.len() as usize) < from { + // then just skip the bucket + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &candidates, + ); + } else { + // otherwise, skip some of the documents and add some of the rest, in order of ids + let all_candidates = candidates.iter().collect::>(); + let (skipped_candidates, candidates) = + all_candidates.split_at(from - cur_offset); + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &skipped_candidates.into_iter().collect(), + ); + let candidates = candidates + .iter() + .take(length - results.len()) + .copied() + .collect::>(); + logger.add_to_results(&candidates); + results.extend(&candidates); + } + } else { + // if we have passed the offset already, add some of the documents (up to the limit) + let candidates = + candidates.iter().take(length - results.len()).collect::>(); + logger.add_to_results(&candidates); + results.extend(&candidates); + } + } + cur_offset += len as usize; + }; + } + + while results.len() < length { + // The universe for this bucket is zero or one element, so we don't need to sort + // anything, just extend the results and go back to the parent ranking rule. + if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { + maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); + ranking_rule_universes[cur_ranking_rule_index].clear(); + back!(); + continue; + } + + let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else { + back!(); + continue; + }; + + logger.next_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &ranking_rule_universes[cur_ranking_rule_index], + &next_bucket.candidates, + ); + + debug_assert!( + ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates) + ); + ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; + + if cur_ranking_rule_index == ranking_rules_len - 1 + || next_bucket.candidates.len() <= 1 + || cur_offset + (next_bucket.candidates.len() as usize) < from + { + maybe_add_to_results!(&next_bucket.candidates); + continue; + } + + cur_ranking_rule_index += 1; + ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); + logger.start_iteration_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &next_bucket.query, + &ranking_rule_universes[cur_ranking_rule_index], + ); + ranking_rules[cur_ranking_rule_index].start_iteration( + ctx, + logger, + &next_bucket.candidates, + &next_bucket.query, + )?; + } + + Ok(results) +} diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 9dc6018e6..a771d3768 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -2,8 +2,6 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::{QueryGraph, SearchContext}; -// use crate::search::new::sort::Sort; -use crate::search::new::distinct::{apply_distinct_rule, DistinctOutput}; use crate::Result; /// An internal trait implemented by only [`PlaceholderQuery`] and [`QueryGraph`] @@ -69,171 +67,3 @@ pub struct RankingRuleOutput { /// The allowed candidates for the child ranking rule pub candidates: RoaringBitmap, } - -pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( - ctx: &mut SearchContext<'ctx>, - mut ranking_rules: Vec>, - query: &Q, - universe: &RoaringBitmap, - from: usize, - length: usize, - logger: &mut dyn SearchLogger, -) -> Result> { - logger.initial_query(query); - logger.ranking_rules(&ranking_rules); - logger.initial_universe(universe); - - let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { - ctx.index.fields_ids_map(ctx.txn)?.id(field) - } else { - None - }; - - if universe.len() < from as u64 { - return Ok(vec![]); - } - - let ranking_rules_len = ranking_rules.len(); - logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe); - ranking_rules[0].start_iteration(ctx, logger, universe, query)?; - - let mut ranking_rule_universes: Vec = - vec![RoaringBitmap::default(); ranking_rules_len]; - ranking_rule_universes[0] = universe.clone(); - - let mut cur_ranking_rule_index = 0; - - /// Finish iterating over the current ranking rule, yielding - /// control to the parent (or finishing the search if not possible). - /// Update the candidates accordingly and inform the logger. - macro_rules! back { - () => { - assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); - logger.end_iteration_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &ranking_rule_universes[cur_ranking_rule_index], - ); - ranking_rule_universes[cur_ranking_rule_index].clear(); - ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger); - if cur_ranking_rule_index == 0 { - break; - } else { - cur_ranking_rule_index -= 1; - } - }; - } - - let mut results = vec![]; - let mut cur_offset = 0usize; - - /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` - /// into account and inform the logger. - macro_rules! maybe_add_to_results { - ($candidates:expr) => { - // First apply the distinct rule on the candidates, reducing the universes if necessary - let candidates = if let Some(distinct_fid) = distinct_fid { - let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?; - for universe in ranking_rule_universes.iter_mut() { - *universe -= &excluded; - } - remaining - } else { - $candidates.clone() - }; - let len = candidates.len(); - // if the candidates are empty, there is nothing to do; - if !candidates.is_empty() { - // if we still haven't reached the first document to return - if cur_offset < from { - // and if no document from this bucket can be returned - if cur_offset + (candidates.len() as usize) < from { - // then just skip the bucket - logger.skip_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &candidates, - ); - } else { - // otherwise, skip some of the documents and add some of the rest, in order of ids - let all_candidates = candidates.iter().collect::>(); - let (skipped_candidates, candidates) = - all_candidates.split_at(from - cur_offset); - logger.skip_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &skipped_candidates.into_iter().collect(), - ); - let candidates = candidates - .iter() - .take(length - results.len()) - .copied() - .collect::>(); - logger.add_to_results(&candidates); - results.extend(&candidates); - } - } else { - // if we have passed the offset already, add some of the documents (up to the limit) - let candidates = - candidates.iter().take(length - results.len()).collect::>(); - logger.add_to_results(&candidates); - results.extend(&candidates); - } - } - cur_offset += len as usize; - }; - } - - while results.len() < length { - // The universe for this bucket is zero or one element, so we don't need to sort - // anything, just extend the results and go back to the parent ranking rule. - if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { - maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); - ranking_rule_universes[cur_ranking_rule_index].clear(); - back!(); - continue; - } - - let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else { - back!(); - continue; - }; - - logger.next_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &ranking_rule_universes[cur_ranking_rule_index], - &next_bucket.candidates, - ); - - debug_assert!( - ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates) - ); - ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; - - if cur_ranking_rule_index == ranking_rules_len - 1 - || next_bucket.candidates.len() <= 1 - || cur_offset + (next_bucket.candidates.len() as usize) < from - { - maybe_add_to_results!(&next_bucket.candidates); - continue; - } - - cur_ranking_rule_index += 1; - ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); - logger.start_iteration_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &next_bucket.query, - &ranking_rule_universes[cur_ranking_rule_index], - ); - ranking_rules[cur_ranking_rule_index].start_iteration( - ctx, - logger, - &next_bucket.candidates, - &next_bucket.query, - )?; - } - - Ok(results) -} From c69cbec64a516629e22029737b35cf3dae10c8c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 11:20:04 +0200 Subject: [PATCH 145/234] Add more search tests --- milli/src/search/new/tests/language.rs | 2 +- .../src/search/new/tests/ngram_split_words.rs | 141 +++++++- milli/src/search/new/tests/proximity.rs | 317 ++++++++++++++++++ milli/src/search/new/tests/typo.rs | 148 +++++++- milli/src/search/new/tests/words_tms.rs | 185 +++++++++- 5 files changed, 766 insertions(+), 27 deletions(-) diff --git a/milli/src/search/new/tests/language.rs b/milli/src/search/new/tests/language.rs index 6adad748c..e16544fdb 100644 --- a/milli/src/search/new/tests/language.rs +++ b/milli/src/search/new/tests/language.rs @@ -18,5 +18,5 @@ fn test_kanji_language_detection() { search.query("東京"); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - assert_eq!(documents_ids, vec![1]); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]"); } diff --git a/milli/src/search/new/tests/ngram_split_words.rs b/milli/src/search/new/tests/ngram_split_words.rs index 06c49274c..b78bbe763 100644 --- a/milli/src/search/new/tests/ngram_split_words.rs +++ b/milli/src/search/new/tests/ngram_split_words.rs @@ -16,7 +16,10 @@ This module tests the following properties: 13. Ngrams cannot be formed by combining a phrase and a word or two phrases */ -use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -46,6 +49,14 @@ fn create_index() -> TempIndex { { "id": 3, "text": "the sunflower is tall" + }, + { + "id": 4, + "text": "the sunflawer is tall" + }, + { + "id": 5, + "text": "sunflowering is not a verb" } ])) .unwrap(); @@ -67,8 +78,18 @@ fn test_2gram_simple() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("sun flower"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - // will also match documents with "sun flower" - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); + // will also match documents with "sunflower" + prefix tolerance + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] fn test_3gram_simple() { @@ -87,6 +108,13 @@ fn test_3gram_simple() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sunflowers are pretty\"", + ] + "###); } #[test] @@ -99,7 +127,18 @@ fn test_2gram_typo() { s.query("sun flawer"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] @@ -119,6 +158,13 @@ fn test_no_disable_ngrams() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); // documents containing `sunflower` insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + "\"the sunflower is tall\"", + ] + "###); } #[test] @@ -137,7 +183,17 @@ fn test_2gram_prefix() { s.query("sun flow"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); // documents containing words beginning with `sunflow` - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] @@ -157,7 +213,16 @@ fn test_3gram_prefix() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); // documents containing a word beginning with sunfl - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] @@ -170,8 +235,17 @@ fn test_split_words() { s.query("sunflower "); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - // all the documents with either `sunflower` or `sun flower` - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]"); + // all the documents with either `sunflower` or `sun flower` + eventual typo + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + ] + "###); } #[test] @@ -191,6 +265,12 @@ fn test_disable_split_words() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); // no document containing `sun flower` insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflower is tall\"", + ] + "###); } #[test] @@ -203,8 +283,18 @@ fn test_2gram_split_words() { s.query("sunf lower"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - // all the documents with "sunflower", "sun flower", or (sunflower + 1 typo) - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]"); + // all the documents with "sunflower", "sun flower", (sunflower + 1 typo), or (sunflower as prefix) + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] @@ -218,7 +308,15 @@ fn test_3gram_no_split_words() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); // no document with `sun flower` - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] @@ -231,7 +329,13 @@ fn test_3gram_no_typos() { s.query("sunf la wer"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflawer is tall\"", + ] + "###); } #[test] @@ -245,6 +349,13 @@ fn test_no_ngram_phrases() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + ] + "###); let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); @@ -252,4 +363,10 @@ fn test_no_ngram_phrases() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + ] + "###); } diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index e69de29bb..f6e071572 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -0,0 +1,317 @@ +/*! +This module tests the Proximity ranking rule: + +1. A proximity of >7 always has the same cost. + +2. Phrase terms can be in proximity to other terms via their start and end words, +but we need to make sure that the phrase exists in the document that meets this +proximity condition. This is especially relevant with split words and synonyms. + +3. An ngram has the same proximity cost as its component words being consecutive. +e.g. `sunflower` equivalent to `sun flower`. + +4. The prefix databases can be used to find the proximity between two words, but +they store fewer proximities than the regular word proximity DB. + +*/ + +use std::collections::HashMap; + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_simple_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "the very quick dark brown and smart fox did jump over the terribly lazy and small dog" + }, + { + "id": 1, + "text": "the. quick brown fox jumps over the lazy. dog" + }, + { + "id": 2, + "text": "the quick brown fox jumps over the lazy. dog" + }, + { + "id": 3, + "text": "dog the quick brown fox jumps over the lazy" + }, + { + "id": 4, + "text": "the quickbrown fox jumps over the lazy dog" + }, + { + "id": 5, + "text": "brown quick fox jumps over the lazy dog" + }, + { + "id": 6, + "text": "the really quick brown fox jumps over the very lazy dog" + }, + { + "id": 7, + "text": "the really quick brown fox jumps over the lazy dog" + }, + { + "id": 8, + "text": "the quick brown fox jumps over the lazy" + }, + { + "id": 9, + "text": "the quack brown fox jumps over the lazy" + }, + { + "id": 9, + "text": "the quack brown fox jumps over the lazy dog" + }, + { + "id": 10, + "text": "the quick brown fox jumps over the lazy dog" + } + ])) + .unwrap(); + index +} + +fn create_edge_cases_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + index.add_documents(documents!([ + { + // This document will insert "s" in the prefix database + "id": 0, + "text": " + saa sab sac sae saf sag sah sai saj sak sal sam san sao sap saq sar sasa sat sau sav saw sax say saz + sba sbb sbc sbe sbf sbg sbh sbi sbj sbk sbl sbm sbn sbo sbp sbq sbr sbsb sbt sbu sbv sbw sbx sby sbz + sca scb scc sce scf scg sch sci scj sck scl scm scn sco scp scq scr scsc sct scu scv scw scx scy scz + sda sdb sdc sde sdf sdg sdh sdi sdj sdk sdl sdm sdn sdo sdp sdq sdr sdsd sdt sdu sdv sdw sdx sdy sdz + sea seb sec see sef seg seh sei sej sek sel sem sen seo sep seq ser sese set seu sev sew sex sey sez + sfa sfb sfc sfe sff sfg sfh sfi sfj sfk sfl sfm sfn sfo sfp sfq sfr sfsf sft sfu sfv sfw sfx sfy sfz + sga sgb sgc sge sgf sgg sgh sgi sgj sgk sgl sgm sgn sgo sgp sgq sgr sgsg sgt sgu sgv sgw sgx sgy sgz + ska skb skc ske skf skg skh ski skj skk skl skm skn sko skp skq skr sksk skt sku skv skw skx sky skz + sla slb slc sle slf slg slh sli slj slk sll slm sln slo slp slq slr slsl slt slu slv slw slx sly slz + sma smb smc sme smf smg smh smi smj smk sml smm smn smo smp smq smr smsm smt smu smv smw smx smy smz + sna snb snc sne snf sng snh sni snj snk snl snm snn sno snp snq snr snsn snt snu snv snw snx sny snz + soa sob soc soe sof sog soh soi soj sok sol som son soo sop soq sor soso sot sou sov sow sox soy soz + spa spb spc spe spf spg sph spi spj spk spl spm spn spo spp spq spr spsp spt spu spv spw spx spy spz + sqa sqb sqc sqe sqf sqg sqh sqi sqj sqk sql sqm sqn sqo sqp sqq sqr sqsq sqt squ sqv sqw sqx sqy sqz + sra srb src sre srf srg srh sri srj srk srl srm srn sro srp srq srr srsr srt sru srv srw srx sry srz + ssa ssb ssc sse ssf ssg ssh ssi ssj ssk ssl ssm ssn sso ssp ssq ssr ssss sst ssu ssv ssw ssx ssy ssz + sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz + " + }, + // The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`. + // If the search query is "sunflower", the split word "Sun Flower" will match some documents. + // If the query is `sunflower wilting`, then we should make sure that + // the proximity condition `flower wilting: prox N` also comes with the condition + // `sun wilting: prox N+1`. TODO: this is not the exact condition we use for now. + // We only check that the phrase `sun flower` exists and `flower wilting: prox N`, which + // is better than nothing but not the best. + { + "id": 1, + "text": "Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat." + }, + { + "id": 2, + "text": "Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat." + }, + { + "id": 3, + // This document matches the query `sunflower wilting`, but the proximity condition + // between `sunflower` and `wilting` cannot be through the split-word `Sun Flower` + // which would reduce to only `flower` and `wilting` being in proximity. + "text": "A flower wilting under the sun, unlike a sunflower" + }, + { + // This should be the best document for `sunflower wilting` + "id": 4, + "text": "sun flower wilting under the heat" + }, + { + // This is also the best document for `sunflower wilting` + "id": 5, + "text": "sunflower wilting under the heat" + }, + { + // Prox MAX between `best` and `s` prefix + "id": 6, + "text": "this is the best meal I have ever had in such a beautiful summer day" + }, + { + // Prox 5 between `best` and `s` prefix + "id": 7, + "text": "this is the best cooked meal of the summer" + }, + { + // Prox 4 between `best` and `s` prefix + "id": 8, + "text": "this is the best meal of the summer" + }, + { + // Prox 3 between `best` and `s` prefix + "id": 9, + "text": "this is the best meal of summer" + }, + { + // Prox 1 between `best` and `s` prefix + "id": 10, + "text": "this is the best summer meal" + }, + { + // Reverse Prox 3 between `best` and `s` prefix + "id": 11, + "text": "summer x y best" + }, + { + // Reverse Prox 2 between `best` and `s` prefix + "id": 12, + "text": "summer x best" + }, + { + // Reverse Prox 1 between `best` and `s` prefix + "id": 13, + "text": "summer best" + }, + ])).unwrap(); + index +} + +#[test] +fn test_proximity_simple() { + let index = create_simple_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 9, 10, 7, 6, 5, 2, 3, 0, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quickbrown fox jumps over the lazy dog\"", + "\"the quack brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the really quick brown fox jumps over the lazy dog\"", + "\"the really quick brown fox jumps over the very lazy dog\"", + "\"brown quick fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy. dog\"", + "\"dog the quick brown fox jumps over the lazy\"", + "\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"", + "\"the. quick brown fox jumps over the lazy. dog\"", + ] + "###); +} + +#[test] +fn test_proximity_split_word() { + let index = create_edge_cases_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunflower wilting"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: "2" and "4" should be swapped ideally + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", + "\"sun flower wilting under the heat\"", + "\"sunflower wilting under the heat\"", + "\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"", + "\"A flower wilting under the sun, unlike a sunflower\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("\"sun flower\" wilting"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: "2" and "4" should be swapped ideally + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", + "\"sun flower wilting under the heat\"", + "\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"", + ] + "###); + drop(txn); + + index + .update_settings(|s| { + let mut syns = HashMap::new(); + syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]); + s.set_synonyms(syns); + }) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("xyz wilting"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: "2" and "4" should be swapped ideally + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", + "\"sun flower wilting under the heat\"", + "\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"", + ] + "###); +} + +#[test] +fn test_proximity_prefix_db() { + let index = create_edge_cases_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best s"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + // This test illustrates the loss of precision from using the prefix DB + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best summer meal\"", + "\"summer best\"", + "\"this is the best meal of summer\"", + "\"summer x best\"", + "\"this is the best meal of the summer\"", + "\"this is the best meal I have ever had in such a beautiful summer day\"", + "\"this is the best cooked meal of the summer\"", + "\"summer x y best\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs index 6ac8f5516..4df340e9b 100644 --- a/milli/src/search/new/tests/typo.rs +++ b/milli/src/search/new/tests/typo.rs @@ -21,8 +21,8 @@ if `words` doesn't exist before it. use std::collections::HashMap; use crate::{ - index::tests::TempIndex, Criterion, - Search, SearchResult, TermsMatchingStrategy, + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, }; fn create_index() -> TempIndex { @@ -130,6 +130,10 @@ fn create_index() -> TempIndex { "id": 22, "text": "the quick brown fox jumps over the lackadaisical dog" }, + { + "id": 23, + "text": "the quivk brown fox jumps over the lazy dog" + }, ])) .unwrap(); index @@ -151,6 +155,12 @@ fn test_no_typo() { s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); } #[test] @@ -168,7 +178,14 @@ fn test_default_typo() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 23]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); // 1 typo on one word, replaced letter let mut s = Search::new(&txn, &index); @@ -176,6 +193,12 @@ fn test_default_typo() { s.query("the quack brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); // 1 typo on one word, missing letter, extra letter let mut s = Search::new(&txn, &index); @@ -183,6 +206,12 @@ fn test_default_typo() { s.query("the quicest brownest fox jummps over the laziest dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quickest brownest fox jumps over the laziest dog\"", + ] + "###); // 1 typo on one word, swapped letters let mut s = Search::new(&txn, &index); @@ -190,6 +219,12 @@ fn test_default_typo() { s.query("the quikc borwn fox jupms over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); // 1 first letter typo on a word <5 bytes, replaced letter let mut s = Search::new(&txn, &index); @@ -211,6 +246,12 @@ fn test_default_typo() { s.query("the quack brawn fox junps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); // 2 typos on words < 9 bytes let mut s = Search::new(&txn, &index); @@ -225,6 +266,12 @@ fn test_default_typo() { s.query("the extravant fox kyrocketed over the lamguorout dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the extravagant fox skyrocketed over the languorous dog\"", + ] + "###); // 2 typos on words >= 9 bytes: 2 extra letters in a single word, swapped letters + extra letter, replaced letters let mut s = Search::new(&txn, &index); @@ -232,6 +279,12 @@ fn test_default_typo() { s.query("the extravaganttt fox sktyrocnketed over the lagnuorrous dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the extravagant fox skyrocketed over the languorous dog\"", + ] + "###); } #[test] @@ -244,6 +297,8 @@ fn test_phrase_no_typo_allowed() { s.query("the \"quick brewn\" fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @"[]"); } #[test] @@ -256,12 +311,20 @@ fn test_ngram_typos() { s.query("the extra lagant fox skyrocketed over the languorous dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the extravagant fox skyrocketed over the languorous dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("the ex tra lagant fox skyrocketed over the languorous dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @"[]"); } #[test] fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() { @@ -278,7 +341,29 @@ fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids: ids_1, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{ids_1:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + insta::assert_snapshot!(format!("{ids_1:?}"), @"[0, 23, 7, 8, 9, 22, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + let texts = collect_field_values(&index, &txn, "text", &ids_1); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the lackadaisical dog\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown foxes jump over the lazy dog\"", + "\"the quick brown fax sends a letter to the dog\"", + "\"the quick brown\"", + "\"the quick\"", + "\"a fox doesn't quack, that crown goes to the duck.\"", + "\"the quickest brownest fox jumps over the laziest dog\"", + "\"the quicker browner fox jumped over the lazier dog\"", + "\"the extravagant fox skyrocketed over the languorous dog\"", + "\"the fast brownish fox jumps over the lackadaisical dog\"", + ] + "###); index .update_settings(|s| { @@ -290,7 +375,7 @@ fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids: ids_2, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{ids_2:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + insta::assert_snapshot!(format!("{ids_2:?}"), @"[0, 23, 7, 8, 9, 22, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); assert_eq!(ids_1, ids_2); } @@ -307,6 +392,17 @@ fn test_typo_bucketing() { s.query("network interconnection sunflower"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 15, 16, 17, 18, 20]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"netwolk interconections sunflawar\"", + "\"network interconnections sunflawer\"", + "\"network interconnection sunflower\"", + "\"network interconnection sun flower\"", + "\"network interconnection sunflowering\"", + "\"network interconnection sunflowar\"", + ] + "###); // Then with the typo ranking rule drop(txn); @@ -322,12 +418,34 @@ fn test_typo_bucketing() { s.query("network interconnection sunflower"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18, 17, 20, 15, 14]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"network interconnection sunflower\"", + "\"network interconnection sunflowering\"", + "\"network interconnection sun flower\"", + "\"network interconnection sunflowar\"", + "\"network interconnections sunflawer\"", + "\"netwolk interconections sunflawar\"", + ] + "###); let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("network interconnection sun flower"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[17, 19, 16, 18, 20, 15]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"network interconnection sun flower\"", + "\"network interconnection sun flowering\"", + "\"network interconnection sunflower\"", + "\"network interconnection sunflowering\"", + "\"network interconnection sunflowar\"", + "\"network interconnections sunflawer\"", + ] + "###); } #[test] @@ -350,7 +468,15 @@ fn test_typo_synonyms() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("the quick brown fox jumps over the lackadaisical dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 22, 23]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lackadaisical dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); @@ -359,5 +485,13 @@ fn test_typo_synonyms() { // TODO: is this correct? interaction of ngrams + synonyms means that the // multi-word synonyms end up having a typo cost. This is probably not what we want. let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the fast brownish fox jumps over the lackadaisical dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lackadaisical dog\"", + ] + "###); } diff --git a/milli/src/search/new/tests/words_tms.rs b/milli/src/search/new/tests/words_tms.rs index 8b5c0153f..74748ea5a 100644 --- a/milli/src/search/new/tests/words_tms.rs +++ b/milli/src/search/new/tests/words_tms.rs @@ -12,9 +12,12 @@ account by the proximity ranking rule. 7. The search is capable of returning no results if no documents match the query */ -use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; -fn create_quick_brown_fox_trivial_index() -> TempIndex { +fn create_index() -> TempIndex { let index = TempIndex::new(); index @@ -126,7 +129,7 @@ fn create_quick_brown_fox_trivial_index() -> TempIndex { #[test] fn test_words_tms_last_simple() { - let index = create_quick_brown_fox_trivial_index(); + let index = create_index(); let txn = index.read_txn().unwrap(); let mut s = Search::new(&txn, &index); @@ -136,6 +139,31 @@ fn test_words_tms_last_simple() { // 6 and 7 have the same score because "the" appears twice insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 8, 6, 7, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("extravagant the quick brown fox jumps over the lazy dog"); @@ -146,7 +174,7 @@ fn test_words_tms_last_simple() { #[test] fn test_words_tms_last_phrase() { - let index = create_quick_brown_fox_trivial_index(); + let index = create_index(); let txn = index.read_txn().unwrap(); let mut s = Search::new(&txn, &index); @@ -156,6 +184,21 @@ fn test_words_tms_last_phrase() { // "The quick brown fox" is a phrase, not deleted by this term matching strategy insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 6, 7, 5, 4, 11, 12]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("\"the quick brown fox\" jumps over the \"lazy\" dog"); @@ -165,6 +208,17 @@ fn test_words_tms_last_phrase() { // "lazy" is a phrase, not deleted by this term matching strategy // but words before it can be deleted insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 11, 12]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("\"the quick brown fox jumps over the lazy dog\""); @@ -173,6 +227,12 @@ fn test_words_tms_last_phrase() { // The whole query is a phrase, no terms are removed insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("\"the quick brown fox jumps over the lazy dog"); @@ -181,11 +241,17 @@ fn test_words_tms_last_phrase() { // The whole query is still a phrase, even without closing quotes, so no terms are removed insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); } #[test] fn test_words_proximity_tms_last_simple() { - let index = create_quick_brown_fox_trivial_index(); + let index = create_index(); index .update_settings(|s| { s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); @@ -200,6 +266,31 @@ fn test_words_proximity_tms_last_simple() { // 7 is better than 6 because of the proximity between "the" and its surrounding terms insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("the brown quick fox jumps over the lazy dog"); @@ -208,11 +299,36 @@ fn test_words_proximity_tms_last_simple() { // 10 is better than 9 because of the proximity between "quick" and "brown" insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the brown quick fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); } #[test] fn test_words_proximity_tms_last_phrase() { - let index = create_quick_brown_fox_trivial_index(); + let index = create_index(); index .update_settings(|s| { s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); @@ -228,6 +344,26 @@ fn test_words_proximity_tms_last_phrase() { // "quick brown" is a phrase. The proximity of its first and last words // to their adjacent query words should be taken into account insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("the \"quick brown\" \"fox jumps\" over the lazy dog"); @@ -238,11 +374,27 @@ fn test_words_proximity_tms_last_phrase() { // to their adjacent query words should be taken into account. // The same applies to `fox jumps`. insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + ] + "###); } #[test] fn test_words_tms_all() { - let index = create_quick_brown_fox_trivial_index(); + let index = create_index(); index .update_settings(|s| { s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); @@ -256,6 +408,23 @@ fn test_words_tms_all() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("extravagant"); @@ -263,4 +432,6 @@ fn test_words_tms_all() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @"[]"); } From 4c8a0179ba1c243791a3b24fae2f37e12e50e8cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 11:30:49 +0200 Subject: [PATCH 146/234] Add more search tests --- milli/src/search/new/tests/proximity.rs | 176 ++++++++++++++++++++++-- 1 file changed, 165 insertions(+), 11 deletions(-) diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index f6e071572..44ff94f1d 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -1,17 +1,17 @@ /*! This module tests the Proximity ranking rule: -1. A proximity of >7 always has the same cost. +1. A sprximity of >7 always has the same cost. -2. Phrase terms can be in proximity to other terms via their start and end words, +2. Phrase terms can be in sprximity to other terms via their start and end words, but we need to make sure that the phrase exists in the document that meets this proximity condition. This is especially relevant with split words and synonyms. -3. An ngram has the same proximity cost as its component words being consecutive. +3. An ngram has the same sprximity cost as its component words being consecutive. e.g. `sunflower` equivalent to `sun flower`. -4. The prefix databases can be used to find the proximity between two words, but -they store fewer proximities than the regular word proximity DB. +4. The prefix databases can be used to find the sprximity between two words, but +they store fewer sprximities than the regular word sprximity DB. */ @@ -126,9 +126,9 @@ fn create_edge_cases_index() -> TempIndex { // The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`. // If the search query is "sunflower", the split word "Sun Flower" will match some documents. // If the query is `sunflower wilting`, then we should make sure that - // the proximity condition `flower wilting: prox N` also comes with the condition - // `sun wilting: prox N+1`. TODO: this is not the exact condition we use for now. - // We only check that the phrase `sun flower` exists and `flower wilting: prox N`, which + // the sprximity condition `flower wilting: sprx N` also comes with the condition + // `sun wilting: sprx N+1`. TODO: this is not the exact condition we use for now. + // We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which // is better than nothing but not the best. { "id": 1, @@ -140,9 +140,9 @@ fn create_edge_cases_index() -> TempIndex { }, { "id": 3, - // This document matches the query `sunflower wilting`, but the proximity condition + // This document matches the query `sunflower wilting`, but the sprximity condition // between `sunflower` and `wilting` cannot be through the split-word `Sun Flower` - // which would reduce to only `flower` and `wilting` being in proximity. + // which would reduce to only `flower` and `wilting` being in sprximity. "text": "A flower wilting under the sun, unlike a sunflower" }, { @@ -195,6 +195,69 @@ fn create_edge_cases_index() -> TempIndex { "id": 13, "text": "summer best" }, + { + // This document will insert "win" in the prefix database + "id": 14, + "text": " + winaa winab winac winae winaf winag winah winai winaj winak winal winam winan winao winap winaq winar winasa winat winau winav winaw winax winay winaz + winba winbb winbc winbe winbf winbg winbh winbi winbj winbk winbl winbm winbn winbo winbp winbq winbr winbsb winbt winbu winbv winbw winbx winby winbz + winca wincb wincc wince wincf wincg winch winci wincj winck wincl wincm wincn winco wincp wincq wincr wincsc winct wincu wincv wincw wincx wincy wincz + winda windb windc winde windf windg windh windi windj windk windl windm windn windo windp windq windr windsd windt windu windv windw windx windy windz + winea wineb winec winee winef wineg wineh winei winej winek winel winem winen wineo winep wineq winer winese winet wineu winev winew winex winey winez + winfa winfb winfc winfe winff winfg winfh winfi winfj winfk winfl winfm winfn winfo winfp winfq winfr winfsf winft winfu winfv winfw winfx winfy winfz + winga wingb wingc winge wingf wingg wingh wingi wingj wingk wingl wingm wingn wingo wingp wingq wingr wingsg wingt wingu wingv wingw wingx wingy wingz + winka winkb winkc winke winkf winkg winkh winki winkj winkk winkl winkm winkn winko winkp winkq winkr winksk winkt winku winkv winkw winkx winky winkz + winla winlb winlc winle winlf winlg winlh winli winlj winlk winll winlm winln winlo winlp winlq winlr winlsl winlt winlu winlv winlw winlx winly winlz + winma winmb winmc winme winmf winmg winmh winmi winmj winmk winml winmm winmn winmo winmp winmq winmr winmsm winmt winmu winmv winmw winmx winmy winmz + winna winnb winnc winne winnf winng winnh winni winnj winnk winnl winnm winnn winno winnp winnq winnr winnsn winnt winnu winnv winnw winnx winny winnz + winoa winob winoc winoe winof winog winoh winoi winoj winok winol winom winon winoo winop winoq winor winoso winot winou winov winow winox winoy winoz + winpa winpb winpc winpe winpf winpg winph winpi winpj winpk winpl winpm winpn winpo winpp winpq winpr winpsp winpt winpu winpv winpw winpx winpy winpz + winqa winqb winqc winqe winqf winqg winqh winqi winqj winqk winql winqm winqn winqo winqp winqq winqr winqsq winqt winqu winqv winqw winqx winqy winqz + winra winrb winrc winre winrf winrg winrh winri winrj winrk winrl winrm winrn winro winrp winrq winrr winrsr winrt winru winrv winrw winrx winry winrz + winsa winsb winsc winse winsf winsg winsh winsi winsj winsk winsl winsm winsn winso winsp winsq winsr winsss winst winsu winsv winsw winsx winsy winsz + winta wintb wintc winte wintf wintg winth winti wintj wintk wintl wintm wintn winto wintp wintq wintr wintst wintt wintu wintv wintw wintx winty wintz + " + }, + { + // Prox MAX between `best` and `win` prefix + "id": 15, + "text": "this is the best meal I have ever had in such a beautiful winter day" + }, + { + // Prox 5 between `best` and `win` prefix + "id": 16, + "text": "this is the best cooked meal of the winter" + }, + { + // Prox 4 between `best` and `win` prefix + "id": 17, + "text": "this is the best meal of the winter" + }, + { + // Prox 3 between `best` and `win` prefix + "id": 18, + "text": "this is the best meal of winter" + }, + { + // Prox 1 between `best` and `win` prefix + "id": 19, + "text": "this is the best winter meal" + }, + { + // Reverse Prox 3 between `best` and `win` prefix + "id": 20, + "text": "winter x y best" + }, + { + // Reverse Prox 2 between `best` and `win` prefix + "id": 21, + "text": "winter x best" + }, + { + // Reverse Prox 1 between `best` and `win` prefix + "id": 22, + "text": "winter best" + }, ])).unwrap(); index } @@ -298,7 +361,7 @@ fn test_proximity_prefix_db() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("best s"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11, 15]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); // This test illustrates the loss of precision from using the prefix DB @@ -312,6 +375,97 @@ fn test_proximity_prefix_db() { "\"this is the best meal I have ever had in such a beautiful summer day\"", "\"this is the best cooked meal of the summer\"", "\"summer x y best\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + ] + "###); + + // Difference when using the `su` prefix, which is not in the prefix DB + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best su"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 11, 7, 6, 15]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best summer meal\"", + "\"summer best\"", + "\"this is the best meal of summer\"", + "\"summer x best\"", + "\"this is the best meal of the summer\"", + "\"summer x y best\"", + "\"this is the best cooked meal of the summer\"", + "\"this is the best meal I have ever had in such a beautiful summer day\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + ] + "###); + + // Note that there is a case where a prefix is in the prefix DB but not in the + // **proximity** prefix DB. In that case, its sprximity score will always be + // the maximum. This happens for prefixes that are larger than 2 bytes. + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best win"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best meal I have ever had in such a beautiful winter day\"", + "\"this is the best cooked meal of the winter\"", + "\"this is the best meal of the winter\"", + "\"this is the best meal of winter\"", + "\"this is the best winter meal\"", + "\"winter x y best\"", + "\"winter x best\"", + "\"winter best\"", + ] + "###); + + // Now using `wint`, which is not in the prefix DB: + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best wint"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 20, 16, 15]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best winter meal\"", + "\"winter best\"", + "\"this is the best meal of winter\"", + "\"winter x best\"", + "\"this is the best meal of the winter\"", + "\"winter x y best\"", + "\"this is the best cooked meal of the winter\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + ] + "###); + + // and using `wi` which is in the prefix DB and proximity prefix DB + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best wi"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 15, 16, 20]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best winter meal\"", + "\"winter best\"", + "\"this is the best meal of winter\"", + "\"winter x best\"", + "\"this is the best meal of the winter\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + "\"this is the best cooked meal of the winter\"", + "\"winter x y best\"", ] "###); } From 6e50f23896418c5d363a69cc13813fc6bad548f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 13:33:23 +0200 Subject: [PATCH 147/234] Add more search tests --- milli/src/search/new/tests/mod.rs | 2 + milli/src/search/new/tests/proximity.rs | 2 +- milli/src/search/new/tests/proximity_typo.rs | 68 ++++++++++ milli/src/search/new/tests/typo_proximity.rs | 126 +++++++++++++++++++ 4 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 milli/src/search/new/tests/proximity_typo.rs create mode 100644 milli/src/search/new/tests/typo_proximity.rs diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index 0fd5013db..898276858 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -3,8 +3,10 @@ pub mod distinct; pub mod language; pub mod ngram_split_words; pub mod proximity; +pub mod proximity_typo; pub mod sort; pub mod typo; +pub mod typo_proximity; pub mod words_tms; fn collect_field_values( diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index 44ff94f1d..880f933f0 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -1,7 +1,7 @@ /*! This module tests the Proximity ranking rule: -1. A sprximity of >7 always has the same cost. +1. A proximity of >7 always has the same cost. 2. Phrase terms can be in sprximity to other terms via their start and end words, but we need to make sure that the phrase exists in the document that meets this diff --git a/milli/src/search/new/tests/proximity_typo.rs b/milli/src/search/new/tests/proximity_typo.rs new file mode 100644 index 000000000..3bf869d1d --- /dev/null +++ b/milli/src/search/new/tests/proximity_typo.rs @@ -0,0 +1,68 @@ +/*! +This module tests the interactions between the proximity and typo ranking rules. + +The proximity ranking rule should transform the query graph such that it +only contains the word pairs that it used to compute its bucket. +*/ + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Proximity, Criterion::Typo]); + }) + .unwrap(); + + index + .add_documents(documents!([ + // Basic trap. + // + // We have one document with the perfect word pair: `sommer - holiday` + // and another with the perfect word pair: `sommer holidty`. + // + // The proximity ranking rule will put them both in the same bucket, and it + // should minify the query graph to make it represent: + // EITHER: + // sommer + holiday + // OR: + // sommer + holidty + // + // Such that the child typo ranking rule does not find any match + // for its zero-typo bucket `summer + holiday`, even though both documents + // contain these two exact words. + { + "id": 0, + "text": "summer. holiday. sommer holidty" + }, + { + "id": 1, + "text": "summer. holiday. sommer holiday" + }, + + ])) + .unwrap(); + index +} + +#[test] +fn test_trap_basic() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("summer holiday"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + "###); +} diff --git a/milli/src/search/new/tests/typo_proximity.rs b/milli/src/search/new/tests/typo_proximity.rs new file mode 100644 index 000000000..ba6371544 --- /dev/null +++ b/milli/src/search/new/tests/typo_proximity.rs @@ -0,0 +1,126 @@ +/*! +This module tests the interactions between the typo and proximity ranking rules. + +The typo ranking rule should transform the query graph such that it only contains +the combinations of word derivations that it used to compute its bucket. + +The proximity ranking rule should then look for proximities only between those specific derivations. +For example, given the the search query `beautiful summer` and the dataset: +```text +{ "id": 0, "text": "beautigul summer...... beautiful day in the summer" } +{ "id": 1, "text": "beautiful summer" } +``` +Then the document with id `1` should be returned before `0`. +The proximity ranking rule is not allowed to look for the proximity between `beautigul` and `summer` +because the typo ranking rule before it only used the derivation `beautiful`. +*/ + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); + }) + .unwrap(); + + index + .add_documents(documents!([ + // trap explained in the module documentation + { + "id": 0, + "text": "beautigul summer. beautiful x y z summer" + }, + { + "id": 1, + "text": "beautiful summer" + }, + // the next 2 documents set up a more complicated trap + // with the query `beautiful summer`, we will have: + // 1. documents with no typos, id 0 and 1 + // 2. documents with 1 typos: id 2 and 3, those are interpreted as EITHER + // - id 2: "beautigul + summer" ; OR + // - id 3: "beautiful + sommer" + // To sort these two documents, the proximity ranking rule must use only the + // word pairs: `beautigul -- summer` and `beautiful -- sommer` even though + // all variations of `beautiful` and `sommer` were used by the typo ranking rule. + { + "id": 2, + "text": "beautigul sommer. beautigul x summer" + }, + { + "id": 3, + "text": "beautiful sommer" + }, + // The next two documents lay out an even more complex trap, which the current implementation + // fails to handle properly. + // With the user query `delicious sweet dessert`, the typo ranking rule will return one bucket of: + // - id 4: delicitous + sweet + dessert + // - id 5: beautiful + sweet + desgert + // The word pairs that the proximity ranking rules is allowed to use are + // EITHER: + // delicitous -- sweet AND sweet -- dessert + // OR + // delicious -- sweet AND sweet -- desgert + // So the word pair to use for the terms `summer` and `dessert` depend on the + // word pairs explored before them. + { + "id": 4, + "text": "delicitous. sweet. dessert. delicitous sweet desgert", + }, + { + "id": 5, + "text": "delicious. sweet desgert. delicious sweet desgert", + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_trap_basic_and_complex1() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("beautiful summer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"beautiful summer\"", + "\"beautigul summer. beautiful x y z summer\"", + "\"beautiful sommer\"", + "\"beautigul sommer. beautigul x summer\"", + ] + "###); +} + +#[test] +fn test_trap_complex2() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("delicious sweet dessert"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: this is incorrect. 5 should appear before 4 + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"delicitous. sweet. dessert. delicitous sweet desgert\"", + "\"delicious. sweet desgert. delicious sweet desgert\"", + ] + "###); +} From b5691802a32fd63697a83882124ecf645ae4df86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 14:10:01 +0200 Subject: [PATCH 148/234] Add new tests and fix construction of query graph from paths --- milli/src/search/new/interner.rs | 3 + milli/src/search/new/query_graph.rs | 101 ++++++++++--------- milli/src/search/new/tests/proximity_typo.rs | 9 +- milli/src/search/new/tests/sort.rs | 12 +-- milli/src/search/new/tests/typo_proximity.rs | 8 +- 5 files changed, 73 insertions(+), 60 deletions(-) diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index e9bfbef86..ebf18f38c 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -176,6 +176,9 @@ impl Interner { pub fn iter_mut(&mut self) -> impl Iterator, &mut T)> { self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x)) } + pub fn freeze(self) -> FixedSizeInterner { + FixedSizeInterner { stable_store: self.stable_store } + } } /// A store of values of type `T`, each linked to a value of type `From` diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 33e178494..2662ef730 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -4,7 +4,7 @@ use super::query_term::{ }; use super::small_bitmap::SmallBitmap; use super::SearchContext; -use crate::search::new::interner::DedupInterner; +use crate::search::new::interner::Interner; use crate::Result; use std::cmp::Ordering; use std::collections::BTreeMap; @@ -345,29 +345,13 @@ impl QueryGraph { Build a query graph from a list of paths The paths are composed of source and dest terms. - If the source term is `None`, then the last dest term is used - as the predecessor of the dest term. If the source is Some(_), - then an edge is built between the last dest term and the source, - and between the source and new dest term. - Note that the resulting graph will not correspond to a perfect - representation of the set of paths. For example, consider the following paths: ```txt PATH 1 : a -> b1 -> c1 -> d -> e1 PATH 2 : a -> b2 -> c2 -> d -> e2 ``` Then the resulting graph will be: - ```txt - ┌────┐ ┌────┐ ┌────┐ - ┌──│ b1 │──│ c1 │─┐ ┌──│ e1 │ - ┌────┐ │ └────┘ └────┘ │ ┌────┐ │ └────┘ - │ a │─┤ ├─│ d │─┤ - └────┘ │ ┌────┐ ┌────┐ │ └────┘ │ ┌────┐ - └──│ b2 │──│ c2 │─┘ └──│ e2 │ - └────┘ └────┘ └────┘ - ``` - which is different from the fully correct representation: ```txt ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌──│ b1 │──│ c1 │───│ d │───│ e1 │ @@ -383,21 +367,51 @@ impl QueryGraph { pub fn build_from_paths( paths: Vec, LocatedQueryTermSubset)>>, ) -> Self { - let mut node_data = DedupInterner::default(); - let root_node = node_data.insert(QueryNodeData::Start); - let end_node = node_data.insert(QueryNodeData::End); + let mut node_data = Interner::default(); + let root_node = node_data.push(QueryNodeData::Start); + let end_node = node_data.push(QueryNodeData::End); + + let mut paths_with_single_terms = vec![]; + + for path in paths { + let mut processed_path = vec![]; + let mut prev_dest_term: Option = None; + for (start_term, dest_term) in path { + if let Some(prev_dest_term) = prev_dest_term.take() { + if let Some(mut start_term) = start_term { + if start_term.term_ids == prev_dest_term.term_ids { + start_term.term_subset.intersect(&prev_dest_term.term_subset); + processed_path.push(start_term); + } else { + processed_path.push(prev_dest_term); + processed_path.push(start_term); + } + } else { + processed_path.push(prev_dest_term); + } + } else if let Some(start_term) = start_term { + processed_path.push(start_term); + } + prev_dest_term = Some(dest_term); + } + if let Some(prev_dest_term) = prev_dest_term { + processed_path.push(prev_dest_term); + } + paths_with_single_terms.push(processed_path); + } + + // TODO: make a prefix tree of the processed paths to avoid uselessly duplicating nodes let mut paths_with_ids = vec![]; - for path in paths { + for path in paths_with_single_terms { let mut path_with_ids = vec![]; - for node in path { - let (start_term, end_term) = node; - let src_node_id = start_term.map(|x| node_data.insert(QueryNodeData::Term(x))); - let dest_node_id = node_data.insert(QueryNodeData::Term(end_term)); - path_with_ids.push((src_node_id, dest_node_id)); + for term in path { + let id = node_data.push(QueryNodeData::Term(term)); + path_with_ids.push(Interned::from_raw(id.into_raw())); } paths_with_ids.push(path_with_ids); } + let nodes_data = node_data.freeze(); let nodes_data_len = nodes_data.len(); let mut nodes = nodes_data.map_move(|n| QueryNode { @@ -406,31 +420,22 @@ impl QueryGraph { successors: SmallBitmap::new(nodes_data_len), }); - let root_node = Interned::from_raw(root_node.into_raw()); - let end_node = Interned::from_raw(end_node.into_raw()); + let root_node = Interned::::from_raw(root_node.into_raw()); + let end_node = Interned::::from_raw(end_node.into_raw()); for path in paths_with_ids { - let mut prev_node = root_node; - for node in path { - let (start_term, dest_term) = node; - let end_term = Interned::from_raw(dest_term.into_raw()); - let src = if let Some(start_term) = start_term { - // TODO: this is incorrect! should take the intersection - // between the prev node and the start term if they refer to the same - // original query term! - let start_term = Interned::from_raw(start_term.into_raw()); - nodes.get_mut(prev_node).successors.insert(start_term); - nodes.get_mut(start_term).predecessors.insert(prev_node); - start_term - } else { - prev_node - }; - nodes.get_mut(src).successors.insert(end_term); - nodes.get_mut(end_term).predecessors.insert(src); - prev_node = end_term; + let mut prev_node_id = root_node; + for node_id in path { + let prev_node = nodes.get_mut(prev_node_id); + prev_node.successors.insert(node_id); + let node = nodes.get_mut(node_id); + node.predecessors.insert(prev_node_id); + prev_node_id = node_id; } - nodes.get_mut(prev_node).successors.insert(end_node); - nodes.get_mut(end_node).predecessors.insert(prev_node); + let prev_node = nodes.get_mut(prev_node_id); + prev_node.successors.insert(end_node); + let node = nodes.get_mut(end_node); + node.predecessors.insert(prev_node_id); } QueryGraph { root_node, end_node, nodes } diff --git a/milli/src/search/new/tests/proximity_typo.rs b/milli/src/search/new/tests/proximity_typo.rs index 3bf869d1d..9f9601e3f 100644 --- a/milli/src/search/new/tests/proximity_typo.rs +++ b/milli/src/search/new/tests/proximity_typo.rs @@ -3,6 +3,8 @@ This module tests the interactions between the proximity and typo ranking rules. The proximity ranking rule should transform the query graph such that it only contains the word pairs that it used to compute its bucket. + +TODO: This is not currently implemented. */ use crate::{ @@ -61,8 +63,13 @@ fn test_trap_basic() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("summer holiday"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: this is incorrect, 1 should come before 0 insta::assert_debug_snapshot!(texts, @r###" + [ + "\"summer. holiday. sommer holidty\"", + "\"summer. holiday. sommer holiday\"", + ] "###); } diff --git a/milli/src/search/new/tests/sort.rs b/milli/src/search/new/tests/sort.rs index d3a952a24..d2201f55b 100644 --- a/milli/src/search/new/tests/sort.rs +++ b/milli/src/search/new/tests/sort.rs @@ -271,13 +271,13 @@ fn test_sort() { "false", "true", "true", - "__does_not_exist___", + "__does_not_exist__", "null", "[null,null,\"\"]", "\"\"", "{\"sub\":0}", - "__does_not_exist___", - "__does_not_exist___", + "__does_not_exist__", + "__does_not_exist__", ] "###); @@ -304,13 +304,13 @@ fn test_sort() { "false", "\"1\"", "\"0\"", - "__does_not_exist___", + "__does_not_exist__", "null", "[null,null,\"\"]", "\"\"", "{\"sub\":0}", - "__does_not_exist___", - "__does_not_exist___", + "__does_not_exist__", + "__does_not_exist__", ] "###); } diff --git a/milli/src/search/new/tests/typo_proximity.rs b/milli/src/search/new/tests/typo_proximity.rs index ba6371544..220fc69e1 100644 --- a/milli/src/search/new/tests/typo_proximity.rs +++ b/milli/src/search/new/tests/typo_proximity.rs @@ -59,8 +59,7 @@ fn create_index() -> TempIndex { "id": 3, "text": "beautiful sommer" }, - // The next two documents lay out an even more complex trap, which the current implementation - // fails to handle properly. + // The next two documents lay out an even more complex trap. // With the user query `delicious sweet dessert`, the typo ranking rule will return one bucket of: // - id 4: delicitous + sweet + dessert // - id 5: beautiful + sweet + desgert @@ -114,13 +113,12 @@ fn test_trap_complex2() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("delicious sweet dessert"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); - // TODO: this is incorrect. 5 should appear before 4 insta::assert_debug_snapshot!(texts, @r###" [ - "\"delicitous. sweet. dessert. delicitous sweet desgert\"", "\"delicious. sweet desgert. delicious sweet desgert\"", + "\"delicitous. sweet. dessert. delicitous sweet desgert\"", ] "###); } From 337e75b0e4fa63566ed5ac516f504a723117ecb2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:42:51 +0200 Subject: [PATCH 149/234] Exact attribute with state --- milli/src/search/new/exact_attribute.rs | 166 +++++++++++++++++------- 1 file changed, 122 insertions(+), 44 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index bb6299e28..fa837272b 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -1,5 +1,5 @@ use heed::BytesDecode; -use roaring::MultiOps; +use roaring::{MultiOps, RoaringBitmap}; use super::query_graph::QueryGraph; use super::ranking_rules::{RankingRule, RankingRuleOutput}; @@ -7,19 +7,18 @@ use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::ExactTerm; use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; -/// FIXME: +/// A ranking rule that produces 3 disjoint buckets: /// -/// - A lot of work done in next_bucket that start_iteration could do. -/// - Consider calling the graph based rule directly from this one. -/// - currently we did exact term, don't forget about prefix -/// - some tests +/// 1. Documents from the universe whose value is exactly the query. +/// 2. Documents from the universe not in (1) whose value starts with the query. +/// 3. Documents from the universe not in (1) or (2). pub struct ExactAttribute { - query_graph: Option, + state: State, } impl ExactAttribute { pub fn new() -> Self { - Self { query_graph: None } + Self { state: Default::default() } } } @@ -30,23 +29,69 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { fn start_iteration( &mut self, - _ctx: &mut SearchContext<'ctx>, + ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, - _universe: &roaring::RoaringBitmap, + universe: &roaring::RoaringBitmap, query: &QueryGraph, ) -> Result<()> { - self.query_graph = Some(query.clone()); + self.state = State::start_iteration(ctx, universe, query)?; + Ok(()) } fn next_bucket( &mut self, - ctx: &mut SearchContext<'ctx>, + _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, universe: &roaring::RoaringBitmap, ) -> Result>> { - // iterate on the nodes of the graph, retain LocatedQueryTermSubset - let query_graph = self.query_graph.as_ref().unwrap(); + let state = std::mem::take(&mut self.state); + let (state, output) = State::next(state, universe); + self.state = state; + + Ok(output) + } + + fn end_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + ) { + self.state = Default::default(); + } +} + +/// Inner state of the ranking rule. +#[derive(Default)] +enum State { + /// State between two iterations + #[default] + Uninitialized, + /// The next call to `next` will output the documents in the universe that have an attribute that is the exact query + ExactAttribute(QueryGraph, Vec), + /// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query, + /// but isn't the exact query. + AttributeStarts(QueryGraph, Vec), + /// The next calls to `next` will output the input universe. + Empty(QueryGraph), +} + +/// The candidates sorted by attributes +/// +/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field. +struct FieldCandidates { + /// The candidates that start with all the words of the query in the field + start_with_exact: RoaringBitmap, + /// The candidates that have the same number of words as the query in the field + exact_word_count: RoaringBitmap, +} + +impl State { + fn start_iteration( + ctx: &mut SearchContext<'_>, + universe: &RoaringBitmap, + query_graph: &QueryGraph, + ) -> Result { let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = Vec::with_capacity(query_graph.nodes.len() as usize); for (_, node) in query_graph.nodes.iter() { @@ -55,11 +100,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { exact_term } else { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + continue; }; exact_term_position_ids.push(( exact_term, @@ -73,14 +114,17 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { exact_term_position_ids.sort_by_key(|(_, _, id)| *id); // bail if there is a "hole" (missing word) in remaining query graph + if let Some((_, _, first_id)) = exact_term_position_ids.first() { + if *first_id != 0 { + return Ok(State::Empty(query_graph.clone())); + } + } else { + return Ok(State::Empty(query_graph.clone())); + } let mut previous_id = 0; for (_, _, id) in exact_term_position_ids.iter().copied() { if id < previous_id || id - previous_id > 1 { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } else { previous_id = id; } @@ -102,11 +146,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { .collect(); for (words, position) in &words_positions { if candidates.is_empty() { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } 'words: for (offset, word) in words.iter().enumerate() { @@ -116,8 +156,11 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { } else { continue 'words; }; + // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of + // longer phrases we'll be losing on precision here. + let bucketed_position = crate::bucketed_position(position + offset); let word_position_docids = CboRoaringBitmapCodec::bytes_decode( - ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), + ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(), ) .unwrap_or_default(); candidates &= word_position_docids; @@ -127,16 +170,12 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { let candidates = candidates; if candidates.is_empty() { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); - let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); + let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); // then check that there exists at least one attribute that has all of the terms for fid in searchable_fields_ids { @@ -156,20 +195,59 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { )?; intersection &= &candidates; if !intersection.is_empty() { - candidates_per_attributes.push(intersection); + let candidates_with_exact_word_count = ctx + .index + .field_id_word_count_docids + .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))? + .unwrap_or_default(); + candidates_per_attribute.push(FieldCandidates { + start_with_exact: intersection, + exact_word_count: candidates_with_exact_word_count, + }); } } // note we could have "false positives" where there both exist different attributes that collectively // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. - let candidates = MultiOps::union(candidates_per_attributes.into_iter()); - Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates })) + Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute)) } - fn end_iteration( - &mut self, - _ctx: &mut SearchContext<'ctx>, - _logger: &mut dyn SearchLogger, - ) { + fn next( + state: State, + universe: &RoaringBitmap, + ) -> (State, Option>) { + let (state, output) = match state { + State::Uninitialized => (state, None), + State::ExactAttribute(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.iter().map( + |FieldCandidates { start_with_exact, exact_word_count }| { + start_with_exact & exact_word_count + }, + )); + candidates &= universe; + ( + State::AttributeStarts(query_graph.clone(), candidates_per_attribute), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::AttributeStarts(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map( + |FieldCandidates { mut start_with_exact, exact_word_count }| { + start_with_exact -= exact_word_count; + start_with_exact + }, + )); + candidates &= universe; + ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::Empty(query_graph) => ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }), + ), + }; + (state, output) } } From f7ecea142ec3c3d1403ad00969e37d211c861125 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:43:16 +0200 Subject: [PATCH 150/234] Fix panics and issues in exactness graph ranking rule --- .../new/ranking_rule_graph/exactness/mod.rs | 48 ++++++++----------- .../extract/extract_word_position_docids.rs | 5 +- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index a1e19a015..3d558e87b 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -1,24 +1,11 @@ +use heed::BytesDecode; use roaring::RoaringBitmap; use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::query_graph::{QueryGraph, QueryNode}; use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; -use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; - -/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other -/// word than a doc that matches 9 words non exactly but none exactly -/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider -/// -/// "naive vision" -/// condition from one node to another: -/// - word exactly present: cost 0 -/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside -/// -/// Three strategies: -/// 1. ExactAttribute: word position / word_fid_docid -/// 2. AttributeStart: -/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait` +use crate::{Result, RoaringBitmapCodec, SearchContext, SearchLogger}; #[derive(Clone, PartialEq, Eq, Hash)] pub enum ExactnessCondition { @@ -42,7 +29,7 @@ fn compute_docids( ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), ExactTerm::Word(word) => { if let Some(word_candidates) = ctx.get_db_word_docids(word)? { - CboRoaringBitmapCodec::deserialize_from(word_candidates)? + RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)? } else { return Ok(Default::default()); } @@ -86,22 +73,29 @@ impl RankingRuleGraphTrait for ExactnessGraph { let skip_condition = ExactnessCondition::Skip(dest_node.clone()); let skip_condition = conditions_interner.insert(skip_condition); - Ok(vec![(0, exact_condition), (1, skip_condition)]) + + Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)]) } fn log_state( - graph: &RankingRuleGraph, - paths: &[Vec>], - dead_ends_cache: &DeadEndsCache, - universe: &RoaringBitmap, - costs: &MappedInterner>, - cost: u64, - logger: &mut dyn SearchLogger, + _graph: &RankingRuleGraph, + _paths: &[Vec>], + _dead_ends_cache: &DeadEndsCache, + _niverse: &RoaringBitmap, + _costs: &MappedInterner>, + _cost: u64, + _logger: &mut dyn SearchLogger, ) { - todo!() } - fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { - todo!() + fn label_for_condition( + _ctx: &mut SearchContext, + condition: &Self::Condition, + ) -> Result { + Ok(match condition { + ExactnessCondition::ExactInAttribute(_) => "exact", + ExactnessCondition::Skip(_) => "skip", + } + .to_owned()) } } diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index cd3ec691b..eef5089bc 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -7,10 +7,7 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{ - absolute_from_relative_position, bucketed_position, relative_from_absolute_position, - DocumentId, Result, -}; +use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result}; /// Extracts the word positions and the documents ids where this word appear. /// From d1ddaa223d39b7ba74fde7f9a72b04662931935f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 18:05:44 +0200 Subject: [PATCH 151/234] Use correct codec in proximity --- .../ranking_rule_graph/proximity/compute_docids.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 8496054b7..07bd102ca 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,14 +1,17 @@ #![allow(clippy::too_many_arguments)] +use std::collections::BTreeSet; + +use heed::BytesDecode; +use roaring::RoaringBitmap; + use super::ProximityCondition; use crate::search::new::interner::Interned; use crate::search::new::query_term::{Phrase, QueryTermSubset}; use crate::search::new::ranking_rule_graph::ComputedCondition; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::SearchContext; -use crate::{CboRoaringBitmapCodec, Result}; -use roaring::RoaringBitmap; -use std::collections::BTreeSet; +use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; pub fn compute_docids( ctx: &mut SearchContext, @@ -90,7 +93,8 @@ pub fn compute_docids( continue; } } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? { - let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?; + let left_word_docids = + RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?; if universe.is_disjoint(&left_word_docids) { continue; } From d9460a76f43fa7b35cb2e4d423148c2a0ab174e5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:43:42 +0200 Subject: [PATCH 152/234] Fix word_position_docids indexing --- .../index_documents/extract/extract_word_position_docids.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index eef5089bc..734cf8778 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -39,9 +39,8 @@ pub fn extract_word_fid_and_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - let (fid, position) = relative_from_absolute_position(position); + let (_fid, position) = relative_from_absolute_position(position); let position = bucketed_position(position); - let position = absolute_from_relative_position(fid, position); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } From 5440f43fd3be28981933bd55e126ee88ed3324e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 14:55:02 +0200 Subject: [PATCH 153/234] Fix indexing of word_position_docid and fid --- milli/src/snapshot_tests.rs | 8 +++ .../extract/extract_word_fid_docids.rs | 48 ++++++++++++++++ .../extract/extract_word_position_docids.rs | 4 +- .../src/update/index_documents/extract/mod.rs | 17 +++++- milli/src/update/index_documents/mod.rs | 57 +++++++++++++++++++ .../src/update/index_documents/typed_chunk.rs | 12 ++++ 6 files changed, 141 insertions(+), 5 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_word_fid_docids.rs diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f7f1a97e6..eb94c4be9 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -248,6 +248,11 @@ pub fn snap_word_position_docids(index: &Index) -> String { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) }) } +pub fn snap_word_fid_docids(index: &Index) -> String { + make_db_snap_from_iter!(index, word_fid_docids, |((word, fid), b)| { + &format!("{word:<16} {fid:<3} {}", display_bitmap(&b)) + }) +} pub fn snap_field_id_word_count_docids(index: &Index) -> String { make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| { &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) @@ -477,6 +482,9 @@ macro_rules! full_snap_of_db { ($index:ident, word_position_docids) => {{ $crate::snapshot_tests::snap_word_position_docids(&$index) }}; + ($index:ident, word_fid_docids) => {{ + $crate::snapshot_tests::snap_word_fid_docids(&$index) + }}; ($index:ident, field_id_word_count_docids) => {{ $crate::snapshot_tests::snap_field_id_word_count_docids(&$index) }}; diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs new file mode 100644 index 000000000..72b30cddf --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs @@ -0,0 +1,48 @@ +use std::fs::File; +use std::io; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::{relative_from_absolute_position, DocumentId, Result}; + +/// Extracts the word, field id, and the documents ids where this word appear at this field id. +#[logging_timer::time] +pub fn extract_word_fid_docids( + docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_fid_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = DocumentId::from_be_bytes(document_id_bytes); + + for position in read_u32_ne_bytes(value) { + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + let (fid, _) = relative_from_absolute_position(position); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + } + + let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; + + Ok(word_fid_docids_reader) +} diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 734cf8778..80a36c308 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -14,7 +14,7 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_fid_and_position_docids( +pub fn extract_word_position_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { @@ -39,7 +39,7 @@ pub fn extract_word_fid_and_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - let (_fid, position) = relative_from_absolute_position(position); + let (_, position) = relative_from_absolute_position(position); let position = bucketed_position(position); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 844efed36..db041de6f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -5,6 +5,7 @@ mod extract_fid_docid_facet_values; mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_word_docids; +mod extract_word_fid_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; @@ -22,8 +23,9 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; +use self::extract_word_fid_docids::extract_word_fid_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; -use self::extract_word_position_docids::extract_word_fid_and_position_docids; +use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, @@ -130,14 +132,23 @@ pub(crate) fn data_from_obkv_documents( ); spawn_extraction_task::<_, _, Vec>>( - docid_word_positions_chunks, + docid_word_positions_chunks.clone(), indexer, lmdb_writer_sx.clone(), - extract_word_fid_and_position_docids, + extract_word_position_docids, merge_cbo_roaring_bitmaps, TypedChunk::WordPositionDocids, "word-position-docids", ); + spawn_extraction_task::<_, _, Vec>>( + docid_word_positions_chunks, + indexer, + lmdb_writer_sx.clone(), + extract_word_fid_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordFidDocids, + "word-fid-docids", + ); spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_strings_chunks, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ade217beb..235b35fc8 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2255,4 +2255,61 @@ mod tests { {"id":1,"catto":"jorts"} "###); } + + #[test] + fn test_word_fid_position() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + {"id": 0, "text": "sun flowers are looking at the sun" }, + {"id": 1, "text": "sun flowers are looking at the sun" }, + {"id": 2, "text": "the sun is shining today" }, + { + "id": 3, + "text": "a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a " + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9"); + db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f"); + + index + .add_documents(documents!([ + {"id": 4, "text": "sun flowers are looking at the sun" }, + {"id": 5, "text2": "sun flowers are looking at the sun" }, + {"id": 6, "text": "b b b" }, + { + "id": 7, + "text2": "a a a a" + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4"); + db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83"); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.strategy(DeletionStrategy::AlwaysHard); + builder.delete_external_id("0"); + builder.delete_external_id("3"); + let result = builder.execute().unwrap(); + println!("{result:?}"); + + wtxn.commit().unwrap(); + + db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); + db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); + db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1"); + } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b9b11cfa8..14ba021bd 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -35,6 +35,7 @@ pub(crate) enum TypedChunk { exact_word_docids_reader: grenad::Reader, }, WordPositionDocids(grenad::Reader), + WordFidDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), @@ -140,6 +141,17 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::WordFidDocids(word_fid_docids_iter) => { + append_entries_into_database( + word_fid_docids_iter, + &index.word_fid_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?; From 13b7c826c14140283494a8c85407cf8cc197f184 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 12:15:37 +0200 Subject: [PATCH 154/234] add new highlighter --- .../src/search/new/matches/matching_words.rs | 334 +++++++ milli/src/search/new/matches/mod.rs | 848 ++++++++++++++++++ milli/src/search/new/mod.rs | 1 + 3 files changed, 1183 insertions(+) create mode 100644 milli/src/search/new/matches/matching_words.rs create mode 100644 milli/src/search/new/matches/mod.rs diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs new file mode 100644 index 000000000..a47a08c68 --- /dev/null +++ b/milli/src/search/new/matches/matching_words.rs @@ -0,0 +1,334 @@ +use std::cmp::Reverse; +use std::ops::RangeInclusive; + +use charabia::Token; + +use super::super::interner::Interned; +use super::super::query_term::{ + Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm, +}; +use super::super::{DedupInterner, Phrase}; +use crate::SearchContext; + +pub struct LocatedMatchingPhrase { + pub value: Interned, + pub positions: RangeInclusive, +} + +pub struct LocatedMatchingWords { + pub value: Vec>, + pub positions: RangeInclusive, + pub is_prefix: bool, +} + +/// Structure created from a query tree +/// referencing words that match the given query tree. +pub struct MatchingWords<'ctx> { + word_interner: &'ctx DedupInterner, + phrase_interner: &'ctx DedupInterner, + phrases: Vec, + words: Vec, +} + +/// Extract and centralize the different phrases and words to match stored in a QueryTerm. +fn extract_matching_terms(term: &QueryTerm) -> (Vec>, Vec>) { + let mut matching_words = Vec::new(); + let mut matching_phrases = Vec::new(); + + // the structure is exhaustively extracted to ensure that no field is missing. + let QueryTerm { + original: _, + is_multiple_words: _, + max_nbr_typos: _, + is_prefix: _, + zero_typo, + one_typo, + two_typo, + } = term; + + // the structure is exhaustively extracted to ensure that no field is missing. + let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo; + + // zero typo + if let Some(phrase) = phrase { + matching_phrases.push(*phrase); + } + if let Some(zero_typo) = zero_typo { + matching_words.push(*zero_typo); + } + for synonym in synonyms { + matching_phrases.push(*synonym); + } + + // one typo + // the structure is exhaustively extracted to ensure that no field is missing. + if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo { + if let Some(split_words) = split_words { + matching_phrases.push(*split_words); + } + for one_typo in one_typo { + matching_words.push(*one_typo); + } + } + + // two typos + // the structure is exhaustively extracted to ensure that no field is missing. + if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo { + for two_typos in two_typos { + matching_words.push(*two_typos); + } + } + + (matching_phrases, matching_words) +} + +impl<'ctx> MatchingWords<'ctx> { + pub fn new(ctx: &'ctx SearchContext, located_terms: Vec) -> Self { + let mut phrases = Vec::new(); + let mut words = Vec::new(); + + // Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms + // and wrap them in dedicated structures. + for located_term in located_terms { + let term = ctx.term_interner.get(located_term.value); + let (matching_phrases, matching_words) = extract_matching_terms(term); + + for matching_phrase in matching_phrases { + phrases.push(LocatedMatchingPhrase { + value: matching_phrase, + positions: located_term.positions.clone(), + }); + } + words.push(LocatedMatchingWords { + value: matching_words, + positions: located_term.positions.clone(), + is_prefix: term.is_prefix, + }); + } + + // Sort word to put prefixes at the bottom prioritizing the exact matches. + words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len()))); + + Self { + phrases, + words, + word_interner: &ctx.word_interner, + phrase_interner: &ctx.phrase_interner, + } + } + + /// Returns an iterator over terms that match or partially match the given token. + pub fn match_token<'b>(&'ctx self, token: &'b Token<'b>) -> MatchesIter<'ctx, 'b> { + MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token } + } + + /// Try to match the token with one of the located_words. + fn match_unique_words(&'ctx self, token: &Token) -> Option> { + for located_words in &self.words { + for word in &located_words.value { + let word = self.word_interner.get(*word); + // if the word is a prefix we match using starts_with. + if located_words.is_prefix && token.lemma().starts_with(word) { + let char_len = token.original_lengths(word.len()).0; + let ids = &located_words.positions; + return Some(MatchType::Full { char_len, ids }); + // else we exact match the token. + } else if token.lemma() == word { + let char_len = token.char_end - token.char_start; + let ids = &located_words.positions; + return Some(MatchType::Full { char_len, ids }); + } + } + } + + None + } +} + +/// Iterator over terms that match the given token, +/// This allow to lazily evaluate matches. +pub struct MatchesIter<'a, 'b> { + matching_words: &'a MatchingWords<'a>, + phrases: Box + 'a>, + token: &'b Token<'b>, +} + +impl<'a> Iterator for MatchesIter<'a, '_> { + type Item = MatchType<'a>; + + fn next(&mut self) -> Option { + match self.phrases.next() { + // Try to match all the phrases first. + Some(located_phrase) => { + let phrase = self.matching_words.phrase_interner.get(located_phrase.value); + + // create a PartialMatch struct to make it compute the first match + // instead of duplicating the code. + let ids = &located_phrase.positions; + // collect the references of words from the interner. + let words = phrase + .words + .iter() + .map(|word| { + word.map(|word| self.matching_words.word_interner.get(word).as_str()) + }) + .collect(); + let partial = PartialMatch { matching_words: words, ids, char_len: 0 }; + + partial.match_token(self.token).or_else(|| self.next()) + } + // If no phrases matches, try to match uiques words. + None => self.matching_words.match_unique_words(self.token), + } + } +} + +/// Id of a matching term corespounding to a word written by the end user. +pub type WordId = u16; + +/// A given token can partially match a query word for several reasons: +/// - split words +/// - multi-word synonyms +/// In these cases we need to match consecutively several tokens to consider that the match is full. +#[derive(Debug, PartialEq)] +pub enum MatchType<'a> { + Full { char_len: usize, ids: &'a RangeInclusive }, + Partial(PartialMatch<'a>), +} + +/// Structure helper to match several tokens in a row in order to complete a partial match. +#[derive(Debug, PartialEq)] +pub struct PartialMatch<'a> { + matching_words: Vec>, + ids: &'a RangeInclusive, + char_len: usize, +} + +impl<'a> PartialMatch<'a> { + /// Returns: + /// - None if the given token breaks the partial match + /// - Partial if the given token matches the partial match but doesn't complete it + /// - Full if the given token completes the partial match + pub fn match_token(self, token: &Token) -> Option> { + let Self { mut matching_words, ids, .. } = self; + + let is_matching = match matching_words.first()? { + Some(word) => &token.lemma() == word, + // a None value in the phrase corresponds to a stop word, + // the walue is considered a match if the current token is categorized as a stop word. + None => token.is_stopword(), + }; + + let char_len = token.char_end - token.char_start; + // if there are remaining words to match in the phrase and the current token is matching, + // return a new Partial match allowing the highlighter to continue. + if is_matching && matching_words.len() > 1 { + matching_words.remove(0); + Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len })) + // if there is no remaining word to match in the phrase and the current token is matching, + // return a Full match. + } else if is_matching { + Some(MatchType::Full { char_len, ids }) + // if the current token doesn't match, return None to break the match sequence. + } else { + None + } + } + + pub fn char_len(&self) -> usize { + self.char_len + } +} + +#[cfg(test)] +pub(crate) mod tests { + use std::borrow::Cow; + + use charabia::{TokenKind, TokenizerBuilder}; + + use super::super::super::located_query_terms_from_string; + use super::*; + use crate::index::tests::TempIndex; + + pub(crate) fn temp_index_with_documents() -> TempIndex { + let temp_index = TempIndex::new(); + temp_index + .add_documents(documents!([ + { "id": 1, "name": "split this world westfali westfalia the" }, + ])) + .unwrap(); + temp_index + } + + #[test] + fn matching_words() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let tokenizer = TokenizerBuilder::new().build(); + let tokens = tokenizer.tokenize("split this world"); + let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); + let matching_words = MatchingWords::new(&ctx, query_terms); + + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("split"), + char_end: "split".chars().count(), + byte_end: "split".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &(0..=0) }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("nyc"), + char_end: "nyc".chars().count(), + byte_end: "nyc".len(), + ..Default::default() + }) + .next(), + None + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("world"), + char_end: "world".chars().count(), + byte_end: "world".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("worlded"), + char_end: "worlded".chars().count(), + byte_end: "worlded".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("thisnew"), + char_end: "thisnew".chars().count(), + byte_end: "thisnew".len(), + ..Default::default() + }) + .next(), + None + ); + } +} diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs new file mode 100644 index 000000000..33d0591a6 --- /dev/null +++ b/milli/src/search/new/matches/mod.rs @@ -0,0 +1,848 @@ +use std::borrow::Cow; + +use charabia::{SeparatorKind, Token, Tokenizer}; +use matching_words::{MatchType, MatchingWords, PartialMatch, WordId}; +use serde::Serialize; + +use super::query_term::LocatedQueryTerm; +use crate::SearchContext; + +pub mod matching_words; + +const DEFAULT_CROP_MARKER: &str = "…"; +const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; +const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; + +/// Structure used to build a Matcher allowing to customize formating tags. +pub struct MatcherBuilder<'a, 'ctx, A> { + matching_words: MatchingWords<'ctx>, + tokenizer: Tokenizer<'a, 'a, A>, + crop_marker: Option, + highlight_prefix: Option, + highlight_suffix: Option, +} + +impl<'a, 'ctx, A> MatcherBuilder<'a, 'ctx, A> { + pub fn new( + ctx: &'ctx SearchContext, + located_terms: Vec, + tokenizer: Tokenizer<'a, 'a, A>, + ) -> Self { + let matching_words = MatchingWords::new(ctx, located_terms); + Self { + matching_words, + tokenizer, + crop_marker: None, + highlight_prefix: None, + highlight_suffix: None, + } + } + + pub fn crop_marker(&mut self, marker: String) -> &Self { + self.crop_marker = Some(marker); + self + } + + pub fn highlight_prefix(&mut self, prefix: String) -> &Self { + self.highlight_prefix = Some(prefix); + self + } + + pub fn highlight_suffix(&mut self, suffix: String) -> &Self { + self.highlight_suffix = Some(suffix); + self + } + + pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { + let crop_marker = match &self.crop_marker { + Some(marker) => marker.as_str(), + None => DEFAULT_CROP_MARKER, + }; + + let highlight_prefix = match &self.highlight_prefix { + Some(marker) => marker.as_str(), + None => DEFAULT_HIGHLIGHT_PREFIX, + }; + let highlight_suffix = match &self.highlight_suffix { + Some(marker) => marker.as_str(), + None => DEFAULT_HIGHLIGHT_SUFFIX, + }; + Matcher { + text, + matching_words: &self.matching_words, + tokenizer: &self.tokenizer, + crop_marker, + highlight_prefix, + highlight_suffix, + matches: None, + } + } +} + +#[derive(Copy, Clone, Default)] +pub struct FormatOptions { + pub highlight: bool, + pub crop: Option, +} + +impl FormatOptions { + pub fn merge(self, other: Self) -> Self { + Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } + } +} + +#[derive(Clone, Debug)] +pub struct Match { + match_len: usize, + // ids of the query words that matches. + ids: Vec, + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, +} + +#[derive(Serialize, Debug, Clone, PartialEq, Eq)] +pub struct MatchBounds { + pub start: usize, + pub length: usize, +} + +/// Structure used to analize a string, compute words that match, +/// and format the source string, returning a highlighted and cropped sub-string. +pub struct Matcher<'t, 'm, A> { + text: &'t str, + matching_words: &'m MatchingWords<'m>, + tokenizer: &'m Tokenizer<'m, 'm, A>, + crop_marker: &'m str, + highlight_prefix: &'m str, + highlight_suffix: &'m str, + matches: Option<(Vec>, Vec)>, +} + +impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { + /// Iterates over tokens and save any of them that matches the query. + fn compute_matches(&mut self) -> &mut Self { + /// some words are counted as matches only if they are close together and in the good order, + /// compute_partial_match peek into next words to validate if the match is complete. + fn compute_partial_match<'a>( + mut partial: PartialMatch, + token_position: usize, + word_position: usize, + words_positions: &mut impl Iterator)>, + matches: &mut Vec, + ) -> bool { + let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; + + for (token_position, word_position, word) in words_positions { + partial = match partial.match_token(word) { + // token matches the partial match, but the match is not full, + // we temporarly save the current token then we try to match the next one. + Some(MatchType::Partial(partial)) => { + potential_matches.push((token_position, word_position, partial.char_len())); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + let ids: Vec<_> = ids.clone().into_iter().collect(); + // save previously matched tokens as matches. + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| Match { + match_len, + ids: ids.clone(), + word_position, + token_position, + }, + ); + matches.extend(iter); + + // save the token that closes the partial match as a match. + matches.push(Match { + match_len: char_len, + ids, + word_position, + token_position, + }); + + // the match is complete, we return true. + return true; + } + // no match, continue to next match. + None => break, + }; + } + + // the match is not complete, we return false. + false + } + + let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect(); + let mut matches = Vec::new(); + + let mut words_positions = tokens + .iter() + .scan((0, 0), |(token_position, word_position), token| { + let current_token_position = *token_position; + let current_word_position = *word_position; + *token_position += 1; + if !token.is_separator() { + *word_position += 1; + } + + Some((current_token_position, current_word_position, token)) + }) + .filter(|(_, _, token)| !token.is_separator()); + + while let Some((token_position, word_position, word)) = words_positions.next() { + for match_type in self.matching_words.match_token(word) { + match match_type { + // we match, we save the current token as a match, + // then we continue the rest of the tokens. + MatchType::Full { char_len, ids } => { + let ids: Vec<_> = ids.clone().into_iter().collect(); + matches.push(Match { + match_len: char_len, + ids, + word_position, + token_position, + }); + break; + } + // we match partially, iterate over next tokens to check if we can complete the match. + MatchType::Partial(partial) => { + // if match is completed, we break the matching loop over the current token, + // then we continue the rest of the tokens. + let mut wp = words_positions.clone(); + if compute_partial_match( + partial, + token_position, + word_position, + &mut wp, + &mut matches, + ) { + words_positions = wp; + break; + } + } + } + } + } + + self.matches = Some((tokens, matches)); + self + } + + /// Returns boundaries of the words that match the query. + pub fn matches(&mut self) -> Vec { + match &self.matches { + None => self.compute_matches().matches(), + Some((tokens, matches)) => matches + .iter() + .map(|m| MatchBounds { + start: tokens[m.token_position].byte_start, + length: m.match_len, + }) + .collect(), + } + } + + /// Returns the bounds in byte index of the crop window. + fn crop_bounds(&self, tokens: &[Token], matches: &[Match], crop_size: usize) -> (usize, usize) { + // if there is no match, we start from the beginning of the string by default. + let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); + let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); + let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); + let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); + + // matches needs to be counted in the crop len. + let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; + + // create the initial state of the crop window: 2 iterators starting from the matches positions, + // a reverse iterator starting from the first match token position and going towards the beginning of the text, + let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); + // an iterator starting from the last match token position and going towards the end of the text. + let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); + + // grows the crop window peeking in both directions + // until the window contains the good number of words: + while remaining_words > 0 { + let before_token = before_tokens.peek().map(|t| t.separator_kind()); + let after_token = after_tokens.peek().map(|t| t.separator_kind()); + + match (before_token, after_token) { + // we can expand both sides. + (Some(before_token), Some(after_token)) => { + match (before_token, after_token) { + // if they are both separators and are the same kind then advance both, + // or expand in the soft separator separator side. + (Some(before_token_kind), Some(after_token_kind)) => { + if before_token_kind == after_token_kind { + before_tokens.next(); + + // this avoid having an ending separator before crop marker. + if remaining_words > 1 { + after_tokens.next(); + } + } else if before_token_kind == SeparatorKind::Hard { + after_tokens.next(); + } else { + before_tokens.next(); + } + } + // if one of the tokens is a word, we expend in the side of the word. + // left is a word, advance left. + (None, Some(_)) => { + before_tokens.next(); + remaining_words -= 1; + } + // right is a word, advance right. + (Some(_), None) => { + after_tokens.next(); + remaining_words -= 1; + } + // both are words, advance left then right if remaining_word > 0. + (None, None) => { + before_tokens.next(); + remaining_words -= 1; + + if remaining_words > 0 { + after_tokens.next(); + remaining_words -= 1; + } + } + } + } + // the end of the text is reached, advance left. + (Some(before_token), None) => { + before_tokens.next(); + if before_token.is_none() { + remaining_words -= 1; + } + } + // the start of the text is reached, advance right. + (None, Some(after_token)) => { + after_tokens.next(); + if after_token.is_none() { + remaining_words -= 1; + } + } + // no more token to add. + (None, None) => break, + } + } + + // finally, keep the byte index of each bound of the crop window. + let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); + let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + + (crop_byte_start, crop_byte_end) + } + + /// Compute the score of a match interval: + /// 1) count unique matches + /// 2) calculate distance between matches + /// 3) count ordered matches + fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + // compute distance between matches + distance_score -= (next_match.word_position - m.word_position).min(7) as i16; + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + (uniq_score, distance_score, order_score) + } + + /// Returns the matches interval where the score computed by match_interval_score is the best. + fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { + // we compute the matches interval if we have at least 2 matches. + if matches.len() > 1 { + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval = (0, 0); + let mut best_interval_score = self.match_interval_score(&matches[0..=0]); + // current interval positions. + let mut interval_first = 0; + let mut interval_last = 0; + for (index, next_match) in matches.iter().enumerate().skip(1) { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + if next_match.word_position - matches[interval_first].word_position >= crop_size { + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + + // keep interval if it's the best + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + best_interval_score = interval_score; + } + + // advance start of the interval while interval is longer than crop_size. + while next_match.word_position - matches[interval_first].word_position + >= crop_size + { + interval_first += 1; + } + } + interval_last = index; + } + + // compute the last interval score and compare it to the best one. + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + } + + &matches[best_interval.0..=best_interval.1] + } else { + matches + } + } + + // Returns the formatted version of the original text. + pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { + if !format_options.highlight && format_options.crop.is_none() { + // compute matches is not needed if no highlight nor crop is requested. + Cow::Borrowed(self.text) + } else { + match &self.matches { + Some((tokens, matches)) => { + // If the text has to be cropped, + // compute the best interval to crop around. + let matches = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.find_best_match_interval(matches, crop_size) + } + _ => matches, + }; + + // If the text has to be cropped, + // crop around the best interval. + let (byte_start, byte_end) = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.crop_bounds(tokens, matches, crop_size) + } + _ => (0, self.text.len()), + }; + + let mut formatted = Vec::new(); + + // push crop marker if it's not the start of the text. + if byte_start > 0 && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + let mut byte_index = byte_start; + + if format_options.highlight { + // insert highlight markers around matches. + for m in matches { + let token = &tokens[m.token_position]; + + if byte_index < token.byte_start { + formatted.push(&self.text[byte_index..token.byte_start]); + } + + let highlight_byte_index = self.text[token.byte_start..] + .char_indices() + .enumerate() + .find(|(i, _)| *i == m.match_len) + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + formatted.push(self.highlight_prefix); + formatted.push(&self.text[token.byte_start..highlight_byte_index]); + formatted.push(self.highlight_suffix); + // if it's a prefix highlight, we put the end of the word after the highlight marker. + if highlight_byte_index < token.byte_end { + formatted.push(&self.text[highlight_byte_index..token.byte_end]); + } + + byte_index = token.byte_end; + } + } + + // push the rest of the text between last match and the end of crop. + if byte_index < byte_end { + formatted.push(&self.text[byte_index..byte_end]); + } + + // push crop marker if it's not the end of the text. + if byte_end < self.text.len() && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + if formatted.len() == 1 { + // avoid concatenating if there is already 1 slice. + Cow::Borrowed(&self.text[byte_start..byte_end]) + } else { + Cow::Owned(formatted.concat()) + } + } + None => self.compute_matches().format(format_options), + } + } + } +} + +#[cfg(test)] +mod tests { + use charabia::TokenizerBuilder; + use matching_words::tests::temp_index_with_documents; + + use super::super::located_query_terms_from_string; + use super::*; + + impl<'a, 'ctx> MatcherBuilder<'a, 'ctx, &[u8]> { + pub fn new_test(ctx: &'ctx mut SearchContext, query: &'a str) -> Self { + let tokenizer = TokenizerBuilder::new().build(); + let tokens = tokenizer.tokenize(query); + let query_terms = located_query_terms_from_string(ctx, tokens, None).unwrap(); + Self::new(ctx, query_terms, TokenizerBuilder::new().build()) + } + } + + #[test] + fn format_identity() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: false, crop: None }; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + } + + #[test] + fn format_highlight() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: true, crop: None }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + assert_eq!(&matcher.format(format_options), ""); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + assert_eq!(&matcher.format(format_options), ":-)"); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no crop should return complete text, because there is no matches. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves." + ); + } + + #[test] + fn highlight_unicode() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "world"); + let format_options = FormatOptions { highlight: true, crop: None }; + + // Text containing prefix match. + let text = "Ŵôřlḑôle"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑôle" + ); + + // Text containing unicode match. + let text = "Ŵôřlḑ"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑ" + ); + + let builder = MatcherBuilder::new_test(&mut ctx, "westfali"); + let format_options = FormatOptions { highlight: true, crop: None }; + + // Text containing unicode match. + let text = "Westfália"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Westfália" + ); + } + + #[test] + fn format_crop() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: false, crop: Some(10) }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no highlight should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" + ); + + // Text without any match starting by a separator. + let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; + let mut matcher = builder.build(text); + // no highlight should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"(A quick brown fox can not jump 32 feet, right…" + ); + + // Test phrase propagation + let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; + let mut matcher = builder.build(text); + // should crop the phrase instead of croping around the match. + insta::assert_snapshot!( + matcher.format(format_options), + @"… Split The World is a book written by Emily Henry…" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no highlight should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no highlight should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + + // Text containing matches with diferent density. + let text = "split void the void void world void void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + + // Text containing matches with same word. + let text = "split split split split split split void void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + } + + #[test] + fn format_highlight_crop() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: true, crop: Some(10) }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // both should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // both should return 10 last words with a marker at the start and highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // both should return 10 last words with a marker at the start and highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + } + + #[test] + fn smaller_crop_size() { + //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let text = "void void split the world void void."; + + // set a smaller crop size + let format_options = FormatOptions { highlight: false, crop: Some(2) }; + let mut matcher = builder.build(text); + // because crop size < query size, partially format matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…split the…" + ); + + // set a smaller crop size + let format_options = FormatOptions { highlight: false, crop: Some(1) }; + let mut matcher = builder.build(text); + // because crop size < query size, partially format matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…split…" + ); + + // set crop size to 0 + let format_options = FormatOptions { highlight: false, crop: Some(0) }; + let mut matcher = builder.build(text); + // because crop size is 0, crop is ignored. + insta::assert_snapshot!( + matcher.format(format_options), + @"void void split the world void void." + ); + } + + #[test] + fn partial_matches() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let mut builder = MatcherBuilder::new_test(&mut ctx, "the \"t he\" door \"do or\""); + builder.highlight_prefix("_".to_string()); + builder.highlight_suffix("_".to_string()); + + let format_options = FormatOptions { highlight: true, crop: None }; + + let text = "the do or die can't be he do and or isn't he"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" + ); + } +} diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d561d25b..ef7e61ee1 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -4,6 +4,7 @@ mod graph_based_ranking_rule; mod interner; mod limits; mod logger; +mod matches; mod query_graph; mod query_term; mod ranking_rule_graph; From ebe23b04c9eaa2784f83a3718853534380ddc3b1 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 12:28:28 +0200 Subject: [PATCH 155/234] Make the matcher consume the search context --- .../src/search/new/matches/matching_words.rs | 22 ++++----- milli/src/search/new/matches/mod.rs | 47 ++++++++++--------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index a47a08c68..e737dc942 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -23,9 +23,9 @@ pub struct LocatedMatchingWords { /// Structure created from a query tree /// referencing words that match the given query tree. -pub struct MatchingWords<'ctx> { - word_interner: &'ctx DedupInterner, - phrase_interner: &'ctx DedupInterner, +pub struct MatchingWords { + word_interner: DedupInterner, + phrase_interner: DedupInterner, phrases: Vec, words: Vec, } @@ -82,8 +82,8 @@ fn extract_matching_terms(term: &QueryTerm) -> (Vec>, Vec MatchingWords<'ctx> { - pub fn new(ctx: &'ctx SearchContext, located_terms: Vec) -> Self { +impl MatchingWords { + pub fn new(ctx: SearchContext, located_terms: Vec) -> Self { let mut phrases = Vec::new(); let mut words = Vec::new(); @@ -112,18 +112,18 @@ impl<'ctx> MatchingWords<'ctx> { Self { phrases, words, - word_interner: &ctx.word_interner, - phrase_interner: &ctx.phrase_interner, + word_interner: ctx.word_interner, + phrase_interner: ctx.phrase_interner, } } /// Returns an iterator over terms that match or partially match the given token. - pub fn match_token<'b>(&'ctx self, token: &'b Token<'b>) -> MatchesIter<'ctx, 'b> { + pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token } } /// Try to match the token with one of the located_words. - fn match_unique_words(&'ctx self, token: &Token) -> Option> { + fn match_unique_words<'a>(&'a self, token: &Token) -> Option> { for located_words in &self.words { for word in &located_words.value { let word = self.word_interner.get(*word); @@ -148,7 +148,7 @@ impl<'ctx> MatchingWords<'ctx> { /// Iterator over terms that match the given token, /// This allow to lazily evaluate matches. pub struct MatchesIter<'a, 'b> { - matching_words: &'a MatchingWords<'a>, + matching_words: &'a MatchingWords, phrases: Box + 'a>, token: &'b Token<'b>, } @@ -268,7 +268,7 @@ pub(crate) mod tests { let tokenizer = TokenizerBuilder::new().build(); let tokens = tokenizer.tokenize("split this world"); let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); - let matching_words = MatchingWords::new(&ctx, query_terms); + let matching_words = MatchingWords::new(ctx, query_terms); assert_eq!( matching_words diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 33d0591a6..9b73c2098 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -14,17 +14,17 @@ const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; /// Structure used to build a Matcher allowing to customize formating tags. -pub struct MatcherBuilder<'a, 'ctx, A> { - matching_words: MatchingWords<'ctx>, +pub struct MatcherBuilder<'a, A> { + matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>, crop_marker: Option, highlight_prefix: Option, highlight_suffix: Option, } -impl<'a, 'ctx, A> MatcherBuilder<'a, 'ctx, A> { +impl<'a, A> MatcherBuilder<'a, A> { pub fn new( - ctx: &'ctx SearchContext, + ctx: SearchContext, located_terms: Vec, tokenizer: Tokenizer<'a, 'a, A>, ) -> Self { @@ -112,7 +112,7 @@ pub struct MatchBounds { /// and format the source string, returning a highlighted and cropped sub-string. pub struct Matcher<'t, 'm, A> { text: &'t str, - matching_words: &'m MatchingWords<'m>, + matching_words: &'m MatchingWords, tokenizer: &'m Tokenizer<'m, 'm, A>, crop_marker: &'m str, highlight_prefix: &'m str, @@ -509,11 +509,11 @@ mod tests { use super::super::located_query_terms_from_string; use super::*; - impl<'a, 'ctx> MatcherBuilder<'a, 'ctx, &[u8]> { - pub fn new_test(ctx: &'ctx mut SearchContext, query: &'a str) -> Self { + impl<'a, 'ctx> MatcherBuilder<'a, &[u8]> { + pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self { let tokenizer = TokenizerBuilder::new().build(); let tokens = tokenizer.tokenize(query); - let query_terms = located_query_terms_from_string(ctx, tokens, None).unwrap(); + let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); Self::new(ctx, query_terms, TokenizerBuilder::new().build()) } } @@ -522,8 +522,8 @@ mod tests { fn format_identity() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "split the world"); let format_options = FormatOptions { highlight: false, crop: None }; @@ -550,8 +550,8 @@ mod tests { fn format_highlight() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "split the world"); let format_options = FormatOptions { highlight: true, crop: None }; @@ -594,8 +594,8 @@ mod tests { fn highlight_unicode() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "world"); let format_options = FormatOptions { highlight: true, crop: None }; // Text containing prefix match. @@ -616,7 +616,8 @@ mod tests { @"Ŵôřlḑ" ); - let builder = MatcherBuilder::new_test(&mut ctx, "westfali"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "westfali"); let format_options = FormatOptions { highlight: true, crop: None }; // Text containing unicode match. @@ -633,8 +634,8 @@ mod tests { fn format_crop() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "split the world"); let format_options = FormatOptions { highlight: false, crop: Some(10) }; @@ -731,8 +732,8 @@ mod tests { fn format_highlight_crop() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "split the world"); let format_options = FormatOptions { highlight: true, crop: Some(10) }; @@ -794,8 +795,8 @@ mod tests { //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "split the world"); let text = "void void split the world void void."; @@ -831,8 +832,8 @@ mod tests { fn partial_matches() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let mut builder = MatcherBuilder::new_test(&mut ctx, "the \"t he\" door \"do or\""); + let ctx = SearchContext::new(&temp_index, &rtxn); + let mut builder = MatcherBuilder::new_test(ctx, "the \"t he\" door \"do or\""); builder.highlight_prefix("_".to_string()); builder.highlight_suffix("_".to_string()); From 9c5f64769a38d766fb96c350ed7396cc57cefe3f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 13:58:56 +0200 Subject: [PATCH 156/234] Integrate the new Highlighter in the search --- milli/src/lib.rs | 4 +- milli/src/search/matches/matching_words.rs | 3 +- milli/src/search/mod.rs | 8 ++- .../src/search/new/matches/matching_words.rs | 36 +++++++++++++ milli/src/search/new/matches/mod.rs | 13 ++--- milli/src/search/new/mod.rs | 54 ++++++++++--------- 6 files changed, 77 insertions(+), 41 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index eb63c3904..13e23a5bd 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -97,8 +97,8 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, - MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search, + SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 22241c457..5ccf0286f 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -289,8 +289,7 @@ mod tests { use charabia::TokenKind; - use super::*; - use crate::MatchingWords; + use super::{MatchingWords, *}; #[test] fn test_bytes_to_highlight() { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 08803b73f..3683a5cf0 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -5,9 +5,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; -pub use self::matches::{ - FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, -}; +pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords}; use crate::{ execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, }; @@ -109,9 +107,9 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - let mut ctx = SearchContext::new(self.index, self.rtxn); + let ctx = SearchContext::new(self.index, self.rtxn); execute_search( - &mut ctx, + ctx, &self.query, self.terms_matching_strategy, self.exhaustive_number_hits, diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index e737dc942..4ca04884a 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -1,4 +1,5 @@ use std::cmp::Reverse; +use std::fmt; use std::ops::RangeInclusive; use charabia::Token; @@ -23,6 +24,7 @@ pub struct LocatedMatchingWords { /// Structure created from a query tree /// referencing words that match the given query tree. +#[derive(Default)] pub struct MatchingWords { word_interner: DedupInterner, phrase_interner: DedupInterner, @@ -240,6 +242,40 @@ impl<'a> PartialMatch<'a> { } } +impl fmt::Debug for MatchingWords { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let MatchingWords { word_interner, phrase_interner, phrases, words } = self; + + let phrases: Vec<_> = phrases + .iter() + .map(|p| { + ( + phrase_interner + .get(p.value) + .words + .iter() + .map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w))) + .collect::>() + .join(" "), + p.positions.clone(), + ) + }) + .collect(); + + let words: Vec<_> = words + .iter() + .flat_map(|w| { + w.value + .iter() + .map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix)) + .collect::>() + }) + .collect(); + + f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish() + } +} + #[cfg(test)] pub(crate) mod tests { use std::borrow::Cow; diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 9b73c2098..2a9596902 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -1,7 +1,8 @@ use std::borrow::Cow; use charabia::{SeparatorKind, Token, Tokenizer}; -use matching_words::{MatchType, MatchingWords, PartialMatch, WordId}; +pub use matching_words::MatchingWords; +use matching_words::{MatchType, PartialMatch, WordId}; use serde::Serialize; use super::query_term::LocatedQueryTerm; @@ -23,12 +24,7 @@ pub struct MatcherBuilder<'a, A> { } impl<'a, A> MatcherBuilder<'a, A> { - pub fn new( - ctx: SearchContext, - located_terms: Vec, - tokenizer: Tokenizer<'a, 'a, A>, - ) -> Self { - let matching_words = MatchingWords::new(ctx, located_terms); + pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { Self { matching_words, tokenizer, @@ -514,7 +510,8 @@ mod tests { let tokenizer = TokenizerBuilder::new().build(); let tokens = tokenizer.tokenize(query); let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); - Self::new(ctx, query_terms, TokenizerBuilder::new().build()) + let matching_words = MatchingWords::new(ctx, query_terms); + Self::new(matching_words, TokenizerBuilder::new().build()) } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index ef7e61ee1..0bb454c06 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -4,7 +4,7 @@ mod graph_based_ranking_rule; mod interner; mod limits; mod logger; -mod matches; +pub mod matches; mod query_graph; mod query_term; mod ranking_rule_graph; @@ -271,7 +271,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( #[allow(clippy::too_many_arguments)] pub fn execute_search( - ctx: &mut SearchContext, + mut ctx: SearchContext, query: &Option, terms_matching_strategy: TermsMatchingStrategy, exhaustive_number_hits: bool, @@ -284,21 +284,22 @@ pub fn execute_search( query_graph_logger: &mut dyn SearchLogger, ) -> Result { let mut universe = if let Some(filters) = filters { - filters.evaluate(ctx.txn, ctx.index)? + filters.evaluate(&mut ctx.txn, &mut ctx.index)? } else { - ctx.index.documents_ids(ctx.txn)? + ctx.index.documents_ids(&mut ctx.txn)? }; + let mut located_query_terms = None; let documents_ids = if let Some(query) = query { // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut tokbuilder = TokenizerBuilder::new(); - let stop_words = ctx.index.stop_words(ctx.txn)?; + let stop_words = &mut ctx.index.stop_words(&mut ctx.txn)?; if let Some(ref stop_words) = stop_words { tokbuilder.stop_words(stop_words); } - let script_lang_map = ctx.index.script_language(ctx.txn)?; + let script_lang_map = &mut ctx.index.script_language(&mut ctx.txn)?; if !script_lang_map.is_empty() { tokbuilder.allow_list(&script_lang_map); } @@ -306,27 +307,31 @@ pub fn execute_search( let tokenizer = tokbuilder.build(); let tokens = tokenizer.tokenize(query); - let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; - let graph = QueryGraph::from_query(ctx, &query_terms)?; + let query_terms = located_query_terms_from_string(&mut ctx, tokens, words_limit)?; + let graph = QueryGraph::from_query(&mut ctx, &query_terms)?; + located_query_terms = Some(query_terms); - check_sort_criteria(ctx, sort_criteria.as_ref())?; + check_sort_criteria(&mut ctx, sort_criteria.as_ref())?; universe = resolve_maximally_reduced_query_graph( - ctx, + &mut ctx, &universe, &graph, terms_matching_strategy, query_graph_logger, )?; - let ranking_rules = - get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; + let ranking_rules = get_ranking_rules_for_query_graph_search( + &mut ctx, + sort_criteria, + terms_matching_strategy, + )?; - bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? + bucket_sort(&mut ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { - let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?; + let ranking_rules = get_ranking_rules_for_placeholder_search(&mut ctx, sort_criteria)?; bucket_sort( - ctx, + &mut ctx, ranking_rules, &PlaceholderQuery, &universe, @@ -340,19 +345,20 @@ pub fn execute_search( // is requested and a distinct attribute is set. let mut candidates = universe; if exhaustive_number_hits { - if let Some(f) = ctx.index.distinct_field(ctx.txn)? { - if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) { - candidates = apply_distinct_rule(ctx, distinct_fid, &candidates)?.remaining; + if let Some(f) = &mut ctx.index.distinct_field(&mut ctx.txn)? { + if let Some(distinct_fid) = ctx.index.fields_ids_map(&mut ctx.txn)?.id(f) { + candidates = apply_distinct_rule(&mut ctx, distinct_fid, &candidates)?.remaining; } } } - Ok(SearchResult { - // TODO: correct matching words - matching_words: MatchingWords::default(), - candidates, - documents_ids, - }) + // consume context and located_query_terms to build MatchingWords. + let matching_words = match located_query_terms { + Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms), + None => MatchingWords::default(), + }; + + Ok(SearchResult { matching_words, candidates, documents_ids }) } fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec>) -> Result<()> { From a1148c09c2b6e56f23428a04530a81edc9c657e8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 14:00:21 +0200 Subject: [PATCH 157/234] remove old matcher --- milli/src/search/matches/matching_words.rs | 457 ----------- milli/src/search/matches/mod.rs | 865 --------------------- milli/src/search/mod.rs | 1 - 3 files changed, 1323 deletions(-) delete mode 100644 milli/src/search/matches/matching_words.rs delete mode 100644 milli/src/search/matches/mod.rs diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs deleted file mode 100644 index 5ccf0286f..000000000 --- a/milli/src/search/matches/matching_words.rs +++ /dev/null @@ -1,457 +0,0 @@ -use std::cmp::{min, Reverse}; -use std::collections::BTreeMap; -use std::fmt; -use std::ops::{Index, IndexMut}; -use std::rc::Rc; - -use charabia::Token; -use levenshtein_automata::{Distance, DFA}; - -use crate::error::InternalError; -use crate::search::build_dfa; -use crate::MAX_WORD_LENGTH; - -type IsPrefix = bool; - -/// Structure created from a query tree -/// referencing words that match the given query tree. -#[derive(Default)] -pub struct MatchingWords { - inner: Vec<(Vec>, Vec)>, -} - -impl fmt::Debug for MatchingWords { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - writeln!(f, "[")?; - for (matching_words, primitive_word_id) in self.inner.iter() { - writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?; - } - writeln!(f, "]")?; - Ok(()) - } -} - -impl MatchingWords { - pub fn new( - mut matching_words: Vec<(Vec>, Vec)>, - ) -> crate::Result { - // if one of the matching_words vec doesn't contain a word. - if matching_words.iter().any(|(mw, _)| mw.is_empty()) { - return Err(InternalError::InvalidMatchingWords.into()); - } - - // Sort word by len in DESC order prioritizing the longuest matches, - // in order to highlight the longuest part of the matched word. - matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len()))); - - Ok(Self { inner: matching_words }) - } - - /// Returns an iterator over terms that match or partially match the given token. - pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { - MatchesIter { inner: Box::new(self.inner.iter()), token } - } -} - -/// Iterator over terms that match the given token, -/// This allow to lazily evaluate matches. -pub struct MatchesIter<'a, 'b> { - #[allow(clippy::type_complexity)] - inner: Box>, Vec)> + 'a>, - token: &'b Token<'b>, -} - -impl<'a> Iterator for MatchesIter<'a, '_> { - type Item = MatchType<'a>; - - fn next(&mut self) -> Option { - match self.inner.next() { - Some((matching_words, ids)) => match matching_words[0].match_token(self.token) { - Some(char_len) => { - if matching_words.len() > 1 { - Some(MatchType::Partial(PartialMatch { - matching_words: &matching_words[1..], - ids, - char_len, - })) - } else { - Some(MatchType::Full { char_len, ids }) - } - } - None => self.next(), - }, - None => None, - } - } -} - -/// Id of a matching term corespounding to a word written by the end user. -pub type PrimitiveWordId = u8; - -/// Structure used to match a specific term. -pub struct MatchingWord { - pub dfa: DFA, - pub word: String, - pub typo: u8, - pub prefix: IsPrefix, -} - -impl fmt::Debug for MatchingWord { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("MatchingWord") - .field("word", &self.word) - .field("typo", &self.typo) - .field("prefix", &self.prefix) - .finish() - } -} - -impl PartialEq for MatchingWord { - fn eq(&self, other: &Self) -> bool { - self.prefix == other.prefix && self.typo == other.typo && self.word == other.word - } -} - -impl MatchingWord { - pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option { - if word.len() > MAX_WORD_LENGTH { - return None; - } - let dfa = build_dfa(&word, typo, prefix); - - Some(Self { dfa, word, typo, prefix }) - } - - /// Returns the lenght in chars of the match in case of the token matches the term. - pub fn match_token(&self, token: &Token) -> Option { - match self.dfa.eval(token.lemma()) { - Distance::Exact(t) if t <= self.typo => { - if self.prefix { - let len = bytes_to_highlight(token.lemma(), &self.word); - Some(token.original_lengths(len).0) - } else { - Some(token.original_lengths(token.lemma().len()).0) - } - } - _otherwise => None, - } - } -} - -/// A given token can partially match a query word for several reasons: -/// - split words -/// - multi-word synonyms -/// In these cases we need to match consecutively several tokens to consider that the match is full. -#[derive(Debug, PartialEq)] -pub enum MatchType<'a> { - Full { char_len: usize, ids: &'a [PrimitiveWordId] }, - Partial(PartialMatch<'a>), -} - -/// Structure helper to match several tokens in a row in order to complete a partial match. -#[derive(Debug, PartialEq)] -pub struct PartialMatch<'a> { - matching_words: &'a [Rc], - ids: &'a [PrimitiveWordId], - char_len: usize, -} - -impl<'a> PartialMatch<'a> { - /// Returns: - /// - None if the given token breaks the partial match - /// - Partial if the given token matches the partial match but doesn't complete it - /// - Full if the given token completes the partial match - pub fn match_token(self, token: &Token) -> Option> { - self.matching_words[0].match_token(token).map(|char_len| { - if self.matching_words.len() > 1 { - MatchType::Partial(PartialMatch { - matching_words: &self.matching_words[1..], - ids: self.ids, - char_len, - }) - } else { - MatchType::Full { char_len, ids: self.ids } - } - }) - } - - pub fn char_len(&self) -> usize { - self.char_len - } -} - -// A simple wrapper around vec so we can get contiguous but index it like it's 2D array. -struct N2Array { - y_size: usize, - buf: Vec, -} - -impl N2Array { - fn new(x: usize, y: usize, value: T) -> N2Array { - N2Array { y_size: y, buf: vec![value; x * y] } - } -} - -impl Index<(usize, usize)> for N2Array { - type Output = T; - - #[inline] - fn index(&self, (x, y): (usize, usize)) -> &T { - &self.buf[(x * self.y_size) + y] - } -} - -impl IndexMut<(usize, usize)> for N2Array { - #[inline] - fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T { - &mut self.buf[(x * self.y_size) + y] - } -} - -/// Returns the number of **bytes** we want to highlight in the `source` word. -/// Basically we want to highlight as much characters as possible in the source until it has too much -/// typos (= 2) -/// The algorithm is a modified -/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) -fn bytes_to_highlight(source: &str, target: &str) -> usize { - let n = source.chars().count(); - let m = target.chars().count(); - - if n == 0 { - return 0; - } - // since we allow two typos we can send two characters even if it's completely wrong - if m < 3 { - return source.chars().take(m).map(|c| c.len_utf8()).sum(); - } - if n == m && source == target { - return source.len(); - } - - let inf = n + m; - let mut matrix = N2Array::new(n + 2, m + 2, 0); - - matrix[(0, 0)] = inf; - for i in 0..=n { - matrix[(i + 1, 0)] = inf; - matrix[(i + 1, 1)] = i; - } - for j in 0..=m { - matrix[(0, j + 1)] = inf; - matrix[(1, j + 1)] = j; - } - - let mut last_row = BTreeMap::new(); - - for (row, char_s) in source.chars().enumerate() { - let mut last_match_col = 0; - let row = row + 1; - - for (col, char_t) in target.chars().enumerate() { - let col = col + 1; - let last_match_row = *last_row.get(&char_t).unwrap_or(&0); - let cost = usize::from(char_s != char_t); - - let dist_add = matrix[(row, col + 1)] + 1; - let dist_del = matrix[(row + 1, col)] + 1; - let dist_sub = matrix[(row, col)] + cost; - let dist_trans = matrix[(last_match_row, last_match_col)] - + (row - last_match_row - 1) - + 1 - + (col - last_match_col - 1); - let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans)); - matrix[(row + 1, col + 1)] = dist; - - if cost == 0 { - last_match_col = col; - } - } - - last_row.insert(char_s, row); - } - - let mut minimum = (u32::max_value(), 0); - for x in 0..=m { - let dist = matrix[(n + 1, x + 1)] as u32; - if dist < minimum.0 { - minimum = (dist, x); - } - } - - // everything was done characters wise and now we want to returns a number of bytes - source.chars().take(minimum.1).map(|c| c.len_utf8()).sum() -} - -#[cfg(test)] -mod tests { - use std::borrow::Cow; - use std::str::from_utf8; - - use charabia::TokenKind; - - use super::{MatchingWords, *}; - - #[test] - fn test_bytes_to_highlight() { - struct TestBytesToHighlight { - query: &'static str, - text: &'static str, - length: usize, - } - let tests = [ - TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() }, - TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() }, - TestBytesToHighlight { - query: "Levenshtein", - text: "Levenshtein", - length: "Levenshtein".len(), - }, - // we get to the end of our word with only one typo - TestBytesToHighlight { - query: "Levenste", - text: "Levenshtein", - length: "Levenste".len(), - }, - // we get our third and last authorized typo right on the last character - TestBytesToHighlight { - query: "Levenstein", - text: "Levenshte", - length: "Levenste".len(), - }, - // we get to the end of our word with only two typos at the beginning - TestBytesToHighlight { - query: "Bavenshtein", - text: "Levenshtein", - length: "Bavenshtein".len(), - }, - TestBytesToHighlight { - query: "Альфа", text: "Альфой", length: "Альф".len() - }, - TestBytesToHighlight { - query: "Go💼", text: "Go💼od luck.", length: "Go💼".len() - }, - TestBytesToHighlight { - query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len() - }, - TestBytesToHighlight { - query: "chäräcters", - text: "chäräcters", - length: "chäräcters".len(), - }, - TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() }, - TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() }, - ]; - - for test in &tests { - let length = bytes_to_highlight(test.text, test.query); - assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text); - assert!( - from_utf8(&test.query.as_bytes()[..length]).is_ok(), - r#"converting {}[..{}] to an utf8 str failed"#, - test.query, - length - ); - } - } - - #[test] - fn matching_words() { - let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()), - Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), - ]; - let matching_words = vec![ - (vec![all[0].clone()], vec![0]), - (vec![all[1].clone()], vec![1]), - (vec![all[2].clone()], vec![2]), - ]; - - let matching_words = MatchingWords::new(matching_words).unwrap(); - - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("word"), - char_end: "word".chars().count(), - byte_end: "word".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 3, ids: &[2] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("nyc"), - char_end: "nyc".chars().count(), - byte_end: "nyc".len(), - ..Default::default() - }) - .next(), - None - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("world"), - char_end: "world".chars().count(), - byte_end: "world".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 5, ids: &[2] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("splitted"), - char_end: "splitted".chars().count(), - byte_end: "splitted".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 5, ids: &[0] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("thisnew"), - char_end: "thisnew".chars().count(), - byte_end: "thisnew".len(), - ..Default::default() - }) - .next(), - None - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("borld"), - char_end: "borld".chars().count(), - byte_end: "borld".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 5, ids: &[2] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("wordsplit"), - char_end: "wordsplit".chars().count(), - byte_end: "wordsplit".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 4, ids: &[2] }) - ); - } -} diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs deleted file mode 100644 index c634ae297..000000000 --- a/milli/src/search/matches/mod.rs +++ /dev/null @@ -1,865 +0,0 @@ -use std::borrow::Cow; - -use charabia::{SeparatorKind, Token, Tokenizer}; -use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; -pub use matching_words::{MatchingWord, MatchingWords}; -use serde::Serialize; - -pub mod matching_words; - -const DEFAULT_CROP_MARKER: &str = "…"; -const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; -const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; - -/// Structure used to build a Matcher allowing to customize formating tags. -pub struct MatcherBuilder<'a, A> { - matching_words: MatchingWords, - tokenizer: Tokenizer<'a, 'a, A>, - crop_marker: Option, - highlight_prefix: Option, - highlight_suffix: Option, -} - -impl<'a, A> MatcherBuilder<'a, A> { - pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { - Self { - matching_words, - tokenizer, - crop_marker: None, - highlight_prefix: None, - highlight_suffix: None, - } - } - - pub fn crop_marker(&mut self, marker: String) -> &Self { - self.crop_marker = Some(marker); - self - } - - pub fn highlight_prefix(&mut self, prefix: String) -> &Self { - self.highlight_prefix = Some(prefix); - self - } - - pub fn highlight_suffix(&mut self, suffix: String) -> &Self { - self.highlight_suffix = Some(suffix); - self - } - - pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { - let crop_marker = match &self.crop_marker { - Some(marker) => marker.as_str(), - None => DEFAULT_CROP_MARKER, - }; - - let highlight_prefix = match &self.highlight_prefix { - Some(marker) => marker.as_str(), - None => DEFAULT_HIGHLIGHT_PREFIX, - }; - let highlight_suffix = match &self.highlight_suffix { - Some(marker) => marker.as_str(), - None => DEFAULT_HIGHLIGHT_SUFFIX, - }; - Matcher { - text, - matching_words: &self.matching_words, - tokenizer: &self.tokenizer, - crop_marker, - highlight_prefix, - highlight_suffix, - matches: None, - } - } -} - -#[derive(Copy, Clone, Default)] -pub struct FormatOptions { - pub highlight: bool, - pub crop: Option, -} - -impl FormatOptions { - pub fn merge(self, other: Self) -> Self { - Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } - } -} - -#[derive(Clone, Debug)] -pub struct Match { - match_len: usize, - // ids of the query words that matches. - ids: Vec, - // position of the word in the whole text. - word_position: usize, - // position of the token in the whole text. - token_position: usize, -} - -#[derive(Serialize, Debug, Clone, PartialEq, Eq)] -pub struct MatchBounds { - pub start: usize, - pub length: usize, -} - -/// Structure used to analize a string, compute words that match, -/// and format the source string, returning a highlighted and cropped sub-string. -pub struct Matcher<'t, 'm, A> { - text: &'t str, - matching_words: &'m MatchingWords, - tokenizer: &'m Tokenizer<'m, 'm, A>, - crop_marker: &'m str, - highlight_prefix: &'m str, - highlight_suffix: &'m str, - matches: Option<(Vec>, Vec)>, -} - -impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { - /// Iterates over tokens and save any of them that matches the query. - fn compute_matches(&mut self) -> &mut Self { - /// some words are counted as matches only if they are close together and in the good order, - /// compute_partial_match peek into next words to validate if the match is complete. - fn compute_partial_match<'a>( - mut partial: PartialMatch, - token_position: usize, - word_position: usize, - words_positions: &mut impl Iterator)>, - matches: &mut Vec, - ) -> bool { - let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; - - for (token_position, word_position, word) in words_positions { - partial = match partial.match_token(word) { - // token matches the partial match, but the match is not full, - // we temporarly save the current token then we try to match the next one. - Some(MatchType::Partial(partial)) => { - potential_matches.push((token_position, word_position, partial.char_len())); - partial - } - // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - // save previously matched tokens as matches. - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| Match { - match_len, - ids: ids.to_vec(), - word_position, - token_position, - }, - ); - matches.extend(iter); - - // save the token that closes the partial match as a match. - matches.push(Match { - match_len: char_len, - ids: ids.to_vec(), - word_position, - token_position, - }); - - // the match is complete, we return true. - return true; - } - // no match, continue to next match. - None => break, - }; - } - - // the match is not complete, we return false. - false - } - - let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect(); - let mut matches = Vec::new(); - - let mut words_positions = tokens - .iter() - .scan((0, 0), |(token_position, word_position), token| { - let current_token_position = *token_position; - let current_word_position = *word_position; - *token_position += 1; - if !token.is_separator() { - *word_position += 1; - } - - Some((current_token_position, current_word_position, token)) - }) - .filter(|(_, _, token)| !token.is_separator()); - - while let Some((token_position, word_position, word)) = words_positions.next() { - for match_type in self.matching_words.match_token(word) { - match match_type { - // we match, we save the current token as a match, - // then we continue the rest of the tokens. - MatchType::Full { char_len, ids } => { - matches.push(Match { - match_len: char_len, - ids: ids.to_vec(), - word_position, - token_position, - }); - break; - } - // we match partially, iterate over next tokens to check if we can complete the match. - MatchType::Partial(partial) => { - // if match is completed, we break the matching loop over the current token, - // then we continue the rest of the tokens. - let mut wp = words_positions.clone(); - if compute_partial_match( - partial, - token_position, - word_position, - &mut wp, - &mut matches, - ) { - words_positions = wp; - break; - } - } - } - } - } - - self.matches = Some((tokens, matches)); - self - } - - /// Returns boundaries of the words that match the query. - pub fn matches(&mut self) -> Vec { - match &self.matches { - None => self.compute_matches().matches(), - Some((tokens, matches)) => matches - .iter() - .map(|m| MatchBounds { - start: tokens[m.token_position].byte_start, - length: m.match_len, - }) - .collect(), - } - } - - /// Returns the bounds in byte index of the crop window. - fn crop_bounds(&self, tokens: &[Token], matches: &[Match], crop_size: usize) -> (usize, usize) { - // if there is no match, we start from the beginning of the string by default. - let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); - let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); - let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); - let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); - - // matches needs to be counted in the crop len. - let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; - - // create the initial state of the crop window: 2 iterators starting from the matches positions, - // a reverse iterator starting from the first match token position and going towards the beginning of the text, - let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); - // an iterator starting from the last match token position and going towards the end of the text. - let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); - - // grows the crop window peeking in both directions - // until the window contains the good number of words: - while remaining_words > 0 { - let before_token = before_tokens.peek().map(|t| t.separator_kind()); - let after_token = after_tokens.peek().map(|t| t.separator_kind()); - - match (before_token, after_token) { - // we can expand both sides. - (Some(before_token), Some(after_token)) => { - match (before_token, after_token) { - // if they are both separators and are the same kind then advance both, - // or expand in the soft separator separator side. - (Some(before_token_kind), Some(after_token_kind)) => { - if before_token_kind == after_token_kind { - before_tokens.next(); - - // this avoid having an ending separator before crop marker. - if remaining_words > 1 { - after_tokens.next(); - } - } else if before_token_kind == SeparatorKind::Hard { - after_tokens.next(); - } else { - before_tokens.next(); - } - } - // if one of the tokens is a word, we expend in the side of the word. - // left is a word, advance left. - (None, Some(_)) => { - before_tokens.next(); - remaining_words -= 1; - } - // right is a word, advance right. - (Some(_), None) => { - after_tokens.next(); - remaining_words -= 1; - } - // both are words, advance left then right if remaining_word > 0. - (None, None) => { - before_tokens.next(); - remaining_words -= 1; - - if remaining_words > 0 { - after_tokens.next(); - remaining_words -= 1; - } - } - } - } - // the end of the text is reached, advance left. - (Some(before_token), None) => { - before_tokens.next(); - if before_token.is_none() { - remaining_words -= 1; - } - } - // the start of the text is reached, advance right. - (None, Some(after_token)) => { - after_tokens.next(); - if after_token.is_none() { - remaining_words -= 1; - } - } - // no more token to add. - (None, None) => break, - } - } - - // finally, keep the byte index of each bound of the crop window. - let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); - let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - - (crop_byte_start, crop_byte_end) - } - - /// Compute the score of a match interval: - /// 1) count unique matches - /// 2) calculate distance between matches - /// 3) count ordered matches - fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { - let mut ids: Vec = Vec::with_capacity(matches.len()); - let mut order_score = 0; - let mut distance_score = 0; - - let mut iter = matches.iter().peekable(); - while let Some(m) = iter.next() { - if let Some(next_match) = iter.peek() { - // if matches are ordered - if next_match.ids.iter().min() > m.ids.iter().min() { - order_score += 1; - } - - // compute distance between matches - distance_score -= (next_match.word_position - m.word_position).min(7) as i16; - } - - ids.extend(m.ids.iter()); - } - - ids.sort_unstable(); - ids.dedup(); - let uniq_score = ids.len() as i16; - - // rank by unique match count, then by distance between matches, then by ordered match count. - (uniq_score, distance_score, order_score) - } - - /// Returns the matches interval where the score computed by match_interval_score is the best. - fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { - // we compute the matches interval if we have at least 2 matches. - if matches.len() > 1 { - // positions of the first and the last match of the best matches interval in `matches`. - let mut best_interval = (0, 0); - let mut best_interval_score = self.match_interval_score(&matches[0..=0]); - // current interval positions. - let mut interval_first = 0; - let mut interval_last = 0; - for (index, next_match) in matches.iter().enumerate().skip(1) { - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - if next_match.word_position - matches[interval_first].word_position >= crop_size { - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - - // keep interval if it's the best - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - best_interval_score = interval_score; - } - - // advance start of the interval while interval is longer than crop_size. - while next_match.word_position - matches[interval_first].word_position - >= crop_size - { - interval_first += 1; - } - } - interval_last = index; - } - - // compute the last interval score and compare it to the best one. - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - } - - &matches[best_interval.0..=best_interval.1] - } else { - matches - } - } - - // Returns the formatted version of the original text. - pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { - if !format_options.highlight && format_options.crop.is_none() { - // compute matches is not needed if no highlight nor crop is requested. - Cow::Borrowed(self.text) - } else { - match &self.matches { - Some((tokens, matches)) => { - // If the text has to be cropped, - // compute the best interval to crop around. - let matches = match format_options.crop { - Some(crop_size) if crop_size > 0 => { - self.find_best_match_interval(matches, crop_size) - } - _ => matches, - }; - - // If the text has to be cropped, - // crop around the best interval. - let (byte_start, byte_end) = match format_options.crop { - Some(crop_size) if crop_size > 0 => { - self.crop_bounds(tokens, matches, crop_size) - } - _ => (0, self.text.len()), - }; - - let mut formatted = Vec::new(); - - // push crop marker if it's not the start of the text. - if byte_start > 0 && !self.crop_marker.is_empty() { - formatted.push(self.crop_marker); - } - - let mut byte_index = byte_start; - - if format_options.highlight { - // insert highlight markers around matches. - for m in matches { - let token = &tokens[m.token_position]; - - if byte_index < token.byte_start { - formatted.push(&self.text[byte_index..token.byte_start]); - } - - let highlight_byte_index = self.text[token.byte_start..] - .char_indices() - .enumerate() - .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); - formatted.push(self.highlight_prefix); - formatted.push(&self.text[token.byte_start..highlight_byte_index]); - formatted.push(self.highlight_suffix); - // if it's a prefix highlight, we put the end of the word after the highlight marker. - if highlight_byte_index < token.byte_end { - formatted.push(&self.text[highlight_byte_index..token.byte_end]); - } - - byte_index = token.byte_end; - } - } - - // push the rest of the text between last match and the end of crop. - if byte_index < byte_end { - formatted.push(&self.text[byte_index..byte_end]); - } - - // push crop marker if it's not the end of the text. - if byte_end < self.text.len() && !self.crop_marker.is_empty() { - formatted.push(self.crop_marker); - } - - if formatted.len() == 1 { - // avoid concatenating if there is already 1 slice. - Cow::Borrowed(&self.text[byte_start..byte_end]) - } else { - Cow::Owned(formatted.concat()) - } - } - None => self.compute_matches().format(format_options), - } - } - } -} - -#[cfg(test)] -mod tests { - use std::rc::Rc; - - use charabia::TokenizerBuilder; - - use super::*; - use crate::search::matches::matching_words::MatchingWord; - - fn matching_words() -> MatchingWords { - let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), - ]; - let matching_words = vec![ - (vec![all[0].clone()], vec![0]), - (vec![all[1].clone()], vec![1]), - (vec![all[2].clone()], vec![2]), - ]; - - MatchingWords::new(matching_words).unwrap() - } - - impl MatcherBuilder<'_, Vec> { - pub fn from_matching_words(matching_words: MatchingWords) -> Self { - Self::new(matching_words, TokenizerBuilder::default().build()) - } - } - - #[test] - fn format_identity() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: false, crop: None }; - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - } - - #[test] - fn format_highlight() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: true, crop: None }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ""); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ":-)"); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // no crop should return complete text, because there is no matches. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Natalie risk her future to build a world with the boy she loves." - ); - } - - #[test] - fn highlight_unicode() { - let all = vec![ - Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()), - Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), - ]; - let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])]; - - let matching_words = MatchingWords::new(matching_words).unwrap(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: true, crop: None }; - - // Text containing prefix match. - let text = "Ŵôřlḑôle"; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Ŵôřlḑôle" - ); - - // Text containing unicode match. - let text = "Ŵôřlḑ"; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Ŵôřlḑ" - ); - - // Text containing unicode match. - let text = "Westfália"; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Westfália" - ); - } - - #[test] - fn format_crop() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: false, crop: Some(10) }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @"" - ); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @":-)" - ); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // no highlight should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"A quick brown fox can not jump 32 feet, right…" - ); - - // Text without any match starting by a separator. - let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; - let mut matcher = builder.build(text); - // no highlight should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"(A quick brown fox can not jump 32 feet, right…" - ); - - // Test phrase propagation - let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; - let mut matcher = builder.build(text); - // should crop the phrase instead of croping around the match. - insta::assert_snapshot!( - matcher.format(format_options), - @"… Split The World is a book written by Emily Henry…" - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // no highlight should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…future to build a world with the boy she loves…" - ); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // no highlight should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing a match unordered and a match ordered. - let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - - // Text containing matches with diferent density. - let text = "split void the void void world void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - - // Text containing matches with same word. - let text = "split split split split split split void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - } - - #[test] - fn format_highlight_crop() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: true, crop: Some(10) }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @"" - ); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @":-)" - ); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // both should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"A quick brown fox can not jump 32 feet, right…" - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // both should return 10 last words with a marker at the start and highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…future to build a world with the boy she loves…" - ); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // both should return 10 last words with a marker at the start and highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing a match unordered and a match ordered. - let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - } - - #[test] - fn smaller_crop_size() { - //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let text = "void void split the world void void."; - - // set a smaller crop size - let format_options = FormatOptions { highlight: false, crop: Some(2) }; - let mut matcher = builder.build(text); - // because crop size < query size, partially format matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…split the…" - ); - - // set a smaller crop size - let format_options = FormatOptions { highlight: false, crop: Some(1) }; - let mut matcher = builder.build(text); - // because crop size < query size, partially format matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…split…" - ); - - // set crop size to 0 - let format_options = FormatOptions { highlight: false, crop: Some(0) }; - let mut matcher = builder.build(text); - // because crop size is 0, crop is ignored. - insta::assert_snapshot!( - matcher.format(format_options), - @"void void split the world void void." - ); - } - - #[test] - fn partial_matches() { - let all = vec![ - Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()), - ]; - let matching_words = vec![ - (vec![all[0].clone()], vec![0]), - (vec![all[1].clone(), all[2].clone()], vec![0]), - (vec![all[3].clone()], vec![1]), - (vec![all[4].clone(), all[5].clone()], vec![1]), - (vec![all[4].clone()], vec![2]), - ]; - - let matching_words = MatchingWords::new(matching_words).unwrap(); - - let mut builder = MatcherBuilder::from_matching_words(matching_words); - builder.highlight_prefix("_".to_string()); - builder.highlight_suffix("_".to_string()); - - let format_options = FormatOptions { highlight: true, crop: None }; - - let text = "the do or die can't be he do and or isn't he"; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_" - ); - } -} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3683a5cf0..3e372e551 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,6 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); pub mod facet; mod fst_utils; -mod matches; pub mod new; pub struct Search<'a> { From ae17c62e24583cbc12d79701937803d42d7707a1 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 14:07:18 +0200 Subject: [PATCH 158/234] Remove warnings --- milli/src/search/new/matches/mod.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 2a9596902..2b87963ab 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -5,9 +5,6 @@ pub use matching_words::MatchingWords; use matching_words::{MatchType, PartialMatch, WordId}; use serde::Serialize; -use super::query_term::LocatedQueryTerm; -use crate::SearchContext; - pub mod matching_words; const DEFAULT_CROP_MARKER: &str = "…"; @@ -504,6 +501,7 @@ mod tests { use super::super::located_query_terms_from_string; use super::*; + use crate::SearchContext; impl<'a, 'ctx> MatcherBuilder<'a, &[u8]> { pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self { From 47f6a3ad3df3b1e6beff46821d4f0ad906cef1c6 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 15:02:23 +0200 Subject: [PATCH 159/234] Take into account that a logger need the search context --- milli/src/search/mod.rs | 38 ++++++++++++++--------- milli/src/search/new/mod.rs | 61 +++++++++++++++++-------------------- 2 files changed, 52 insertions(+), 47 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3e372e551..c4dfdd6b3 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -6,6 +6,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords}; +use self::new::PartialSearchResult; use crate::{ execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, }; @@ -106,20 +107,29 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - let ctx = SearchContext::new(self.index, self.rtxn); - execute_search( - ctx, - &self.query, - self.terms_matching_strategy, - self.exhaustive_number_hits, - &self.filter, - &self.sort_criteria, - self.offset, - self.limit, - Some(self.words_limit), - &mut DefaultSearchLogger, - &mut DefaultSearchLogger, - ) + let mut ctx = SearchContext::new(self.index, self.rtxn); + let PartialSearchResult { located_query_terms, candidates, documents_ids } = + execute_search( + &mut ctx, + &self.query, + self.terms_matching_strategy, + self.exhaustive_number_hits, + &self.filter, + &self.sort_criteria, + self.offset, + self.limit, + Some(self.words_limit), + &mut DefaultSearchLogger, + &mut DefaultSearchLogger, + )?; + + // consume context and located_query_terms to build MatchingWords. + let matching_words = match located_query_terms { + Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms), + None => MatchingWords::default(), + }; + + Ok(SearchResult { matching_words, candidates, documents_ids }) } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 0bb454c06..fc4d3b64c 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -28,7 +28,7 @@ use interner::DedupInterner; pub use logger::detailed::DetailedSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; -use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; +use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm}; use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; @@ -39,10 +39,7 @@ use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::resolve_query_graph::compute_query_graph_docids; use self::sort::Sort; use crate::search::new::distinct::apply_distinct_rule; -use crate::{ - AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy, - UserError, -}; +use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -54,6 +51,7 @@ pub struct SearchContext<'ctx> { pub term_interner: Interner, pub phrase_docids: PhraseDocIdsCache, } + impl<'ctx> SearchContext<'ctx> { pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { Self { @@ -271,7 +269,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( #[allow(clippy::too_many_arguments)] pub fn execute_search( - mut ctx: SearchContext, + ctx: &mut SearchContext, query: &Option, terms_matching_strategy: TermsMatchingStrategy, exhaustive_number_hits: bool, @@ -282,11 +280,11 @@ pub fn execute_search( words_limit: Option, placeholder_search_logger: &mut dyn SearchLogger, query_graph_logger: &mut dyn SearchLogger, -) -> Result { +) -> Result { let mut universe = if let Some(filters) = filters { - filters.evaluate(&mut ctx.txn, &mut ctx.index)? + filters.evaluate(ctx.txn, ctx.index)? } else { - ctx.index.documents_ids(&mut ctx.txn)? + ctx.index.documents_ids(ctx.txn)? }; let mut located_query_terms = None; @@ -294,12 +292,12 @@ pub fn execute_search( // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut tokbuilder = TokenizerBuilder::new(); - let stop_words = &mut ctx.index.stop_words(&mut ctx.txn)?; + let stop_words = ctx.index.stop_words(ctx.txn)?; if let Some(ref stop_words) = stop_words { tokbuilder.stop_words(stop_words); } - let script_lang_map = &mut ctx.index.script_language(&mut ctx.txn)?; + let script_lang_map = ctx.index.script_language(ctx.txn)?; if !script_lang_map.is_empty() { tokbuilder.allow_list(&script_lang_map); } @@ -307,31 +305,28 @@ pub fn execute_search( let tokenizer = tokbuilder.build(); let tokens = tokenizer.tokenize(query); - let query_terms = located_query_terms_from_string(&mut ctx, tokens, words_limit)?; - let graph = QueryGraph::from_query(&mut ctx, &query_terms)?; + let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; + let graph = QueryGraph::from_query(ctx, &query_terms)?; located_query_terms = Some(query_terms); - check_sort_criteria(&mut ctx, sort_criteria.as_ref())?; + check_sort_criteria(ctx, sort_criteria.as_ref())?; universe = resolve_maximally_reduced_query_graph( - &mut ctx, + ctx, &universe, &graph, terms_matching_strategy, query_graph_logger, )?; - let ranking_rules = get_ranking_rules_for_query_graph_search( - &mut ctx, - sort_criteria, - terms_matching_strategy, - )?; + let ranking_rules = + get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; - bucket_sort(&mut ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? + bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { - let ranking_rules = get_ranking_rules_for_placeholder_search(&mut ctx, sort_criteria)?; + let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?; bucket_sort( - &mut ctx, + ctx, ranking_rules, &PlaceholderQuery, &universe, @@ -345,20 +340,14 @@ pub fn execute_search( // is requested and a distinct attribute is set. let mut candidates = universe; if exhaustive_number_hits { - if let Some(f) = &mut ctx.index.distinct_field(&mut ctx.txn)? { - if let Some(distinct_fid) = ctx.index.fields_ids_map(&mut ctx.txn)?.id(f) { - candidates = apply_distinct_rule(&mut ctx, distinct_fid, &candidates)?.remaining; + if let Some(f) = ctx.index.distinct_field(ctx.txn)? { + if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) { + candidates = apply_distinct_rule(ctx, distinct_fid, &candidates)?.remaining; } } } - // consume context and located_query_terms to build MatchingWords. - let matching_words = match located_query_terms { - Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms), - None => MatchingWords::default(), - }; - - Ok(SearchResult { matching_words, candidates, documents_ids }) + Ok(PartialSearchResult { located_query_terms, candidates, documents_ids }) } fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec>) -> Result<()> { @@ -402,3 +391,9 @@ fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec> Ok(()) } + +pub struct PartialSearchResult { + pub located_query_terms: Option>, + pub candidates: RoaringBitmap, + pub documents_ids: Vec, +} From 1ba8a40d61ffa363a7c08ec084c863c8780970d6 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 15:10:16 +0200 Subject: [PATCH 160/234] Remove formating benchmark because they can't be isoloated easily anymore --- benchmarks/Cargo.toml | 4 -- benchmarks/benches/formatting.rs | 67 -------------------------------- 2 files changed, 71 deletions(-) delete mode 100644 benchmarks/benches/formatting.rs diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 5203a7601..f0ed054df 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -48,7 +48,3 @@ harness = false [[bench]] name = "indexing" harness = false - -[[bench]] -name = "formatting" -harness = false diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs deleted file mode 100644 index 2e0fa0ce7..000000000 --- a/benchmarks/benches/formatting.rs +++ /dev/null @@ -1,67 +0,0 @@ -use std::rc::Rc; - -use criterion::{criterion_group, criterion_main}; -use milli::tokenizer::TokenizerBuilder; -use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; - -#[global_allocator] -static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; - -struct Conf<'a> { - name: &'a str, - text: &'a str, - matching_words: MatcherBuilder<'a, Vec>, -} - -fn bench_formatting(c: &mut criterion::Criterion) { - #[rustfmt::skip] - let confs = &[ - Conf { - name: "'the door d'", - text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#, - matching_words: MatcherBuilder::new(MatchingWords::new(vec![ - (vec![Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap())], vec![0]), - (vec![Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap())], vec![0]), - (vec![Rc::new(MatchingWord::new("door".to_string(), 1, false).unwrap())], vec![1]), - (vec![Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap())], vec![0]), - (vec![Rc::new(MatchingWord::new("thedoor".to_string(), 1, false).unwrap())], vec![0, 1]), - (vec![Rc::new(MatchingWord::new("d".to_string(), 0, true).unwrap())], vec![2]), - (vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]), - (vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]), - ] - ).unwrap(), TokenizerBuilder::default().build()), - }, - ]; - - let format_options = &[ - FormatOptions { highlight: false, crop: None }, - FormatOptions { highlight: true, crop: None }, - FormatOptions { highlight: false, crop: Some(10) }, - FormatOptions { highlight: true, crop: Some(10) }, - FormatOptions { highlight: false, crop: Some(20) }, - FormatOptions { highlight: true, crop: Some(20) }, - ]; - - for option in format_options { - let highlight = if option.highlight { "highlight" } else { "no-highlight" }; - - let name = match option.crop { - Some(size) => format!("{}-crop({})", highlight, size), - None => format!("{}-no-crop", highlight), - }; - - let mut group = c.benchmark_group(&name); - for conf in confs { - group.bench_function(conf.name, |b| { - b.iter(|| { - let mut matcher = conf.matching_words.build(conf.text); - matcher.format(*option); - }) - }); - } - group.finish(); - } -} - -criterion_group!(benches, bench_formatting); -criterion_main!(benches); From ba8dcc2d78dc6aca5adfdee3be9ef0804a37b372 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 15:50:47 +0200 Subject: [PATCH 161/234] Fix clippy --- milli/src/search/new/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 2b87963ab..8dded0cab 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -503,7 +503,7 @@ mod tests { use super::*; use crate::SearchContext; - impl<'a, 'ctx> MatcherBuilder<'a, &[u8]> { + impl<'a> MatcherBuilder<'a, &[u8]> { pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self { let tokenizer = TokenizerBuilder::new().build(); let tokens = tokenizer.tokenize(query); From f7e7f438f89e40890fb3f2964c239ec609a0e508 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 17:22:31 +0200 Subject: [PATCH 162/234] Patch prefix match --- milli/src/search/new/matches/matching_words.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 4ca04884a..d5d1b6906 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -20,6 +20,7 @@ pub struct LocatedMatchingWords { pub value: Vec>, pub positions: RangeInclusive, pub is_prefix: bool, + pub original_char_count: usize, } /// Structure created from a query tree @@ -101,10 +102,12 @@ impl MatchingWords { positions: located_term.positions.clone(), }); } + words.push(LocatedMatchingWords { value: matching_words, positions: located_term.positions.clone(), is_prefix: term.is_prefix, + original_char_count: ctx.word_interner.get(term.original).chars().count(), }); } @@ -131,7 +134,11 @@ impl MatchingWords { let word = self.word_interner.get(*word); // if the word is a prefix we match using starts_with. if located_words.is_prefix && token.lemma().starts_with(word) { - let char_len = token.original_lengths(word.len()).0; + let Some((char_index, c)) = word.char_indices().take(located_words.original_char_count).last() else { + continue; + }; + let prefix_length = char_index + c.len_utf8(); + let char_len = token.original_lengths(prefix_length).0; let ids = &located_words.positions; return Some(MatchType::Full { char_len, ids }); // else we exact match the token. From 7276deee0a6202dfccaafe0dacaa32fa60264c64 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:04:46 +0200 Subject: [PATCH 163/234] Add new db caches --- milli/src/search/new/db_cache.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index af94108e2..effd123be 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -24,6 +24,8 @@ pub struct DatabaseCache<'ctx> { pub word_docids: FxHashMap, Option<&'ctx [u8]>>, pub exact_word_docids: FxHashMap, Option<&'ctx [u8]>>, pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, + pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, } impl<'ctx> DatabaseCache<'ctx> { fn get_value<'v, K1, KC>( @@ -128,4 +130,32 @@ impl<'ctx> SearchContext<'ctx> { self.index.prefix_word_pair_proximity_docids.remap_data_type::(), ) } + + pub fn get_db_word_position_docids( + &mut self, + word: Interned, + position: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, position), + &(self.word_interner.get(word).as_str(), position), + &mut self.db_cache.word_position_docids, + self.index.word_position_docids.remap_data_type::(), + ) + } + + pub fn get_db_word_fid_docids( + &mut self, + word: Interned, + fid: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, fid), + &(self.word_interner.get(word).as_str(), fid), + &mut self.db_cache.word_fid_docids, + self.index.word_fid_docids.remap_data_type::(), + ) + } } From 2c9822a3371088025db976aadba05b8e958d9a2c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:06:07 +0200 Subject: [PATCH 164/234] Rename `is_multiple_words` to `is_ngram` and `zero_typo` to `exact` --- milli/src/search/new/logger/detailed.rs | 4 ++-- milli/src/search/new/query_term.rs | 32 +++++++++++++++---------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 3a02950a8..3c4779ad9 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -441,7 +441,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ }) => { let QueryTerm { original, - is_multiple_words: _, + is_ngram: _, is_prefix: _, max_nbr_typos, zero_typo, @@ -458,7 +458,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ ) .unwrap(); - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = zero_typo; for w in zero_typo.iter().copied() { diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index d19ab6135..90b03d194 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -204,8 +204,13 @@ impl QueryTermSubset { } if !self.zero_typo_subset.is_empty() { - let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = - &original.zero_typo; + let ZeroTypoTerm { + phrase: _, + exact: zero_typo, + prefix_of, + synonyms: _, + use_prefix_db: _, + } = &original.zero_typo; result.extend(zero_typo.iter().copied()); result.extend(prefix_of.iter().copied()); }; @@ -258,7 +263,7 @@ impl QueryTermSubset { )?; } - let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = + let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } = &original.zero_typo; result.extend(phrase.iter().copied()); result.extend(synonyms.iter().copied()); @@ -302,7 +307,7 @@ impl QueryTerm { #[derive(Clone, PartialEq, Eq, Hash)] pub struct QueryTerm { pub original: Interned, - pub is_multiple_words: bool, + pub is_ngram: bool, pub max_nbr_typos: u8, pub is_prefix: bool, pub zero_typo: ZeroTypoTerm, @@ -318,7 +323,7 @@ pub struct ZeroTypoTerm { /// The original phrase, if any pub phrase: Option>, /// A single word equivalent to the original term, with zero typos - pub zero_typo: Option>, + pub exact: Option>, /// All the words that contain the original word as prefix pub prefix_of: BTreeSet>, /// All the synonyms of the original word or phrase @@ -341,7 +346,7 @@ pub struct TwoTypoTerm { impl ZeroTypoTerm { fn is_empty(&self) -> bool { - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self; phrase.is_none() && zero_typo.is_none() && prefix_of.is_empty() @@ -370,12 +375,12 @@ impl QueryTerm { ) -> Self { Self { original: word_interner.insert(phrase.description(word_interner)), - is_multiple_words: false, + is_ngram: false, max_nbr_typos: 0, is_prefix: false, zero_typo: ZeroTypoTerm { phrase: Some(phrase_interner.insert(phrase)), - zero_typo: None, + exact: None, prefix_of: BTreeSet::default(), synonyms: BTreeSet::default(), use_prefix_db: None, @@ -387,7 +392,7 @@ impl QueryTerm { pub fn empty(word_interner: &mut DedupInterner, original: &str) -> Self { Self { original: word_interner.insert(original.to_owned()), - is_multiple_words: false, + is_ngram: false, is_prefix: false, max_nbr_typos: 0, zero_typo: <_>::default(), @@ -606,11 +611,12 @@ fn partially_initialized_term_from_word( Some(ctx.phrase_interner.insert(Phrase { words })) }) .collect(); - let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; + let zero_typo = + ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db }; Ok(QueryTerm { original: word_interned, - is_multiple_words: false, + is_ngram: false, max_nbr_typos: max_typo, is_prefix, zero_typo, @@ -765,7 +771,7 @@ fn split_best_frequency( impl QueryTerm { /// Return the original word from the given query term pub fn original_single_word(&self) -> Option> { - if self.is_multiple_words { + if self.is_ngram { None } else { Some(self.original) @@ -1039,7 +1045,7 @@ pub fn make_ngram( let term = QueryTerm { original, - is_multiple_words: true, + is_ngram: true, is_prefix, max_nbr_typos, zero_typo: term.zero_typo, From 996619b22a024777a08483740458d48cc73f5725 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:07:26 +0200 Subject: [PATCH 165/234] Increase position by 8 on hard separator when building query terms --- milli/src/search/new/query_term.rs | 2 +- .../index_documents/extract/extract_docid_word_positions.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 90b03d194..005c0a2e3 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -907,7 +907,7 @@ pub fn located_query_terms_from_string( TokenKind::Separator(separator_kind) => { match separator_kind { SeparatorKind::Hard => { - position += 1; + position += 8; } SeparatorKind::Soft => { position += 0; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 2d51fcc1a..c362f8f1b 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -153,7 +153,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st /// take an iterator on tokens and compute their relative position depending on separator kinds /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, -/// else we keep the standart proximity of 1 between words. +/// else we keep the standard proximity of 1 between words. fn process_tokens<'a>( tokens: impl Iterator>, ) -> impl Iterator)> { From 1b8e4d0301f81fa4349a0d047b164c46b6e12fd9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:09:32 +0200 Subject: [PATCH 166/234] Add ExactTerm and helper method --- milli/src/search/new/query_term.rs | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 005c0a2e3..4e3922980 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -4,6 +4,7 @@ use std::ops::{ControlFlow, RangeInclusive}; use charabia::normalizer::NormalizedTokenIter; use charabia::{SeparatorKind, TokenKind}; +use either::Either; use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use heed::types::DecodeIgnore; @@ -138,7 +139,43 @@ pub struct LocatedQueryTermSubset { pub term_ids: RangeInclusive, } +#[derive(Clone, Copy)] +pub enum ExactTerm { + Phrase(Interned), + Word(Interned), +} + +impl ExactTerm { + pub fn interned_words<'ctx>( + &self, + ctx: &'ctx SearchContext<'ctx>, + ) -> impl Iterator>> + 'ctx { + match *self { + ExactTerm::Phrase(phrase) => { + let phrase = ctx.phrase_interner.get(phrase); + Either::Left(phrase.words.iter().copied()) + } + ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))), + } + } +} + impl QueryTermSubset { + pub fn exact_term(&self, ctx: &SearchContext) -> Option { + let full_query_term = ctx.term_interner.get(self.original); + if full_query_term.is_ngram { + return None; + } + // TODO: included in subset + if let Some(phrase) = full_query_term.zero_typo.phrase { + self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase)) + } else if let Some(word) = full_query_term.zero_typo.exact { + self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word)) + } else { + None + } + } + pub fn empty(for_term: Interned) -> Self { Self { original: for_term, From 8a13ed7e3fd0df9c163fdbdc7d7b56e6a5b0fbe4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:12:07 +0200 Subject: [PATCH 167/234] Add exactness ranking rules --- milli/src/search/new/exact_attribute.rs | 175 ++++++++++++++++++ .../search/new/graph_based_ranking_rule.rs | 10 +- milli/src/search/new/mod.rs | 12 +- .../new/ranking_rule_graph/exactness/mod.rs | 107 +++++++++++ .../src/search/new/ranking_rule_graph/mod.rs | 3 + 5 files changed, 301 insertions(+), 6 deletions(-) create mode 100644 milli/src/search/new/exact_attribute.rs create mode 100644 milli/src/search/new/ranking_rule_graph/exactness/mod.rs diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs new file mode 100644 index 000000000..bb6299e28 --- /dev/null +++ b/milli/src/search/new/exact_attribute.rs @@ -0,0 +1,175 @@ +use heed::BytesDecode; +use roaring::MultiOps; + +use super::query_graph::QueryGraph; +use super::ranking_rules::{RankingRule, RankingRuleOutput}; +use crate::search::new::query_graph::QueryNodeData; +use crate::search::new::query_term::ExactTerm; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; + +/// FIXME: +/// +/// - A lot of work done in next_bucket that start_iteration could do. +/// - Consider calling the graph based rule directly from this one. +/// - currently we did exact term, don't forget about prefix +/// - some tests +pub struct ExactAttribute { + query_graph: Option, +} + +impl ExactAttribute { + pub fn new() -> Self { + Self { query_graph: None } + } +} + +impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { + fn id(&self) -> String { + "exact_attribute".to_owned() + } + + fn start_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + _universe: &roaring::RoaringBitmap, + query: &QueryGraph, + ) -> Result<()> { + self.query_graph = Some(query.clone()); + Ok(()) + } + + fn next_bucket( + &mut self, + ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + universe: &roaring::RoaringBitmap, + ) -> Result>> { + // iterate on the nodes of the graph, retain LocatedQueryTermSubset + let query_graph = self.query_graph.as_ref().unwrap(); + let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = + Vec::with_capacity(query_graph.nodes.len() as usize); + for (_, node) in query_graph.nodes.iter() { + match &node.data { + QueryNodeData::Term(term) => { + let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { + exact_term + } else { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + }; + exact_term_position_ids.push(( + exact_term, + *term.positions.start(), + *term.term_ids.start(), + )) + } + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, + } + } + + exact_term_position_ids.sort_by_key(|(_, _, id)| *id); + // bail if there is a "hole" (missing word) in remaining query graph + let mut previous_id = 0; + for (_, _, id) in exact_term_position_ids.iter().copied() { + if id < previous_id || id - previous_id > 1 { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } else { + previous_id = id; + } + } + + // sample query: "sunflower are pretty" + // sunflower at pos 0 in attr A + // are at pos 1 in attr B + // pretty at pos 2 in attr C + // We want to eliminate such document + + // first check that for each term, there exists some attribute that has this term at the correct position + //"word-position-docids"; + let mut candidates = universe.clone(); + let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids + .iter() + .copied() + .map(|(term, position, _)| (term.interned_words(ctx).collect(), position)) + .collect(); + for (words, position) in &words_positions { + if candidates.is_empty() { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } + + 'words: for (offset, word) in words.iter().enumerate() { + let offset = offset as u16; + let word = if let Some(word) = word { + word + } else { + continue 'words; + }; + let word_position_docids = CboRoaringBitmapCodec::bytes_decode( + ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), + ) + .unwrap_or_default(); + candidates &= word_position_docids; + } + } + + let candidates = candidates; + + if candidates.is_empty() { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } + + let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); + + let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); + + // then check that there exists at least one attribute that has all of the terms + for fid in searchable_fields_ids { + let mut intersection = MultiOps::intersection( + words_positions + .iter() + .flat_map(|(words, ..)| words.iter()) + // ignore stop words words in phrases + .flatten() + .map(|word| -> Result<_> { + Ok(ctx + .get_db_word_fid_docids(*word, fid)? + .map(CboRoaringBitmapCodec::bytes_decode) + .unwrap_or_default() + .unwrap_or_default()) + }), + )?; + intersection &= &candidates; + if !intersection.is_empty() { + candidates_per_attributes.push(intersection); + } + } + // note we could have "false positives" where there both exist different attributes that collectively + // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. + + let candidates = MultiOps::union(candidates_per_attributes.into_iter()); + Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates })) + } + + fn end_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + ) { + } +} diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index b8c58c726..28b4ed1f4 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner}; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, - TypoGraph, + ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph, + RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; @@ -65,6 +65,12 @@ impl GraphBasedRankingRule { Self::new_with_id("typo".to_owned(), terms_matching_strategy) } } +pub type Exactness = GraphBasedRankingRule; +impl GraphBasedRankingRule { + pub fn new() -> Self { + Self::new_with_id("exactness".to_owned(), None) + } +} /// A generic graph-based ranking rule pub struct GraphBasedRankingRule { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d561d25b..779e589b3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -9,8 +9,9 @@ mod query_term; mod ranking_rule_graph; mod ranking_rules; mod resolve_query_graph; -// TODO: documentation + comments mod small_bitmap; + +mod exact_attribute; // TODO: documentation + comments // implementation is currently an adaptation of the previous implementation to fit with the new model mod sort; @@ -33,6 +34,8 @@ use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; +use self::exact_attribute::ExactAttribute; +use self::graph_based_ranking_rule::Exactness; use self::interner::Interner; use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::resolve_query_graph::compute_query_graph_docids; @@ -150,7 +153,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut proximity = false; let mut sort = false; let attribute = false; - let exactness = false; + let mut exactness = false; let mut asc = HashSet::new(); let mut desc = HashSet::new(); @@ -211,8 +214,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( if exactness { continue; } - // todo!(); - // exactness = false; + ranking_rules.push(Box::new(ExactAttribute::new())); + ranking_rules.push(Box::new(Exactness::new())); + exactness = true; } crate::Criterion::Asc(field_name) => { if asc.contains(&field_name) { diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs new file mode 100644 index 000000000..a1e19a015 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -0,0 +1,107 @@ +use roaring::RoaringBitmap; + +use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; +use crate::search::new::query_graph::{QueryGraph, QueryNode}; +use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; + +/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other +/// word than a doc that matches 9 words non exactly but none exactly +/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider +/// +/// "naive vision" +/// condition from one node to another: +/// - word exactly present: cost 0 +/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside +/// +/// Three strategies: +/// 1. ExactAttribute: word position / word_fid_docid +/// 2. AttributeStart: +/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait` + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum ExactnessCondition { + ExactInAttribute(LocatedQueryTermSubset), + Skip(LocatedQueryTermSubset), +} + +pub enum ExactnessGraph {} + +fn compute_docids( + ctx: &mut SearchContext, + dest_node: &LocatedQueryTermSubset, + universe: &RoaringBitmap, +) -> Result { + let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) { + exact_term + } else { + return Ok(Default::default()); + }; + let mut candidates = match exact_term { + ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), + ExactTerm::Word(word) => { + if let Some(word_candidates) = ctx.get_db_word_docids(word)? { + CboRoaringBitmapCodec::deserialize_from(word_candidates)? + } else { + return Ok(Default::default()); + } + } + }; + // TODO: synonyms? + candidates &= universe; + Ok(candidates) +} + +impl RankingRuleGraphTrait for ExactnessGraph { + type Condition = ExactnessCondition; + + fn resolve_condition( + ctx: &mut SearchContext, + condition: &Self::Condition, + universe: &RoaringBitmap, + ) -> Result { + let (docids, dest_node) = match condition { + ExactnessCondition::ExactInAttribute(dest_node) => { + (compute_docids(ctx, dest_node, universe)?, dest_node) + } + ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node), + }; + Ok(ComputedCondition { + docids, + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: dest_node.clone(), + }) + } + + fn build_edges( + _ctx: &mut SearchContext, + conditions_interner: &mut DedupInterner, + _source_node: Option<&LocatedQueryTermSubset>, + dest_node: &LocatedQueryTermSubset, + ) -> Result)>> { + let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone()); + let exact_condition = conditions_interner.insert(exact_condition); + + let skip_condition = ExactnessCondition::Skip(dest_node.clone()); + let skip_condition = conditions_interner.insert(skip_condition); + Ok(vec![(0, exact_condition), (1, skip_condition)]) + } + + fn log_state( + graph: &RankingRuleGraph, + paths: &[Vec>], + dead_ends_cache: &DeadEndsCache, + universe: &RoaringBitmap, + costs: &MappedInterner>, + cost: u64, + logger: &mut dyn SearchLogger, + ) { + todo!() + } + + fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { + todo!() + } +} diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 7c40008c8..936c3e942 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -10,6 +10,8 @@ mod cheapest_paths; mod condition_docids_cache; mod dead_ends_cache; +/// Implementation of the `exactness` ranking rule +mod exactness; /// Implementation of the `proximity` ranking rule mod proximity; /// Implementation of the `typo` ranking rule @@ -20,6 +22,7 @@ use std::hash::Hash; pub use cheapest_paths::PathVisitor; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; +pub use exactness::{ExactnessCondition, ExactnessGraph}; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; From f513cf930acd3cb403cb58dc5bffcc278a5a214f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:42:51 +0200 Subject: [PATCH 168/234] Exact attribute with state --- milli/src/search/new/exact_attribute.rs | 166 +++++++++++++++++------- 1 file changed, 122 insertions(+), 44 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index bb6299e28..fa837272b 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -1,5 +1,5 @@ use heed::BytesDecode; -use roaring::MultiOps; +use roaring::{MultiOps, RoaringBitmap}; use super::query_graph::QueryGraph; use super::ranking_rules::{RankingRule, RankingRuleOutput}; @@ -7,19 +7,18 @@ use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::ExactTerm; use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; -/// FIXME: +/// A ranking rule that produces 3 disjoint buckets: /// -/// - A lot of work done in next_bucket that start_iteration could do. -/// - Consider calling the graph based rule directly from this one. -/// - currently we did exact term, don't forget about prefix -/// - some tests +/// 1. Documents from the universe whose value is exactly the query. +/// 2. Documents from the universe not in (1) whose value starts with the query. +/// 3. Documents from the universe not in (1) or (2). pub struct ExactAttribute { - query_graph: Option, + state: State, } impl ExactAttribute { pub fn new() -> Self { - Self { query_graph: None } + Self { state: Default::default() } } } @@ -30,23 +29,69 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { fn start_iteration( &mut self, - _ctx: &mut SearchContext<'ctx>, + ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, - _universe: &roaring::RoaringBitmap, + universe: &roaring::RoaringBitmap, query: &QueryGraph, ) -> Result<()> { - self.query_graph = Some(query.clone()); + self.state = State::start_iteration(ctx, universe, query)?; + Ok(()) } fn next_bucket( &mut self, - ctx: &mut SearchContext<'ctx>, + _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, universe: &roaring::RoaringBitmap, ) -> Result>> { - // iterate on the nodes of the graph, retain LocatedQueryTermSubset - let query_graph = self.query_graph.as_ref().unwrap(); + let state = std::mem::take(&mut self.state); + let (state, output) = State::next(state, universe); + self.state = state; + + Ok(output) + } + + fn end_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + ) { + self.state = Default::default(); + } +} + +/// Inner state of the ranking rule. +#[derive(Default)] +enum State { + /// State between two iterations + #[default] + Uninitialized, + /// The next call to `next` will output the documents in the universe that have an attribute that is the exact query + ExactAttribute(QueryGraph, Vec), + /// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query, + /// but isn't the exact query. + AttributeStarts(QueryGraph, Vec), + /// The next calls to `next` will output the input universe. + Empty(QueryGraph), +} + +/// The candidates sorted by attributes +/// +/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field. +struct FieldCandidates { + /// The candidates that start with all the words of the query in the field + start_with_exact: RoaringBitmap, + /// The candidates that have the same number of words as the query in the field + exact_word_count: RoaringBitmap, +} + +impl State { + fn start_iteration( + ctx: &mut SearchContext<'_>, + universe: &RoaringBitmap, + query_graph: &QueryGraph, + ) -> Result { let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = Vec::with_capacity(query_graph.nodes.len() as usize); for (_, node) in query_graph.nodes.iter() { @@ -55,11 +100,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { exact_term } else { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + continue; }; exact_term_position_ids.push(( exact_term, @@ -73,14 +114,17 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { exact_term_position_ids.sort_by_key(|(_, _, id)| *id); // bail if there is a "hole" (missing word) in remaining query graph + if let Some((_, _, first_id)) = exact_term_position_ids.first() { + if *first_id != 0 { + return Ok(State::Empty(query_graph.clone())); + } + } else { + return Ok(State::Empty(query_graph.clone())); + } let mut previous_id = 0; for (_, _, id) in exact_term_position_ids.iter().copied() { if id < previous_id || id - previous_id > 1 { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } else { previous_id = id; } @@ -102,11 +146,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { .collect(); for (words, position) in &words_positions { if candidates.is_empty() { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } 'words: for (offset, word) in words.iter().enumerate() { @@ -116,8 +156,11 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { } else { continue 'words; }; + // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of + // longer phrases we'll be losing on precision here. + let bucketed_position = crate::bucketed_position(position + offset); let word_position_docids = CboRoaringBitmapCodec::bytes_decode( - ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), + ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(), ) .unwrap_or_default(); candidates &= word_position_docids; @@ -127,16 +170,12 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { let candidates = candidates; if candidates.is_empty() { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); - let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); + let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); // then check that there exists at least one attribute that has all of the terms for fid in searchable_fields_ids { @@ -156,20 +195,59 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { )?; intersection &= &candidates; if !intersection.is_empty() { - candidates_per_attributes.push(intersection); + let candidates_with_exact_word_count = ctx + .index + .field_id_word_count_docids + .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))? + .unwrap_or_default(); + candidates_per_attribute.push(FieldCandidates { + start_with_exact: intersection, + exact_word_count: candidates_with_exact_word_count, + }); } } // note we could have "false positives" where there both exist different attributes that collectively // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. - let candidates = MultiOps::union(candidates_per_attributes.into_iter()); - Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates })) + Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute)) } - fn end_iteration( - &mut self, - _ctx: &mut SearchContext<'ctx>, - _logger: &mut dyn SearchLogger, - ) { + fn next( + state: State, + universe: &RoaringBitmap, + ) -> (State, Option>) { + let (state, output) = match state { + State::Uninitialized => (state, None), + State::ExactAttribute(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.iter().map( + |FieldCandidates { start_with_exact, exact_word_count }| { + start_with_exact & exact_word_count + }, + )); + candidates &= universe; + ( + State::AttributeStarts(query_graph.clone(), candidates_per_attribute), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::AttributeStarts(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map( + |FieldCandidates { mut start_with_exact, exact_word_count }| { + start_with_exact -= exact_word_count; + start_with_exact + }, + )); + candidates &= universe; + ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::Empty(query_graph) => ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }), + ), + }; + (state, output) } } From e58426109a6fc1b3cb5ba544d8e3f1020a74fbd8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:43:16 +0200 Subject: [PATCH 169/234] Fix panics and issues in exactness graph ranking rule --- .../new/ranking_rule_graph/exactness/mod.rs | 48 ++++++++----------- .../extract/extract_word_position_docids.rs | 5 +- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index a1e19a015..3d558e87b 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -1,24 +1,11 @@ +use heed::BytesDecode; use roaring::RoaringBitmap; use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::query_graph::{QueryGraph, QueryNode}; use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; -use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; - -/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other -/// word than a doc that matches 9 words non exactly but none exactly -/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider -/// -/// "naive vision" -/// condition from one node to another: -/// - word exactly present: cost 0 -/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside -/// -/// Three strategies: -/// 1. ExactAttribute: word position / word_fid_docid -/// 2. AttributeStart: -/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait` +use crate::{Result, RoaringBitmapCodec, SearchContext, SearchLogger}; #[derive(Clone, PartialEq, Eq, Hash)] pub enum ExactnessCondition { @@ -42,7 +29,7 @@ fn compute_docids( ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), ExactTerm::Word(word) => { if let Some(word_candidates) = ctx.get_db_word_docids(word)? { - CboRoaringBitmapCodec::deserialize_from(word_candidates)? + RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)? } else { return Ok(Default::default()); } @@ -86,22 +73,29 @@ impl RankingRuleGraphTrait for ExactnessGraph { let skip_condition = ExactnessCondition::Skip(dest_node.clone()); let skip_condition = conditions_interner.insert(skip_condition); - Ok(vec![(0, exact_condition), (1, skip_condition)]) + + Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)]) } fn log_state( - graph: &RankingRuleGraph, - paths: &[Vec>], - dead_ends_cache: &DeadEndsCache, - universe: &RoaringBitmap, - costs: &MappedInterner>, - cost: u64, - logger: &mut dyn SearchLogger, + _graph: &RankingRuleGraph, + _paths: &[Vec>], + _dead_ends_cache: &DeadEndsCache, + _niverse: &RoaringBitmap, + _costs: &MappedInterner>, + _cost: u64, + _logger: &mut dyn SearchLogger, ) { - todo!() } - fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { - todo!() + fn label_for_condition( + _ctx: &mut SearchContext, + condition: &Self::Condition, + ) -> Result { + Ok(match condition { + ExactnessCondition::ExactInAttribute(_) => "exact", + ExactnessCondition::Skip(_) => "skip", + } + .to_owned()) } } diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index cd3ec691b..eef5089bc 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -7,10 +7,7 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{ - absolute_from_relative_position, bucketed_position, relative_from_absolute_position, - DocumentId, Result, -}; +use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result}; /// Extracts the word positions and the documents ids where this word appear. /// From 90a6c0149537212422ddff41d78177eba1cdb916 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 18:05:44 +0200 Subject: [PATCH 170/234] Use correct codec in proximity --- .../ranking_rule_graph/proximity/compute_docids.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 8496054b7..07bd102ca 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,14 +1,17 @@ #![allow(clippy::too_many_arguments)] +use std::collections::BTreeSet; + +use heed::BytesDecode; +use roaring::RoaringBitmap; + use super::ProximityCondition; use crate::search::new::interner::Interned; use crate::search::new::query_term::{Phrase, QueryTermSubset}; use crate::search::new::ranking_rule_graph::ComputedCondition; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::SearchContext; -use crate::{CboRoaringBitmapCodec, Result}; -use roaring::RoaringBitmap; -use std::collections::BTreeSet; +use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; pub fn compute_docids( ctx: &mut SearchContext, @@ -90,7 +93,8 @@ pub fn compute_docids( continue; } } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? { - let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?; + let left_word_docids = + RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?; if universe.is_disjoint(&left_word_docids) { continue; } From 66ddee4390092bb43ed4cbf95e9719dddbc350c0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:43:42 +0200 Subject: [PATCH 171/234] Fix word_position_docids indexing --- .../index_documents/extract/extract_word_position_docids.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index eef5089bc..734cf8778 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -39,9 +39,8 @@ pub fn extract_word_fid_and_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - let (fid, position) = relative_from_absolute_position(position); + let (_fid, position) = relative_from_absolute_position(position); let position = bucketed_position(position); - let position = absolute_from_relative_position(fid, position); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } From 130d2061bde64796a54d4b36b6e668d55f1e8fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 14:55:02 +0200 Subject: [PATCH 172/234] Fix indexing of word_position_docid and fid --- milli/src/snapshot_tests.rs | 8 +++ .../extract/extract_word_fid_docids.rs | 48 ++++++++++++++++ .../extract/extract_word_position_docids.rs | 4 +- .../src/update/index_documents/extract/mod.rs | 17 +++++- milli/src/update/index_documents/mod.rs | 57 +++++++++++++++++++ .../src/update/index_documents/typed_chunk.rs | 12 ++++ 6 files changed, 141 insertions(+), 5 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_word_fid_docids.rs diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f7f1a97e6..eb94c4be9 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -248,6 +248,11 @@ pub fn snap_word_position_docids(index: &Index) -> String { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) }) } +pub fn snap_word_fid_docids(index: &Index) -> String { + make_db_snap_from_iter!(index, word_fid_docids, |((word, fid), b)| { + &format!("{word:<16} {fid:<3} {}", display_bitmap(&b)) + }) +} pub fn snap_field_id_word_count_docids(index: &Index) -> String { make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| { &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) @@ -477,6 +482,9 @@ macro_rules! full_snap_of_db { ($index:ident, word_position_docids) => {{ $crate::snapshot_tests::snap_word_position_docids(&$index) }}; + ($index:ident, word_fid_docids) => {{ + $crate::snapshot_tests::snap_word_fid_docids(&$index) + }}; ($index:ident, field_id_word_count_docids) => {{ $crate::snapshot_tests::snap_field_id_word_count_docids(&$index) }}; diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs new file mode 100644 index 000000000..72b30cddf --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs @@ -0,0 +1,48 @@ +use std::fs::File; +use std::io; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::{relative_from_absolute_position, DocumentId, Result}; + +/// Extracts the word, field id, and the documents ids where this word appear at this field id. +#[logging_timer::time] +pub fn extract_word_fid_docids( + docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_fid_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = DocumentId::from_be_bytes(document_id_bytes); + + for position in read_u32_ne_bytes(value) { + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + let (fid, _) = relative_from_absolute_position(position); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + } + + let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; + + Ok(word_fid_docids_reader) +} diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 734cf8778..80a36c308 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -14,7 +14,7 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_fid_and_position_docids( +pub fn extract_word_position_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { @@ -39,7 +39,7 @@ pub fn extract_word_fid_and_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - let (_fid, position) = relative_from_absolute_position(position); + let (_, position) = relative_from_absolute_position(position); let position = bucketed_position(position); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 844efed36..db041de6f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -5,6 +5,7 @@ mod extract_fid_docid_facet_values; mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_word_docids; +mod extract_word_fid_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; @@ -22,8 +23,9 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; +use self::extract_word_fid_docids::extract_word_fid_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; -use self::extract_word_position_docids::extract_word_fid_and_position_docids; +use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, @@ -130,14 +132,23 @@ pub(crate) fn data_from_obkv_documents( ); spawn_extraction_task::<_, _, Vec>>( - docid_word_positions_chunks, + docid_word_positions_chunks.clone(), indexer, lmdb_writer_sx.clone(), - extract_word_fid_and_position_docids, + extract_word_position_docids, merge_cbo_roaring_bitmaps, TypedChunk::WordPositionDocids, "word-position-docids", ); + spawn_extraction_task::<_, _, Vec>>( + docid_word_positions_chunks, + indexer, + lmdb_writer_sx.clone(), + extract_word_fid_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordFidDocids, + "word-fid-docids", + ); spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_strings_chunks, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ade217beb..235b35fc8 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2255,4 +2255,61 @@ mod tests { {"id":1,"catto":"jorts"} "###); } + + #[test] + fn test_word_fid_position() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + {"id": 0, "text": "sun flowers are looking at the sun" }, + {"id": 1, "text": "sun flowers are looking at the sun" }, + {"id": 2, "text": "the sun is shining today" }, + { + "id": 3, + "text": "a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a " + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9"); + db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f"); + + index + .add_documents(documents!([ + {"id": 4, "text": "sun flowers are looking at the sun" }, + {"id": 5, "text2": "sun flowers are looking at the sun" }, + {"id": 6, "text": "b b b" }, + { + "id": 7, + "text2": "a a a a" + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4"); + db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83"); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.strategy(DeletionStrategy::AlwaysHard); + builder.delete_external_id("0"); + builder.delete_external_id("3"); + let result = builder.execute().unwrap(); + println!("{result:?}"); + + wtxn.commit().unwrap(); + + db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); + db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); + db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1"); + } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b9b11cfa8..14ba021bd 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -35,6 +35,7 @@ pub(crate) enum TypedChunk { exact_word_docids_reader: grenad::Reader, }, WordPositionDocids(grenad::Reader), + WordFidDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), @@ -140,6 +141,17 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::WordFidDocids(word_fid_docids_iter) => { + append_entries_into_database( + word_fid_docids_iter, + &index.word_fid_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?; From 618c54915d57989f27c705dcf674dde35b9b73a1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 6 Apr 2023 15:58:29 +0200 Subject: [PATCH 173/234] exact_attribute: dedup nodes after sorting them --- milli/src/search/new/exact_attribute.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index fa837272b..b82a60597 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -113,6 +113,8 @@ impl State { } exact_term_position_ids.sort_by_key(|(_, _, id)| *id); + exact_term_position_ids.dedup_by_key(|(_, _, id)| *id); + // bail if there is a "hole" (missing word) in remaining query graph if let Some((_, _, first_id)) = exact_term_position_ids.first() { if *first_id != 0 { From ab09dc0167162752620546a8b12466273eef6f1b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 6 Apr 2023 15:59:00 +0200 Subject: [PATCH 174/234] exact_attributes: Add TODOs and additional check after review --- milli/src/search/new/exact_attribute.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index b82a60597..3a31f6a75 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -166,6 +166,9 @@ impl State { ) .unwrap_or_default(); candidates &= word_position_docids; + if candidates.is_empty() { + return Ok(State::Empty(query_graph.clone())); + } } } @@ -197,11 +200,15 @@ impl State { )?; intersection &= &candidates; if !intersection.is_empty() { + // TODO: although not really worth it in terms of performance, + // if would be good to put this in cache for the sake of consistency let candidates_with_exact_word_count = ctx .index .field_id_word_count_docids .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))? .unwrap_or_default(); + // TODO: consider if we must store the candidates as arrays, or if there is a way to perform the union + // here. candidates_per_attribute.push(FieldCandidates { start_with_exact: intersection, exact_word_count: candidates_with_exact_word_count, From 31630c85d0f596762ee9157f4932a0a132fc9b3b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 6 Apr 2023 15:59:21 +0200 Subject: [PATCH 175/234] exactness graph rr: Add important TODO/FIXME after review --- milli/src/search/new/ranking_rule_graph/exactness/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index 3d558e87b..6639299a1 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -58,6 +58,7 @@ impl RankingRuleGraphTrait for ExactnessGraph { docids, universe_len: universe.len(), start_term_subset: None, + // TODO/FIXME: modify `end_term_subset` to signal to the next ranking rules that the term cannot be removed end_term_subset: dest_node.clone(), }) } From d6585eb10bb83be437bec6108e67280bf59bba9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 6 Apr 2023 15:50:11 +0200 Subject: [PATCH 176/234] Avoid splitting ngrams into their original component words --- .../new/query_term/compute_derivations.rs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index 695c57f08..03d92572e 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -293,9 +293,26 @@ impl Interned { })?; let original_str = ctx.word_interner.get(original).to_owned(); let split_words = find_split_words(ctx, original_str.as_str())?; - let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; let self_mut = ctx.term_interner.get_mut(self); + + // Only add the split words to the derivations if: + // 1. the term is not an ngram; OR + // 2. the term is an ngram, but the split words are different from the ngram's component words + let split_words = if let Some((ngram_words, split_words)) = + self_mut.ngram_words.as_ref().zip(split_words.as_ref()) + { + let Phrase { words } = ctx.phrase_interner.get(*split_words); + if ngram_words.iter().ne(words.iter().flatten()) { + Some(*split_words) + } else { + None + } + } else { + split_words + }; + let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; + self_mut.one_typo = Lazy::Init(one_typo); Ok(()) From 540a396e49c626b35067b1ff112cd664382ae504 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Fri, 7 Apr 2023 11:08:39 +0200 Subject: [PATCH 177/234] Fix indexing bug in words_prefix_position --- milli/src/update/words_prefix_position_docids.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index eb036c52f..2846c76f7 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -81,8 +81,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { - let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - let (_fid, pos) = relative_from_absolute_position(pos); + let (word, pos) = StrBEU16Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), From d0e9d65025410fe14bff4c283a416a8da1d89a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Fri, 7 Apr 2023 11:09:01 +0200 Subject: [PATCH 178/234] Fix distinct attribute bugs --- milli/src/search/new/bucket_sort.rs | 172 ++++++++++++++++--------- milli/src/search/new/distinct.rs | 9 +- milli/src/search/new/mod.rs | 14 +- milli/src/search/new/tests/distinct.rs | 2 +- 4 files changed, 121 insertions(+), 76 deletions(-) diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index 712825c31..6413ff811 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -6,6 +6,11 @@ use super::SearchContext; use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput}; use crate::Result; +pub struct BucketSortOutput { + pub docids: Vec, + pub all_candidates: RoaringBitmap, +} + pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, mut ranking_rules: Vec>, @@ -14,7 +19,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( from: usize, length: usize, logger: &mut dyn SearchLogger, -) -> Result> { +) -> Result { logger.initial_query(query); logger.ranking_rules(&ranking_rules); logger.initial_universe(universe); @@ -26,7 +31,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( }; if universe.len() < from as u64 { - return Ok(vec![]); + return Ok(BucketSortOutput { docids: vec![], all_candidates: universe.clone() }); } if ranking_rules.is_empty() { if let Some(distinct_fid) = distinct_fid { @@ -42,9 +47,12 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?; results.push(docid); } - return Ok(results); + let mut all_candidates = universe - excluded; + all_candidates.extend(results.iter().copied()); + return Ok(BucketSortOutput { docids: results, all_candidates }); } else { - return Ok(universe.iter().skip(from).take(length).collect()); + let docids = universe.iter().skip(from).take(length).collect(); + return Ok(BucketSortOutput { docids, all_candidates: universe.clone() }); }; } @@ -61,7 +69,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( /// Finish iterating over the current ranking rule, yielding /// control to the parent (or finishing the search if not possible). - /// Update the candidates accordingly and inform the logger. + /// Update the universes accordingly and inform the logger. macro_rules! back { () => { assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); @@ -80,72 +88,35 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( }; } - let mut results = vec![]; + let mut all_candidates = RoaringBitmap::new(); + let mut valid_docids = vec![]; let mut cur_offset = 0usize; - /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` - /// into account and inform the logger. macro_rules! maybe_add_to_results { ($candidates:expr) => { - // First apply the distinct rule on the candidates, reducing the universes if necessary - let candidates = if let Some(distinct_fid) = distinct_fid { - let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?; - for universe in ranking_rule_universes.iter_mut() { - *universe -= &excluded; - } - remaining - } else { - $candidates.clone() - }; - let len = candidates.len(); - // if the candidates are empty, there is nothing to do; - if !candidates.is_empty() { - // if we still haven't reached the first document to return - if cur_offset < from { - // and if no document from this bucket can be returned - if cur_offset + (candidates.len() as usize) < from { - // then just skip the bucket - logger.skip_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &candidates, - ); - } else { - // otherwise, skip some of the documents and add some of the rest, in order of ids - let all_candidates = candidates.iter().collect::>(); - let (skipped_candidates, candidates) = - all_candidates.split_at(from - cur_offset); - logger.skip_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &skipped_candidates.into_iter().collect(), - ); - let candidates = candidates - .iter() - .take(length - results.len()) - .copied() - .collect::>(); - logger.add_to_results(&candidates); - results.extend(&candidates); - } - } else { - // if we have passed the offset already, add some of the documents (up to the limit) - let candidates = - candidates.iter().take(length - results.len()).collect::>(); - logger.add_to_results(&candidates); - results.extend(&candidates); - } - } - cur_offset += len as usize; + maybe_add_to_results( + ctx, + from, + length, + logger, + &mut valid_docids, + &mut all_candidates, + &mut ranking_rule_universes, + &mut ranking_rules, + cur_ranking_rule_index, + &mut cur_offset, + distinct_fid, + $candidates, + )?; }; } - while results.len() < length { + while valid_docids.len() < length { // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { - maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); - ranking_rule_universes[cur_ranking_rule_index].clear(); + let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); + maybe_add_to_results!(bucket); back!(); continue; } @@ -171,7 +142,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( || next_bucket.candidates.len() <= 1 || cur_offset + (next_bucket.candidates.len() as usize) < from { - maybe_add_to_results!(&next_bucket.candidates); + maybe_add_to_results!(next_bucket.candidates); continue; } @@ -191,5 +162,80 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( )?; } - Ok(results) + all_candidates |= &ranking_rule_universes[0]; + + Ok(BucketSortOutput { docids: valid_docids, all_candidates }) +} + +/// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` +/// into account and inform the logger. +#[allow(clippy::too_many_arguments)] +fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>( + ctx: &mut SearchContext<'ctx>, + from: usize, + length: usize, + logger: &mut dyn SearchLogger, + + valid_docids: &mut Vec, + all_candidates: &mut RoaringBitmap, + + ranking_rule_universes: &mut [RoaringBitmap], + ranking_rules: &mut [BoxRankingRule<'ctx, Q>], + cur_ranking_rule_index: usize, + + cur_offset: &mut usize, + distinct_fid: Option, + candidates: RoaringBitmap, +) -> Result<()> { + // First apply the distinct rule on the candidates, reducing the universes if necessary + let candidates = if let Some(distinct_fid) = distinct_fid { + let DistinctOutput { remaining, excluded } = + apply_distinct_rule(ctx, distinct_fid, &candidates)?; + for universe in ranking_rule_universes.iter_mut() { + *universe -= &excluded; + } + remaining + } else { + candidates.clone() + }; + *all_candidates |= &candidates; + // if the candidates are empty, there is nothing to do; + if candidates.is_empty() { + return Ok(()); + } + + // if we still haven't reached the first document to return + if *cur_offset < from { + // and if no document from this bucket can be returned + if *cur_offset + (candidates.len() as usize) < from { + // then just skip the bucket + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &candidates, + ); + } else { + // otherwise, skip some of the documents and add some of the rest, in order of ids + let all_candidates = candidates.iter().collect::>(); + let (skipped_candidates, candidates) = all_candidates.split_at(from - *cur_offset); + + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &skipped_candidates.iter().collect(), + ); + let candidates = + candidates.iter().take(length - valid_docids.len()).copied().collect::>(); + logger.add_to_results(&candidates); + valid_docids.extend(&candidates); + } + } else { + // if we have passed the offset already, add some of the documents (up to the limit) + let candidates = candidates.iter().take(length - valid_docids.len()).collect::>(); + logger.add_to_results(&candidates); + valid_docids.extend(&candidates); + } + + *cur_offset += candidates.len() as usize; + Ok(()) } diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs index 7b77adf49..fbb7550a9 100644 --- a/milli/src/search/new/distinct.rs +++ b/milli/src/search/new/distinct.rs @@ -61,12 +61,9 @@ pub fn distinct_single_docid( } for item in facet_number_values(docid, field_id, index, txn)? { let ((_, _, facet_value), _) = item?; - if let Some(facet_docids) = facet_value_docids( - index.facet_id_string_docids.remap_types(), - txn, - field_id, - facet_value, - )? { + if let Some(facet_docids) = + facet_value_docids(index.facet_id_f64_docids.remap_types(), txn, field_id, facet_value)? + { *excluded |= facet_docids; } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index b307b2434..3beda526b 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -24,6 +24,7 @@ mod tests; use std::collections::HashSet; +use bucket_sort::bucket_sort; use charabia::TokenizerBuilder; use db_cache::DatabaseCache; use graph_based_ranking_rule::{Proximity, Typo}; @@ -34,11 +35,11 @@ pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; -use bucket_sort::bucket_sort; use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; +use self::bucket_sort::BucketSortOutput; use self::exact_attribute::ExactAttribute; use self::graph_based_ranking_rule::Exactness; use self::interner::Interner; @@ -297,7 +298,7 @@ pub fn execute_search( ctx.index.documents_ids(ctx.txn)? }; - let documents_ids = if let Some(query) = query { + let bucket_sort_output = if let Some(query) = query { // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut tokbuilder = TokenizerBuilder::new(); @@ -344,13 +345,14 @@ pub fn execute_search( )? }; + let BucketSortOutput { docids, mut all_candidates } = bucket_sort_output; + // The candidates is the universe unless the exhaustive number of hits // is requested and a distinct attribute is set. - let mut candidates = universe; if exhaustive_number_hits { if let Some(f) = ctx.index.distinct_field(ctx.txn)? { if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) { - candidates = apply_distinct_rule(ctx, distinct_fid, &candidates)?.remaining; + all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining; } } } @@ -358,8 +360,8 @@ pub fn execute_search( Ok(SearchResult { // TODO: correct matching words matching_words: MatchingWords::default(), - candidates, - documents_ids, + candidates: all_candidates, + documents_ids: docids, }) } diff --git a/milli/src/search/new/tests/distinct.rs b/milli/src/search/new/tests/distinct.rs index 4073cf585..74e0cdca0 100644 --- a/milli/src/search/new/tests/distinct.rs +++ b/milli/src/search/new/tests/distinct.rs @@ -531,7 +531,7 @@ fn test_distinct_all_candidates() { let candidates = candidates.iter().collect::>(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); // TODO: this is incorrect! - insta::assert_snapshot!(format!("{candidates:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); + insta::assert_snapshot!(format!("{candidates:?}"), @"[1, 4, 7, 8, 14, 17, 19, 20, 23, 24, 25, 26]"); } #[test] From 8cb85294ef3ea720baef5f9db6b4b678685e6ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Fri, 7 Apr 2023 11:09:30 +0200 Subject: [PATCH 179/234] Remove unused import warning --- milli/src/update/words_prefix_position_docids.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index 2846c76f7..b09555264 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -8,13 +8,13 @@ use heed::{BytesDecode, BytesEncode}; use log::debug; use crate::error::SerializationError; -use crate::heed_codec::{StrBEU16Codec, StrBEU32Codec}; +use crate::heed_codec::StrBEU16Codec; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::index_documents::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, CursorClonableMmap, MergeFn, }; -use crate::{relative_from_absolute_position, Index, Result}; +use crate::{Index, Result}; pub struct WordPrefixPositionDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, From 7ab48ed8c772c8ff49eb0cb95759fc69de53d925 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 11 Apr 2023 15:41:44 +0200 Subject: [PATCH 180/234] Matching words fixes --- milli/src/search/new/logger/mod.rs | 2 - .../src/search/new/matches/matching_words.rs | 64 +------------ milli/src/search/new/matches/mod.rs | 4 +- milli/src/search/new/mod.rs | 17 ++-- milli/src/search/new/query_term/mod.rs | 94 +++++++++++++++---- 5 files changed, 94 insertions(+), 87 deletions(-) diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 15cb78784..889e811ad 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -1,8 +1,6 @@ // #[cfg(test)] pub mod detailed; -pub mod test_logger; - use roaring::RoaringBitmap; use super::interner::{Interned, MappedInterner}; diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index d5d1b6906..0da1b3a78 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -5,9 +5,7 @@ use std::ops::RangeInclusive; use charabia::Token; use super::super::interner::Interned; -use super::super::query_term::{ - Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm, -}; +use super::super::query_term::LocatedQueryTerm; use super::super::{DedupInterner, Phrase}; use crate::SearchContext; @@ -33,68 +31,16 @@ pub struct MatchingWords { words: Vec, } -/// Extract and centralize the different phrases and words to match stored in a QueryTerm. -fn extract_matching_terms(term: &QueryTerm) -> (Vec>, Vec>) { - let mut matching_words = Vec::new(); - let mut matching_phrases = Vec::new(); - - // the structure is exhaustively extracted to ensure that no field is missing. - let QueryTerm { - original: _, - is_multiple_words: _, - max_nbr_typos: _, - is_prefix: _, - zero_typo, - one_typo, - two_typo, - } = term; - - // the structure is exhaustively extracted to ensure that no field is missing. - let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo; - - // zero typo - if let Some(phrase) = phrase { - matching_phrases.push(*phrase); - } - if let Some(zero_typo) = zero_typo { - matching_words.push(*zero_typo); - } - for synonym in synonyms { - matching_phrases.push(*synonym); - } - - // one typo - // the structure is exhaustively extracted to ensure that no field is missing. - if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo { - if let Some(split_words) = split_words { - matching_phrases.push(*split_words); - } - for one_typo in one_typo { - matching_words.push(*one_typo); - } - } - - // two typos - // the structure is exhaustively extracted to ensure that no field is missing. - if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo { - for two_typos in two_typos { - matching_words.push(*two_typos); - } - } - - (matching_phrases, matching_words) -} - impl MatchingWords { pub fn new(ctx: SearchContext, located_terms: Vec) -> Self { let mut phrases = Vec::new(); let mut words = Vec::new(); - // Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms + // Extract and centralize the different phrases and words to match stored in a QueryTerm // and wrap them in dedicated structures. for located_term in located_terms { let term = ctx.term_interner.get(located_term.value); - let (matching_phrases, matching_words) = extract_matching_terms(term); + let (matching_words, matching_phrases) = term.all_computed_derivations(); for matching_phrase in matching_phrases { phrases.push(LocatedMatchingPhrase { @@ -106,8 +52,8 @@ impl MatchingWords { words.push(LocatedMatchingWords { value: matching_words, positions: located_term.positions.clone(), - is_prefix: term.is_prefix, - original_char_count: ctx.word_interner.get(term.original).chars().count(), + is_prefix: term.is_cached_prefix(), + original_char_count: term.original_word(&ctx).chars().count(), }); } diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 8dded0cab..84bdea7ab 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -137,7 +137,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { } // partial match is now full, we keep this matches and we advance positions Some(MatchType::Full { char_len, ids }) => { - let ids: Vec<_> = ids.clone().into_iter().collect(); + let ids: Vec<_> = ids.clone().collect(); // save previously matched tokens as matches. let iter = potential_matches.into_iter().map( |(token_position, word_position, match_len)| Match { @@ -192,7 +192,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { // we match, we save the current token as a match, // then we continue the rest of the tokens. MatchType::Full { char_len, ids } => { - let ids: Vec<_> = ids.clone().into_iter().collect(); + let ids: Vec<_> = ids.clone().collect(); matches.push(Match { match_len: char_len, ids, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 0445b3e94..e07a17029 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -35,20 +35,20 @@ pub use logger::detailed::DetailedSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm}; -use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; +use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; -use self::bucket_sort::BucketSortOutput; -use self::exact_attribute::ExactAttribute; -use self::graph_based_ranking_rule::Exactness; -use self::interner::Interner; -use self::ranking_rules::{BoxRankingRule, RankingRule}; -use self::resolve_query_graph::compute_query_graph_docids; -use self::sort::Sort; use crate::search::new::distinct::apply_distinct_rule; use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; +use bucket_sort::BucketSortOutput; +use exact_attribute::ExactAttribute; +use graph_based_ranking_rule::Exactness; +use interner::Interner; +use ranking_rules::{BoxRankingRule, RankingRule}; +use resolve_query_graph::compute_query_graph_docids; +use sort::Sort; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -361,6 +361,7 @@ pub fn execute_search( Ok(PartialSearchResult { candidates: all_candidates, documents_ids: docids, + located_query_terms, }) } diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 896c70e1b..83320139b 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -188,17 +188,35 @@ impl QueryTermSubset { } let original = ctx.term_interner.get_mut(self.original); - if !self.zero_typo_subset.is_empty() { - let ZeroTypoTerm { - phrase: _, - exact: zero_typo, - prefix_of, - synonyms: _, - use_prefix_db: _, - } = &original.zero_typo; - result.extend(zero_typo.iter().copied()); - result.extend(prefix_of.iter().copied()); - }; + match &self.zero_typo_subset { + NTypoTermSubset::All => { + let ZeroTypoTerm { + phrase: _, + exact: zero_typo, + prefix_of, + synonyms: _, + use_prefix_db: _, + } = &original.zero_typo; + result.extend(zero_typo.iter().copied()); + result.extend(prefix_of.iter().copied()); + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let ZeroTypoTerm { + phrase: _, + exact: zero_typo, + prefix_of, + synonyms: _, + use_prefix_db: _, + } = &original.zero_typo; + if let Some(zero_typo) = zero_typo { + if words.contains(zero_typo) { + result.insert(*zero_typo); + } + } + result.extend(prefix_of.intersection(words).copied()); + } + NTypoTermSubset::Nothing => {} + } match &self.one_typo_subset { NTypoTermSubset::All => { @@ -248,11 +266,24 @@ impl QueryTermSubset { result.extend(phrase.iter().copied()); result.extend(synonyms.iter().copied()); - if !self.one_typo_subset.is_empty() { - let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { - panic!(); - }; - result.extend(split_words.iter().copied()); + match &self.one_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { + panic!(); + }; + result.extend(split_words.iter().copied()); + } + NTypoTermSubset::Subset { phrases, .. } => { + let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { + panic!(); + }; + if let Some(split_words) = split_words { + if phrases.contains(split_words) { + result.insert(*split_words); + } + } + } + NTypoTermSubset::Nothing => {} } Ok(result) @@ -368,3 +399,34 @@ impl LocatedQueryTerm { interner.get(self.value).is_empty() } } + +impl QueryTerm { + pub fn is_cached_prefix(&self) -> bool { + self.zero_typo.use_prefix_db.is_some() + } + pub fn original_word(&self, ctx: &SearchContext) -> String { + ctx.word_interner.get(self.original).clone() + } + pub fn all_computed_derivations(&self) -> (Vec>, Vec>) { + let mut words = BTreeSet::new(); + let mut phrases = BTreeSet::new(); + + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } = + &self.zero_typo; + words.extend(zero_typo.iter().copied()); + words.extend(prefix_of.iter().copied()); + phrases.extend(phrase.iter().copied()); + phrases.extend(synonyms.iter().copied()); + + if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = &self.one_typo { + words.extend(one_typo.iter().copied()); + phrases.extend(split_words.iter().copied()); + }; + + if let Lazy::Init(TwoTypoTerm { two_typos }) = &self.two_typo { + words.extend(two_typos.iter().copied()); + }; + + (words.into_iter().collect(), phrases.into_iter().collect()) + } +} From 96183e804a10d3b1337f695da822623d5861e6f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 6 Apr 2023 16:24:44 +0200 Subject: [PATCH 181/234] Simplify the logger --- .../search/new/graph_based_ranking_rule.rs | 12 --- milli/src/search/new/logger/detailed.rs | 55 +++----------- milli/src/search/new/logger/mod.rs | 73 ++++--------------- milli/src/search/new/mod.rs | 2 +- .../new/ranking_rule_graph/exactness/mod.rs | 29 +------- .../src/search/new/ranking_rule_graph/mod.rs | 15 ---- .../new/ranking_rule_graph/proximity/mod.rs | 31 +------- .../search/new/ranking_rule_graph/typo/mod.rs | 24 +----- milli/src/search/new/words.rs | 2 - 9 files changed, 38 insertions(+), 205 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 28b4ed1f4..154bcd3b2 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -190,10 +190,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase cur_distance_idx: _, } = &mut state; - let original_universe = universe; let mut universe = universe.clone(); - let original_graph = graph.clone(); let mut used_conditions = SmallBitmap::for_interned_values_in(&graph.conditions_interner); let mut good_paths = vec![]; let mut considered_paths = vec![]; @@ -272,16 +270,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase } })?; - G::log_state( - &original_graph, - &considered_paths, - dead_ends_cache, - original_universe, - all_costs, - cost, - logger, - ); - // We modify the next query graph so that it only contains the subgraph // that was used to compute this bucket // But we only do it in case the bucket length is >1, because otherwise diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 86568d5d2..0efc457f6 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -1,3 +1,4 @@ +use std::any::Any; use std::fs::File; use std::io::Write; use std::path::PathBuf; @@ -6,6 +7,7 @@ use std::time::Instant; // use rand::random; use roaring::RoaringBitmap; +use crate::search::new::graph_based_ranking_rule::Typo; use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::LocatedQueryTermSubset; @@ -14,6 +16,8 @@ use crate::search::new::ranking_rule_graph::{ RankingRuleGraphTrait, TypoCondition, TypoGraph, }; use crate::search::new::ranking_rules::BoxRankingRule; +use crate::search::new::sort::Sort; +use crate::search::new::words::Words; use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger}; pub enum SearchEvents { @@ -92,7 +96,7 @@ impl SearchLogger for DetailedSearchLogger { self.initial_query_time = Some(Instant::now()); } - fn query_for_universe(&mut self, query: &QueryGraph) { + fn query_for_initial_universe(&mut self, query: &QueryGraph) { self.query_for_universe = Some(query.clone()); } @@ -161,46 +165,12 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::ExtendResults { new: docids.to_vec() }); } - fn log_words_state(&mut self, query_graph: &QueryGraph) { - self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); - } - - fn log_proximity_state( - &mut self, - query_graph: &RankingRuleGraph, - paths_map: &[Vec>], - dead_ends_cache: &DeadEndsCache, - universe: &RoaringBitmap, - costs: &MappedInterner>, - cost: u64, - ) { - self.events.push(SearchEvents::ProximityState { - graph: query_graph.clone(), - paths: paths_map.to_vec(), - dead_ends_cache: dead_ends_cache.clone(), - universe: universe.clone(), - costs: costs.clone(), - cost, - }) - } - - fn log_typo_state( - &mut self, - query_graph: &RankingRuleGraph, - paths_map: &[Vec>], - dead_ends_cache: &DeadEndsCache, - universe: &RoaringBitmap, - costs: &MappedInterner>, - cost: u64, - ) { - self.events.push(SearchEvents::TypoState { - graph: query_graph.clone(), - paths: paths_map.to_vec(), - dead_ends_cache: dead_ends_cache.clone(), - universe: universe.clone(), - costs: costs.clone(), - cost, - }) + /// Logs the internal state of the ranking rule + fn log_ranking_rule_state<'ctx>(&mut self, state: &(dyn Any + 'ctx)) { + if let Some(_words) = state.downcast_ref::() { + } else if let Some(_sort) = state.downcast_ref::>() { + } else if let Some(_typo) = state.downcast_ref::() { + } } } @@ -567,9 +537,8 @@ results.{cur_ranking_rule}{cur_activated_id} {{ file, "{condition_id} {{ shape: class -{} +label }}", - R::label_for_condition(ctx, condition).unwrap() ) .unwrap(); } diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 889e811ad..1ef048a8e 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -1,14 +1,14 @@ // #[cfg(test)] pub mod detailed; +use std::any::Any; + use roaring::RoaringBitmap; -use super::interner::{Interned, MappedInterner}; -use super::query_graph::QueryNode; -use super::ranking_rule_graph::{ - DeadEndsCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoCondition, TypoGraph, -}; +use super::graph_based_ranking_rule::Typo; use super::ranking_rules::BoxRankingRule; +use super::sort::Sort; +use super::words::Words; use super::{RankingRule, RankingRuleQueryTrait}; /// Trait for structure logging the execution of a search query. @@ -16,12 +16,12 @@ pub trait SearchLogger { /// Logs the initial query fn initial_query(&mut self, query: &Q); - /// Logs the query that was used to compute the set of all candidates - fn query_for_universe(&mut self, query: &Q); - /// Logs the value of the initial set of all candidates fn initial_universe(&mut self, universe: &RoaringBitmap); + /// Logs the query that was used to compute the set of all candidates + fn query_for_initial_universe(&mut self, query: &Q); + /// Logs the ranking rules used to perform the search query fn ranking_rules(&mut self, rr: &[BoxRankingRule]); @@ -58,30 +58,13 @@ pub trait SearchLogger { /// Logs the addition of document ids to the final results fn add_to_results(&mut self, docids: &[u32]); - /// Logs the internal state of the words ranking rule - fn log_words_state(&mut self, query_graph: &Q); - - /// Logs the internal state of the proximity ranking rule - fn log_proximity_state( - &mut self, - query_graph: &RankingRuleGraph, - paths: &[Vec>], - dead_ends_cache: &DeadEndsCache, - universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u64, - ); - - /// Logs the internal state of the typo ranking rule - fn log_typo_state( - &mut self, - query_graph: &RankingRuleGraph, - paths: &[Vec>], - dead_ends_cache: &DeadEndsCache, - universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u64, - ); + /// Logs the internal state of the ranking rule + fn log_ranking_rule_state<'ctx>(&mut self, rr: &(dyn Any + 'ctx)) { + if let Some(_words) = rr.downcast_ref::() { + } else if let Some(_sort) = rr.downcast_ref::>() { + } else if let Some(_typo) = rr.downcast_ref::() { + } + } } /// A dummy [`SearchLogger`] which does nothing. @@ -90,7 +73,7 @@ pub struct DefaultSearchLogger; impl SearchLogger for DefaultSearchLogger { fn initial_query(&mut self, _query: &Q) {} - fn query_for_universe(&mut self, _query: &Q) {} + fn query_for_initial_universe(&mut self, _query: &Q) {} fn initial_universe(&mut self, _universe: &RoaringBitmap) {} @@ -130,28 +113,4 @@ impl SearchLogger for DefaultSearchLogger { } fn add_to_results(&mut self, _docids: &[u32]) {} - - fn log_words_state(&mut self, _query_graph: &Q) {} - - fn log_proximity_state( - &mut self, - _query_graph: &RankingRuleGraph, - _paths_map: &[Vec>], - _dead_ends_cache: &DeadEndsCache, - _universe: &RoaringBitmap, - _distances: &MappedInterner>, - _cost: u64, - ) { - } - - fn log_typo_state( - &mut self, - _query_graph: &RankingRuleGraph, - _paths: &[Vec>], - _dead_ends_cache: &DeadEndsCache, - _universe: &RoaringBitmap, - _distances: &MappedInterner>, - _cost: u64, - ) { - } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index e07a17029..5bebd3bff 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -96,7 +96,7 @@ fn resolve_maximally_reduced_query_graph( }; graph.remove_nodes_keep_edges(&nodes_to_remove); - logger.query_for_universe(&graph); + logger.query_for_initial_universe(&graph); let docids = compute_query_graph_docids(ctx, &graph, universe)?; Ok(docids) diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index 6639299a1..55c4497dd 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -1,11 +1,10 @@ use heed::BytesDecode; use roaring::RoaringBitmap; -use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; -use crate::search::new::query_graph::{QueryGraph, QueryNode}; +use super::{ComputedCondition, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; -use crate::{Result, RoaringBitmapCodec, SearchContext, SearchLogger}; +use crate::{Result, RoaringBitmapCodec, SearchContext}; #[derive(Clone, PartialEq, Eq, Hash)] pub enum ExactnessCondition { @@ -77,26 +76,4 @@ impl RankingRuleGraphTrait for ExactnessGraph { Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)]) } - - fn log_state( - _graph: &RankingRuleGraph, - _paths: &[Vec>], - _dead_ends_cache: &DeadEndsCache, - _niverse: &RoaringBitmap, - _costs: &MappedInterner>, - _cost: u64, - _logger: &mut dyn SearchLogger, - ) { - } - - fn label_for_condition( - _ctx: &mut SearchContext, - condition: &Self::Condition, - ) -> Result { - Ok(match condition { - ExactnessCondition::ExactInAttribute(_) => "exact", - ExactnessCondition::Skip(_) => "skip", - } - .to_owned()) - } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 936c3e942..00e759a28 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -28,7 +28,6 @@ use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner}; -use super::logger::SearchLogger; use super::query_term::LocatedQueryTermSubset; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; @@ -86,10 +85,6 @@ impl PartialEq for Edge { pub trait RankingRuleGraphTrait: Sized { type Condition: Sized + Clone + PartialEq + Eq + Hash; - /// Return the label of the given edge condition, to be used when visualising - /// the ranking rule graph. - fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result; - /// Compute the document ids associated with the given edge condition, /// restricted to the given universe. fn resolve_condition( @@ -105,16 +100,6 @@ pub trait RankingRuleGraphTrait: Sized { source_node: Option<&LocatedQueryTermSubset>, dest_node: &LocatedQueryTermSubset, ) -> Result)>>; - - fn log_state( - graph: &RankingRuleGraph, - paths: &[Vec>], - dead_ends_cache: &DeadEndsCache, - universe: &RoaringBitmap, - costs: &MappedInterner>, - cost: u64, - logger: &mut dyn SearchLogger, - ); } /// The graph used by graph-based ranking rules. diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index cfd3f62bf..ead717a6f 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -3,11 +3,10 @@ pub mod compute_docids; use roaring::RoaringBitmap; -use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; -use crate::search::new::logger::SearchLogger; +use super::{ComputedCondition, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::LocatedQueryTermSubset; -use crate::search::new::{QueryGraph, QueryNode, SearchContext}; +use crate::search::new::SearchContext; use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] @@ -37,28 +36,4 @@ impl RankingRuleGraphTrait for ProximityGraph { ) -> Result)>> { build::build_edges(ctx, conditions_interner, source_term, dest_term) } - - fn log_state( - graph: &RankingRuleGraph, - paths: &[Vec>], - dead_ends_cache: &DeadEndsCache, - universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u64, - logger: &mut dyn SearchLogger, - ) { - logger.log_proximity_state(graph, paths, dead_ends_cache, universe, distances, cost); - } - - fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { - match condition { - ProximityCondition::Uninit { cost, .. } => { - // TODO - Ok(format!("{cost}: cost")) - } - ProximityCondition::Term { term } => { - Ok(format!("{} : exists", term.term_subset.description(ctx))) - } - } - } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 5d7e0f874..da5198c23 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,11 +1,10 @@ use roaring::RoaringBitmap; -use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; -use crate::search::new::logger::SearchLogger; +use super::{ComputedCondition, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; -use crate::search::new::{QueryGraph, QueryNode, SearchContext}; +use crate::search::new::SearchContext; use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] @@ -76,21 +75,4 @@ impl RankingRuleGraphTrait for TypoGraph { } Ok(edges) } - - fn log_state( - graph: &RankingRuleGraph, - paths: &[Vec>], - dead_ends_cache: &DeadEndsCache, - universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u64, - logger: &mut dyn SearchLogger, - ) { - logger.log_typo_state(graph, paths, dead_ends_cache, universe, distances, cost); - } - - fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { - let TypoCondition { term, nbr_typos } = condition; - Ok(format!("{}: {nbr_typos}", term.term_subset.description(ctx))) - } } diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 0036694c3..c3ae07bcb 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -66,8 +66,6 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { } let Some(query_graph) = &mut self.query_graph else { panic!() }; - logger.log_words_state(query_graph); - let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?; let child_query_graph = query_graph.clone(); From 1f813a6f3bcec8d259b1b18e2ec04be9831f1c6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 11 Apr 2023 11:56:31 +0200 Subject: [PATCH 182/234] Simplify implementation of the detailed (=visual) logger --- milli/examples/search.rs | 14 +- milli/src/lib.rs | 2 +- .../search/new/graph_based_ranking_rule.rs | 3 + milli/src/search/new/logger/detailed.rs | 563 ------------------ milli/src/search/new/logger/mod.rs | 91 +-- milli/src/search/new/logger/visual.rs | 525 ++++++++++++++++ milli/src/search/new/mod.rs | 2 +- .../src/search/new/ranking_rule_graph/mod.rs | 2 +- milli/src/search/new/words.rs | 1 + 9 files changed, 567 insertions(+), 636 deletions(-) delete mode 100644 milli/src/search/new/logger/detailed.rs create mode 100644 milli/src/search/new/logger/visual.rs diff --git a/milli/examples/search.rs b/milli/examples/search.rs index ff7d564c6..c9a3c1438 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -1,6 +1,6 @@ -use std::error::Error; use std::io::stdin; use std::time::Instant; +use std::{error::Error, path::Path}; use heed::EnvOpenOptions; use milli::{ @@ -19,7 +19,7 @@ fn main() -> Result<(), Box> { program_name ) }); - let detailed_logger = args.next(); + let detailed_logger_dir = args.next(); let print_documents: bool = if let Some(arg) = args.next() { arg == "print-documents" } else { false }; @@ -34,11 +34,11 @@ fn main() -> Result<(), Box> { let mut default_logger = DefaultSearchLogger; // FIXME: consider resetting the state of the logger between search executions as otherwise panics are possible. // Workaround'd here by recreating the logger on each iteration of the loop - let mut detailed_logger = detailed_logger + let mut detailed_logger = detailed_logger_dir .as_ref() - .map(|logger_dir| milli::DetailedSearchLogger::new(logger_dir)); + .map(|logger_dir| (milli::VisualSearchLogger::default(), logger_dir)); let logger: &mut dyn SearchLogger<_> = - if let Some(detailed_logger) = detailed_logger.as_mut() { + if let Some((detailed_logger, _)) = detailed_logger.as_mut() { detailed_logger } else { &mut default_logger @@ -61,8 +61,8 @@ fn main() -> Result<(), Box> { &mut DefaultSearchLogger, logger, )?; - if let Some(logger) = &detailed_logger { - logger.write_d2_description(&mut ctx); + if let Some((logger, dir)) = detailed_logger { + logger.finish(&mut ctx, Path::new(dir))?; } let elapsed = start.elapsed(); println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index e134c8ceb..868df74e8 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -79,7 +79,7 @@ pub use filter_parser::{Condition, FilterCondition, Span, Token}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; pub use search::new::{ - execute_search, DefaultSearchLogger, DetailedSearchLogger, SearchContext, SearchLogger, + execute_search, DefaultSearchLogger, SearchContext, SearchLogger, VisualSearchLogger, }; use serde_json::Value; pub use {charabia as tokenizer, heed}; diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 154bcd3b2..41a96dd9e 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -270,6 +270,9 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase } })?; + logger.log_internal_state(graph); + logger.log_internal_state(&good_paths); + // We modify the next query graph so that it only contains the subgraph // that was used to compute this bucket // But we only do it in case the bucket length is >1, because otherwise diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs deleted file mode 100644 index 0efc457f6..000000000 --- a/milli/src/search/new/logger/detailed.rs +++ /dev/null @@ -1,563 +0,0 @@ -use std::any::Any; -use std::fs::File; -use std::io::Write; -use std::path::PathBuf; -use std::time::Instant; - -// use rand::random; -use roaring::RoaringBitmap; - -use crate::search::new::graph_based_ranking_rule::Typo; -use crate::search::new::interner::{Interned, MappedInterner}; -use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::LocatedQueryTermSubset; -use crate::search::new::ranking_rule_graph::{ - DeadEndsCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, - RankingRuleGraphTrait, TypoCondition, TypoGraph, -}; -use crate::search::new::ranking_rules::BoxRankingRule; -use crate::search::new::sort::Sort; -use crate::search::new::words::Words; -use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger}; - -pub enum SearchEvents { - RankingRuleStartIteration { - ranking_rule_idx: usize, - query: QueryGraph, - universe: RoaringBitmap, - time: Instant, - }, - RankingRuleNextBucket { - ranking_rule_idx: usize, - universe: RoaringBitmap, - candidates: RoaringBitmap, - time: Instant, - }, - RankingRuleEndIteration { - ranking_rule_idx: usize, - universe: RoaringBitmap, - time: Instant, - }, - ExtendResults { - new: Vec, - }, - WordsState { - query_graph: QueryGraph, - }, - ProximityState { - graph: RankingRuleGraph, - paths: Vec>>, - dead_ends_cache: DeadEndsCache, - universe: RoaringBitmap, - costs: MappedInterner>, - cost: u64, - }, - TypoState { - graph: RankingRuleGraph, - paths: Vec>>, - dead_ends_cache: DeadEndsCache, - universe: RoaringBitmap, - costs: MappedInterner>, - cost: u64, - }, - RankingRuleSkipBucket { - ranking_rule_idx: usize, - candidates: RoaringBitmap, - time: Instant, - }, -} - -pub struct DetailedSearchLogger { - folder_path: PathBuf, - initial_query: Option, - initial_query_time: Option, - query_for_universe: Option, - initial_universe: Option, - ranking_rules_ids: Option>, - events: Vec, -} -impl DetailedSearchLogger { - pub fn new(folder_path: &str) -> Self { - Self { - folder_path: PathBuf::new().join(folder_path), - initial_query: None, - initial_query_time: None, - query_for_universe: None, - initial_universe: None, - ranking_rules_ids: None, - events: vec![], - } - } -} - -impl SearchLogger for DetailedSearchLogger { - fn initial_query(&mut self, query: &QueryGraph) { - self.initial_query = Some(query.clone()); - self.initial_query_time = Some(Instant::now()); - } - - fn query_for_initial_universe(&mut self, query: &QueryGraph) { - self.query_for_universe = Some(query.clone()); - } - - fn initial_universe(&mut self, universe: &RoaringBitmap) { - self.initial_universe = Some(universe.clone()); - } - fn ranking_rules(&mut self, rr: &[BoxRankingRule]) { - self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect()); - } - - fn start_iteration_ranking_rule( - &mut self, - ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule, - query: &QueryGraph, - universe: &RoaringBitmap, - ) { - self.events.push(SearchEvents::RankingRuleStartIteration { - ranking_rule_idx, - query: query.clone(), - universe: universe.clone(), - time: Instant::now(), - }) - } - - fn next_bucket_ranking_rule( - &mut self, - ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule, - universe: &RoaringBitmap, - candidates: &RoaringBitmap, - ) { - self.events.push(SearchEvents::RankingRuleNextBucket { - ranking_rule_idx, - universe: universe.clone(), - candidates: candidates.clone(), - time: Instant::now(), - }) - } - fn skip_bucket_ranking_rule( - &mut self, - ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule, - candidates: &RoaringBitmap, - ) { - self.events.push(SearchEvents::RankingRuleSkipBucket { - ranking_rule_idx, - candidates: candidates.clone(), - time: Instant::now(), - }) - } - - fn end_iteration_ranking_rule( - &mut self, - ranking_rule_idx: usize, - _ranking_rule: &dyn RankingRule, - universe: &RoaringBitmap, - ) { - self.events.push(SearchEvents::RankingRuleEndIteration { - ranking_rule_idx, - universe: universe.clone(), - time: Instant::now(), - }) - } - fn add_to_results(&mut self, docids: &[u32]) { - self.events.push(SearchEvents::ExtendResults { new: docids.to_vec() }); - } - - /// Logs the internal state of the ranking rule - fn log_ranking_rule_state<'ctx>(&mut self, state: &(dyn Any + 'ctx)) { - if let Some(_words) = state.downcast_ref::() { - } else if let Some(_sort) = state.downcast_ref::>() { - } else if let Some(_typo) = state.downcast_ref::() { - } - } -} - -impl DetailedSearchLogger { - pub fn write_d2_description(&self, ctx: &mut SearchContext) { - let mut prev_time = self.initial_query_time.unwrap(); - let mut timestamp = vec![]; - fn activated_id(timestamp: &[usize]) -> String { - let mut s = String::new(); - s.push('0'); - for t in timestamp.iter() { - s.push_str(&format!("{t}")); - } - s - } - - let index_path = self.folder_path.join("index.d2"); - let mut file = std::fs::File::create(index_path).unwrap(); - writeln!(&mut file, "direction: right").unwrap(); - writeln!(&mut file, "Initial Query Graph: {{").unwrap(); - let initial_query_graph = self.initial_query.as_ref().unwrap(); - Self::query_graph_d2_description(ctx, initial_query_graph, &mut file); - writeln!(&mut file, "}}").unwrap(); - - writeln!(&mut file, "Query Graph Used To Compute Universe: {{").unwrap(); - let query_graph_for_universe = self.query_for_universe.as_ref().unwrap(); - Self::query_graph_d2_description(ctx, query_graph_for_universe, &mut file); - writeln!(&mut file, "}}").unwrap(); - - let initial_universe = self.initial_universe.as_ref().unwrap(); - writeln!(&mut file, "Initial Universe Length {}", initial_universe.len()).unwrap(); - - writeln!(&mut file, "Control Flow Between Ranking Rules: {{").unwrap(); - writeln!(&mut file, "shape: sequence_diagram").unwrap(); - for (idx, rr_id) in self.ranking_rules_ids.as_ref().unwrap().iter().enumerate() { - writeln!(&mut file, "{idx}: {rr_id}").unwrap(); - } - writeln!(&mut file, "results").unwrap(); - // writeln!(&mut file, "time").unwrap(); - for event in self.events.iter() { - match event { - SearchEvents::RankingRuleStartIteration { ranking_rule_idx, time, .. } => { - let _elapsed = time.duration_since(prev_time); - prev_time = *time; - let parent_activated_id = activated_id(×tamp); - timestamp.push(0); - let self_activated_id = activated_id(×tamp); - // writeln!(&mut file, "time.{self_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); - if *ranking_rule_idx != 0 { - let parent_ranking_rule_idx = ranking_rule_idx - 1; - writeln!( - &mut file, - "{parent_ranking_rule_idx}.{parent_activated_id} -> {ranking_rule_idx}.{self_activated_id} : start iteration", - ) - .unwrap(); - } - writeln!( - &mut file, - "{ranking_rule_idx}.{self_activated_id} {{ - style {{ - fill: \"#D8A7B1\" - }} -}}" - ) - .unwrap(); - } - SearchEvents::RankingRuleNextBucket { - ranking_rule_idx, - time, - universe, - candidates, - } => { - let _elapsed = time.duration_since(prev_time); - prev_time = *time; - let old_activated_id = activated_id(×tamp); - // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); - *timestamp.last_mut().unwrap() += 1; - let next_activated_id = activated_id(×tamp); - writeln!(&mut file, - "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket {}/{}", candidates.len(), universe.len()) - .unwrap(); - } - SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates, time } => { - let _elapsed = time.duration_since(prev_time); - prev_time = *time; - let old_activated_id = activated_id(×tamp); - // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); - *timestamp.last_mut().unwrap() += 1; - let next_activated_id = activated_id(×tamp); - let len = candidates.len(); - writeln!(&mut file, - "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : skip bucket ({len})",) - .unwrap(); - } - SearchEvents::RankingRuleEndIteration { ranking_rule_idx, time, .. } => { - let _elapsed = time.duration_since(prev_time); - prev_time = *time; - let cur_activated_id = activated_id(×tamp); - // writeln!(&mut file, "time.{cur_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); - - timestamp.pop(); - let parent_activated_id = activated_id(×tamp); - let parent_ranking_rule = if *ranking_rule_idx == 0 { - "start".to_owned() - } else { - format!("{}.{parent_activated_id}", ranking_rule_idx - 1) - }; - writeln!( - &mut file, - "{ranking_rule_idx}.{cur_activated_id} -> {parent_ranking_rule} : end iteration", - ) - .unwrap(); - } - SearchEvents::ExtendResults { new } => { - if new.is_empty() { - continue; - } - let cur_ranking_rule = timestamp.len() - 1; - let cur_activated_id = activated_id(×tamp); - let docids = new.iter().collect::>(); - let len = new.len(); - - writeln!( - &mut file, - "{cur_ranking_rule}.{cur_activated_id} -> results.{cur_ranking_rule}{cur_activated_id} : \"add {len}\" -results.{cur_ranking_rule}{cur_activated_id} {{ - tooltip: \"{docids:?}\" - style {{ - fill: \"#B6E2D3\" - }} -}} -" - ) - .unwrap(); - } - SearchEvents::WordsState { query_graph } => { - let cur_ranking_rule = timestamp.len() - 1; - *timestamp.last_mut().unwrap() += 1; - let cur_activated_id = activated_id(×tamp); - *timestamp.last_mut().unwrap() -= 1; - let id = format!("{cur_ranking_rule}.{cur_activated_id}"); - let new_file_path = self.folder_path.join(format!("{id}.d2")); - let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::query_graph_d2_description(ctx, query_graph, &mut new_file); - writeln!( - &mut file, - "{id} {{ - link: \"{id}.d2.svg\" -}}" - ) - .unwrap(); - } - SearchEvents::ProximityState { - graph, - paths, - dead_ends_cache, - universe, - costs, - cost, - } => { - let cur_ranking_rule = timestamp.len() - 1; - *timestamp.last_mut().unwrap() += 1; - let cur_activated_id = activated_id(×tamp); - *timestamp.last_mut().unwrap() -= 1; - let id = format!("{cur_ranking_rule}.{cur_activated_id}"); - let new_file_path = self.folder_path.join(format!("{id}.d2")); - let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description( - ctx, - graph, - paths, - dead_ends_cache, - costs.clone(), - &mut new_file, - ); - writeln!( - &mut file, - "{id} {{ - link: \"{id}.d2.svg\" - tooltip: \"cost {cost}, universe len: {}\" - }}", - universe.len() - ) - .unwrap(); - } - SearchEvents::TypoState { - graph, - paths, - dead_ends_cache, - universe, - costs, - cost, - } => { - let cur_ranking_rule = timestamp.len() - 1; - *timestamp.last_mut().unwrap() += 1; - let cur_activated_id = activated_id(×tamp); - *timestamp.last_mut().unwrap() -= 1; - let id = format!("{cur_ranking_rule}.{cur_activated_id}"); - let new_file_path = self.folder_path.join(format!("{id}.d2")); - let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description( - ctx, - graph, - paths, - dead_ends_cache, - costs.clone(), - &mut new_file, - ); - writeln!( - &mut file, - "{id} {{ - link: \"{id}.d2.svg\" - tooltip: \"cost {cost}, universe len: {}\" -}}", - universe.len() - ) - .unwrap(); - } - } - } - writeln!(&mut file, "}}").unwrap(); - } - - fn query_node_d2_desc( - ctx: &mut SearchContext, - node_idx: Interned, - node: &QueryNode, - _costs: &[u64], - file: &mut File, - ) { - match &node.data { - QueryNodeData::Term(LocatedQueryTermSubset { - term_subset, - positions: _, - term_ids: _, - }) => { - writeln!( - file, - "{node_idx} : \"{}\" {{ - shape: class - max_nbr_typo: {}", - term_subset.description(ctx), - term_subset.max_nbr_typos(ctx) - ) - .unwrap(); - - for w in term_subset.all_single_words_except_prefix_db(ctx).unwrap() { - let w = ctx.word_interner.get(w); - writeln!(file, "{w}: word").unwrap(); - } - for p in term_subset.all_phrases(ctx).unwrap() { - writeln!(file, "{}: phrase", p.description(ctx)).unwrap(); - } - if let Some(w) = term_subset.use_prefix_db(ctx) { - let w = ctx.word_interner.get(w); - writeln!(file, "{w}: prefix db").unwrap(); - } - - writeln!(file, "}}").unwrap(); - } - QueryNodeData::Deleted => panic!(), - QueryNodeData::Start => { - writeln!(file, "{node_idx} : START").unwrap(); - } - QueryNodeData::End => { - writeln!(file, "{node_idx} : END").unwrap(); - } - } - } - fn query_graph_d2_description( - ctx: &mut SearchContext, - query_graph: &QueryGraph, - file: &mut File, - ) { - writeln!(file, "direction: right").unwrap(); - for (node_id, node) in query_graph.nodes.iter() { - if matches!(node.data, QueryNodeData::Deleted) { - continue; - } - Self::query_node_d2_desc(ctx, node_id, node, &[], file); - - for edge in node.successors.iter() { - writeln!(file, "{node_id} -> {edge};\n").unwrap(); - } - } - } - fn ranking_rule_graph_d2_description( - ctx: &mut SearchContext, - graph: &RankingRuleGraph, - paths: &[Vec>], - _dead_ends_cache: &DeadEndsCache, - costs: MappedInterner>, - file: &mut File, - ) { - writeln!(file, "direction: right").unwrap(); - - writeln!(file, "Proximity Graph {{").unwrap(); - for (node_idx, node) in graph.query_graph.nodes.iter() { - if matches!(&node.data, QueryNodeData::Deleted) { - continue; - } - let costs = &costs.get(node_idx); - Self::query_node_d2_desc(ctx, node_idx, node, costs, file); - } - for (_edge_id, edge) in graph.edges_store.iter() { - let Some(edge) = edge else { continue }; - let Edge { source_node, dest_node, condition: details, cost, nodes_to_skip: _ } = edge; - - match &details { - None => { - writeln!(file, "{source_node} -> {dest_node} : \"always cost {cost}\"",) - .unwrap(); - } - Some(condition) => { - // let condition = graph.conditions_interner.get(*condition); - writeln!( - file, - "{source_node} -> {dest_node} : \"{condition} cost {cost}\"", - cost = edge.cost, - ) - .unwrap(); - } - } - } - writeln!(file, "}}").unwrap(); - - // writeln!(file, "costs {{").unwrap(); - // Self::paths_d2_description(graph, paths, file); - // writeln!(file, "}}").unwrap(); - - writeln!(file, "Paths {{").unwrap(); - Self::paths_d2_description(ctx, graph, paths, file); - writeln!(file, "}}").unwrap(); - - // writeln!(file, "Dead-end couples of conditions {{").unwrap(); - // for (i, (e1, e2)) in dead_end_paths_cache.condition_couples.iter().enumerate() { - // writeln!(file, "{i} : \"\" {{").unwrap(); - // Self::condition_d2_description(ctx, graph, e1, file); - // for e2 in e2.iter() { - // Self::condition_d2_description(ctx, graph, e2, file); - // writeln!(file, "{e1} -- {e2}").unwrap(); - // } - // writeln!(file, "}}").unwrap(); - // } - // writeln!(file, "}}").unwrap(); - - // writeln!(file, "Dead-end edges {{").unwrap(); - // for condition in dead_end_paths_cache.conditions.iter() { - // writeln!(file, "{condition}").unwrap(); - // } - // writeln!(file, "}}").unwrap(); - - // writeln!(file, "Dead-end prefixes {{").unwrap(); - // writeln!(file, "}}").unwrap(); - } - fn condition_d2_description( - ctx: &mut SearchContext, - graph: &RankingRuleGraph, - condition_id: Interned, - file: &mut File, - ) { - let condition = graph.conditions_interner.get(condition_id); - writeln!( - file, - "{condition_id} {{ -shape: class -label -}}", - ) - .unwrap(); - } - fn paths_d2_description( - ctx: &mut SearchContext, - graph: &RankingRuleGraph, - paths: &[Vec>], - file: &mut File, - ) { - for (path_idx, condition_indexes) in paths.iter().enumerate() { - writeln!(file, "{path_idx} {{").unwrap(); - for condition in condition_indexes.iter() { - Self::condition_d2_description(ctx, graph, *condition, file); - } - for couple_edges in condition_indexes.windows(2) { - let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() }; - writeln!(file, "{src_edge_idx} -> {dest_edge_idx}").unwrap(); - } - writeln!(file, "}}").unwrap(); - } - } -} diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 1ef048a8e..d516a5db9 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -1,84 +1,28 @@ // #[cfg(test)] -pub mod detailed; +pub mod visual; use std::any::Any; use roaring::RoaringBitmap; -use super::graph_based_ranking_rule::Typo; use super::ranking_rules::BoxRankingRule; -use super::sort::Sort; -use super::words::Words; use super::{RankingRule, RankingRuleQueryTrait}; /// Trait for structure logging the execution of a search query. pub trait SearchLogger { /// Logs the initial query - fn initial_query(&mut self, query: &Q); + fn initial_query(&mut self, _query: &Q); /// Logs the value of the initial set of all candidates - fn initial_universe(&mut self, universe: &RoaringBitmap); + fn initial_universe(&mut self, _universe: &RoaringBitmap); /// Logs the query that was used to compute the set of all candidates - fn query_for_initial_universe(&mut self, query: &Q); + fn query_for_initial_universe(&mut self, _query: &Q); /// Logs the ranking rules used to perform the search query - fn ranking_rules(&mut self, rr: &[BoxRankingRule]); + fn ranking_rules(&mut self, _rr: &[BoxRankingRule]); /// Logs the start of a ranking rule's iteration. - fn start_iteration_ranking_rule( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule, - query: &Q, - universe: &RoaringBitmap, - ); - /// Logs the end of the computation of a ranking rule bucket - fn next_bucket_ranking_rule( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule, - universe: &RoaringBitmap, - candidates: &RoaringBitmap, - ); - /// Logs the skipping of a ranking rule bucket - fn skip_bucket_ranking_rule( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule, - candidates: &RoaringBitmap, - ); - /// Logs the end of a ranking rule's iteration. - fn end_iteration_ranking_rule( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule, - universe: &RoaringBitmap, - ); - /// Logs the addition of document ids to the final results - fn add_to_results(&mut self, docids: &[u32]); - - /// Logs the internal state of the ranking rule - fn log_ranking_rule_state<'ctx>(&mut self, rr: &(dyn Any + 'ctx)) { - if let Some(_words) = rr.downcast_ref::() { - } else if let Some(_sort) = rr.downcast_ref::>() { - } else if let Some(_typo) = rr.downcast_ref::() { - } - } -} - -/// A dummy [`SearchLogger`] which does nothing. -pub struct DefaultSearchLogger; - -impl SearchLogger for DefaultSearchLogger { - fn initial_query(&mut self, _query: &Q) {} - - fn query_for_initial_universe(&mut self, _query: &Q) {} - - fn initial_universe(&mut self, _universe: &RoaringBitmap) {} - - fn ranking_rules(&mut self, _rr: &[BoxRankingRule]) {} - fn start_iteration_ranking_rule( &mut self, _ranking_rule_idx: usize, @@ -87,7 +31,7 @@ impl SearchLogger for DefaultSearchLogger { _universe: &RoaringBitmap, ) { } - + /// Logs the end of the computation of a ranking rule bucket fn next_bucket_ranking_rule( &mut self, _ranking_rule_idx: usize, @@ -96,6 +40,7 @@ impl SearchLogger for DefaultSearchLogger { _candidates: &RoaringBitmap, ) { } + /// Logs the skipping of a ranking rule bucket fn skip_bucket_ranking_rule( &mut self, _ranking_rule_idx: usize, @@ -103,7 +48,7 @@ impl SearchLogger for DefaultSearchLogger { _candidates: &RoaringBitmap, ) { } - + /// Logs the end of a ranking rule's iteration. fn end_iteration_ranking_rule( &mut self, _ranking_rule_idx: usize, @@ -111,6 +56,26 @@ impl SearchLogger for DefaultSearchLogger { _universe: &RoaringBitmap, ) { } + /// Logs the addition of document ids to the final results + fn add_to_results(&mut self, _docids: &[u32]); + + /// Logs an internal state in the search algorithms + fn log_internal_state(&mut self, _rr: &dyn Any); +} + +/// A dummy [`SearchLogger`] which does nothing. +pub struct DefaultSearchLogger; + +impl SearchLogger for DefaultSearchLogger { + fn initial_query(&mut self, _query: &Q) {} + + fn initial_universe(&mut self, _universe: &RoaringBitmap) {} + + fn query_for_initial_universe(&mut self, _query: &Q) {} + + fn ranking_rules(&mut self, _rr: &[BoxRankingRule]) {} fn add_to_results(&mut self, _docids: &[u32]) {} + + fn log_internal_state(&mut self, _rr: &dyn Any) {} } diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs new file mode 100644 index 000000000..17f7ef76c --- /dev/null +++ b/milli/src/search/new/logger/visual.rs @@ -0,0 +1,525 @@ +use std::any::Any; +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::{Path, PathBuf}; +use std::time::Instant; + +// use rand::random; +use roaring::RoaringBitmap; + +use crate::search::new::interner::Interned; +use crate::search::new::query_graph::QueryNodeData; +use crate::search::new::query_term::LocatedQueryTermSubset; +use crate::search::new::ranking_rule_graph::{ + Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, + TypoCondition, TypoGraph, +}; +use crate::search::new::ranking_rules::BoxRankingRule; +use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger}; +use crate::Result; + +pub enum SearchEvents { + RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 }, + RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 }, + RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 }, + RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 }, + ExtendResults { new: Vec }, + WordsGraph { query_graph: QueryGraph }, + ProximityGraph { graph: RankingRuleGraph }, + ProximityPaths { paths: Vec>> }, + TypoGraph { graph: RankingRuleGraph }, + TypoPaths { paths: Vec>> }, +} + +enum Location { + Words, + Typo, + Proximity, + Other, +} + +#[derive(Default)] +pub struct VisualSearchLogger { + initial_query: Option, + initial_query_time: Option, + query_for_universe: Option, + initial_universe: Option, + ranking_rules_ids: Option>, + events: Vec, + location: Vec, +} + +impl SearchLogger for VisualSearchLogger { + fn initial_query(&mut self, query: &QueryGraph) { + self.initial_query = Some(query.clone()); + self.initial_query_time = Some(Instant::now()); + } + + fn query_for_initial_universe(&mut self, query: &QueryGraph) { + self.query_for_universe = Some(query.clone()); + } + + fn initial_universe(&mut self, universe: &RoaringBitmap) { + self.initial_universe = Some(universe.clone()); + } + fn ranking_rules(&mut self, rr: &[BoxRankingRule]) { + self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect()); + } + + fn start_iteration_ranking_rule( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule, + _query: &QueryGraph, + universe: &RoaringBitmap, + ) { + self.events.push(SearchEvents::RankingRuleStartIteration { + ranking_rule_idx, + universe_len: universe.len(), + }); + self.location.push(match ranking_rule.id().as_str() { + "words" => Location::Words, + "typo" => Location::Typo, + "proximity" => Location::Proximity, + _ => Location::Other, + }); + } + + fn next_bucket_ranking_rule( + &mut self, + ranking_rule_idx: usize, + _ranking_rule: &dyn RankingRule, + universe: &RoaringBitmap, + bucket: &RoaringBitmap, + ) { + self.events.push(SearchEvents::RankingRuleNextBucket { + ranking_rule_idx, + universe_len: universe.len(), + bucket_len: bucket.len(), + }); + } + fn skip_bucket_ranking_rule( + &mut self, + ranking_rule_idx: usize, + _ranking_rule: &dyn RankingRule, + bucket: &RoaringBitmap, + ) { + self.events.push(SearchEvents::RankingRuleSkipBucket { + ranking_rule_idx, + bucket_len: bucket.len(), + }) + } + + fn end_iteration_ranking_rule( + &mut self, + ranking_rule_idx: usize, + _ranking_rule: &dyn RankingRule, + universe: &RoaringBitmap, + ) { + self.events.push(SearchEvents::RankingRuleEndIteration { + ranking_rule_idx, + universe_len: universe.len(), + }); + self.location.pop(); + } + fn add_to_results(&mut self, docids: &[u32]) { + self.events.push(SearchEvents::ExtendResults { new: docids.to_vec() }); + } + + /// Logs the internal state of the ranking rule + fn log_internal_state(&mut self, state: &dyn Any) { + let Some(location) = self.location.last() else { return }; + match location { + Location::Words => { + if let Some(query_graph) = state.downcast_ref::() { + self.events.push(SearchEvents::WordsGraph { query_graph: query_graph.clone() }); + } + } + Location::Typo => { + if let Some(graph) = state.downcast_ref::>() { + self.events.push(SearchEvents::TypoGraph { graph: graph.clone() }); + } + if let Some(paths) = state.downcast_ref::>>>() { + self.events.push(SearchEvents::TypoPaths { paths: paths.clone() }); + } + } + Location::Proximity => { + if let Some(graph) = state.downcast_ref::>() { + self.events.push(SearchEvents::ProximityGraph { graph: graph.clone() }); + } + if let Some(paths) = state.downcast_ref::>>>() + { + self.events.push(SearchEvents::ProximityPaths { paths: paths.clone() }); + } + } + Location::Other => {} + } + } +} + +impl VisualSearchLogger { + pub fn finish<'ctx>(self, ctx: &'ctx mut SearchContext<'ctx>, folder: &Path) -> Result<()> { + let mut f = DetailedLoggerFinish::new(ctx, folder)?; + f.finish(self)?; + Ok(()) + } +} + +struct DetailedLoggerFinish<'ctx> { + ctx: &'ctx mut SearchContext<'ctx>, + /// The folder where all the files should be printed + folder_path: PathBuf, + /// The main file visualising the search request + index_file: BufWriter, + /// A vector of counters where each counter at index i represents the number of times + /// that the ranking rule at idx i-1 was called since its last call to `start_iteration`. + /// This is used to uniquely identify a point in the sequence diagram. + rr_action_counter: Vec, + /// The file storing information about the internal state of the latest active ranking rule + file_for_internal_state: Option>, +} + +impl<'ctx> DetailedLoggerFinish<'ctx> { + fn cur_file(&mut self) -> &mut BufWriter { + if let Some(file) = self.file_for_internal_state.as_mut() { + file + } else { + &mut self.index_file + } + } + fn pop_rr_action(&mut self) { + self.file_for_internal_state = None; + self.rr_action_counter.pop(); + } + fn push_new_rr_action(&mut self) { + self.file_for_internal_state = None; + self.rr_action_counter.push(0); + } + fn increment_cur_rr_action(&mut self) { + self.file_for_internal_state = None; + if let Some(c) = self.rr_action_counter.last_mut() { + *c += 1; + } + } + fn id_of_timestamp(&self) -> String { + let mut s = String::new(); + for t in self.rr_action_counter.iter() { + s.push_str(&format!("{t}_")); + } + s + } + fn id_of_extend_results(&self) -> String { + let mut s = String::new(); + s.push_str("results.\""); + s.push_str(&self.id_of_timestamp()); + s.push('"'); + s + } + fn id_of_last_rr_action(&self) -> String { + let mut s = String::new(); + let rr_id = if self.rr_action_counter.is_empty() { + "start.\"".to_owned() + } else { + format!("{}.\"", self.rr_action_counter.len() - 1) + }; + s.push_str(&rr_id); + s.push_str(&self.id_of_timestamp()); + s.push('"'); + s + } + fn make_new_file_for_internal_state_if_needed(&mut self) -> Result<()> { + if self.file_for_internal_state.is_some() { + return Ok(()); + } + let timestamp = self.id_of_timestamp(); + let id = self.id_of_last_rr_action(); + let new_file_path = self.folder_path.join(format!("{timestamp}.d2")); + self.file_for_internal_state = Some(BufWriter::new(File::create(new_file_path)?)); + + writeln!( + &mut self.index_file, + "{id} {{ + link: \"{timestamp}.d2.svg\" +}}" + )?; + Ok(()) + } + fn new(ctx: &'ctx mut SearchContext<'ctx>, folder_path: &Path) -> Result { + let index_path = folder_path.join("index.d2"); + let index_file = BufWriter::new(File::create(index_path)?); + + Ok(Self { + ctx, + folder_path: folder_path.to_owned(), + index_file, + rr_action_counter: vec![], + file_for_internal_state: None, + }) + } + + fn finish(&mut self, logger: VisualSearchLogger) -> Result<()> { + writeln!(&mut self.index_file, "direction: right")?; + if let Some(qg) = logger.initial_query { + writeln!(&mut self.index_file, "Initial Query Graph: {{")?; + self.write_query_graph(&qg)?; + writeln!(&mut self.index_file, "}}")?; + } + if let Some(qg) = logger.query_for_universe { + writeln!(&mut self.index_file, "Query Graph Used To Compute Universe: {{")?; + self.write_query_graph(&qg)?; + writeln!(&mut self.index_file, "}}")?; + } + let Some(ranking_rules_ids) = logger.ranking_rules_ids else { return Ok(()) }; + writeln!(&mut self.index_file, "Control Flow Between Ranking Rules: {{")?; + writeln!(&mut self.index_file, "shape: sequence_diagram")?; + writeln!(&mut self.index_file, "start")?; + for (idx, rr_id) in ranking_rules_ids.iter().enumerate() { + writeln!(&mut self.index_file, "{idx}: {rr_id}")?; + } + writeln!(&mut self.index_file, "results")?; + for event in logger.events { + self.write_event(event)?; + } + writeln!(&mut self.index_file, "}}")?; + Ok(()) + } + + fn write_event(&mut self, e: SearchEvents) -> Result<()> { + match e { + SearchEvents::RankingRuleStartIteration { ranking_rule_idx, universe_len } => { + assert!(ranking_rule_idx == self.rr_action_counter.len()); + self.write_start_iteration(universe_len)?; + } + SearchEvents::RankingRuleNextBucket { ranking_rule_idx, universe_len, bucket_len } => { + assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); + self.write_next_bucket(bucket_len, universe_len)?; + } + SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, bucket_len } => { + assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); + self.write_skip_bucket(bucket_len)?; + } + SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => { + assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); + self.write_end_iteration()?; + } + SearchEvents::ExtendResults { new } => { + self.write_extend_results(new)?; + } + SearchEvents::WordsGraph { query_graph } => self.write_words_graph(query_graph)?, + SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?, + SearchEvents::ProximityPaths { paths } => { + self.write_rr_graph_paths::(paths)?; + } + SearchEvents::TypoGraph { graph } => self.write_rr_graph(&graph)?, + SearchEvents::TypoPaths { paths } => { + self.write_rr_graph_paths::(paths)?; + } + } + Ok(()) + } + fn write_query_graph(&mut self, qg: &QueryGraph) -> Result<()> { + writeln!(self.cur_file(), "direction: right")?; + for (node_id, node) in qg.nodes.iter() { + if matches!(node.data, QueryNodeData::Deleted) { + continue; + } + self.write_query_node(node_id, node)?; + + for edge in node.successors.iter() { + writeln!(self.cur_file(), "{node_id} -> {edge};\n").unwrap(); + } + } + Ok(()) + } + + fn write_start_iteration(&mut self, _universe_len: u64) -> Result<()> { + let parent_action_id = self.id_of_last_rr_action(); + self.push_new_rr_action(); + let self_action_id = self.id_of_last_rr_action(); + writeln!(&mut self.index_file, "{parent_action_id} -> {self_action_id} : start iteration")?; + writeln!( + &mut self.index_file, + "{self_action_id} {{ +style {{ +fill: \"#D8A7B1\" +}} +}}" + )?; + + Ok(()) + } + fn write_next_bucket(&mut self, bucket_len: u64, universe_len: u64) -> Result<()> { + let cur_action_id = self.id_of_last_rr_action(); + self.increment_cur_rr_action(); + let next_action_id = self.id_of_last_rr_action(); + writeln!( + &mut self.index_file, + "{cur_action_id} -> {next_action_id} : next bucket {bucket_len}/{universe_len}" + )?; + + Ok(()) + } + fn write_skip_bucket(&mut self, bucket_len: u64) -> Result<()> { + let cur_action_id = self.id_of_last_rr_action(); + self.increment_cur_rr_action(); + let next_action_id = self.id_of_last_rr_action(); + writeln!( + &mut self.index_file, + "{cur_action_id} -> {next_action_id} : skip bucket ({bucket_len})" + )?; + + Ok(()) + } + fn write_end_iteration(&mut self) -> Result<()> { + let cur_action_id = self.id_of_last_rr_action(); + self.pop_rr_action(); + let parent_action_id = self.id_of_last_rr_action(); + + writeln!(&mut self.index_file, "{cur_action_id} -> {parent_action_id} : end iteration",)?; + Ok(()) + } + fn write_extend_results(&mut self, new: Vec) -> Result<()> { + if new.is_empty() { + return Ok(()); + } + + let cur_action_id = self.id_of_last_rr_action(); + let results_id = self.id_of_extend_results(); + let docids = new.iter().collect::>(); + let len = new.len(); + + writeln!( + &mut self.index_file, + "{cur_action_id} -> {results_id} : \"add {len}\" +{results_id} {{ +tooltip: \"{docids:?}\" +style {{ +fill: \"#B6E2D3\" +}} +}} +" + )?; + Ok(()) + } + + fn write_query_node(&mut self, node_idx: Interned, node: &QueryNode) -> Result<()> { + let Self { + ctx, index_file, file_for_internal_state: active_ranking_rule_state_file, .. + } = self; + let file = if let Some(file) = active_ranking_rule_state_file.as_mut() { + file + } else { + index_file + }; + match &node.data { + QueryNodeData::Term(LocatedQueryTermSubset { + term_subset, + positions: _, + term_ids: _, + }) => { + writeln!( + file, + "{node_idx} : \"{}\" {{ + shape: class + max_nbr_typo: {}", + term_subset.description(ctx), + term_subset.max_nbr_typos(ctx) + )?; + + for w in term_subset.all_single_words_except_prefix_db(ctx)? { + let w = ctx.word_interner.get(w); + writeln!(file, "{w}: word")?; + } + for p in term_subset.all_phrases(ctx)? { + writeln!(file, "{}: phrase", p.description(ctx))?; + } + if let Some(w) = term_subset.use_prefix_db(ctx) { + let w = ctx.word_interner.get(w); + writeln!(file, "{w}: prefix db")?; + } + + writeln!(file, "}}")?; + } + QueryNodeData::Deleted => panic!(), + QueryNodeData::Start => { + writeln!(file, "{node_idx} : START")?; + } + QueryNodeData::End => { + writeln!(file, "{node_idx} : END")?; + } + } + Ok(()) + } + fn write_words_graph(&mut self, qg: QueryGraph) -> Result<()> { + self.make_new_file_for_internal_state_if_needed()?; + + self.write_query_graph(&qg)?; + + Ok(()) + } + fn write_rr_graph( + &mut self, + graph: &RankingRuleGraph, + ) -> Result<()> { + self.make_new_file_for_internal_state_if_needed()?; + + writeln!(self.cur_file(), "direction: right")?; + + writeln!(self.cur_file(), "Graph {{")?; + for (node_idx, node) in graph.query_graph.nodes.iter() { + if matches!(&node.data, QueryNodeData::Deleted) { + continue; + } + self.write_query_node(node_idx, node)?; + } + for (_edge_id, edge) in graph.edges_store.iter() { + let Some(edge) = edge else { continue }; + let Edge { source_node, dest_node, condition: details, cost, nodes_to_skip: _ } = edge; + + match &details { + None => { + writeln!( + self.cur_file(), + "{source_node} -> {dest_node} : \"always cost {cost}\"", + )?; + } + Some(condition) => { + writeln!( + self.cur_file(), + "{source_node} -> {dest_node} : \"{condition} cost {cost}\"", + cost = edge.cost, + )?; + } + } + } + writeln!(self.cur_file(), "}}")?; + + Ok(()) + } + + fn write_rr_graph_paths( + &mut self, + paths: Vec>>, + ) -> Result<()> { + self.make_new_file_for_internal_state_if_needed()?; + let file = if let Some(file) = self.file_for_internal_state.as_mut() { + file + } else { + &mut self.index_file + }; + writeln!(file, "Path {{")?; + for (path_idx, condition_indexes) in paths.iter().enumerate() { + writeln!(file, "{path_idx} {{")?; + for condition in condition_indexes.iter() { + writeln!(file, "{condition}")?; + } + for couple_edges in condition_indexes.windows(2) { + let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() }; + writeln!(file, "{src_edge_idx} -> {dest_edge_idx}")?; + } + writeln!(file, "}}")?; + } + writeln!(file, "}}")?; + Ok(()) + } +} diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 5bebd3bff..7b15bcaab 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -31,7 +31,7 @@ use db_cache::DatabaseCache; use graph_based_ranking_rule::{Proximity, Typo}; use heed::RoTxn; use interner::DedupInterner; -pub use logger::detailed::DetailedSearchLogger; +pub use logger::visual::VisualSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm}; diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 00e759a28..6a9bfff93 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -82,7 +82,7 @@ impl PartialEq for Edge { /// It mostly describes how to: /// 1. Retrieve the set of edges (their cost and condition) between two nodes. /// 2. Compute the document ids satisfying a condition -pub trait RankingRuleGraphTrait: Sized { +pub trait RankingRuleGraphTrait: Sized + 'static { type Condition: Sized + Clone + PartialEq + Eq + Hash; /// Compute the document ids associated with the given edge condition, diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index c3ae07bcb..39bbc823d 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -65,6 +65,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { return Ok(None); } let Some(query_graph) = &mut self.query_graph else { panic!() }; + logger.log_internal_state(query_graph); let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?; From 244003e36f4ef872f6b96bdb1d870a5b344c9d18 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 15:26:47 +0200 Subject: [PATCH 183/234] Refactor DB cache to return Roaring Bitmaps directly instead of byte slices --- milli/src/search/new/db_cache.rs | 72 +++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index a0dde4686..c1862244a 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -4,10 +4,13 @@ use std::hash::Hash; use fxhash::FxHashMap; use heed::types::ByteSlice; -use heed::{BytesEncode, Database, RoTxn}; +use heed::{BytesDecode, BytesEncode, Database, RoTxn}; +use roaring::RoaringBitmap; use super::interner::Interned; -use crate::{Result, SearchContext}; +use crate::{ + CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, +}; /// A cache storing pointers to values in the LMDB databases. /// @@ -65,27 +68,31 @@ impl<'ctx> SearchContext<'ctx> { } /// Retrieve or insert the given value in the `word_docids` database. - pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { + pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { DatabaseCache::get_value( self.txn, word, self.word_interner.get(word).as_str(), &mut self.db_cache.word_docids, self.index.word_docids.remap_data_type::(), - ) + )? + .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } /// Retrieve or insert the given value in the `word_prefix_docids` database. pub fn get_db_word_prefix_docids( &mut self, prefix: Interned, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, prefix, self.word_interner.get(prefix).as_str(), &mut self.db_cache.word_prefix_docids, self.index.word_prefix_docids.remap_data_type::(), - ) + )? + .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } pub fn get_db_word_pair_proximity_docids( @@ -93,7 +100,7 @@ impl<'ctx> SearchContext<'ctx> { word1: Interned, word2: Interned, proximity: u8, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, (proximity, word1, word2), @@ -104,7 +111,32 @@ impl<'ctx> SearchContext<'ctx> { ), &mut self.db_cache.word_pair_proximity_docids, self.index.word_pair_proximity_docids.remap_data_type::(), - ) + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() + } + + pub fn get_db_word_pair_proximity_docids_len( + &mut self, + word1: Interned, + word2: Interned, + proximity: u8, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (proximity, word1, word2), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(word2).as_str(), + ), + &mut self.db_cache.word_pair_proximity_docids, + self.index.word_pair_proximity_docids.remap_data_type::(), + )? + .map(|bytes| { + CboRoaringBitmapLenCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()) + }) + .transpose() } pub fn get_db_word_prefix_pair_proximity_docids( @@ -112,7 +144,7 @@ impl<'ctx> SearchContext<'ctx> { word1: Interned, prefix2: Interned, proximity: u8, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, (proximity, word1, prefix2), @@ -123,14 +155,16 @@ impl<'ctx> SearchContext<'ctx> { ), &mut self.db_cache.word_prefix_pair_proximity_docids, self.index.word_prefix_pair_proximity_docids.remap_data_type::(), - ) + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } pub fn get_db_prefix_word_pair_proximity_docids( &mut self, left_prefix: Interned, right: Interned, proximity: u8, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, (proximity, left_prefix, right), @@ -141,34 +175,40 @@ impl<'ctx> SearchContext<'ctx> { ), &mut self.db_cache.prefix_word_pair_proximity_docids, self.index.prefix_word_pair_proximity_docids.remap_data_type::(), - ) + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } pub fn get_db_word_position_docids( &mut self, word: Interned, position: u16, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, (word, position), &(self.word_interner.get(word).as_str(), position), &mut self.db_cache.word_position_docids, self.index.word_position_docids.remap_data_type::(), - ) + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } pub fn get_db_word_fid_docids( &mut self, word: Interned, fid: u16, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, (word, fid), &(self.word_interner.get(word).as_str(), fid), &mut self.db_cache.word_fid_docids, self.index.word_fid_docids.remap_data_type::(), - ) + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } } From e7ff987c46ee5d28caeb7c2c53435a3f4524510f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 15:31:40 +0200 Subject: [PATCH 184/234] Update call sites --- milli/src/search/new/exact_attribute.rs | 15 ++++----------- .../search/new/query_term/compute_derivations.rs | 14 ++++++-------- .../new/ranking_rule_graph/exactness/mod.rs | 3 +-- .../proximity/compute_docids.rs | 15 ++++++--------- milli/src/search/new/resolve_query_graph.rs | 15 ++++++--------- 5 files changed, 23 insertions(+), 39 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 3a31f6a75..bc0195ebc 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -1,11 +1,10 @@ -use heed::BytesDecode; use roaring::{MultiOps, RoaringBitmap}; use super::query_graph::QueryGraph; use super::ranking_rules::{RankingRule, RankingRuleOutput}; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::ExactTerm; -use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; +use crate::{Result, SearchContext, SearchLogger}; /// A ranking rule that produces 3 disjoint buckets: /// @@ -161,10 +160,8 @@ impl State { // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of // longer phrases we'll be losing on precision here. let bucketed_position = crate::bucketed_position(position + offset); - let word_position_docids = CboRoaringBitmapCodec::bytes_decode( - ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(), - ) - .unwrap_or_default(); + let word_position_docids = + ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(); candidates &= word_position_docids; if candidates.is_empty() { return Ok(State::Empty(query_graph.clone())); @@ -191,11 +188,7 @@ impl State { // ignore stop words words in phrases .flatten() .map(|word| -> Result<_> { - Ok(ctx - .get_db_word_fid_docids(*word, fid)? - .map(CboRoaringBitmapCodec::bytes_decode) - .unwrap_or_default() - .unwrap_or_default()) + Ok(ctx.get_db_word_fid_docids(*word, fid)?.unwrap_or_default()) }), )?; intersection &= &candidates; diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index 03d92572e..12b8c3832 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -1,17 +1,17 @@ -use fst::automaton::Str; -use fst::{Automaton, IntoStreamer, Streamer}; -use heed::types::DecodeIgnore; -use heed::BytesDecode; use std::borrow::Cow; use std::collections::BTreeSet; use std::ops::ControlFlow; +use fst::automaton::Str; +use fst::{Automaton, IntoStreamer, Streamer}; +use heed::types::DecodeIgnore; + use super::*; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::new::query_term::TwoTypoTerm; use crate::search::new::{limits, SearchContext}; use crate::search::{build_dfa, get_first}; -use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; +use crate::{Result, MAX_WORD_LENGTH}; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum NumberOfTypos { @@ -385,9 +385,7 @@ fn split_best_frequency( let left = ctx.word_interner.insert(left.to_owned()); let right = ctx.word_interner.insert(right.to_owned()); - if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { - let frequency = - CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?; + if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? { if best.map_or(true, |(old, _, _)| frequency > old) { best = Some((frequency, left, right)); } diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index 55c4497dd..4a3dd6549 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -1,4 +1,3 @@ -use heed::BytesDecode; use roaring::RoaringBitmap; use super::{ComputedCondition, RankingRuleGraphTrait}; @@ -28,7 +27,7 @@ fn compute_docids( ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), ExactTerm::Word(word) => { if let Some(word_candidates) = ctx.get_db_word_docids(word)? { - RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)? + word_candidates } else { return Ok(Default::default()); } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 07bd102ca..b6f164f16 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -2,7 +2,6 @@ use std::collections::BTreeSet; -use heed::BytesDecode; use roaring::RoaringBitmap; use super::ProximityCondition; @@ -11,7 +10,7 @@ use crate::search::new::query_term::{Phrase, QueryTermSubset}; use crate::search::new::ranking_rule_graph::ComputedCondition; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::SearchContext; -use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; +use crate::Result; pub fn compute_docids( ctx: &mut SearchContext, @@ -92,9 +91,7 @@ pub fn compute_docids( if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) { continue; } - } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? { - let left_word_docids = - RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?; + } else if let Some(left_word_docids) = ctx.get_db_word_docids(left_word)? { if universe.is_disjoint(&left_word_docids) { continue; } @@ -155,7 +152,7 @@ fn compute_prefix_edges( if let Some(new_docids) = ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)? { - let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(left_word); used_right_prefix.insert(right_prefix); @@ -170,7 +167,7 @@ fn compute_prefix_edges( left_word, backward_proximity, )? { - let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(left_word); used_right_prefix.insert(right_prefix); @@ -217,7 +214,7 @@ fn compute_non_prefix_edges( if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)? { - let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(word1); used_right_words.insert(word2); @@ -231,7 +228,7 @@ fn compute_non_prefix_edges( if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)? { - let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(word2); used_right_words.insert(word1); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index ef7adad14..bca8b6268 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -3,7 +3,6 @@ use std::collections::VecDeque; use fxhash::FxHashMap; -use heed::BytesDecode; use roaring::RoaringBitmap; use super::interner::Interned; @@ -12,7 +11,7 @@ use super::query_term::{Phrase, QueryTermSubset}; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, SearchContext}; use crate::search::new::query_term::LocatedQueryTermSubset; -use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; +use crate::Result; #[derive(Default)] pub struct PhraseDocIdsCache { @@ -37,7 +36,7 @@ pub fn compute_query_term_subset_docids( let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { if let Some(word_docids) = ctx.get_db_word_docids(word)? { - docids |= RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?; + docids |= word_docids; } } for phrase in term.all_phrases(ctx)? { @@ -46,8 +45,7 @@ pub fn compute_query_term_subset_docids( if let Some(prefix) = term.use_prefix_db(ctx) { if let Some(prefix_docids) = ctx.get_db_word_prefix_docids(prefix)? { - docids |= - RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?; + docids |= prefix_docids; } } @@ -128,8 +126,7 @@ pub fn compute_phrase_docids( if words.len() == 1 { if let Some(word) = &words[0] { if let Some(word_docids) = ctx.get_db_word_docids(*word)? { - return RoaringBitmapCodec::bytes_decode(word_docids) - .ok_or(heed::Error::Decoding.into()); + return Ok(word_docids); } else { return Ok(RoaringBitmap::new()); } @@ -158,7 +155,7 @@ pub fn compute_phrase_docids( { if dist == 0 { match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? { - Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?), + Some(m) => bitmaps.push(m), // If there are no documents for this pair, there will be no // results for the phrase query. None => return Ok(RoaringBitmap::new()), @@ -169,7 +166,7 @@ pub fn compute_phrase_docids( if let Some(m) = ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?; + bitmap |= m; } } if bitmap.is_empty() { From 325f17488aa142d634b3aa3eb537b5eeee3e0b9c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 18:26:44 +0200 Subject: [PATCH 185/234] Add SearchContext::word_docids() method --- milli/src/search/new/db_cache.rs | 37 ++++++++++++++++++++++++++++++++ milli/src/search/new/mod.rs | 15 +++++++++++++ 2 files changed, 52 insertions(+) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index c1862244a..09845377c 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -8,6 +8,7 @@ use heed::{BytesDecode, BytesEncode, Database, RoTxn}; use roaring::RoaringBitmap; use super::interner::Interned; +use super::Word; use crate::{ CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, }; @@ -67,6 +68,26 @@ impl<'ctx> SearchContext<'ctx> { } } + pub fn word_docids(&mut self, word: Word) -> Result> { + match word { + Word::Original(word) => { + let exact = self.get_db_exact_word_docids(word)?; + let tolerant = self.get_db_word_docids(word)?; + Ok(match (exact, tolerant) { + (None, None) => None, + (None, Some(tolerant)) => Some(tolerant), + (Some(exact), None) => Some(exact), + (Some(exact), Some(tolerant)) => { + let mut both = exact; + both |= tolerant; + Some(both) + } + }) + } + Word::Derived(word) => self.get_db_word_docids(word), + } + } + /// Retrieve or insert the given value in the `word_docids` database. pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { DatabaseCache::get_value( @@ -79,6 +100,22 @@ impl<'ctx> SearchContext<'ctx> { .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) .transpose() } + + fn get_db_exact_word_docids( + &mut self, + word: Interned, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + word, + self.word_interner.get(word).as_str(), + &mut self.db_cache.exact_word_docids, + self.index.exact_word_docids.remap_data_type::(), + )? + .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() + } + /// Retrieve or insert the given value in the `word_prefix_docids` database. pub fn get_db_word_prefix_docids( &mut self, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 7b15bcaab..f51d3771d 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -75,6 +75,21 @@ impl<'ctx> SearchContext<'ctx> { } } +#[derive(Clone, Copy, PartialEq, PartialOrd, Ord, Eq)] +pub enum Word { + Original(Interned), + Derived(Interned), +} + +impl Word { + pub fn interned(&self) -> Interned { + match self { + Word::Original(word) => *word, + Word::Derived(word) => *word, + } + } +} + /// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it. #[allow(clippy::too_many_arguments)] fn resolve_maximally_reduced_query_graph( From 5ab46324c4fc4bcce2d7b250f82ad4175a1bb6b0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 18:27:41 +0200 Subject: [PATCH 186/234] Everyone uses the SearchContext::word_docids instead of get_db_word_docids make get_db_word_docids private --- milli/src/search/new/db_cache.rs | 2 +- milli/src/search/new/logger/visual.rs | 2 +- milli/src/search/new/mod.rs | 2 + milli/src/search/new/query_term/mod.rs | 42 +++++++++++++------ .../new/ranking_rule_graph/exactness/mod.rs | 5 ++- .../proximity/compute_docids.rs | 14 +++---- milli/src/search/new/resolve_query_graph.rs | 6 +-- 7 files changed, 46 insertions(+), 27 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 09845377c..aa1c11773 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -89,7 +89,7 @@ impl<'ctx> SearchContext<'ctx> { } /// Retrieve or insert the given value in the `word_docids` database. - pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { + fn get_db_word_docids(&mut self, word: Interned) -> Result> { DatabaseCache::get_value( self.txn, word, diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index 17f7ef76c..068b5ad68 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -427,7 +427,7 @@ fill: \"#B6E2D3\" )?; for w in term_subset.all_single_words_except_prefix_db(ctx)? { - let w = ctx.word_interner.get(w); + let w = ctx.word_interner.get(w.interned()); writeln!(file, "{w}: word")?; } for p in term_subset.all_phrases(ctx)? { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index f51d3771d..9f8d8699f 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -50,6 +50,8 @@ use ranking_rules::{BoxRankingRule, RankingRule}; use resolve_query_graph::compute_query_graph_docids; use sort::Sort; +use self::interner::Interned; + /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { pub index: &'ctx Index, diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 83320139b..0a0d1a7eb 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -3,18 +3,18 @@ mod ntypo_subset; mod parse_query; mod phrase; -use super::interner::{DedupInterner, Interned}; -use super::{limits, SearchContext}; -use crate::Result; use std::collections::BTreeSet; use std::ops::RangeInclusive; +use compute_derivations::partially_initialized_term_from_word; use either::Either; pub use ntypo_subset::NTypoTermSubset; pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed}; pub use phrase::Phrase; -use compute_derivations::partially_initialized_term_from_word; +use super::interner::{DedupInterner, Interned}; +use super::{limits, SearchContext, Word}; +use crate::Result; /// A set of word derivations attached to a location in the search query. #[derive(Clone, PartialEq, Eq, Hash)] @@ -180,7 +180,7 @@ impl QueryTermSubset { pub fn all_single_words_except_prefix_db( &self, ctx: &mut SearchContext, - ) -> Result>> { + ) -> Result> { let mut result = BTreeSet::default(); // TODO: a compute_partially funtion if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { @@ -197,8 +197,20 @@ impl QueryTermSubset { synonyms: _, use_prefix_db: _, } = &original.zero_typo; - result.extend(zero_typo.iter().copied()); - result.extend(prefix_of.iter().copied()); + result.extend(zero_typo.iter().copied().map(|w| { + if original.ngram_words.is_some() { + Word::Derived(w) + } else { + Word::Original(w) + } + })); + result.extend(prefix_of.iter().copied().map(|w| { + if original.ngram_words.is_some() { + Word::Derived(w) + } else { + Word::Original(w) + } + })); } NTypoTermSubset::Subset { words, phrases: _ } => { let ZeroTypoTerm { @@ -210,10 +222,14 @@ impl QueryTermSubset { } = &original.zero_typo; if let Some(zero_typo) = zero_typo { if words.contains(zero_typo) { - result.insert(*zero_typo); + if original.ngram_words.is_some() { + result.insert(Word::Derived(*zero_typo)); + } else { + result.insert(Word::Original(*zero_typo)); + } } } - result.extend(prefix_of.intersection(words).copied()); + result.extend(prefix_of.intersection(words).copied().map(Word::Derived)); } NTypoTermSubset::Nothing => {} } @@ -223,13 +239,13 @@ impl QueryTermSubset { let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { panic!() }; - result.extend(one_typo.iter().copied()) + result.extend(one_typo.iter().copied().map(Word::Derived)) } NTypoTermSubset::Subset { words, phrases: _ } => { let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { panic!() }; - result.extend(one_typo.intersection(words)); + result.extend(one_typo.intersection(words).copied().map(Word::Derived)); } NTypoTermSubset::Nothing => {} }; @@ -239,13 +255,13 @@ impl QueryTermSubset { let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { panic!() }; - result.extend(two_typos.iter().copied()); + result.extend(two_typos.iter().copied().map(Word::Derived)); } NTypoTermSubset::Subset { words, phrases: _ } => { let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { panic!() }; - result.extend(two_typos.intersection(words)); + result.extend(two_typos.intersection(words).copied().map(Word::Derived)); } NTypoTermSubset::Nothing => {} }; diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index 4a3dd6549..7455a7a17 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -3,7 +3,8 @@ use roaring::RoaringBitmap; use super::{ComputedCondition, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; -use crate::{Result, RoaringBitmapCodec, SearchContext}; +use crate::search::new::Word; +use crate::{Result, SearchContext}; #[derive(Clone, PartialEq, Eq, Hash)] pub enum ExactnessCondition { @@ -26,7 +27,7 @@ fn compute_docids( let mut candidates = match exact_term { ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), ExactTerm::Word(word) => { - if let Some(word_candidates) = ctx.get_db_word_docids(word)? { + if let Some(word_candidates) = ctx.word_docids(Word::Original(word))? { word_candidates } else { return Ok(Default::default()); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index b6f164f16..760c7272c 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -9,7 +9,7 @@ use crate::search::new::interner::Interned; use crate::search::new::query_term::{Phrase, QueryTermSubset}; use crate::search::new::ranking_rule_graph::ComputedCondition; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; -use crate::search::new::SearchContext; +use crate::search::new::{SearchContext, Word}; use crate::Result; pub fn compute_docids( @@ -54,7 +54,7 @@ pub fn compute_docids( { compute_prefix_edges( ctx, - left_word, + left_word.interned(), right_prefix, left_phrase, forward_proximity, @@ -91,7 +91,7 @@ pub fn compute_docids( if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) { continue; } - } else if let Some(left_word_docids) = ctx.get_db_word_docids(left_word)? { + } else if let Some(left_word_docids) = ctx.word_docids(left_word)? { if universe.is_disjoint(&left_word_docids) { continue; } @@ -101,7 +101,7 @@ pub fn compute_docids( for (right_word, right_phrase) in right_derivs { compute_non_prefix_edges( ctx, - left_word, + left_word.interned(), right_word, left_phrase, right_phrase, @@ -243,7 +243,7 @@ fn compute_non_prefix_edges( fn last_words_of_term_derivations( ctx: &mut SearchContext, t: &QueryTermSubset, -) -> Result>, Interned)>> { +) -> Result>, Word)>> { let mut result = BTreeSet::new(); for w in t.all_single_words_except_prefix_db(ctx)? { @@ -253,7 +253,7 @@ fn last_words_of_term_derivations( let phrase = ctx.phrase_interner.get(p); let last_term_of_phrase = phrase.words.last().unwrap(); if let Some(last_word) = last_term_of_phrase { - result.insert((Some(p), *last_word)); + result.insert((Some(p), Word::Original(*last_word))); } } @@ -266,7 +266,7 @@ fn first_word_of_term_iter( let mut result = BTreeSet::new(); let all_words = t.all_single_words_except_prefix_db(ctx)?; for w in all_words { - result.insert((w, None)); + result.insert((w.interned(), None)); } for p in t.all_phrases(ctx)? { let phrase = ctx.phrase_interner.get(p); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index bca8b6268..c78f0c5ee 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -9,7 +9,7 @@ use super::interner::Interned; use super::query_graph::QueryNodeData; use super::query_term::{Phrase, QueryTermSubset}; use super::small_bitmap::SmallBitmap; -use super::{QueryGraph, SearchContext}; +use super::{QueryGraph, SearchContext, Word}; use crate::search::new::query_term::LocatedQueryTermSubset; use crate::Result; @@ -35,7 +35,7 @@ pub fn compute_query_term_subset_docids( ) -> Result { let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { - if let Some(word_docids) = ctx.get_db_word_docids(word)? { + if let Some(word_docids) = ctx.word_docids(word)? { docids |= word_docids; } } @@ -125,7 +125,7 @@ pub fn compute_phrase_docids( } if words.len() == 1 { if let Some(word) = &words[0] { - if let Some(word_docids) = ctx.get_db_word_docids(*word)? { + if let Some(word_docids) = ctx.word_docids(Word::Original(*word))? { return Ok(word_docids); } else { return Ok(RoaringBitmap::new()); From c20c38a7fa37cf9babc79018f1410958ca329f07 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 22:04:38 +0200 Subject: [PATCH 187/234] Add SearchContext::word_prefix_docids() method --- milli/src/search/new/db_cache.rs | 36 ++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index aa1c11773..fb36c0d9f 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -29,6 +29,7 @@ pub struct DatabaseCache<'ctx> { pub word_docids: FxHashMap, Option<&'ctx [u8]>>, pub exact_word_docids: FxHashMap, Option<&'ctx [u8]>>, pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, + pub exact_word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, pub words_fst: Option>>, pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, @@ -116,6 +117,26 @@ impl<'ctx> SearchContext<'ctx> { .transpose() } + pub fn word_prefix_docids(&mut self, prefix: Word) -> Result> { + match prefix { + Word::Original(prefix) => { + let exact = self.get_db_exact_word_prefix_docids(prefix)?; + let tolerant = self.get_db_word_prefix_docids(prefix)?; + Ok(match (exact, tolerant) { + (None, None) => None, + (None, Some(tolerant)) => Some(tolerant), + (Some(exact), None) => Some(exact), + (Some(exact), Some(tolerant)) => { + let mut both = exact; + both |= tolerant; + Some(both) + } + }) + } + Word::Derived(prefix) => self.get_db_word_prefix_docids(prefix), + } + } + /// Retrieve or insert the given value in the `word_prefix_docids` database. pub fn get_db_word_prefix_docids( &mut self, @@ -132,6 +153,21 @@ impl<'ctx> SearchContext<'ctx> { .transpose() } + fn get_db_exact_word_prefix_docids( + &mut self, + prefix: Interned, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + prefix, + self.word_interner.get(prefix).as_str(), + &mut self.db_cache.exact_word_prefix_docids, + self.index.exact_word_prefix_docids.remap_data_type::(), + )? + .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() + } + pub fn get_db_word_pair_proximity_docids( &mut self, word1: Interned, From 7a01f20df746997886aa57a1e74d9f9ffef82a5d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 22:06:10 +0200 Subject: [PATCH 188/234] Use word_prefix_docids, make get_word_prefix_docids private --- milli/src/search/new/db_cache.rs | 2 +- milli/src/search/new/logger/visual.rs | 2 +- milli/src/search/new/query_term/mod.rs | 13 ++++++++++--- .../ranking_rule_graph/proximity/compute_docids.rs | 2 +- milli/src/search/new/resolve_query_graph.rs | 2 +- 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index fb36c0d9f..6193f4c58 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -138,7 +138,7 @@ impl<'ctx> SearchContext<'ctx> { } /// Retrieve or insert the given value in the `word_prefix_docids` database. - pub fn get_db_word_prefix_docids( + fn get_db_word_prefix_docids( &mut self, prefix: Interned, ) -> Result> { diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index 068b5ad68..72e33f339 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -434,7 +434,7 @@ fill: \"#B6E2D3\" writeln!(file, "{}: phrase", p.description(ctx))?; } if let Some(w) = term_subset.use_prefix_db(ctx) { - let w = ctx.word_interner.get(w); + let w = ctx.word_interner.get(w.interned()); writeln!(file, "{w}: prefix db")?; } diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 0a0d1a7eb..d8c2bb0c7 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -159,12 +159,12 @@ impl QueryTermSubset { self.two_typo_subset.intersect(&other.two_typo_subset); } - pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option> { + pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option { let original = ctx.term_interner.get(self.original); let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { return None }; - match &self.zero_typo_subset { + let word = match &self.zero_typo_subset { NTypoTermSubset::All => Some(use_prefix_db), NTypoTermSubset::Subset { words, phrases: _ } => { // TODO: use a subset of prefix words instead @@ -175,7 +175,14 @@ impl QueryTermSubset { } } NTypoTermSubset::Nothing => None, - } + }; + word.map(|word| { + if original.ngram_words.is_some() { + Word::Derived(word) + } else { + Word::Original(word) + } + }) } pub fn all_single_words_except_prefix_db( &self, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 760c7272c..3e75f948e 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -55,7 +55,7 @@ pub fn compute_docids( compute_prefix_edges( ctx, left_word.interned(), - right_prefix, + right_prefix.interned(), left_phrase, forward_proximity, backward_proximity, diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index c78f0c5ee..f4938ca12 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -44,7 +44,7 @@ pub fn compute_query_term_subset_docids( } if let Some(prefix) = term.use_prefix_db(ctx) { - if let Some(prefix_docids) = ctx.get_db_word_prefix_docids(prefix)? { + if let Some(prefix_docids) = ctx.word_prefix_docids(prefix)? { docids |= prefix_docids; } } From 38b7b31beb4e1e0631aee05f97367d38593508d9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Apr 2023 15:14:00 +0200 Subject: [PATCH 189/234] Decide to use prefix DB if the word is not an ngram --- .../new/query_term/compute_derivations.rs | 12 ++++++++++-- .../src/search/new/query_term/parse_query.rs | 19 +++++++++++++------ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index 12b8c3832..0da841890 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -177,6 +177,7 @@ pub fn partially_initialized_term_from_word( word: &str, max_typo: u8, is_prefix: bool, + is_ngram: bool, ) -> Result { let word_interned = ctx.word_interner.insert(word.to_owned()); @@ -197,12 +198,19 @@ pub fn partially_initialized_term_from_word( let fst = ctx.index.words_fst(ctx.txn)?; let use_prefix_db = is_prefix - && ctx + && (ctx .index .word_prefix_docids .remap_data_type::() .get(ctx.txn, word)? - .is_some(); + .is_some() + || (!is_ngram + && ctx + .index + .exact_word_prefix_docids + .remap_data_type::() + .get(ctx.txn, word)? + .is_some())); let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; let mut zero_typo = None; diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 5663f6b4b..91b888dcf 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -1,8 +1,8 @@ -use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind}; - -use crate::{Result, SearchContext, MAX_WORD_LENGTH}; +use charabia::normalizer::NormalizedTokenIter; +use charabia::{SeparatorKind, TokenKind}; use super::*; +use crate::{Result, SearchContext, MAX_WORD_LENGTH}; /// Convert the tokenised search query into a list of located query terms. // TODO: checking if the positions are correct for phrases, separators, ngrams @@ -51,6 +51,7 @@ pub fn located_query_terms_from_string( word, nbr_typos(word), false, + false, )?; let located_term = LocatedQueryTerm { value: ctx.term_interner.push(term), @@ -62,8 +63,13 @@ pub fn located_query_terms_from_string( } } else { let word = token.lemma(); - let term = - partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; + let term = partially_initialized_term_from_word( + ctx, + word, + nbr_typos(word), + true, + false, + )?; let located_term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: position..=position, @@ -195,7 +201,8 @@ pub fn make_ngram( let max_nbr_typos = number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); - let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; + let mut term = + partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix, true)?; // Now add the synonyms let index_synonyms = ctx.index.synonyms(ctx.txn)?; From e55efc419e715d691e6f6e1742b5b1f233e31ac0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 12 Apr 2023 11:12:14 +0200 Subject: [PATCH 190/234] Introduce a new cache for the words fids --- .../new/ranking_rule_graph/attribute/mod.rs | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 milli/src/search/new/ranking_rule_graph/attribute/mod.rs diff --git a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs new file mode 100644 index 000000000..cfa8a1fbc --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs @@ -0,0 +1,82 @@ +use roaring::RoaringBitmap; + +use super::{ComputedCondition, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned}; +use crate::search::new::query_term::LocatedQueryTermSubset; +use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; +use crate::search::new::SearchContext; +use crate::Result; + +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct AttributeCondition { + term: LocatedQueryTermSubset, + nbr_typos: u8, +} + +pub enum AttributeGraph {} + +impl RankingRuleGraphTrait for AttributeGraph { + type Condition = AttributeCondition; + + fn resolve_condition( + ctx: &mut SearchContext, + condition: &Self::Condition, + universe: &RoaringBitmap, + ) -> Result { + let AttributeCondition { term, .. } = condition; + // maybe compute_query_term_subset_docids should accept a universe as argument + let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?; + docids &= universe; + + Ok(ComputedCondition { + docids, + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: term.clone(), + }) + } + + fn build_edges( + ctx: &mut SearchContext, + conditions_interner: &mut DedupInterner, + _from: Option<&LocatedQueryTermSubset>, + to_term: &LocatedQueryTermSubset, + ) -> Result)>> { + let term = to_term; + + let mut edges = vec![]; + for word in term.term_subset.all_single_words_except_prefix_db(ctx)? { + // ... + } + + // Ngrams have a base typo cost + // 2-gram -> equivalent to 1 typo + // 3-gram -> equivalent to 2 typos + let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; + + for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) { + let mut term = term.clone(); + match nbr_typos { + 0 => { + term.term_subset.clear_one_typo_subset(); + term.term_subset.clear_two_typo_subset(); + } + 1 => { + term.term_subset.clear_zero_typo_subset(); + term.term_subset.clear_two_typo_subset(); + } + 2 => { + term.term_subset.clear_zero_typo_subset(); + term.term_subset.clear_one_typo_subset(); + } + _ => panic!(), + }; + + edges.push(( + nbr_typos as u32 + base_cost, + conditions_interner.insert(AttributeCondition { term, nbr_typos }), + )); + } + Ok(edges) + } +} From d6a7c28e4d902f0b6bb46845c2431f562400d5d4 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 12 Apr 2023 11:40:44 +0200 Subject: [PATCH 191/234] Implement the attribute ranking rule edge computation --- milli/src/search/new/db_cache.rs | 67 +++++++++++++++++++ milli/src/search/new/query_term/phrase.rs | 4 ++ .../new/ranking_rule_graph/attribute/mod.rs | 51 +++++++------- .../src/search/new/ranking_rule_graph/mod.rs | 2 + 4 files changed, 98 insertions(+), 26 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 6193f4c58..c32c7ba79 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -34,6 +34,9 @@ pub struct DatabaseCache<'ctx> { pub words_fst: Option>>, pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, pub word_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_prefix_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_fids: FxHashMap, Vec>, + pub word_prefix_fids: FxHashMap, Vec>, } impl<'ctx> DatabaseCache<'ctx> { fn get_value<'v, K1, KC>( @@ -284,4 +287,68 @@ impl<'ctx> SearchContext<'ctx> { .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) .transpose() } + + pub fn get_db_word_prefix_fid_docids( + &mut self, + word_prefix: Interned, + fid: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word_prefix, fid), + &(self.word_interner.get(word_prefix).as_str(), fid), + &mut self.db_cache.word_prefix_fid_docids, + self.index.word_prefix_fid_docids.remap_data_type::(), + ) + } + + pub fn get_db_word_fids(&mut self, word: Interned) -> Result> { + let fids = match self.db_cache.word_fids.entry(word) { + Entry::Occupied(fids) => fids.get().clone(), + Entry::Vacant(entry) => { + let key = self.word_interner.get(word).as_bytes(); + let mut fids = vec![]; + let remap_key_type = self + .index + .word_fid_docids + .remap_types::() + .prefix_iter(self.txn, key)? + .remap_key_type::(); + for result in remap_key_type { + let ((_, fid), value) = result?; + // filling other caches to avoid searching for them again + self.db_cache.word_fid_docids.insert((word, fid), Some(value)); + fids.push(fid); + } + entry.insert(fids.clone()); + fids + } + }; + Ok(fids) + } + + pub fn get_db_word_prefix_fids(&mut self, word_prefix: Interned) -> Result> { + let fids = match self.db_cache.word_prefix_fids.entry(word_prefix) { + Entry::Occupied(fids) => fids.get().clone(), + Entry::Vacant(entry) => { + let key = self.word_interner.get(word_prefix).as_bytes(); + let mut fids = vec![]; + let remap_key_type = self + .index + .word_prefix_fid_docids + .remap_types::() + .prefix_iter(self.txn, key)? + .remap_key_type::(); + for result in remap_key_type { + let ((_, fid), value) = result?; + // filling other caches to avoid searching for them again + self.db_cache.word_prefix_fid_docids.insert((word_prefix, fid), Some(value)); + fids.push(fid); + } + entry.insert(fids.clone()); + fids + } + }; + Ok(fids) + } } diff --git a/milli/src/search/new/query_term/phrase.rs b/milli/src/search/new/query_term/phrase.rs index 2ea8e0d39..033c5cf12 100644 --- a/milli/src/search/new/query_term/phrase.rs +++ b/milli/src/search/new/query_term/phrase.rs @@ -13,4 +13,8 @@ impl Interned { let p = ctx.phrase_interner.get(self); p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ") } + pub fn words(self, ctx: &SearchContext) -> Vec>> { + let p = ctx.phrase_interner.get(self); + p.words.clone() + } } diff --git a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs index cfa8a1fbc..2b25adc7e 100644 --- a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs @@ -1,3 +1,4 @@ +use fxhash::FxHashSet; use roaring::RoaringBitmap; use super::{ComputedCondition, RankingRuleGraphTrait}; @@ -10,7 +11,7 @@ use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] pub struct AttributeCondition { term: LocatedQueryTermSubset, - nbr_typos: u8, + fid: u16, } pub enum AttributeGraph {} @@ -44,39 +45,37 @@ impl RankingRuleGraphTrait for AttributeGraph { ) -> Result)>> { let term = to_term; - let mut edges = vec![]; + let mut all_fields = FxHashSet::default(); for word in term.term_subset.all_single_words_except_prefix_db(ctx)? { - // ... + let fields = ctx.get_db_word_fids(word)?; + all_fields.extend(fields); } - // Ngrams have a base typo cost - // 2-gram -> equivalent to 1 typo - // 3-gram -> equivalent to 2 typos - let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; + for phrase in term.term_subset.all_phrases(ctx)? { + for &word in phrase.words(ctx).iter().flatten() { + let fields = ctx.get_db_word_fids(word)?; + all_fields.extend(fields); + } + } - for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) { - let mut term = term.clone(); - match nbr_typos { - 0 => { - term.term_subset.clear_one_typo_subset(); - term.term_subset.clear_two_typo_subset(); - } - 1 => { - term.term_subset.clear_zero_typo_subset(); - term.term_subset.clear_two_typo_subset(); - } - 2 => { - term.term_subset.clear_zero_typo_subset(); - term.term_subset.clear_one_typo_subset(); - } - _ => panic!(), - }; + if let Some(word_prefix) = term.term_subset.use_prefix_db(ctx) { + let fields = ctx.get_db_word_prefix_fids(word_prefix)?; + all_fields.extend(fields); + } + let mut edges = vec![]; + for fid in all_fields { + // TODO: We can improve performances and relevancy by storing + // the term subsets associated to each field ids fetched. edges.push(( - nbr_typos as u32 + base_cost, - conditions_interner.insert(AttributeCondition { term, nbr_typos }), + fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. + conditions_interner.insert(AttributeCondition { + term: term.clone(), // TODO remove this ugly clone + fid, + }), )); } + Ok(edges) } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 6a9bfff93..cccc0643a 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -16,6 +16,8 @@ mod exactness; mod proximity; /// Implementation of the `typo` ranking rule mod typo; +/// Implementation of the `attribute` ranking rule +mod attribute; use std::hash::Hash; From 5230ddb3ea7b2d86bcbca5df2822969351077fcc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 12 Apr 2023 11:52:56 +0200 Subject: [PATCH 192/234] Resolve the attribute ranking rule conditions --- .../new/ranking_rule_graph/attribute/mod.rs | 10 ++++-- milli/src/search/new/resolve_query_graph.rs | 36 +++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs index 2b25adc7e..4ca0b7dc4 100644 --- a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs @@ -4,7 +4,9 @@ use roaring::RoaringBitmap; use super::{ComputedCondition, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::LocatedQueryTermSubset; -use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; +use crate::search::new::resolve_query_graph::{ + compute_query_term_subset_docids, compute_query_term_subset_docids_within_field_id, +}; use crate::search::new::SearchContext; use crate::Result; @@ -26,7 +28,11 @@ impl RankingRuleGraphTrait for AttributeGraph { ) -> Result { let AttributeCondition { term, .. } = condition; // maybe compute_query_term_subset_docids should accept a universe as argument - let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?; + let mut docids = compute_query_term_subset_docids_within_field_id( + ctx, + &term.term_subset, + condition.fid, + )?; docids &= universe; Ok(ComputedCondition { diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index f4938ca12..711497009 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -33,6 +33,8 @@ pub fn compute_query_term_subset_docids( ctx: &mut SearchContext, term: &QueryTermSubset, ) -> Result { + // TODO Use the roaring::MultiOps trait + let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { if let Some(word_docids) = ctx.word_docids(word)? { @@ -52,6 +54,40 @@ pub fn compute_query_term_subset_docids( Ok(docids) } +pub fn compute_query_term_subset_docids_within_field_id( + ctx: &mut SearchContext, + term: &QueryTermSubset, + fid: u16, +) -> Result { + // TODO Use the roaring::MultiOps trait + + let mut docids = RoaringBitmap::new(); + for word in term.all_single_words_except_prefix_db(ctx)? { + if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word, fid)? { + docids |= CboRoaringBitmapCodec::bytes_decode(word_fid_docids) + .ok_or(heed::Error::Decoding)?; + } + } + + for phrase in term.all_phrases(ctx)? { + for &word in phrase.words(ctx).iter().flatten() { + if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word, fid)? { + docids |= CboRoaringBitmapCodec::bytes_decode(word_fid_docids) + .ok_or(heed::Error::Decoding)?; + } + } + } + + if let Some(word_prefix) = term.use_prefix_db(ctx) { + if let Some(word_fid_docids) = ctx.get_db_word_prefix_fid_docids(word_prefix, fid)? { + docids |= CboRoaringBitmapCodec::bytes_decode(word_fid_docids) + .ok_or(heed::Error::Decoding)?; + } + } + + Ok(docids) +} + pub fn compute_query_graph_docids( ctx: &mut SearchContext, q: &QueryGraph, From df0d9bb878c8660c3fc61e286b3e1950d0693f41 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 12 Apr 2023 12:01:50 +0200 Subject: [PATCH 193/234] Introduce the attribute ranking rule in the list of ranking rules --- milli/src/search/new/graph_based_ranking_rule.rs | 10 ++++++++-- milli/src/search/new/mod.rs | 8 ++++---- milli/src/search/new/ranking_rule_graph/mod.rs | 5 +++-- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 41a96dd9e..3ee16ed50 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner}; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph, - RankingRuleGraphTrait, TypoGraph, + AttributeGraph, ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, + RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; @@ -59,6 +59,12 @@ impl GraphBasedRankingRule { Self::new_with_id("proximity".to_owned(), terms_matching_strategy) } } +pub type Attribute = GraphBasedRankingRule; +impl GraphBasedRankingRule { + pub fn new(terms_matching_strategy: Option) -> Self { + Self::new_with_id("attribute".to_owned(), terms_matching_strategy) + } +} pub type Typo = GraphBasedRankingRule; impl GraphBasedRankingRule { pub fn new(terms_matching_strategy: Option) -> Self { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 9f8d8699f..16eccb393 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -28,7 +28,7 @@ use std::collections::HashSet; use bucket_sort::bucket_sort; use charabia::TokenizerBuilder; use db_cache::DatabaseCache; -use graph_based_ranking_rule::{Proximity, Typo}; +use graph_based_ranking_rule::{Attribute, Proximity, Typo}; use heed::RoTxn; use interner::DedupInterner; pub use logger::visual::VisualSearchLogger; @@ -174,7 +174,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut typo = false; let mut proximity = false; let mut sort = false; - let attribute = false; + let mut attribute = false; let mut exactness = false; let mut asc = HashSet::new(); let mut desc = HashSet::new(); @@ -222,8 +222,8 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( if attribute { continue; } - // todo!(); - // attribute = false; + attribute = true; + ranking_rules.push(Box::new(Attribute::new(None))); } crate::Criterion::Sort => { if sort { diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index cccc0643a..fe31029b4 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -10,17 +10,18 @@ mod cheapest_paths; mod condition_docids_cache; mod dead_ends_cache; +/// Implementation of the `attribute` ranking rule +mod attribute; /// Implementation of the `exactness` ranking rule mod exactness; /// Implementation of the `proximity` ranking rule mod proximity; /// Implementation of the `typo` ranking rule mod typo; -/// Implementation of the `attribute` ranking rule -mod attribute; use std::hash::Hash; +pub use attribute::{AttributeCondition, AttributeGraph}; pub use cheapest_paths::PathVisitor; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; From 30f7bd03f6b58d9676967b3a75e4fa2ccf7413c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Apr 2023 16:53:11 +0200 Subject: [PATCH 194/234] Fix compiler warning/errors caused by previous merge --- milli/src/search/new/db_cache.rs | 7 +++++-- .../new/ranking_rule_graph/attribute/mod.rs | 8 +++----- milli/src/search/new/resolve_query_graph.rs | 15 +++++++-------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index c32c7ba79..ad843b045 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -9,6 +9,7 @@ use roaring::RoaringBitmap; use super::interner::Interned; use super::Word; +use crate::heed_codec::StrBEU16Codec; use crate::{ CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, }; @@ -292,14 +293,16 @@ impl<'ctx> SearchContext<'ctx> { &mut self, word_prefix: Interned, fid: u16, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, (word_prefix, fid), &(self.word_interner.get(word_prefix).as_str(), fid), &mut self.db_cache.word_prefix_fid_docids, self.index.word_prefix_fid_docids.remap_data_type::(), - ) + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } pub fn get_db_word_fids(&mut self, word: Interned) -> Result> { diff --git a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs index 4ca0b7dc4..80c1f4c6a 100644 --- a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs @@ -4,9 +4,7 @@ use roaring::RoaringBitmap; use super::{ComputedCondition, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::LocatedQueryTermSubset; -use crate::search::new::resolve_query_graph::{ - compute_query_term_subset_docids, compute_query_term_subset_docids_within_field_id, -}; +use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id; use crate::search::new::SearchContext; use crate::Result; @@ -53,7 +51,7 @@ impl RankingRuleGraphTrait for AttributeGraph { let mut all_fields = FxHashSet::default(); for word in term.term_subset.all_single_words_except_prefix_db(ctx)? { - let fields = ctx.get_db_word_fids(word)?; + let fields = ctx.get_db_word_fids(word.interned())?; all_fields.extend(fields); } @@ -65,7 +63,7 @@ impl RankingRuleGraphTrait for AttributeGraph { } if let Some(word_prefix) = term.term_subset.use_prefix_db(ctx) { - let fields = ctx.get_db_word_prefix_fids(word_prefix)?; + let fields = ctx.get_db_word_prefix_fids(word_prefix.interned())?; all_fields.extend(fields); } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 711497009..a125caa39 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -63,25 +63,24 @@ pub fn compute_query_term_subset_docids_within_field_id( let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { - if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word, fid)? { - docids |= CboRoaringBitmapCodec::bytes_decode(word_fid_docids) - .ok_or(heed::Error::Decoding)?; + if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? { + docids |= word_fid_docids; } } for phrase in term.all_phrases(ctx)? { for &word in phrase.words(ctx).iter().flatten() { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word, fid)? { - docids |= CboRoaringBitmapCodec::bytes_decode(word_fid_docids) - .ok_or(heed::Error::Decoding)?; + docids |= word_fid_docids; } } } if let Some(word_prefix) = term.use_prefix_db(ctx) { - if let Some(word_fid_docids) = ctx.get_db_word_prefix_fid_docids(word_prefix, fid)? { - docids |= CboRoaringBitmapCodec::bytes_decode(word_fid_docids) - .ok_or(heed::Error::Decoding)?; + if let Some(word_fid_docids) = + ctx.get_db_word_prefix_fid_docids(word_prefix.interned(), fid)? + { + docids |= word_fid_docids; } } From d9cebff61c86d528cf3d82854d7579713da46847 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 12 Apr 2023 18:17:28 +0200 Subject: [PATCH 195/234] Add a simple test to check that attributes are ranking correctly --- .../new/ranking_rule_graph/attribute/mod.rs | 2 +- milli/src/search/new/tests/attribute.rs | 58 +++++++++++++++++++ milli/src/search/new/tests/mod.rs | 1 + 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 milli/src/search/new/tests/attribute.rs diff --git a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs index 80c1f4c6a..a2981c604 100644 --- a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs @@ -25,7 +25,7 @@ impl RankingRuleGraphTrait for AttributeGraph { universe: &RoaringBitmap, ) -> Result { let AttributeCondition { term, .. } = condition; - // maybe compute_query_term_subset_docids should accept a universe as argument + // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument let mut docids = compute_query_term_subset_docids_within_field_id( ctx, &term.term_subset, diff --git a/milli/src/search/new/tests/attribute.rs b/milli/src/search/new/tests/attribute.rs new file mode 100644 index 000000000..f9b29881b --- /dev/null +++ b/milli/src/search/new/tests/attribute.rs @@ -0,0 +1,58 @@ +use std::collections::HashMap; + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec![ + "title".to_owned(), + "description".to_owned(), + "plot".to_owned(), + ]); + s.set_criteria(vec![Criterion::Attribute]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "title": "the quick brown fox jumps over the lazy dog", + "description": "Pack my box with five dozen liquor jugs", + "plot": "How vexingly quick daft zebras jump", + }, + { + "id": 1, + "title": "Pack my box with five dozen liquor jugs", + "description": "the quick brown foxes jump over the lazy dog", + "plot": "How vexingly quick daft zebras jump", + }, + { + "id": 2, + "title": "How vexingly quick daft zebras jump", + "description": "Pack my box with five dozen liquor jugs", + "plot": "the quick brown fox jumps over the lazy dog", + } + ])) + .unwrap(); + index +} + +#[test] +fn test_attributes_are_ranked_correctly() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2]"); +} diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index 898276858..9d6d9e159 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -1,3 +1,4 @@ +pub mod attribute; pub mod distinct; #[cfg(feature = "default")] pub mod language; From 8edad8291baf05e461c77c184aa794fae6806b45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 13 Apr 2023 09:14:23 +0200 Subject: [PATCH 196/234] Add logger to attribute rr, fix a bug --- milli/examples/search.rs | 1 - milli/src/search/new/db_cache.rs | 10 +-- milli/src/search/new/logger/visual.rs | 21 ++++++- milli/src/search/new/tests/attribute.rs | 84 ++++++++++++++++++++----- 4 files changed, 92 insertions(+), 24 deletions(-) diff --git a/milli/examples/search.rs b/milli/examples/search.rs index c9a3c1438..030390822 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -50,7 +50,6 @@ fn main() -> Result<(), Box> { let docs = execute_search( &mut ctx, &(!query.trim().is_empty()).then(|| query.trim().to_owned()), - // what a the from which when there is TermsMatchingStrategy::Last, false, &None, diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index ad843b045..cf5332700 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -309,13 +309,14 @@ impl<'ctx> SearchContext<'ctx> { let fids = match self.db_cache.word_fids.entry(word) { Entry::Occupied(fids) => fids.get().clone(), Entry::Vacant(entry) => { - let key = self.word_interner.get(word).as_bytes(); + let mut key = self.word_interner.get(word).as_bytes().to_owned(); + key.push(0); let mut fids = vec![]; let remap_key_type = self .index .word_fid_docids .remap_types::() - .prefix_iter(self.txn, key)? + .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { let ((_, fid), value) = result?; @@ -334,13 +335,14 @@ impl<'ctx> SearchContext<'ctx> { let fids = match self.db_cache.word_prefix_fids.entry(word_prefix) { Entry::Occupied(fids) => fids.get().clone(), Entry::Vacant(entry) => { - let key = self.word_interner.get(word_prefix).as_bytes(); + let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned(); + key.push(0); let mut fids = vec![]; let remap_key_type = self .index .word_prefix_fid_docids .remap_types::() - .prefix_iter(self.txn, key)? + .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { let ((_, fid), value) = result?; diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index 72e33f339..7834f7e46 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -11,8 +11,8 @@ use crate::search::new::interner::Interned; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::ranking_rule_graph::{ - Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, - TypoCondition, TypoGraph, + AttributeCondition, AttributeGraph, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, + RankingRuleGraphTrait, TypoCondition, TypoGraph, }; use crate::search::new::ranking_rules::BoxRankingRule; use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger}; @@ -29,12 +29,15 @@ pub enum SearchEvents { ProximityPaths { paths: Vec>> }, TypoGraph { graph: RankingRuleGraph }, TypoPaths { paths: Vec>> }, + AttributeGraph { graph: RankingRuleGraph }, + AttributePaths { paths: Vec>> }, } enum Location { Words, Typo, Proximity, + Attribute, Other, } @@ -81,6 +84,7 @@ impl SearchLogger for VisualSearchLogger { "words" => Location::Words, "typo" => Location::Typo, "proximity" => Location::Proximity, + "attribute" => Location::Attribute, _ => Location::Other, }); } @@ -152,6 +156,15 @@ impl SearchLogger for VisualSearchLogger { self.events.push(SearchEvents::ProximityPaths { paths: paths.clone() }); } } + Location::Attribute => { + if let Some(graph) = state.downcast_ref::>() { + self.events.push(SearchEvents::AttributeGraph { graph: graph.clone() }); + } + if let Some(paths) = state.downcast_ref::>>>() + { + self.events.push(SearchEvents::AttributePaths { paths: paths.clone() }); + } + } Location::Other => {} } } @@ -314,6 +327,10 @@ impl<'ctx> DetailedLoggerFinish<'ctx> { SearchEvents::TypoPaths { paths } => { self.write_rr_graph_paths::(paths)?; } + SearchEvents::AttributeGraph { graph } => self.write_rr_graph(&graph)?, + SearchEvents::AttributePaths { paths } => { + self.write_rr_graph_paths::(paths)?; + } } Ok(()) } diff --git a/milli/src/search/new/tests/attribute.rs b/milli/src/search/new/tests/attribute.rs index f9b29881b..b248f7953 100644 --- a/milli/src/search/new/tests/attribute.rs +++ b/milli/src/search/new/tests/attribute.rs @@ -1,9 +1,4 @@ -use std::collections::HashMap; - -use crate::{ - index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, - SearchResult, TermsMatchingStrategy, -}; +use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -24,21 +19,75 @@ fn create_index() -> TempIndex { .add_documents(documents!([ { "id": 0, - "title": "the quick brown fox jumps over the lazy dog", - "description": "Pack my box with five dozen liquor jugs", - "plot": "How vexingly quick daft zebras jump", + "title": "", + "description": "", + "plot": "the quick brown fox jumps over the lazy dog", }, { "id": 1, - "title": "Pack my box with five dozen liquor jugs", + "title": "", "description": "the quick brown foxes jump over the lazy dog", - "plot": "How vexingly quick daft zebras jump", + "plot": "", }, { "id": 2, - "title": "How vexingly quick daft zebras jump", - "description": "Pack my box with five dozen liquor jugs", - "plot": "the quick brown fox jumps over the lazy dog", + "title": "the quick brown fox jumps over the lazy dog", + "description": "", + "plot": "", + }, + { + "id": 3, + "title": "the", + "description": "quick brown fox jumps over the lazy dog", + "plot": "", + }, + { + "id": 4, + "title": "the quick", + "description": "brown fox jumps over the lazy dog", + "plot": "", + }, + { + "id": 5, + "title": "the quick brown", + "description": "fox jumps over the lazy dog", + "plot": "", + }, + { + "id": 6, + "title": "the quick brown fox", + "description": "jumps over the lazy dog", + "plot": "", + }, + { + "id": 7, + "title": "the quick", + "description": "brown fox jumps", + "plot": "over the lazy dog", + }, + { + "id": 8, + "title": "the quick brown", + "description": "fox", + "plot": "jumps over the lazy dog", + }, + { + "id": 9, + "title": "the quick brown", + "description": "fox jumps", + "plot": "over the lazy dog", + }, + { + "id": 10, + "title": "", + "description": "the quick brown fox", + "plot": "jumps over the lazy dog", + }, + { + "id": 11, + "title": "the quick", + "description": "", + "plot": "brown fox jumps over the lazy dog", } ])) .unwrap(); @@ -46,13 +95,14 @@ fn create_index() -> TempIndex { } #[test] -fn test_attributes_are_ranked_correctly() { +fn test_attributes_simple() { let index = create_index(); + let txn = index.read_txn().unwrap(); let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the quick brown fox"); + s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 6, 5, 4, 3, 9, 7, 8, 11, 10, 0]"); } From bd9aba4d7733af4a1ce2d3ac341b9a69421faf4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 13 Apr 2023 10:46:09 +0200 Subject: [PATCH 197/234] Add "position" part of the attribute ranking rule --- milli/src/search/new/db_cache.rs | 77 +++++++++++++++ .../search/new/graph_based_ranking_rule.rs | 14 ++- milli/src/search/new/logger/visual.rs | 45 ++++++--- milli/src/search/new/mod.rs | 5 +- .../{attribute => fid}/mod.rs | 12 +-- .../src/search/new/ranking_rule_graph/mod.rs | 7 +- .../new/ranking_rule_graph/position/mod.rs | 93 +++++++++++++++++++ milli/src/search/new/resolve_query_graph.rs | 35 +++++++ .../tests/{attribute.rs => attribute_fid.rs} | 2 +- .../search/new/tests/attribute_position.rs | 52 +++++++++++ milli/src/search/new/tests/mod.rs | 3 +- 11 files changed, 314 insertions(+), 31 deletions(-) rename milli/src/search/new/ranking_rule_graph/{attribute => fid}/mod.rs (90%) create mode 100644 milli/src/search/new/ranking_rule_graph/position/mod.rs rename milli/src/search/new/tests/{attribute.rs => attribute_fid.rs} (99%) create mode 100644 milli/src/search/new/tests/attribute_position.rs diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index cf5332700..cf851a313 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -34,6 +34,10 @@ pub struct DatabaseCache<'ctx> { pub words_fst: Option>>, pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_prefix_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_positions: FxHashMap, Vec>, + pub word_prefix_positions: FxHashMap, Vec>, + pub word_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, pub word_prefix_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, pub word_fids: FxHashMap, Vec>, @@ -356,4 +360,77 @@ impl<'ctx> SearchContext<'ctx> { }; Ok(fids) } + + pub fn get_db_word_prefix_position_docids( + &mut self, + word_prefix: Interned, + position: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word_prefix, position), + &(self.word_interner.get(word_prefix).as_str(), position), + &mut self.db_cache.word_prefix_position_docids, + self.index.word_prefix_position_docids.remap_data_type::(), + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() + } + + pub fn get_db_word_positions(&mut self, word: Interned) -> Result> { + let positions = match self.db_cache.word_positions.entry(word) { + Entry::Occupied(positions) => positions.get().clone(), + Entry::Vacant(entry) => { + let mut key = self.word_interner.get(word).as_bytes().to_owned(); + key.push(0); + let mut positions = vec![]; + let remap_key_type = self + .index + .word_position_docids + .remap_types::() + .prefix_iter(self.txn, &key)? + .remap_key_type::(); + for result in remap_key_type { + let ((_, position), value) = result?; + // filling other caches to avoid searching for them again + self.db_cache.word_position_docids.insert((word, position), Some(value)); + positions.push(position); + } + entry.insert(positions.clone()); + positions + } + }; + Ok(positions) + } + + pub fn get_db_word_prefix_positions( + &mut self, + word_prefix: Interned, + ) -> Result> { + let positions = match self.db_cache.word_prefix_positions.entry(word_prefix) { + Entry::Occupied(positions) => positions.get().clone(), + Entry::Vacant(entry) => { + let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned(); + key.push(0); + let mut positions = vec![]; + let remap_key_type = self + .index + .word_prefix_position_docids + .remap_types::() + .prefix_iter(self.txn, &key)? + .remap_key_type::(); + for result in remap_key_type { + let ((_, position), value) = result?; + // filling other caches to avoid searching for them again + self.db_cache + .word_prefix_position_docids + .insert((word_prefix, position), Some(value)); + positions.push(position); + } + entry.insert(positions.clone()); + positions + } + }; + Ok(positions) + } } diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 3ee16ed50..0d22b5b1e 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -44,7 +44,7 @@ use super::interner::{Interned, MappedInterner}; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - AttributeGraph, ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, + ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; @@ -59,10 +59,16 @@ impl GraphBasedRankingRule { Self::new_with_id("proximity".to_owned(), terms_matching_strategy) } } -pub type Attribute = GraphBasedRankingRule; -impl GraphBasedRankingRule { +pub type Fid = GraphBasedRankingRule; +impl GraphBasedRankingRule { pub fn new(terms_matching_strategy: Option) -> Self { - Self::new_with_id("attribute".to_owned(), terms_matching_strategy) + Self::new_with_id("fid".to_owned(), terms_matching_strategy) + } +} +pub type Position = GraphBasedRankingRule; +impl GraphBasedRankingRule { + pub fn new(terms_matching_strategy: Option) -> Self { + Self::new_with_id("position".to_owned(), terms_matching_strategy) } } pub type Typo = GraphBasedRankingRule; diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index 7834f7e46..1cbe007d3 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -11,8 +11,8 @@ use crate::search::new::interner::Interned; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::ranking_rule_graph::{ - AttributeCondition, AttributeGraph, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, - RankingRuleGraphTrait, TypoCondition, TypoGraph, + Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition, + ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, }; use crate::search::new::ranking_rules::BoxRankingRule; use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger}; @@ -29,15 +29,18 @@ pub enum SearchEvents { ProximityPaths { paths: Vec>> }, TypoGraph { graph: RankingRuleGraph }, TypoPaths { paths: Vec>> }, - AttributeGraph { graph: RankingRuleGraph }, - AttributePaths { paths: Vec>> }, + FidGraph { graph: RankingRuleGraph }, + FidPaths { paths: Vec>> }, + PositionGraph { graph: RankingRuleGraph }, + PositionPaths { paths: Vec>> }, } enum Location { Words, Typo, Proximity, - Attribute, + Fid, + Position, Other, } @@ -84,7 +87,8 @@ impl SearchLogger for VisualSearchLogger { "words" => Location::Words, "typo" => Location::Typo, "proximity" => Location::Proximity, - "attribute" => Location::Attribute, + "fid" => Location::Fid, + "position" => Location::Position, _ => Location::Other, }); } @@ -156,13 +160,20 @@ impl SearchLogger for VisualSearchLogger { self.events.push(SearchEvents::ProximityPaths { paths: paths.clone() }); } } - Location::Attribute => { - if let Some(graph) = state.downcast_ref::>() { - self.events.push(SearchEvents::AttributeGraph { graph: graph.clone() }); + Location::Fid => { + if let Some(graph) = state.downcast_ref::>() { + self.events.push(SearchEvents::FidGraph { graph: graph.clone() }); } - if let Some(paths) = state.downcast_ref::>>>() - { - self.events.push(SearchEvents::AttributePaths { paths: paths.clone() }); + if let Some(paths) = state.downcast_ref::>>>() { + self.events.push(SearchEvents::FidPaths { paths: paths.clone() }); + } + } + Location::Position => { + if let Some(graph) = state.downcast_ref::>() { + self.events.push(SearchEvents::PositionGraph { graph: graph.clone() }); + } + if let Some(paths) = state.downcast_ref::>>>() { + self.events.push(SearchEvents::PositionPaths { paths: paths.clone() }); } } Location::Other => {} @@ -327,9 +338,13 @@ impl<'ctx> DetailedLoggerFinish<'ctx> { SearchEvents::TypoPaths { paths } => { self.write_rr_graph_paths::(paths)?; } - SearchEvents::AttributeGraph { graph } => self.write_rr_graph(&graph)?, - SearchEvents::AttributePaths { paths } => { - self.write_rr_graph_paths::(paths)?; + SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?, + SearchEvents::FidPaths { paths } => { + self.write_rr_graph_paths::(paths)?; + } + SearchEvents::PositionGraph { graph } => self.write_rr_graph(&graph)?, + SearchEvents::PositionPaths { paths } => { + self.write_rr_graph_paths::(paths)?; } } Ok(()) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 16eccb393..b691e00e3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -28,7 +28,7 @@ use std::collections::HashSet; use bucket_sort::bucket_sort; use charabia::TokenizerBuilder; use db_cache::DatabaseCache; -use graph_based_ranking_rule::{Attribute, Proximity, Typo}; +use graph_based_ranking_rule::{Fid, Position, Proximity, Typo}; use heed::RoTxn; use interner::DedupInterner; pub use logger::visual::VisualSearchLogger; @@ -223,7 +223,8 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( continue; } attribute = true; - ranking_rules.push(Box::new(Attribute::new(None))); + ranking_rules.push(Box::new(Fid::new(None))); + ranking_rules.push(Box::new(Position::new(None))); } crate::Criterion::Sort => { if sort { diff --git a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs similarity index 90% rename from milli/src/search/new/ranking_rule_graph/attribute/mod.rs rename to milli/src/search/new/ranking_rule_graph/fid/mod.rs index a2981c604..0f2cceaec 100644 --- a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -9,22 +9,22 @@ use crate::search::new::SearchContext; use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] -pub struct AttributeCondition { +pub struct FidCondition { term: LocatedQueryTermSubset, fid: u16, } -pub enum AttributeGraph {} +pub enum FidGraph {} -impl RankingRuleGraphTrait for AttributeGraph { - type Condition = AttributeCondition; +impl RankingRuleGraphTrait for FidGraph { + type Condition = FidCondition; fn resolve_condition( ctx: &mut SearchContext, condition: &Self::Condition, universe: &RoaringBitmap, ) -> Result { - let AttributeCondition { term, .. } = condition; + let FidCondition { term, .. } = condition; // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument let mut docids = compute_query_term_subset_docids_within_field_id( ctx, @@ -73,7 +73,7 @@ impl RankingRuleGraphTrait for AttributeGraph { // the term subsets associated to each field ids fetched. edges.push(( fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. - conditions_interner.insert(AttributeCondition { + conditions_interner.insert(FidCondition { term: term.clone(), // TODO remove this ugly clone fid, }), diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index fe31029b4..db65afdd7 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -11,9 +11,11 @@ mod condition_docids_cache; mod dead_ends_cache; /// Implementation of the `attribute` ranking rule -mod attribute; +mod fid; /// Implementation of the `exactness` ranking rule mod exactness; +/// Implementation of the `position` ranking rule +mod position; /// Implementation of the `proximity` ranking rule mod proximity; /// Implementation of the `typo` ranking rule @@ -21,11 +23,12 @@ mod typo; use std::hash::Hash; -pub use attribute::{AttributeCondition, AttributeGraph}; +pub use fid::{FidCondition, FidGraph}; pub use cheapest_paths::PathVisitor; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; pub use exactness::{ExactnessCondition, ExactnessGraph}; +pub use position::{PositionCondition, PositionGraph}; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs new file mode 100644 index 000000000..81d013141 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -0,0 +1,93 @@ +use fxhash::FxHashSet; +use roaring::RoaringBitmap; + +use super::{ComputedCondition, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned}; +use crate::search::new::query_term::LocatedQueryTermSubset; +use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_position; +use crate::search::new::SearchContext; +use crate::Result; + +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct PositionCondition { + term: LocatedQueryTermSubset, + position: u16, +} + +pub enum PositionGraph {} + +impl RankingRuleGraphTrait for PositionGraph { + type Condition = PositionCondition; + + fn resolve_condition( + ctx: &mut SearchContext, + condition: &Self::Condition, + universe: &RoaringBitmap, + ) -> Result { + let PositionCondition { term, .. } = condition; + // maybe compute_query_term_subset_docids_within_position_id should accept a universe as argument + let mut docids = compute_query_term_subset_docids_within_position( + ctx, + &term.term_subset, + condition.position, + )?; + docids &= universe; + + Ok(ComputedCondition { + docids, + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: term.clone(), + }) + } + + fn build_edges( + ctx: &mut SearchContext, + conditions_interner: &mut DedupInterner, + _from: Option<&LocatedQueryTermSubset>, + to_term: &LocatedQueryTermSubset, + ) -> Result)>> { + let term = to_term; + + let mut all_positions = FxHashSet::default(); + for word in term.term_subset.all_single_words_except_prefix_db(ctx)? { + let positions = ctx.get_db_word_positions(word.interned())?; + all_positions.extend(positions); + } + + for phrase in term.term_subset.all_phrases(ctx)? { + for &word in phrase.words(ctx).iter().flatten() { + let positions = ctx.get_db_word_positions(word)?; + all_positions.extend(positions); + } + } + + if let Some(word_prefix) = term.term_subset.use_prefix_db(ctx) { + let positions = ctx.get_db_word_prefix_positions(word_prefix.interned())?; + all_positions.extend(positions); + } + + let mut edges = vec![]; + for position in all_positions { + let cost = { + let mut cost = 0; + for i in 0..term.term_ids.len() { + cost += position as u32 + i as u32; + } + cost + }; + + // TODO: We can improve performances and relevancy by storing + // the term subsets associated to each position fetched. + edges.push(( + cost, + conditions_interner.insert(PositionCondition { + term: term.clone(), // TODO remove this ugly clone + position, + }), + )); + } + + Ok(edges) + } +} diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index a125caa39..b8eb623bb 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -87,6 +87,41 @@ pub fn compute_query_term_subset_docids_within_field_id( Ok(docids) } +pub fn compute_query_term_subset_docids_within_position( + ctx: &mut SearchContext, + term: &QueryTermSubset, + position: u16, +) -> Result { + // TODO Use the roaring::MultiOps trait + + let mut docids = RoaringBitmap::new(); + for word in term.all_single_words_except_prefix_db(ctx)? { + if let Some(word_position_docids) = + ctx.get_db_word_position_docids(word.interned(), position)? + { + docids |= word_position_docids; + } + } + + for phrase in term.all_phrases(ctx)? { + for &word in phrase.words(ctx).iter().flatten() { + if let Some(word_position_docids) = ctx.get_db_word_position_docids(word, position)? { + docids |= word_position_docids; + } + } + } + + if let Some(word_prefix) = term.use_prefix_db(ctx) { + if let Some(word_position_docids) = + ctx.get_db_word_prefix_position_docids(word_prefix.interned(), position)? + { + docids |= word_position_docids; + } + } + + Ok(docids) +} + pub fn compute_query_graph_docids( ctx: &mut SearchContext, q: &QueryGraph, diff --git a/milli/src/search/new/tests/attribute.rs b/milli/src/search/new/tests/attribute_fid.rs similarity index 99% rename from milli/src/search/new/tests/attribute.rs rename to milli/src/search/new/tests/attribute_fid.rs index b248f7953..ec7b7a69e 100644 --- a/milli/src/search/new/tests/attribute.rs +++ b/milli/src/search/new/tests/attribute_fid.rs @@ -95,7 +95,7 @@ fn create_index() -> TempIndex { } #[test] -fn test_attributes_simple() { +fn test_attribute_fid_simple() { let index = create_index(); let txn = index.read_txn().unwrap(); diff --git a/milli/src/search/new/tests/attribute_position.rs b/milli/src/search/new/tests/attribute_position.rs new file mode 100644 index 000000000..0eafedb97 --- /dev/null +++ b/milli/src/search/new/tests/attribute_position.rs @@ -0,0 +1,52 @@ +use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Attribute]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "do you know about the quick and talented brown fox", + }, + { + "id": 1, + "text": "do you know about the quick brown fox", + }, + { + "id": 2, + "text": "the quick and talented brown fox", + }, + { + "id": 3, + "text": "fox brown quick the", + }, + { + "id": 4, + "text": "the quick brown fox", + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_attribute_fid_simple() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0]"); +} diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index 9d6d9e159..31b37933d 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -1,4 +1,5 @@ -pub mod attribute; +pub mod attribute_fid; +pub mod attribute_position; pub mod distinct; #[cfg(feature = "default")] pub mod language; From 84d9c731f889f76079c30187c646b194b5678ef1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 24 Apr 2023 09:59:30 +0200 Subject: [PATCH 198/234] Fix bug in encoding of word_position_docids and word_fid_docids --- milli/src/heed_codec/str_beu32_codec.rs | 8 +- milli/src/index.rs | 4 +- milli/src/search/new/db_cache.rs | 32 ++++---- .../search/new/tests/attribute_position.rs | 76 +++++++++++++++++-- .../extract/extract_word_fid_docids.rs | 1 + .../extract/extract_word_position_docids.rs | 1 + 6 files changed, 96 insertions(+), 26 deletions(-) diff --git a/milli/src/heed_codec/str_beu32_codec.rs b/milli/src/heed_codec/str_beu32_codec.rs index 17f3c996f..cce849e37 100644 --- a/milli/src/heed_codec/str_beu32_codec.rs +++ b/milli/src/heed_codec/str_beu32_codec.rs @@ -45,11 +45,12 @@ impl<'a> heed::BytesDecode<'a> for StrBEU16Codec { fn bytes_decode(bytes: &'a [u8]) -> Option { let footer_len = size_of::(); - if bytes.len() < footer_len { + if bytes.len() < footer_len + 1 { return None; } - let (word, bytes) = bytes.split_at(bytes.len() - footer_len); + let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len); + let (_, word) = word_plus_nul_byte.split_last()?; let word = str::from_utf8(word).ok()?; let pos = bytes.try_into().map(u16::from_be_bytes).ok()?; @@ -63,8 +64,9 @@ impl<'a> heed::BytesEncode<'a> for StrBEU16Codec { fn bytes_encode((word, pos): &Self::EItem) -> Option> { let pos = pos.to_be_bytes(); - let mut bytes = Vec::with_capacity(word.len() + pos.len()); + let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len()); bytes.extend_from_slice(word.as_bytes()); + bytes.push(0); bytes.extend_from_slice(&pos[..]); Some(Cow::Owned(bytes)) diff --git a/milli/src/index.rs b/milli/src/index.rs index a36868ef2..bfc75d296 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -126,9 +126,9 @@ pub struct Index { /// Maps the field id and the word count with the docids that corresponds to it. pub field_id_word_count_docids: Database, - /// Maps the position of a word prefix with all the docids where this prefix appears. + /// Maps the word prefix and a position with all the docids where the prefix appears at the position. pub word_prefix_position_docids: Database, - /// Maps the word and the field id with the docids that corresponds to it. + /// Maps the word prefix and a field id with all the docids where the prefix appears inside the field pub word_prefix_fid_docids: Database, /// Maps the script and language with all the docids that corresponds to it. diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index cf851a313..90c604d72 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -261,22 +261,6 @@ impl<'ctx> SearchContext<'ctx> { .transpose() } - pub fn get_db_word_position_docids( - &mut self, - word: Interned, - position: u16, - ) -> Result> { - DatabaseCache::get_value( - self.txn, - (word, position), - &(self.word_interner.get(word).as_str(), position), - &mut self.db_cache.word_position_docids, - self.index.word_position_docids.remap_data_type::(), - )? - .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() - } - pub fn get_db_word_fid_docids( &mut self, word: Interned, @@ -361,6 +345,22 @@ impl<'ctx> SearchContext<'ctx> { Ok(fids) } + pub fn get_db_word_position_docids( + &mut self, + word: Interned, + position: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, position), + &(self.word_interner.get(word).as_str(), position), + &mut self.db_cache.word_position_docids, + self.index.word_position_docids.remap_data_type::(), + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() + } + pub fn get_db_word_prefix_position_docids( &mut self, word_prefix: Interned, diff --git a/milli/src/search/new/tests/attribute_position.rs b/milli/src/search/new/tests/attribute_position.rs index 0eafedb97..e4ed8b5ff 100644 --- a/milli/src/search/new/tests/attribute_position.rs +++ b/milli/src/search/new/tests/attribute_position.rs @@ -1,4 +1,6 @@ -use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; +use crate::{ + db_snap, index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy, +}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -6,7 +8,7 @@ fn create_index() -> TempIndex { index .update_settings(|s| { s.set_primary_key("id".to_owned()); - s.set_searchable_fields(vec!["text".to_owned()]); + s.set_searchable_fields(vec!["text".to_owned(), "other".to_owned()]); s.set_criteria(vec![Criterion::Attribute]); }) .unwrap(); @@ -33,20 +35,84 @@ fn create_index() -> TempIndex { "id": 4, "text": "the quick brown fox", }, + { + "id": 5, + "text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + the quick brown fox", + }, + { + "id": 6, + "text": "quick a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + brown", + }, + { + "id": 7, + "text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + quick brown", + }, ])) .unwrap(); index } #[test] -fn test_attribute_fid_simple() { +fn test_attribute_position_simple() { + let index = create_index(); + + db_snap!(index, word_position_docids, @"fe86911166fa4c0903c512fd86ec65e4"); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("quick brown"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0, 6, 7, 5]"); +} +#[test] +fn test_attribute_position_repeated() { let index = create_index(); let txn = index.read_txn().unwrap(); let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the quick brown fox"); + s.query("a a a a a"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 7, 6]"); } diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs index 72b30cddf..9ee33ea0d 100644 --- a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs @@ -36,6 +36,7 @@ pub fn extract_word_fid_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); let (fid, _) = relative_from_absolute_position(position); key_buffer.extend_from_slice(&fid.to_be_bytes()); word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 80a36c308..9bb43b004 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -39,6 +39,7 @@ pub fn extract_word_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); let (_, position) = relative_from_absolute_position(position); let position = bucketed_position(position); key_buffer.extend_from_slice(&position.to_be_bytes()); From a7a08912103cbf9fe37f4eec0eb0f94ff63ea6a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 24 Apr 2023 10:07:49 +0200 Subject: [PATCH 199/234] Update examples --- milli/examples/index.rs | 18 ++++++++++++++---- milli/examples/settings.rs | 2 +- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/milli/examples/index.rs b/milli/examples/index.rs index 504763664..18f82797b 100644 --- a/milli/examples/index.rs +++ b/milli/examples/index.rs @@ -23,12 +23,14 @@ fn main() -> Result<(), Box> { let dataset_path = args .next() .unwrap_or_else(|| panic!("{}", usage("Missing path to source dataset.", &program_name))); - let primary_key = args.next().unwrap_or_else(|| "id".into()); + // let primary_key = args.next().unwrap_or_else(|| "id".into()); // "title overview" let searchable_fields: Vec = args .next() .map(|arg| arg.split_whitespace().map(ToString::to_string).collect()) .unwrap_or_default(); + + println!("{searchable_fields:?}"); // "release_date genres" let filterable_fields: Vec = args .next() @@ -44,17 +46,25 @@ fn main() -> Result<(), Box> { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(primary_key); + // builder.set_primary_key(primary_key); let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); builder.set_searchable_fields(searchable_fields); let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); builder.set_filterable_fields(filterable_fields); - builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); + builder.set_criteria(vec![ + Criterion::Words, + Criterion::Typo, + Criterion::Proximity, + Criterion::Attribute, + ]); builder.execute(|_| (), || false).unwrap(); let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); + let mut indexing_config = IndexDocumentsConfig::default(); + + indexing_config.autogenerate_docids = true; + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); diff --git a/milli/examples/settings.rs b/milli/examples/settings.rs index a4ac3879f..bb24969cc 100644 --- a/milli/examples/settings.rs +++ b/milli/examples/settings.rs @@ -10,7 +10,7 @@ fn main() { let mut options = EnvOpenOptions::new(); options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - let index = Index::new(options, "data_movies").unwrap(); + let index = Index::new(options, "data_movies.ms").unwrap(); let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); From d1fdbb63daa5a4156db495499a1a5a5614726cb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 24 Apr 2023 12:11:25 +0200 Subject: [PATCH 200/234] Make all search tests pass, fix distinctAttribute bug --- milli/src/search/mod.rs | 104 ----- milli/src/search/new/bucket_sort.rs | 10 +- .../src/search/new/matches/matching_words.rs | 4 +- milli/src/search/new/matches/mod.rs | 4 +- milli/src/search/new/mod.rs | 8 +- .../new/ranking_rule_graph/position/mod.rs | 9 +- .../proximity/compute_docids.rs | 24 +- milli/src/search/new/resolve_query_graph.rs | 22 +- milli/src/update/index_documents/mod.rs | 3 - milli/tests/assets/test_set.ndjson | 401 +++++++++++++++++- milli/tests/search/distinct.rs | 4 +- milli/tests/search/filters.rs | 4 +- milli/tests/search/mod.rs | 16 +- milli/tests/search/phrase_search.rs | 1 - milli/tests/search/query_criteria.rs | 156 +------ milli/tests/search/sort.rs | 2 +- milli/tests/search/typo_tolerance.rs | 20 +- 17 files changed, 465 insertions(+), 327 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index c4dfdd6b3..a0bf272dd 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -28,7 +28,6 @@ pub struct Search<'a> { limit: usize, sort_criteria: Option>, terms_matching_strategy: TermsMatchingStrategy, - authorize_typos: bool, words_limit: usize, exhaustive_number_hits: bool, rtxn: &'a heed::RoTxn<'a>, @@ -44,7 +43,6 @@ impl<'a> Search<'a> { limit: 20, sort_criteria: None, terms_matching_strategy: TermsMatchingStrategy::default(), - authorize_typos: true, exhaustive_number_hits: false, words_limit: 10, rtxn, @@ -77,11 +75,6 @@ impl<'a> Search<'a> { self } - pub fn authorize_typos(&mut self, value: bool) -> &mut Search<'a> { - self.authorize_typos = value; - self - } - pub fn words_limit(&mut self, value: usize) -> &mut Search<'a> { self.words_limit = value; self @@ -99,13 +92,6 @@ impl<'a> Search<'a> { self } - // TODO! - fn _is_typo_authorized(&self) -> Result { - let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?; - // only authorize typos if both the index and the query allow it. - Ok(self.authorize_typos && index_authorizes_typos) - } - pub fn execute(&self) -> Result { let mut ctx = SearchContext::new(self.index, self.rtxn); let PartialSearchResult { located_query_terms, candidates, documents_ids } = @@ -142,7 +128,6 @@ impl fmt::Debug for Search<'_> { limit, sort_criteria, terms_matching_strategy, - authorize_typos, words_limit, exhaustive_number_hits, rtxn: _, @@ -155,7 +140,6 @@ impl fmt::Debug for Search<'_> { .field("limit", limit) .field("sort_criteria", sort_criteria) .field("terms_matching_strategy", terms_matching_strategy) - .field("authorize_typos", authorize_typos) .field("exhaustive_number_hits", exhaustive_number_hits) .field("words_limit", words_limit) .finish() @@ -231,92 +215,4 @@ mod test { assert_eq!(documents_ids, vec![1]); } - - // #[test] - // fn test_is_authorized_typos() { - // let index = TempIndex::new(); - // let mut txn = index.write_txn().unwrap(); - - // let mut search = Search::new(&txn, &index); - - // // default is authorized - // assert!(search.is_typo_authorized().unwrap()); - - // search.authorize_typos(false); - // assert!(!search.is_typo_authorized().unwrap()); - - // index.put_authorize_typos(&mut txn, false).unwrap(); - // txn.commit().unwrap(); - - // let txn = index.read_txn().unwrap(); - // let mut search = Search::new(&txn, &index); - - // assert!(!search.is_typo_authorized().unwrap()); - - // search.authorize_typos(true); - // assert!(!search.is_typo_authorized().unwrap()); - // } - - // #[test] - // fn test_one_typos_tolerance() { - // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - // let mut cache = HashMap::new(); - // let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap(); - - // assert_eq!(found, &[("zealand".to_string(), 1)]); - // } - - // #[test] - // fn test_one_typos_first_letter() { - // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - // let mut cache = HashMap::new(); - // let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap(); - - // assert_eq!(found, &[]); - // } - - // #[test] - // fn test_two_typos_tolerance() { - // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - // let mut cache = HashMap::new(); - // let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap(); - - // assert_eq!(found, &[("zealand".to_string(), 2)]); - // } - - // #[test] - // fn test_two_typos_first_letter() { - // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - // let mut cache = HashMap::new(); - // let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap(); - - // assert_eq!(found, &[("zealand".to_string(), 2)]); - // } - - // #[test] - // fn test_prefix() { - // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - // let mut cache = HashMap::new(); - // let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap(); - - // assert_eq!(found, &[("zealand".to_string(), 0)]); - // } - - // #[test] - // fn test_bad_prefix() { - // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - // let mut cache = HashMap::new(); - // let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap(); - - // assert_eq!(found, &[]); - // } - - // #[test] - // fn test_prefix_with_typo() { - // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - // let mut cache = HashMap::new(); - // let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap(); - - // assert_eq!(found, &[("zealand".to_string(), 1)]); - // } } diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index 6413ff811..ec0116fae 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -88,7 +88,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( }; } - let mut all_candidates = RoaringBitmap::new(); + let mut all_candidates = universe.clone(); let mut valid_docids = vec![]; let mut cur_offset = 0usize; @@ -162,8 +162,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( )?; } - all_candidates |= &ranking_rule_universes[0]; - Ok(BucketSortOutput { docids: valid_docids, all_candidates }) } @@ -193,12 +191,14 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>( apply_distinct_rule(ctx, distinct_fid, &candidates)?; for universe in ranking_rule_universes.iter_mut() { *universe -= &excluded; + *all_candidates -= &excluded; } remaining } else { candidates.clone() }; *all_candidates |= &candidates; + // if the candidates are empty, there is nothing to do; if candidates.is_empty() { return Ok(()); @@ -216,8 +216,8 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>( ); } else { // otherwise, skip some of the documents and add some of the rest, in order of ids - let all_candidates = candidates.iter().collect::>(); - let (skipped_candidates, candidates) = all_candidates.split_at(from - *cur_offset); + let candidates_vec = candidates.iter().collect::>(); + let (skipped_candidates, candidates) = candidates_vec.split_at(from - *cur_offset); logger.skip_bucket_ranking_rule( cur_ranking_rule_index, diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 0da1b3a78..e9a728a01 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -243,7 +243,7 @@ pub(crate) mod tests { let temp_index = TempIndex::new(); temp_index .add_documents(documents!([ - { "id": 1, "name": "split this world westfali westfalia the" }, + { "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" }, ])) .unwrap(); temp_index @@ -305,7 +305,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + None ); assert_eq!( matching_words diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 84bdea7ab..0db2c3660 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -599,7 +599,7 @@ mod tests { // no crop should return complete text with highlighted matches. insta::assert_snapshot!( matcher.format(format_options), - @"Ŵôřlḑôle" + @"Ŵôřlḑôle" ); // Text containing unicode match. @@ -621,7 +621,7 @@ mod tests { // no crop should return complete text with highlighted matches. insta::assert_snapshot!( matcher.format(format_options), - @"Westfália" + @"Westfália" ); } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index b691e00e3..9fe4f3aae 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -184,11 +184,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( for rr in settings_ranking_rules { // Add Words before any of: typo, proximity, attribute, exactness match rr { - crate::Criterion::Typo - | crate::Criterion::Attribute - | crate::Criterion::Proximity - // TODO: no exactness - | crate::Criterion::Exactness => { + crate::Criterion::Typo | crate::Criterion::Attribute | crate::Criterion::Proximity => { if !words { ranking_rules.push(Box::new(Words::new(terms_matching_strategy))); words = true; @@ -339,6 +335,8 @@ pub fn execute_search( check_sort_criteria(ctx, sort_criteria.as_ref())?; + // TODO: if the exactness criterion is the first one, then + // use a different strategy to find the universe (union of any term) universe = resolve_maximally_reduced_query_graph( ctx, &universe, diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index 81d013141..ef4880cfb 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -56,8 +56,13 @@ impl RankingRuleGraphTrait for PositionGraph { } for phrase in term.term_subset.all_phrases(ctx)? { - for &word in phrase.words(ctx).iter().flatten() { - let positions = ctx.get_db_word_positions(word)?; + // Only check the position of the first word in the phrase + // this is not correct, but it is the best we can do, since + // it is difficult/impossible to know the expected position + // of a word in a phrase. + // There is probably a more correct way to do it though. + if let Some(word) = phrase.words(ctx).iter().flatten().next() { + let positions = ctx.get_db_word_positions(*word)?; all_positions.extend(positions); } } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 3e75f948e..057779a22 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -79,11 +79,6 @@ pub fn compute_docids( // // This is an optimisation to avoid checking for an excessive number of // pairs. - // WAIT, NO. - // This should only be done once per node. - // Here, we'll potentially do is.. 16 times? - // Maybe we should do it at edge-build time instead. - // Same for the future attribute ranking rule. let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?; if right_derivs.len() > 1 { let universe = &universe; @@ -190,11 +185,6 @@ fn compute_non_prefix_edges( docids: &mut RoaringBitmap, universe: &RoaringBitmap, ) -> Result<()> { - let mut used_left_phrases = BTreeSet::new(); - let mut used_right_phrases = BTreeSet::new(); - let mut used_left_words = BTreeSet::new(); - let mut used_right_words = BTreeSet::new(); - let mut universe = universe.clone(); for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() { @@ -204,25 +194,19 @@ fn compute_non_prefix_edges( return Ok(()); } } - if let Some(left_phrase) = left_phrase { - used_left_phrases.insert(left_phrase); - } - if let Some(right_phrase) = right_phrase { - used_right_phrases.insert(right_phrase); - } if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)? { let new_docids = &universe & new_docids; if !new_docids.is_empty() { - used_left_words.insert(word1); - used_right_words.insert(word2); *docids |= new_docids; } } if backward_proximity >= 1 - // no swapping when either term is a phrase + // TODO: for now, we don't do any swapping when either term is a phrase + // but maybe we should. We'd need to look at the first/last word of the phrase + // depending on the context. && left_phrase.is_none() && right_phrase.is_none() { if let Some(new_docids) = @@ -230,8 +214,6 @@ fn compute_non_prefix_edges( { let new_docids = &universe & new_docids; if !new_docids.is_empty() { - used_left_words.insert(word2); - used_right_words.insert(word1); *docids |= new_docids; } } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index b8eb623bb..34ed135d4 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -69,11 +69,16 @@ pub fn compute_query_term_subset_docids_within_field_id( } for phrase in term.all_phrases(ctx)? { - for &word in phrase.words(ctx).iter().flatten() { - if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word, fid)? { - docids |= word_fid_docids; + let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone(); + // There may be false positives when resolving a phrase, so we're not + // guaranteed that all of its words are within a single fid. + // TODO: fix this? + if let Some(word) = phrase.words(ctx).iter().flatten().next() { + if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? { + phrase_docids &= word_fid_docids; } } + docids |= phrase_docids; } if let Some(word_prefix) = term.use_prefix_db(ctx) { @@ -104,11 +109,16 @@ pub fn compute_query_term_subset_docids_within_position( } for phrase in term.all_phrases(ctx)? { - for &word in phrase.words(ctx).iter().flatten() { - if let Some(word_position_docids) = ctx.get_db_word_position_docids(word, position)? { - docids |= word_position_docids; + let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone(); + // It's difficult to know the expected position of the words in the phrase, + // so instead we just check the first one. + // TODO: fix this? + if let Some(word) = phrase.words(ctx).iter().flatten().next() { + if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? { + phrase_docids &= word_position_docids; } } + docids |= phrase_docids; } if let Some(word_prefix) = term.use_prefix_db(ctx) { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 235b35fc8..9d60d59ca 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1229,7 +1229,6 @@ mod tests { // testing the simple query search let mut search = crate::Search::new(&rtxn, &index); search.query("document"); - search.authorize_typos(true); search.terms_matching_strategy(TermsMatchingStrategy::default()); // all documents should be returned let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); @@ -1335,7 +1334,6 @@ mod tests { // testing the simple query search let mut search = crate::Search::new(&rtxn, &index); search.query("document"); - search.authorize_typos(true); search.terms_matching_strategy(TermsMatchingStrategy::default()); // all documents should be returned let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); @@ -1582,7 +1580,6 @@ mod tests { let mut search = crate::Search::new(&rtxn, &index); search.query("化妆包"); - search.authorize_typos(true); search.terms_matching_strategy(TermsMatchingStrategy::default()); // only 1 document should be returned diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index a0ee9646b..60ee48dd2 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -1,17 +1,384 @@ -{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"etiopia","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null], "tag_in": 1} -{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"fehérorosz","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": [], "tag_in": 2} -{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"etiopia","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null, "tag_in": 3} -{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"fehérorosz","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4, "tag_in": "four"} -{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"észak-korea","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E", "tag_in": "five"} -{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"etiopia","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"], "tag_in": null} -{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"fehérorosz","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":"", "opt1": [7]} -{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"észak-korea","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8], "tag_in": 8} -{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"etiopia","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":"", "tag_in": "nine"} -{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"észak-korea","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}, "tag_in": 10} -{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"fehérorosz","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] , "tag_in": "eleven"} -{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"etiopia","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}, "tag_in": 12} -{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"fehérorosz","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":"", "opt1": [13, [{"opt2": null}]]} -{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"észak-korea","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}}} -{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"etiopia","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]]} -{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"fehérorosz","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":"", "opt1.opt2": 16} -{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"észak-korea","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""} +{ + "id": "A", + "word_rank": 0, + "typo_rank": 2, + "proximity_rank": 16, + "attribute_rank": 224, + "exact_rank": 6, + "asc_desc_rank": 0, + "sort_by_rank": 0, + "geo_rank": 43, + "title": "hell o", + "description": "hell o is the fourteenth episode of the american television series glee performing songs with this word", + "tag": "etiopia", + "_geo": { + "lat": 50.62984446145472, + "lng": 3.085712705162039 + }, + "": "", + "opt1": [ + null + ], + "tag_in": 1 +} +{ + "id": "B", + "word_rank": 2, + "typo_rank": 0, + "proximity_rank": 0, + "attribute_rank": 0, + "exact_rank": 0, + "asc_desc_rank": 1, + "sort_by_rank": 2, + "geo_rank": 191, + "title": "hello", + "description": "hello is a song recorded by english singer songwriter adele", + "tag": "fehérorosz", + "_geo": { + "lat": 50.63047567664291, + "lng": 3.088852230809636 + }, + "": "", + "opt1": [], + "tag_in": 2 +} +{ + "id": "C", + "word_rank": 0, + "typo_rank": 1, + "proximity_rank": 10, + "attribute_rank": 111, + "exact_rank": 6, + "asc_desc_rank": 2, + "sort_by_rank": 0, + "geo_rank": 283, + "title": "hell on earth", + "description": "hell on earth is the third studio album by american hip hop duo mobb deep", + "tag": "etiopia", + "_geo": { + "lat": 50.6321800003937, + "lng": 3.088331882262139 + }, + "": "", + "opt1": null, + "tag_in": 3 +} +{ + "id": "D", + "word_rank": 0, + "typo_rank": 1, + "proximity_rank": 16, + "attribute_rank": 213, + "exact_rank": 5, + "asc_desc_rank": 3, + "sort_by_rank": 2, + "geo_rank": 1381, + "title": "hell on wheels tv series", + "description": "the construction of the first transcontinental railroad across the united states in the world", + "tag": "fehérorosz", + "_geo": { + "lat": 50.63728851135729, + "lng": 3.0703951595971626 + }, + "": "", + "opt1": 4, + "tag_in": "four" +} +{ + "id": "E", + "word_rank": 2, + "typo_rank": 0, + "proximity_rank": 0, + "attribute_rank": 0, + "exact_rank": 1, + "asc_desc_rank": 4, + "sort_by_rank": 1, + "geo_rank": 1979, + "title": "hello kitty", + "description": "also known by her full name kitty white is a fictional character produced by the japanese company sanrio", + "tag": "észak-korea", + "_geo": { + "lat": 50.64264610511925, + "lng": 3.0665099941857634 + }, + "": "", + "opt1": "E", + "tag_in": "five" +} +{ + "id": "F", + "word_rank": 2, + "typo_rank": 1, + "proximity_rank": 0, + "attribute_rank": 116, + "exact_rank": 5, + "asc_desc_rank": 5, + "sort_by_rank": 0, + "geo_rank": 65022, + "title": "laptop orchestra", + "description": "a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra", + "tag": "etiopia", + "_geo": { + "lat": 51.05028653642387, + "lng": 3.7301072771642096 + }, + "": "", + "opt1": [ + "F" + ], + "tag_in": null +} +{ + "id": "G", + "word_rank": 1, + "typo_rank": 0, + "proximity_rank": 0, + "attribute_rank": 1, + "exact_rank": 3, + "asc_desc_rank": 5, + "sort_by_rank": 2, + "geo_rank": 34692, + "title": "hello world film", + "description": "hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica", + "tag": "fehérorosz", + "_geo": { + "lat": 50.78776041427129, + "lng": 2.661201766290338 + }, + "": "", + "opt1": [ + 7 + ] +} +{ + "id": "H", + "word_rank": 1, + "typo_rank": 0, + "proximity_rank": 1, + "attribute_rank": 1, + "exact_rank": 3, + "asc_desc_rank": 4, + "sort_by_rank": 1, + "geo_rank": 202182, + "title": "world hello day", + "description": "holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force", + "tag": "észak-korea", + "_geo": { + "lat": 48.875617484531965, + "lng": 2.346747821504194 + }, + "": "", + "opt1": [ + "H", + 8 + ], + "tag_in": 8 +} +{ + "id": "I", + "word_rank": 0, + "typo_rank": 0, + "proximity_rank": 9, + "attribute_rank": 125, + "exact_rank": 3, + "asc_desc_rank": 3, + "sort_by_rank": 0, + "geo_rank": 740667, + "title": "hello world song", + "description": "hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum", + "tag": "etiopia", + "_geo": { + "lat": 43.973998070351065, + "lng": 3.4661837318345032 + }, + "": "", + "tag_in": "nine" +} +{ + "id": "J", + "word_rank": 1, + "typo_rank": 0, + "proximity_rank": 1, + "attribute_rank": 2, + "exact_rank": 3, + "asc_desc_rank": 2, + "sort_by_rank": 1, + "geo_rank": 739020, + "title": "hello cruel world", + "description": "hello cruel world is an album by new zealand band tall dwarfs", + "tag": "észak-korea", + "_geo": { + "lat": 43.98920130353838, + "lng": 3.480519311627928 + }, + "": "", + "opt1": {}, + "tag_in": 10 +} +{ + "id": "K", + "word_rank": 0, + "typo_rank": 2, + "proximity_rank": 10, + "attribute_rank": 213, + "exact_rank": 6, + "asc_desc_rank": 1, + "sort_by_rank": 2, + "geo_rank": 738830, + "title": "hallo creation system", + "description": "in few word hallo was a construction toy created by the american company mattel to engage girls in construction play", + "tag": "fehérorosz", + "_geo": { + "lat": 43.99155030238669, + "lng": 3.503453528249425 + }, + "": "", + "opt1": [ + { + "opt2": 11 + } + ], + "tag_in": "eleven" +} +{ + "id": "L", + "word_rank": 0, + "typo_rank": 0, + "proximity_rank": 2, + "attribute_rank": 107, + "exact_rank": 5, + "asc_desc_rank": 0, + "sort_by_rank": 0, + "geo_rank": 737861, + "title": "good morning world", + "description": "good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season", + "tag": "etiopia", + "_geo": { + "lat": 44.000507750283695, + "lng": 3.5116812040621572 + }, + "": "", + "opt1": { + "opt2": [ + 12 + ] + }, + "tag_in": 12 +} +{ + "id": "M", + "word_rank": 0, + "typo_rank": 0, + "proximity_rank": 0, + "attribute_rank": 3, + "exact_rank": 0, + "asc_desc_rank": 0, + "sort_by_rank": 2, + "geo_rank": 739203, + "title": "hello world america", + "description": "a perfect match for a perfect engine using the query hello world america", + "tag": "fehérorosz", + "_geo": { + "lat": 43.99150729038736, + "lng": 3.606143957295055 + }, + "": "", + "opt1": [ + 13, + [ + { + "opt2": null + } + ] + ] +} +{ + "id": "N", + "word_rank": 0, + "typo_rank": 0, + "proximity_rank": 0, + "attribute_rank": 3, + "exact_rank": 1, + "asc_desc_rank": 4, + "sort_by_rank": 1, + "geo_rank": 9499586, + "title": "hello world america unleashed", + "description": "a very good match for a very good engine using the query hello world america", + "tag": "észak-korea", + "_geo": { + "lat": 35.511540843367115, + "lng": 138.764368875787 + }, + "": "", + "opt1": { + "a": 1, + "opt2": { + "opt3": 14 + } + } +} +{ + "id": "O", + "word_rank": 0, + "typo_rank": 0, + "proximity_rank": 0, + "attribute_rank": 3, + "exact_rank": 0, + "asc_desc_rank": 6, + "sort_by_rank": 0, + "geo_rank": 9425163, + "title": "a perfect match for a perfect engine using the query hello world america", + "description": "hello world america", + "tag": "etiopia", + "_geo": { + "lat": 35.00536702277189, + "lng": 135.76118763940391 + }, + "": "", + "opt1": [ + [ + [ + [] + ] + ] + ] +} +{ + "id": "P", + "word_rank": 0, + "typo_rank": 0, + "proximity_rank": 0, + "attribute_rank": 3, + "exact_rank": 1, + "asc_desc_rank": 3, + "sort_by_rank": 2, + "geo_rank": 9422437, + "title": "a very good match for a very good engine using the query hello world america", + "description": "hello world america unleashed", + "tag": "fehérorosz", + "_geo": { + "lat": 35.06462306367058, + "lng": 135.8338440354251 + }, + "": "", + "opt1.opt2": 16 +} +{ + "id": "Q", + "word_rank": 1, + "typo_rank": 0, + "proximity_rank": 0, + "attribute_rank": 1, + "exact_rank": 2, + "asc_desc_rank": 2, + "sort_by_rank": 1, + "geo_rank": 9339230, + "title": "hello world", + "description": "a hello world program generally is a computer program that outputs or displays the message hello world", + "tag": "észak-korea", + "_geo": { + "lat": 34.39548365683149, + "lng": 132.4535960928883 + }, + "": "" +} \ No newline at end of file diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index 3c6dd8cc0..d8291ee30 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -28,7 +28,7 @@ macro_rules! test_distinct { search.query(search::TEST_QUERY); search.limit($limit); search.exhaustive_number_hits($exhaustive); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let SearchResult { documents_ids, candidates, .. } = search.execute().unwrap(); @@ -37,7 +37,7 @@ macro_rules! test_distinct { let mut distinct_values = HashSet::new(); let expected_external_ids: Vec<_> = - search::expected_order(&criteria, true, TermsMatchingStrategy::default(), &[]) + search::expected_order(&criteria, TermsMatchingStrategy::default(), &[]) .into_iter() .filter_map(|d| { if distinct_values.contains(&d.$distinct) { diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 18de24ac3..192abec5f 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -18,7 +18,7 @@ macro_rules! test_filter { let mut search = Search::new(&rtxn, &index); search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); search.filter(filter_conditions); @@ -26,7 +26,7 @@ macro_rules! test_filter { let filtered_ids = search::expected_filtered_ids($filter); let expected_external_ids: Vec<_> = - search::expected_order(&criteria, true, TermsMatchingStrategy::default(), &[]) + search::expected_order(&criteria, TermsMatchingStrategy::default(), &[]) .into_iter() .filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None }) .collect(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 18c74e344..ba0fa0558 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -61,7 +61,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { // index documents let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; - let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let indexing_config = IndexDocumentsConfig::default(); let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); @@ -96,7 +96,6 @@ pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> V pub fn expected_order( criteria: &[Criterion], - authorize_typo: bool, optional_words: TermsMatchingStrategy, sort_by: &[AscDesc], ) -> Vec { @@ -156,14 +155,11 @@ pub fn expected_order( groups = std::mem::take(&mut new_groups); } - if authorize_typo && optional_words == TermsMatchingStrategy::default() { - groups.into_iter().flatten().collect() - } else if optional_words == TermsMatchingStrategy::default() { - groups.into_iter().flatten().filter(|d| d.typo_rank == 0).collect() - } else if authorize_typo { - groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect() - } else { - groups.into_iter().flatten().filter(|d| d.word_rank == 0 && d.typo_rank == 0).collect() + match optional_words { + TermsMatchingStrategy::Last => groups.into_iter().flatten().collect(), + TermsMatchingStrategy::All => { + groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect() + } } } diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs index 5b987ad30..b7f792bfc 100644 --- a/milli/tests/search/phrase_search.rs +++ b/milli/tests/search/phrase_search.rs @@ -26,7 +26,6 @@ fn test_phrase_search_with_stop_words_given_criteria(criteria: &[Criterion]) { let mut search = Search::new(&txn, &index); search.query("\"the use of force\""); search.limit(10); - search.authorize_typos(false); search.terms_matching_strategy(TermsMatchingStrategy::All); let result = search.execute().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index b26752550..65d403097 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -13,14 +13,12 @@ use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; -const ALLOW_TYPOS: bool = true; -const DISALLOW_TYPOS: bool = false; const ALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::Last; const DISALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::All; const ASC_DESC_CANDIDATES_THRESHOLD: usize = 1000; macro_rules! test_criterion { - ($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr, $sort_criteria:expr) => { + ($func:ident, $optional_word:ident, $criteria:expr, $sort_criteria:expr) => { #[test] fn $func() { let criteria = $criteria; @@ -30,169 +28,60 @@ macro_rules! test_criterion { let mut search = Search::new(&rtxn, &index); search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.authorize_typos($authorize_typos); search.terms_matching_strategy($optional_word); search.sort_criteria($sort_criteria); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let expected_external_ids: Vec<_> = search::expected_order( - &criteria, - $authorize_typos, - $optional_word, - &$sort_criteria[..], - ) - .into_iter() - .map(|d| d.id) - .collect(); + let expected_external_ids: Vec<_> = + search::expected_order(&criteria, $optional_word, &$sort_criteria[..]) + .into_iter() + .map(|d| d.id) + .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); assert_eq!(documents_ids, expected_external_ids); } }; } -test_criterion!(none_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![], vec![]); -test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![], vec![]); -test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words], vec![]); +test_criterion!(none, DISALLOW_OPTIONAL_WORDS, vec![], vec![]); +test_criterion!(words, ALLOW_OPTIONAL_WORDS, vec![Words], vec![]); +test_criterion!(attribute, DISALLOW_OPTIONAL_WORDS, vec![Attribute], vec![]); +test_criterion!(typo, DISALLOW_OPTIONAL_WORDS, vec![Typo], vec![]); +test_criterion!(exactness, DISALLOW_OPTIONAL_WORDS, vec![Exactness], vec![]); +test_criterion!(proximity, DISALLOW_OPTIONAL_WORDS, vec![Proximity], vec![]); +test_criterion!(asc, DISALLOW_OPTIONAL_WORDS, vec![Asc(S("asc_desc_rank"))], vec![]); +test_criterion!(desc, DISALLOW_OPTIONAL_WORDS, vec![Desc(S("asc_desc_rank"))], vec![]); test_criterion!( - attribute_allow_typo, + asc_unexisting_field, DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, - vec![Attribute], - vec![] -); -test_criterion!(typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Typo], vec![]); -test_criterion!( - attribute_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Attribute], - vec![] -); -test_criterion!( - exactness_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, - vec![Exactness], - vec![] -); -test_criterion!( - exactness_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Exactness], - vec![] -); -test_criterion!( - proximity_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, - vec![Proximity], - vec![] -); -test_criterion!( - proximity_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Proximity], - vec![] -); -test_criterion!( - asc_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, - vec![Asc(S("asc_desc_rank"))], - vec![] -); -test_criterion!( - asc_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Asc(S("asc_desc_rank"))], - vec![] -); -test_criterion!( - desc_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, - vec![Desc(S("asc_desc_rank"))], - vec![] -); -test_criterion!( - desc_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Desc(S("asc_desc_rank"))], - vec![] -); -test_criterion!( - asc_unexisting_field_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, vec![Asc(S("unexisting_field"))], vec![] ); + test_criterion!( - asc_unexisting_field_disallow_typo, + desc_unexisting_field, DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Asc(S("unexisting_field"))], - vec![] -); -test_criterion!( - desc_unexisting_field_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, vec![Desc(S("unexisting_field"))], vec![] ); + +test_criterion!(empty_sort_by, DISALLOW_OPTIONAL_WORDS, vec![Sort], vec![]); test_criterion!( - desc_unexisting_field_disallow_typo, + sort_by_asc, DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Desc(S("unexisting_field"))], - vec![] -); -test_criterion!(empty_sort_by_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Sort], vec![]); -test_criterion!( - empty_sort_by_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Sort], - vec![] -); -test_criterion!( - sort_by_asc_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, vec![Sort], vec![AscDesc::Asc(Member::Field(S("tag")))] ); test_criterion!( - sort_by_asc_disallow_typo, + sort_by_desc, DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Sort], - vec![AscDesc::Asc(Member::Field(S("tag")))] -); -test_criterion!( - sort_by_desc_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, - vec![Sort], - vec![AscDesc::Desc(Member::Field(S("tag")))] -); -test_criterion!( - sort_by_desc_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, vec![Sort], vec![AscDesc::Desc(Member::Field(S("tag")))] ); test_criterion!( default_criteria_order, ALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, vec![Words, Typo, Proximity, Attribute, Exactness], vec![] ); @@ -354,12 +243,11 @@ fn criteria_mixup() { search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.terms_matching_strategy(ALLOW_OPTIONAL_WORDS); - search.authorize_typos(ALLOW_TYPOS); let SearchResult { documents_ids, .. } = search.execute().unwrap(); let expected_external_ids: Vec<_> = - search::expected_order(&criteria, ALLOW_TYPOS, ALLOW_OPTIONAL_WORDS, &[]) + search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, &[]) .into_iter() .map(|d| d.id) .collect(); diff --git a/milli/tests/search/sort.rs b/milli/tests/search/sort.rs index 16d21eac8..34f3eefe9 100644 --- a/milli/tests/search/sort.rs +++ b/milli/tests/search/sort.rs @@ -14,7 +14,7 @@ fn sort_ranking_rule_missing() { let mut search = Search::new(&rtxn, &index); search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); search.sort_criteria(vec![AscDesc::Asc(Member::Field(S("tag")))]); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index c939186e5..863d2758a 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -19,7 +19,7 @@ fn test_typo_tolerance_one_typo() { let mut search = Search::new(&txn, &index); search.query("zeal"); search.limit(10); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); @@ -28,7 +28,7 @@ fn test_typo_tolerance_one_typo() { let mut search = Search::new(&txn, &index); search.query("zean"); search.limit(10); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); @@ -46,7 +46,7 @@ fn test_typo_tolerance_one_typo() { let mut search = Search::new(&txn, &index); search.query("zean"); search.limit(10); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); @@ -65,7 +65,7 @@ fn test_typo_tolerance_two_typo() { let mut search = Search::new(&txn, &index); search.query("zealand"); search.limit(10); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); @@ -74,7 +74,7 @@ fn test_typo_tolerance_two_typo() { let mut search = Search::new(&txn, &index); search.query("zealemd"); search.limit(10); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); @@ -92,7 +92,7 @@ fn test_typo_tolerance_two_typo() { let mut search = Search::new(&txn, &index); search.query("zealemd"); search.limit(10); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); @@ -142,7 +142,7 @@ fn test_typo_disabled_on_word() { let mut search = Search::new(&txn, &index); search.query("zealand"); search.limit(10); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); @@ -162,7 +162,7 @@ fn test_typo_disabled_on_word() { let mut search = Search::new(&txn, &index); search.query("zealand"); search.limit(10); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); @@ -182,7 +182,7 @@ fn test_disable_typo_on_attribute() { // typo in `antebel(l)um` search.query("antebelum"); search.limit(10); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); @@ -200,7 +200,7 @@ fn test_disable_typo_on_attribute() { let mut search = Search::new(&txn, &index); search.query("antebelum"); search.limit(10); - search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); From 8f2e9718799901b69f9ed0983c40f53428b73f9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 24 Apr 2023 16:57:12 +0200 Subject: [PATCH 201/234] Add tests for "exactness" rr, make correct universe computation --- milli/src/search/new/mod.rs | 99 +++++- milli/src/search/new/tests/exactness.rs | 442 ++++++++++++++++++++++++ milli/src/search/new/tests/mod.rs | 1 + 3 files changed, 533 insertions(+), 9 deletions(-) create mode 100644 milli/src/search/new/tests/exactness.rs diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 9fe4f3aae..fb96af5e7 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -51,6 +51,7 @@ use resolve_query_graph::compute_query_graph_docids; use sort::Sort; use self::interner::Interned; +use self::query_term::ExactTerm; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -119,6 +120,75 @@ fn resolve_maximally_reduced_query_graph( Ok(docids) } +fn resolve_docids_containing_any_exact_word( + ctx: &mut SearchContext, + universe: &RoaringBitmap, + query_graph: &QueryGraph, +) -> Result { + let mut docids = RoaringBitmap::new(); + for (_, node) in query_graph.nodes.iter() { + let term = match &node.data { + query_graph::QueryNodeData::Term(term) => term, + query_graph::QueryNodeData::Deleted + | query_graph::QueryNodeData::Start + | query_graph::QueryNodeData::End => { + continue; + } + }; + if term.term_ids.len() != 1 { + continue; + } + let Some(exact_term) = term.term_subset.exact_term(ctx) else { + continue + }; + let exact_term_docids = match exact_term { + ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)? & universe, + ExactTerm::Word(word) => { + if let Some(word_docids) = ctx.word_docids(Word::Original(word))? { + word_docids & universe + } else { + continue; + } + } + }; + docids |= exact_term_docids; + } + Ok(docids) +} + +fn resolve_universe( + ctx: &mut SearchContext, + initial_universe: &RoaringBitmap, + query_graph: &QueryGraph, + method: UniverseResolutionMethod, + matching_strategy: TermsMatchingStrategy, + logger: &mut dyn SearchLogger, +) -> Result { + match method { + UniverseResolutionMethod::TermMatchingStrategyOnly => { + resolve_maximally_reduced_query_graph( + ctx, + initial_universe, + query_graph, + matching_strategy, + logger, + ) + } + UniverseResolutionMethod::TermMatchingStrategyAndExactness => { + let mut resolved_universe = resolve_maximally_reduced_query_graph( + ctx, + initial_universe, + query_graph, + matching_strategy, + logger, + )?; + resolved_universe |= + resolve_docids_containing_any_exact_word(ctx, initial_universe, query_graph)?; + Ok(resolved_universe) + } + } +} + /// Return the list of initialised ranking rules to be used for a placeholder search. fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, @@ -163,12 +233,17 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( Ok(ranking_rules) } +enum UniverseResolutionMethod { + TermMatchingStrategyOnly, + TermMatchingStrategyAndExactness, +} + /// Return the list of initialised ranking rules to be used for a query graph search. fn get_ranking_rules_for_query_graph_search<'ctx>( ctx: &SearchContext<'ctx>, sort_criteria: &Option>, terms_matching_strategy: TermsMatchingStrategy, -) -> Result>> { +) -> Result<(Vec>, UniverseResolutionMethod)> { // query graph search let mut words = false; let mut typo = false; @@ -179,10 +254,12 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut asc = HashSet::new(); let mut desc = HashSet::new(); + let mut universe_resolution_method = UniverseResolutionMethod::TermMatchingStrategyOnly; + let mut ranking_rules: Vec> = vec![]; let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; for rr in settings_ranking_rules { - // Add Words before any of: typo, proximity, attribute, exactness + // Add Words before any of: typo, proximity, attribute match rr { crate::Criterion::Typo | crate::Criterion::Attribute | crate::Criterion::Proximity => { if !words { @@ -236,6 +313,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( ranking_rules.push(Box::new(ExactAttribute::new())); ranking_rules.push(Box::new(Exactness::new())); exactness = true; + + if !words { + universe_resolution_method = + UniverseResolutionMethod::TermMatchingStrategyAndExactness; + } } crate::Criterion::Asc(field_name) => { if asc.contains(&field_name) { @@ -253,7 +335,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( } } } - Ok(ranking_rules) + Ok((ranking_rules, universe_resolution_method)) } fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( @@ -335,19 +417,18 @@ pub fn execute_search( check_sort_criteria(ctx, sort_criteria.as_ref())?; - // TODO: if the exactness criterion is the first one, then - // use a different strategy to find the universe (union of any term) - universe = resolve_maximally_reduced_query_graph( + let (ranking_rules, universe_resolution_method) = + get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; + + universe = resolve_universe( ctx, &universe, &graph, + universe_resolution_method, terms_matching_strategy, query_graph_logger, )?; - let ranking_rules = - get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; - bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?; diff --git a/milli/src/search/new/tests/exactness.rs b/milli/src/search/new/tests/exactness.rs new file mode 100644 index 000000000..f1f4fbe40 --- /dev/null +++ b/milli/src/search/new/tests/exactness.rs @@ -0,0 +1,442 @@ +/*! +This module tests the following properties about the exactness ranking rule: + +- it sorts documents as follows: + 1. documents which have an attribute which is equal to the whole query + 2. documents which have an attribute which start with the whole query + 3. documents which contain the most exact words from the query + +- the set of all candidates when `exactness` precedes `word` is the union of: + 1. the same set of candidates that would be returned normally + 2. the set of documents that contain at least one exact word from the query + +- if it is placed after `word`, then it will only sort documents by: + 1. those that have an attribute which is equal to the whole remaining query, if this query does not have any "gap" + 2. those that have an attribute which start with the whole remaining query, if this query does not have any "gap" + 3. those that contain the most exact words from the remaining query +*/ + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_index_exact_words_simple_ordered() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Exactness]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "", + }, + { + "id": 1, + "text": "the", + }, + { + "id": 2, + "text": "the quick", + }, + { + "id": 3, + "text": "the quick brown", + }, + { + "id": 4, + "text": "the quick brown fox", + }, + { + "id": 5, + "text": "the quick brown fox jumps", + }, + + { + "id": 6, + "text": "the quick brown fox jumps over", + }, + { + "id": 7, + "text": "the quick brown fox jumps over the", + }, + { + "id": 8, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 9, + "text": "the quick brown fox jumps over the lazy dog", + }, + ])) + .unwrap(); + index +} + +fn create_index_exact_words_simple_reversed() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Exactness]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "", + }, + { + "id": 1, + "text": "dog", + }, + { + "id": 2, + "text": "lazy dog", + }, + { + "id": 3, + "text": "the lazy dog", + }, + { + "id": 4, + "text": "over the lazy dog", + }, + { + "id": 5, + "text": "jumps over the lazy dog", + }, + { + "id": 6, + "text": "fox jumps over the lazy dog", + }, + { + "id": 7, + "text": "brown fox jumps over the lazy dog", + }, + { + "id": 8, + "text": "quick brown fox jumps over the lazy dog", + }, + { + "id": 9, + "text": "the quick brown fox jumps over the lazy dog", + } + ])) + .unwrap(); + index +} + +fn create_index_exact_words_simple_random() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Exactness]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "", + }, + { + "id": 1, + "text": "over", + }, + { + "id": 2, + "text": "jump dog", + }, + { + "id": 3, + "text": "brown the lazy", + }, + { + "id": 4, + "text": "jump dog quick the", + }, + { + "id": 5, + "text": "fox the lazy dog brown", + }, + { + "id": 6, + "text": "jump fox quick lazy the dog", + }, + { + "id": 7, + "text": "the dog brown over jumps quick lazy", + }, + { + "id": 8, + "text": "the jumps dog quick over brown lazy fox", + } + ])) + .unwrap(); + index +} + +fn create_index_attribute_starts_with() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Exactness]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "what a lovely view from this balcony, I love it", + }, + { + "id": 1, + "text": "this balcony is overlooking the sea", + }, + { + "id": 2, + "text": "this balcony", + }, + { + "id": 3, + "text": "over looking the sea is a beautiful balcony", + }, + { + "id": 4, + "text": "a beautiful balcony is overlooking the sea", + }, + { + "id": 5, + "text": "overlooking the sea is a beautiful balcony, I love it", + }, + { + "id": 6, + "text": "overlooking the sea is a beautiful balcony", + }, + { + "id": 7, + "text": "overlooking", + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_exactness_simple_ordered() { + let index = create_index_exact_words_simple_ordered(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 6, 7, 5, 4, 3, 2, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown\"", + "\"the quick\"", + "\"the\"", + ] + "###); +} + +#[test] +fn test_exactness_simple_reversed() { + let index = create_index_exact_words_simple_reversed(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 7, 6, 5, 4, 3, 2, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"quick brown fox jumps over the lazy dog\"", + "\"brown fox jumps over the lazy dog\"", + "\"fox jumps over the lazy dog\"", + "\"jumps over the lazy dog\"", + "\"over the lazy dog\"", + "\"the lazy dog\"", + "\"lazy dog\"", + "\"dog\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 7, 6, 5, 4, 3, 2, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"quick brown fox jumps over the lazy dog\"", + "\"brown fox jumps over the lazy dog\"", + "\"fox jumps over the lazy dog\"", + "\"jumps over the lazy dog\"", + "\"over the lazy dog\"", + "\"the lazy dog\"", + "\"lazy dog\"", + "\"dog\"", + ] + "###); +} + +#[test] +fn test_exactness_simple_random() { + let index = create_index_exact_words_simple_random(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[8, 7, 5, 6, 3, 4, 1, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the jumps dog quick over brown lazy fox\"", + "\"the dog brown over jumps quick lazy\"", + "\"fox the lazy dog brown\"", + "\"jump fox quick lazy the dog\"", + "\"brown the lazy\"", + "\"jump dog quick the\"", + "\"over\"", + "\"jump dog\"", + ] + "###); +} + +#[test] +fn test_exactness_attribute_starts_with_simple() { + let index = create_index_attribute_starts_with(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("this balcony"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 3, 4, 5, 6]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this balcony\"", + "\"this balcony is overlooking the sea\"", + "\"what a lovely view from this balcony, I love it\"", + "\"over looking the sea is a beautiful balcony\"", + "\"a beautiful balcony is overlooking the sea\"", + "\"overlooking the sea is a beautiful balcony, I love it\"", + "\"overlooking the sea is a beautiful balcony\"", + ] + "###); +} + +#[test] +fn test_exactness_attribute_starts_with_phrase() { + let index = create_index_attribute_starts_with(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("\"overlooking the sea\" is a beautiful balcony"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 6, 4, 3, 1, 0, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: this is incorrect, the first document returned here should actually be the second one + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"overlooking the sea is a beautiful balcony, I love it\"", + "\"overlooking the sea is a beautiful balcony\"", + "\"a beautiful balcony is overlooking the sea\"", + "\"over looking the sea is a beautiful balcony\"", + "\"this balcony is overlooking the sea\"", + "\"what a lovely view from this balcony, I love it\"", + "\"this balcony\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("overlooking the sea is a beautiful balcony"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 0, 2, 7]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: this is correct, so the exactness ranking rule probably has a bug in the handling of phrases + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"overlooking the sea is a beautiful balcony\"", + "\"overlooking the sea is a beautiful balcony, I love it\"", + "\"a beautiful balcony is overlooking the sea\"", + "\"over looking the sea is a beautiful balcony\"", + "\"this balcony is overlooking the sea\"", + "\"what a lovely view from this balcony, I love it\"", + "\"this balcony\"", + "\"overlooking\"", + ] + "###); +} + +#[test] +fn test_exactness_all_candidates_with_typo() { + let index = create_index_attribute_starts_with(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("overlocking the sea is a beautiful balcony"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 5, 6, 1, 0, 2, 7]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // "overlooking" is returned here because the term matching strategy allows it + // but it has the worst exactness score (0 exact words) + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"over looking the sea is a beautiful balcony\"", + "\"a beautiful balcony is overlooking the sea\"", + "\"overlooking the sea is a beautiful balcony, I love it\"", + "\"overlooking the sea is a beautiful balcony\"", + "\"this balcony is overlooking the sea\"", + "\"what a lovely view from this balcony, I love it\"", + "\"this balcony\"", + "\"overlooking\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index 31b37933d..1194d32ac 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -1,6 +1,7 @@ pub mod attribute_fid; pub mod attribute_position; pub mod distinct; +pub mod exactness; #[cfg(feature = "default")] pub mod language; pub mod ngram_split_words; From d3a94e8b25b7d489b5591154bdd1ed6e5530a945 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 25 Apr 2023 16:49:08 +0200 Subject: [PATCH 202/234] Fix bugs and add tests to exactness ranking rule --- milli/src/search/new/exact_attribute.rs | 23 +- milli/src/search/new/query_graph.rs | 8 +- milli/src/search/new/query_term/mod.rs | 37 ++ .../new/ranking_rule_graph/exactness/mod.rs | 15 +- milli/src/search/new/resolve_query_graph.rs | 4 +- milli/src/search/new/tests/exactness.rs | 351 +++++++++++++++++- milli/src/search/new/words.rs | 3 +- 7 files changed, 410 insertions(+), 31 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index bc0195ebc..0b95243bc 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -91,6 +91,12 @@ impl State { universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result { + // An ordered list of the (remaining) query terms, with data extracted from them: + // 0. exact subterm. If it doesn't exist, the term is skipped. + // 1. start position of the term + // 2. id of the term + let mut count_all_positions = 0; + let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = Vec::with_capacity(query_graph.nodes.len() as usize); for (_, node) in query_graph.nodes.iter() { @@ -101,6 +107,7 @@ impl State { } else { continue; }; + count_all_positions += term.positions.len(); exact_term_position_ids.push(( exact_term, *term.positions.start(), @@ -195,13 +202,15 @@ impl State { if !intersection.is_empty() { // TODO: although not really worth it in terms of performance, // if would be good to put this in cache for the sake of consistency - let candidates_with_exact_word_count = ctx - .index - .field_id_word_count_docids - .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))? - .unwrap_or_default(); - // TODO: consider if we must store the candidates as arrays, or if there is a way to perform the union - // here. + let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize { + ctx.index + .field_id_word_count_docids + .get(ctx.txn, &(fid, count_all_positions as u8))? + .unwrap_or_default() + } else { + RoaringBitmap::default() + }; + candidates_per_attribute.push(FieldCandidates { start_with_exact: intersection, exact_word_count: candidates_with_exact_word_count, diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 2662ef730..155e6ad75 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -310,11 +310,11 @@ impl QueryGraph { rank as u16 }; let mut nodes_to_remove = BTreeMap::>::new(); - let mut at_least_one_phrase = false; + let mut at_least_one_mandatory_term = false; for (node_id, node) in self.nodes.iter() { let QueryNodeData::Term(t) = &node.data else { continue }; - if t.term_subset.original_phrase(ctx).is_some() { - at_least_one_phrase = true; + if t.term_subset.original_phrase(ctx).is_some() || t.term_subset.is_mandatory() { + at_least_one_mandatory_term = true; continue; } let mut cost = 0; @@ -327,7 +327,7 @@ impl QueryGraph { .insert(node_id); } let mut res: Vec<_> = nodes_to_remove.into_values().collect(); - if !at_least_one_phrase { + if !at_least_one_mandatory_term { res.pop(); } res diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 5ee29615b..5f1a45d83 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -4,6 +4,7 @@ mod parse_query; mod phrase; use std::collections::BTreeSet; +use std::iter::FromIterator; use std::ops::RangeInclusive; use compute_derivations::partially_initialized_term_from_word; @@ -30,6 +31,12 @@ pub struct QueryTermSubset { zero_typo_subset: NTypoTermSubset, one_typo_subset: NTypoTermSubset, two_typo_subset: NTypoTermSubset, + /// `true` if the term cannot be deleted through the term matching strategy + /// + /// Note that there are other reasons for which a term cannot be deleted, such as + /// being a phrase. In that case, this field could be set to `false`, but it + /// still wouldn't be deleteable by the term matching strategy. + mandatory: bool, } #[derive(Clone, PartialEq, Eq, Hash)] @@ -114,6 +121,12 @@ impl ExactTerm { } impl QueryTermSubset { + pub fn is_mandatory(&self) -> bool { + self.mandatory + } + pub fn make_mandatory(&mut self) { + self.mandatory = true; + } pub fn exact_term(&self, ctx: &SearchContext) -> Option { let full_query_term = ctx.term_interner.get(self.original); if full_query_term.ngram_words.is_some() { @@ -135,6 +148,7 @@ impl QueryTermSubset { zero_typo_subset: NTypoTermSubset::Nothing, one_typo_subset: NTypoTermSubset::Nothing, two_typo_subset: NTypoTermSubset::Nothing, + mandatory: false, } } pub fn full(for_term: Interned) -> Self { @@ -143,6 +157,7 @@ impl QueryTermSubset { zero_typo_subset: NTypoTermSubset::All, one_typo_subset: NTypoTermSubset::All, two_typo_subset: NTypoTermSubset::All, + mandatory: false, } } @@ -352,6 +367,28 @@ impl QueryTermSubset { _ => panic!(), } } + pub fn keep_only_exact_term(&mut self, ctx: &SearchContext) { + if let Some(term) = self.exact_term(ctx) { + match term { + ExactTerm::Phrase(p) => { + self.zero_typo_subset = NTypoTermSubset::Subset { + words: BTreeSet::new(), + phrases: BTreeSet::from_iter([p]), + }; + self.clear_one_typo_subset(); + self.clear_two_typo_subset(); + } + ExactTerm::Word(w) => { + self.zero_typo_subset = NTypoTermSubset::Subset { + words: BTreeSet::from_iter([w]), + phrases: BTreeSet::new(), + }; + self.clear_one_typo_subset(); + self.clear_two_typo_subset(); + } + } + } + } pub fn clear_zero_typo_subset(&mut self) { self.zero_typo_subset = NTypoTermSubset::Nothing; } diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index 7455a7a17..431eeac30 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -34,7 +34,7 @@ fn compute_docids( } } }; - // TODO: synonyms? + candidates &= universe; Ok(candidates) } @@ -47,18 +47,21 @@ impl RankingRuleGraphTrait for ExactnessGraph { condition: &Self::Condition, universe: &RoaringBitmap, ) -> Result { - let (docids, dest_node) = match condition { + let (docids, end_term_subset) = match condition { ExactnessCondition::ExactInAttribute(dest_node) => { - (compute_docids(ctx, dest_node, universe)?, dest_node) + let mut end_term_subset = dest_node.clone(); + end_term_subset.term_subset.keep_only_exact_term(ctx); + end_term_subset.term_subset.make_mandatory(); + (compute_docids(ctx, dest_node, universe)?, end_term_subset) } - ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node), + ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node.clone()), }; + Ok(ComputedCondition { docids, universe_len: universe.len(), start_term_subset: None, - // TODO/FIXME: modify `end_term_subset` to signal to the next ranking rules that the term cannot be removed - end_term_subset: dest_node.clone(), + end_term_subset, }) } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 34ed135d4..d16162b1b 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -165,8 +165,8 @@ pub fn compute_query_graph_docids( positions: _, term_ids: _, }) => { - let phrase_docids = compute_query_term_subset_docids(ctx, term_subset)?; - predecessors_docids & phrase_docids + let node_docids = compute_query_term_subset_docids(ctx, term_subset)?; + predecessors_docids & node_docids } QueryNodeData::Deleted => { panic!() diff --git a/milli/src/search/new/tests/exactness.rs b/milli/src/search/new/tests/exactness.rs index f1f4fbe40..2077f6c01 100644 --- a/milli/src/search/new/tests/exactness.rs +++ b/milli/src/search/new/tests/exactness.rs @@ -14,6 +14,11 @@ This module tests the following properties about the exactness ranking rule: 1. those that have an attribute which is equal to the whole remaining query, if this query does not have any "gap" 2. those that have an attribute which start with the whole remaining query, if this query does not have any "gap" 3. those that contain the most exact words from the remaining query + +- if it is followed by other ranking rules, then: + 1. `word` will not remove the exact terms matched by `exactness` + 2. graph-based ranking rules (`typo`, `proximity`, `attribute`) will only work with + (1) the exact terms selected by `exactness` or (2) the full query term otherwise */ use crate::{ @@ -21,7 +26,7 @@ use crate::{ SearchResult, TermsMatchingStrategy, }; -fn create_index_exact_words_simple_ordered() -> TempIndex { +fn create_index_simple_ordered() -> TempIndex { let index = TempIndex::new(); index @@ -80,7 +85,7 @@ fn create_index_exact_words_simple_ordered() -> TempIndex { index } -fn create_index_exact_words_simple_reversed() -> TempIndex { +fn create_index_simple_reversed() -> TempIndex { let index = TempIndex::new(); index @@ -138,7 +143,7 @@ fn create_index_exact_words_simple_reversed() -> TempIndex { index } -fn create_index_exact_words_simple_random() -> TempIndex { +fn create_index_simple_random() -> TempIndex { let index = TempIndex::new(); index @@ -242,9 +247,192 @@ fn create_index_attribute_starts_with() -> TempIndex { index } +fn create_index_simple_ordered_with_typos() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Exactness]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "", + }, + { + "id": 1, + "text": "the", + }, + { + "id": 2, + "text": "the quack", + }, + { + "id": 3, + "text": "the quack briwn", + }, + { + "id": 4, + "text": "the quack briwn fox", + }, + { + "id": 5, + "text": "the quack briwn fox jlmps", + }, + { + "id": 6, + "text": "the quack briwn fox jlmps over", + }, + { + "id": 7, + "text": "the quack briwn fox jlmps over the", + }, + { + "id": 8, + "text": "the quack briwn fox jlmps over the lazy", + }, + { + "id": 9, + "text": "the quack briwn fox jlmps over the lazy dog", + }, + { + "id": 10, + "text": "", + }, + { + "id": 11, + "text": "the", + }, + { + "id": 12, + "text": "the quick", + }, + { + "id": 13, + "text": "the quick brown", + }, + { + "id": 14, + "text": "the quick brown fox", + }, + { + "id": 15, + "text": "the quick brown fox jumps", + }, + + { + "id": 16, + "text": "the quick brown fox jumps over", + }, + { + "id": 17, + "text": "the quick brown fox jumps over the", + }, + { + "id": 18, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 19, + "text": "the quick brown fox jumps over the lazy dog", + }, + ])) + .unwrap(); + index +} + +fn create_index_with_varying_proximities() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "lazy jumps dog brown quick the over fox the", + }, + { + "id": 1, + "text": "the quick brown fox jumps over the very lazy dog" + }, + { + "id": 2, + "text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 3, + "text": "dog brown quick the over fox the lazy", + }, + { + "id": 4, + "text": "the quick brown fox over the very lazy dog" + }, + { + "id": 5, + "text": "the quick brown fox over the lazy dog", + }, + { + "id": 6, + "text": "brown quick the over fox", + }, + { + "id": 7, + "text": "the very quick brown fox over" + }, + { + "id": 8, + "text": "the quick brown fox over", + }, + ])) + .unwrap(); + index +} + +fn create_index_all_equal_except_proximity_between_ignored_terms() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "lazy jumps dog brown quick the over fox the" + }, + { + "id": 1, + "text": "lazy jumps dog brown quick the over fox the. quack briwn jlmps", + }, + { + "id": 2, + "text": "lazy jumps dog brown quick the over fox the. quack briwn jlmps overt", + }, + ])) + .unwrap(); + index +} + #[test] fn test_exactness_simple_ordered() { - let index = create_index_exact_words_simple_ordered(); + let index = create_index_simple_ordered(); let txn = index.read_txn().unwrap(); @@ -271,7 +459,7 @@ fn test_exactness_simple_ordered() { #[test] fn test_exactness_simple_reversed() { - let index = create_index_exact_words_simple_reversed(); + let index = create_index_simple_reversed(); let txn = index.read_txn().unwrap(); @@ -318,7 +506,7 @@ fn test_exactness_simple_reversed() { #[test] fn test_exactness_simple_random() { - let index = create_index_exact_words_simple_random(); + let index = create_index_simple_random(); let txn = index.read_txn().unwrap(); @@ -377,13 +565,12 @@ fn test_exactness_attribute_starts_with_phrase() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("\"overlooking the sea\" is a beautiful balcony"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 6, 4, 3, 1, 0, 2]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 0, 2]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); - // TODO: this is incorrect, the first document returned here should actually be the second one insta::assert_debug_snapshot!(texts, @r###" [ - "\"overlooking the sea is a beautiful balcony, I love it\"", "\"overlooking the sea is a beautiful balcony\"", + "\"overlooking the sea is a beautiful balcony, I love it\"", "\"a beautiful balcony is overlooking the sea\"", "\"over looking the sea is a beautiful balcony\"", "\"this balcony is overlooking the sea\"", @@ -398,7 +585,6 @@ fn test_exactness_attribute_starts_with_phrase() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 0, 2, 7]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); - // TODO: this is correct, so the exactness ranking rule probably has a bug in the handling of phrases insta::assert_debug_snapshot!(texts, @r###" [ "\"overlooking the sea is a beautiful balcony\"", @@ -440,3 +626,148 @@ fn test_exactness_all_candidates_with_typo() { ] "###); } + +#[test] +fn test_exactness_after_words() { + let index = create_index_simple_ordered_with_typos(); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Exactness]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 9, 18, 8, 17, 16, 6, 7, 15, 5, 14, 4, 13, 3, 12, 2, 1, 11]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quack briwn fox jlmps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quack briwn fox jlmps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quack briwn fox jlmps over\"", + "\"the quack briwn fox jlmps over the\"", + "\"the quick brown fox jumps\"", + "\"the quack briwn fox jlmps\"", + "\"the quick brown fox\"", + "\"the quack briwn fox\"", + "\"the quick brown\"", + "\"the quack briwn\"", + "\"the quick\"", + "\"the quack\"", + "\"the\"", + "\"the\"", + ] + "###); +} + +#[test] +fn test_words_after_exactness() { + let index = create_index_simple_ordered_with_typos(); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Exactness, Criterion::Words]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 16, 17, 9, 15, 8, 14, 6, 7, 13, 5, 4, 12, 3, 2, 1, 11]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the\"", + "\"the quack briwn fox jlmps over the lazy dog\"", + "\"the quick brown fox jumps\"", + "\"the quack briwn fox jlmps over the lazy\"", + "\"the quick brown fox\"", + "\"the quack briwn fox jlmps over\"", + "\"the quack briwn fox jlmps over the\"", + "\"the quick brown\"", + "\"the quack briwn fox jlmps\"", + "\"the quack briwn fox\"", + "\"the quick\"", + "\"the quack briwn\"", + "\"the quack\"", + "\"the\"", + "\"the\"", + ] + "###); +} + +#[test] +fn test_proximity_after_exactness() { + let index = create_index_with_varying_proximities(); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 5, 4, 3, 8, 6, 7]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the very lazy dog\"", + "\"lazy jumps dog brown quick the over fox the\"", + "\"the quick brown fox over the lazy dog\"", + "\"the quick brown fox over the very lazy dog\"", + "\"dog brown quick the over fox the lazy\"", + "\"the quick brown fox over\"", + "\"brown quick the over fox\"", + "\"the very quick brown fox over\"", + ] + "###); + + let index = create_index_all_equal_except_proximity_between_ignored_terms(); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"lazy jumps dog brown quick the over fox the\"", + "\"lazy jumps dog brown quick the over fox the. quack briwn jlmps\"", + "\"lazy jumps dog brown quick the over fox the. quack briwn jlmps overt\"", + ] + "###); +} diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 39bbc823d..5c28f017b 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -33,7 +33,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { &mut self, ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, - _parent_candidates: &RoaringBitmap, + _universe: &RoaringBitmap, parent_query_graph: &QueryGraph, ) -> Result<()> { self.exhausted = false; @@ -77,7 +77,6 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { let nodes_to_remove = self.nodes_to_remove.pop().unwrap(); query_graph.remove_nodes_keep_edges(&nodes_to_remove.iter().collect::>()); } - Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket })) } From 3421125a553c933fae6c439f60daf2b430389cf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 25 Apr 2023 17:52:42 +0200 Subject: [PATCH 203/234] Prevent the `exactness` ranking rule from removing random words Make it strictly follow the term matching strategy --- milli/src/search/new/mod.rs | 101 +++--------------- .../new/ranking_rule_graph/exactness/mod.rs | 11 +- milli/src/search/new/tests/exactness.rs | 94 +++++++--------- 3 files changed, 61 insertions(+), 145 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index fb96af5e7..e9518bad5 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -51,7 +51,6 @@ use resolve_query_graph::compute_query_graph_docids; use sort::Sort; use self::interner::Interned; -use self::query_term::ExactTerm; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -120,73 +119,20 @@ fn resolve_maximally_reduced_query_graph( Ok(docids) } -fn resolve_docids_containing_any_exact_word( - ctx: &mut SearchContext, - universe: &RoaringBitmap, - query_graph: &QueryGraph, -) -> Result { - let mut docids = RoaringBitmap::new(); - for (_, node) in query_graph.nodes.iter() { - let term = match &node.data { - query_graph::QueryNodeData::Term(term) => term, - query_graph::QueryNodeData::Deleted - | query_graph::QueryNodeData::Start - | query_graph::QueryNodeData::End => { - continue; - } - }; - if term.term_ids.len() != 1 { - continue; - } - let Some(exact_term) = term.term_subset.exact_term(ctx) else { - continue - }; - let exact_term_docids = match exact_term { - ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)? & universe, - ExactTerm::Word(word) => { - if let Some(word_docids) = ctx.word_docids(Word::Original(word))? { - word_docids & universe - } else { - continue; - } - } - }; - docids |= exact_term_docids; - } - Ok(docids) -} - fn resolve_universe( ctx: &mut SearchContext, initial_universe: &RoaringBitmap, query_graph: &QueryGraph, - method: UniverseResolutionMethod, matching_strategy: TermsMatchingStrategy, logger: &mut dyn SearchLogger, ) -> Result { - match method { - UniverseResolutionMethod::TermMatchingStrategyOnly => { - resolve_maximally_reduced_query_graph( - ctx, - initial_universe, - query_graph, - matching_strategy, - logger, - ) - } - UniverseResolutionMethod::TermMatchingStrategyAndExactness => { - let mut resolved_universe = resolve_maximally_reduced_query_graph( - ctx, - initial_universe, - query_graph, - matching_strategy, - logger, - )?; - resolved_universe |= - resolve_docids_containing_any_exact_word(ctx, initial_universe, query_graph)?; - Ok(resolved_universe) - } - } + resolve_maximally_reduced_query_graph( + ctx, + initial_universe, + query_graph, + matching_strategy, + logger, + ) } /// Return the list of initialised ranking rules to be used for a placeholder search. @@ -233,17 +179,12 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( Ok(ranking_rules) } -enum UniverseResolutionMethod { - TermMatchingStrategyOnly, - TermMatchingStrategyAndExactness, -} - /// Return the list of initialised ranking rules to be used for a query graph search. fn get_ranking_rules_for_query_graph_search<'ctx>( ctx: &SearchContext<'ctx>, sort_criteria: &Option>, terms_matching_strategy: TermsMatchingStrategy, -) -> Result<(Vec>, UniverseResolutionMethod)> { +) -> Result>> { // query graph search let mut words = false; let mut typo = false; @@ -254,14 +195,15 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut asc = HashSet::new(); let mut desc = HashSet::new(); - let mut universe_resolution_method = UniverseResolutionMethod::TermMatchingStrategyOnly; - let mut ranking_rules: Vec> = vec![]; let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; for rr in settings_ranking_rules { // Add Words before any of: typo, proximity, attribute match rr { - crate::Criterion::Typo | crate::Criterion::Attribute | crate::Criterion::Proximity => { + crate::Criterion::Typo + | crate::Criterion::Attribute + | crate::Criterion::Proximity + | crate::Criterion::Exactness => { if !words { ranking_rules.push(Box::new(Words::new(terms_matching_strategy))); words = true; @@ -313,11 +255,6 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( ranking_rules.push(Box::new(ExactAttribute::new())); ranking_rules.push(Box::new(Exactness::new())); exactness = true; - - if !words { - universe_resolution_method = - UniverseResolutionMethod::TermMatchingStrategyAndExactness; - } } crate::Criterion::Asc(field_name) => { if asc.contains(&field_name) { @@ -335,7 +272,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( } } } - Ok((ranking_rules, universe_resolution_method)) + Ok(ranking_rules) } fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( @@ -417,17 +354,11 @@ pub fn execute_search( check_sort_criteria(ctx, sort_criteria.as_ref())?; - let (ranking_rules, universe_resolution_method) = + let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; - universe = resolve_universe( - ctx, - &universe, - &graph, - universe_resolution_method, - terms_matching_strategy, - query_graph_logger, - )?; + universe = + resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?; bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index 431eeac30..0842d6d04 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -3,13 +3,14 @@ use roaring::RoaringBitmap; use super::{ComputedCondition, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; +use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::Word; use crate::{Result, SearchContext}; #[derive(Clone, PartialEq, Eq, Hash)] pub enum ExactnessCondition { ExactInAttribute(LocatedQueryTermSubset), - Skip(LocatedQueryTermSubset), + Any(LocatedQueryTermSubset), } pub enum ExactnessGraph {} @@ -54,7 +55,11 @@ impl RankingRuleGraphTrait for ExactnessGraph { end_term_subset.term_subset.make_mandatory(); (compute_docids(ctx, dest_node, universe)?, end_term_subset) } - ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node.clone()), + ExactnessCondition::Any(dest_node) => { + let docids = + universe & compute_query_term_subset_docids(ctx, &dest_node.term_subset)?; + (docids, dest_node.clone()) + } }; Ok(ComputedCondition { @@ -74,7 +79,7 @@ impl RankingRuleGraphTrait for ExactnessGraph { let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone()); let exact_condition = conditions_interner.insert(exact_condition); - let skip_condition = ExactnessCondition::Skip(dest_node.clone()); + let skip_condition = ExactnessCondition::Any(dest_node.clone()); let skip_condition = conditions_interner.insert(skip_condition); Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)]) diff --git a/milli/src/search/new/tests/exactness.rs b/milli/src/search/new/tests/exactness.rs index 2077f6c01..fd6bcf0af 100644 --- a/milli/src/search/new/tests/exactness.rs +++ b/milli/src/search/new/tests/exactness.rs @@ -6,19 +6,17 @@ This module tests the following properties about the exactness ranking rule: 2. documents which have an attribute which start with the whole query 3. documents which contain the most exact words from the query -- the set of all candidates when `exactness` precedes `word` is the union of: - 1. the same set of candidates that would be returned normally - 2. the set of documents that contain at least one exact word from the query +- the `exactness` ranking rule must be preceded by the `words` ranking rule -- if it is placed after `word`, then it will only sort documents by: +- if `words` has already removed terms from the query, then exactness will sort documents as follows: 1. those that have an attribute which is equal to the whole remaining query, if this query does not have any "gap" 2. those that have an attribute which start with the whole remaining query, if this query does not have any "gap" 3. those that contain the most exact words from the remaining query -- if it is followed by other ranking rules, then: - 1. `word` will not remove the exact terms matched by `exactness` - 2. graph-based ranking rules (`typo`, `proximity`, `attribute`) will only work with - (1) the exact terms selected by `exactness` or (2) the full query term otherwise +- if it is followed by other graph-based ranking rules (`typo`, `proximity`, `attribute`). +Then these rules will only work with + 1. the exact terms selected by `exactness + 2. the full query term otherwise */ use crate::{ @@ -440,14 +438,14 @@ fn test_exactness_simple_ordered() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 6, 7, 5, 4, 3, 2, 1]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 7, 6, 5, 4, 3, 2, 1]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"the quick brown fox jumps over the lazy dog\"", "\"the quick brown fox jumps over the lazy\"", - "\"the quick brown fox jumps over\"", "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", "\"the quick brown fox jumps\"", "\"the quick brown fox\"", "\"the quick brown\"", @@ -467,19 +465,17 @@ fn test_exactness_simple_reversed() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 7, 6, 5, 4, 3, 2, 1]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 3, 4, 5, 6, 7]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"the quick brown fox jumps over the lazy dog\"", "\"quick brown fox jumps over the lazy dog\"", - "\"brown fox jumps over the lazy dog\"", - "\"fox jumps over the lazy dog\"", - "\"jumps over the lazy dog\"", - "\"over the lazy dog\"", "\"the lazy dog\"", - "\"lazy dog\"", - "\"dog\"", + "\"over the lazy dog\"", + "\"jumps over the lazy dog\"", + "\"fox jumps over the lazy dog\"", + "\"brown fox jumps over the lazy dog\"", ] "###); @@ -487,19 +483,17 @@ fn test_exactness_simple_reversed() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 7, 6, 5, 4, 3, 2, 1]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 3, 4, 5, 6, 7]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"the quick brown fox jumps over the lazy dog\"", "\"quick brown fox jumps over the lazy dog\"", - "\"brown fox jumps over the lazy dog\"", - "\"fox jumps over the lazy dog\"", - "\"jumps over the lazy dog\"", - "\"over the lazy dog\"", "\"the lazy dog\"", - "\"lazy dog\"", - "\"dog\"", + "\"over the lazy dog\"", + "\"jumps over the lazy dog\"", + "\"fox jumps over the lazy dog\"", + "\"brown fox jumps over the lazy dog\"", ] "###); } @@ -514,18 +508,16 @@ fn test_exactness_simple_random() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[8, 7, 5, 6, 3, 4, 1, 2]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[8, 7, 4, 6, 3, 5]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"the jumps dog quick over brown lazy fox\"", "\"the dog brown over jumps quick lazy\"", - "\"fox the lazy dog brown\"", + "\"jump dog quick the\"", "\"jump fox quick lazy the dog\"", "\"brown the lazy\"", - "\"jump dog quick the\"", - "\"over\"", - "\"jump dog\"", + "\"fox the lazy dog brown\"", ] "###); } @@ -540,17 +532,13 @@ fn test_exactness_attribute_starts_with_simple() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("this balcony"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 3, 4, 5, 6]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"this balcony\"", "\"this balcony is overlooking the sea\"", "\"what a lovely view from this balcony, I love it\"", - "\"over looking the sea is a beautiful balcony\"", - "\"a beautiful balcony is overlooking the sea\"", - "\"overlooking the sea is a beautiful balcony, I love it\"", - "\"overlooking the sea is a beautiful balcony\"", ] "###); } @@ -565,17 +553,14 @@ fn test_exactness_attribute_starts_with_phrase() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("\"overlooking the sea\" is a beautiful balcony"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 0, 2]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 1]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"overlooking the sea is a beautiful balcony\"", "\"overlooking the sea is a beautiful balcony, I love it\"", "\"a beautiful balcony is overlooking the sea\"", - "\"over looking the sea is a beautiful balcony\"", "\"this balcony is overlooking the sea\"", - "\"what a lovely view from this balcony, I love it\"", - "\"this balcony\"", ] "###); @@ -583,7 +568,7 @@ fn test_exactness_attribute_starts_with_phrase() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("overlooking the sea is a beautiful balcony"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 0, 2, 7]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 7]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ @@ -592,8 +577,6 @@ fn test_exactness_attribute_starts_with_phrase() { "\"a beautiful balcony is overlooking the sea\"", "\"over looking the sea is a beautiful balcony\"", "\"this balcony is overlooking the sea\"", - "\"what a lovely view from this balcony, I love it\"", - "\"this balcony\"", "\"overlooking\"", ] "###); @@ -609,19 +592,16 @@ fn test_exactness_all_candidates_with_typo() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("overlocking the sea is a beautiful balcony"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 5, 6, 1, 0, 2, 7]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 6, 1, 7]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); // "overlooking" is returned here because the term matching strategy allows it // but it has the worst exactness score (0 exact words) insta::assert_debug_snapshot!(texts, @r###" [ - "\"over looking the sea is a beautiful balcony\"", "\"a beautiful balcony is overlooking the sea\"", "\"overlooking the sea is a beautiful balcony, I love it\"", "\"overlooking the sea is a beautiful balcony\"", "\"this balcony is overlooking the sea\"", - "\"what a lovely view from this balcony, I love it\"", - "\"this balcony\"", "\"overlooking\"", ] "###); @@ -686,26 +666,26 @@ fn test_words_after_exactness() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 16, 17, 9, 15, 8, 14, 6, 7, 13, 5, 4, 12, 3, 2, 1, 11]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 9, 18, 8, 17, 16, 6, 7, 15, 5, 14, 4, 13, 3, 12, 2, 1, 11]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"the quick brown fox jumps over the lazy dog\"", - "\"the quick brown fox jumps over the lazy\"", - "\"the quick brown fox jumps over\"", - "\"the quick brown fox jumps over the\"", "\"the quack briwn fox jlmps over the lazy dog\"", - "\"the quick brown fox jumps\"", + "\"the quick brown fox jumps over the lazy\"", "\"the quack briwn fox jlmps over the lazy\"", - "\"the quick brown fox\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", "\"the quack briwn fox jlmps over\"", "\"the quack briwn fox jlmps over the\"", - "\"the quick brown\"", + "\"the quick brown fox jumps\"", "\"the quack briwn fox jlmps\"", + "\"the quick brown fox\"", "\"the quack briwn fox\"", - "\"the quick\"", + "\"the quick brown\"", "\"the quack briwn\"", + "\"the quick\"", "\"the quack\"", "\"the\"", "\"the\"", @@ -729,7 +709,7 @@ fn test_proximity_after_exactness() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 5, 4, 3, 8, 6, 7]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 4, 5, 8, 7, 3, 6]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" @@ -737,12 +717,12 @@ fn test_proximity_after_exactness() { "\"the quick brown fox jumps over the lazy dog\"", "\"the quick brown fox jumps over the very lazy dog\"", "\"lazy jumps dog brown quick the over fox the\"", - "\"the quick brown fox over the lazy dog\"", "\"the quick brown fox over the very lazy dog\"", - "\"dog brown quick the over fox the lazy\"", + "\"the quick brown fox over the lazy dog\"", "\"the quick brown fox over\"", - "\"brown quick the over fox\"", "\"the very quick brown fox over\"", + "\"dog brown quick the over fox the lazy\"", + "\"brown quick the over fox\"", ] "###); From 55bad07c165476594dd82c37fc1df9bf6eba54cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 26 Apr 2023 10:40:05 +0200 Subject: [PATCH 204/234] Fix bug in exact_attribute rr implementation --- milli/src/search/new/exact_attribute.rs | 43 +++++++++++++------------ milli/tests/assets/test_set.ndjson | 4 +-- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 0b95243bc..93dd7c3fc 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -91,13 +91,14 @@ impl State { universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result { - // An ordered list of the (remaining) query terms, with data extracted from them: - // 0. exact subterm. If it doesn't exist, the term is skipped. - // 1. start position of the term - // 2. id of the term - let mut count_all_positions = 0; + struct ExactTermInfo { + exact_term: ExactTerm, + start_position: u16, + start_term_id: u8, + position_count: usize, + } - let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = + let mut exact_terms: Vec = Vec::with_capacity(query_graph.nodes.len() as usize); for (_, node) in query_graph.nodes.iter() { match &node.data { @@ -107,34 +108,35 @@ impl State { } else { continue; }; - count_all_positions += term.positions.len(); - exact_term_position_ids.push(( + exact_terms.push(ExactTermInfo { exact_term, - *term.positions.start(), - *term.term_ids.start(), - )) + start_position: *term.positions.start(), + start_term_id: *term.term_ids.start(), + position_count: term.positions.len(), + }); } QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, } } - exact_term_position_ids.sort_by_key(|(_, _, id)| *id); - exact_term_position_ids.dedup_by_key(|(_, _, id)| *id); + exact_terms.sort_by_key(|x| x.start_term_id); + exact_terms.dedup_by_key(|x| x.start_term_id); + let count_all_positions = exact_terms.iter().fold(0, |acc, x| acc + x.position_count); // bail if there is a "hole" (missing word) in remaining query graph - if let Some((_, _, first_id)) = exact_term_position_ids.first() { - if *first_id != 0 { + if let Some(e) = exact_terms.first() { + if e.start_term_id != 0 { return Ok(State::Empty(query_graph.clone())); } } else { return Ok(State::Empty(query_graph.clone())); } let mut previous_id = 0; - for (_, _, id) in exact_term_position_ids.iter().copied() { - if id < previous_id || id - previous_id > 1 { + for e in exact_terms.iter() { + if e.start_term_id < previous_id || e.start_term_id - previous_id > 1 { return Ok(State::Empty(query_graph.clone())); } else { - previous_id = id; + previous_id = e.start_term_id; } } @@ -147,10 +149,9 @@ impl State { // first check that for each term, there exists some attribute that has this term at the correct position //"word-position-docids"; let mut candidates = universe.clone(); - let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids + let words_positions: Vec<(Vec<_>, _)> = exact_terms .iter() - .copied() - .map(|(term, position, _)| (term.interned_words(ctx).collect(), position)) + .map(|e| (e.exact_term.interned_words(ctx).collect(), e.start_position)) .collect(); for (words, position) in &words_positions { if candidates.is_empty() { diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 60ee48dd2..4c83cbe14 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -134,7 +134,7 @@ "typo_rank": 0, "proximity_rank": 0, "attribute_rank": 1, - "exact_rank": 3, + "exact_rank": 1, "asc_desc_rank": 5, "sort_by_rank": 2, "geo_rank": 34692, @@ -369,7 +369,7 @@ "typo_rank": 0, "proximity_rank": 0, "attribute_rank": 1, - "exact_rank": 2, + "exact_rank": 0, "asc_desc_rank": 2, "sort_by_rank": 1, "geo_rank": 9339230, From b448aca49c20272e6b1ca9f73d2b6ccfee8d9230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 26 Apr 2023 11:04:18 +0200 Subject: [PATCH 205/234] Add more tests for exactness rr --- milli/src/search/new/tests/exactness.rs | 97 +++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/milli/src/search/new/tests/exactness.rs b/milli/src/search/new/tests/exactness.rs index fd6bcf0af..7543959d3 100644 --- a/milli/src/search/new/tests/exactness.rs +++ b/milli/src/search/new/tests/exactness.rs @@ -398,6 +398,44 @@ fn create_index_with_varying_proximities() -> TempIndex { index } +fn create_index_with_typo_and_prefix() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Exactness]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "exPraordinarily quick brown fox", + }, + { + "id": 1, + "text": "extraordinarily quick brown fox", + }, + { + "id": 2, + "text": "extra quick brown fox", + }, + { + "id": 3, + "text": "exPraordinarily quack brown fox", + }, + { + "id": 4, + "text": "exPraordinaPily quick brown fox", + } + ])) + .unwrap(); + index +} + fn create_index_all_equal_except_proximity_between_ignored_terms() -> TempIndex { let index = TempIndex::new(); @@ -751,3 +789,62 @@ fn test_proximity_after_exactness() { ] "###); } + +#[test] +fn test_exactness_followed_by_typo_prefer_no_typo_prefix() { + let index = create_index_with_typo_and_prefix(); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Typo]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("quick brown fox extra"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 4, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"extra quick brown fox\"", + "\"extraordinarily quick brown fox\"", + "\"exPraordinarily quick brown fox\"", + "\"exPraordinaPily quick brown fox\"", + "\"exPraordinarily quack brown fox\"", + ] + "###); +} + +#[test] +fn test_typo_followed_by_exactness() { + let index = create_index_with_typo_and_prefix(); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Exactness]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("extraordinarily quick brown fox"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 4, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"extraordinarily quick brown fox\"", + "\"exPraordinarily quick brown fox\"", + "\"exPraordinaPily quick brown fox\"", + "\"exPraordinarily quack brown fox\"", + ] + "###); +} From c8af57269748d430c7b92e88ef9ee18895b056b3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 26 Apr 2023 16:10:26 +0200 Subject: [PATCH 206/234] Add tests for exact words and exact attributes --- milli/src/search/new/tests/typo.rs | 242 ++++++++++++++++++++--------- 1 file changed, 165 insertions(+), 77 deletions(-) diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs index 4df340e9b..33b165a94 100644 --- a/milli/src/search/new/tests/typo.rs +++ b/milli/src/search/new/tests/typo.rs @@ -20,10 +20,9 @@ if `words` doesn't exist before it. use std::collections::HashMap; -use crate::{ - index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, - SearchResult, TermsMatchingStrategy, -}; +use crate::index::tests::TempIndex; +use crate::search::new::tests::collect_field_values; +use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -134,6 +133,14 @@ fn create_index() -> TempIndex { "id": 23, "text": "the quivk brown fox jumps over the lazy dog" }, + { + "id": 24, + "tolerant_text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 25, + "tolerant_text": "the quivk brown fox jumps over the lazy dog", + }, ])) .unwrap(); index @@ -212,79 +219,6 @@ fn test_default_typo() { "\"the quickest brownest fox jumps over the laziest dog\"", ] "###); - - // 1 typo on one word, swapped letters - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the quikc borwn fox jupms over the lazy dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); - let texts = collect_field_values(&index, &txn, "text", &documents_ids); - insta::assert_debug_snapshot!(texts, @r###" - [ - "\"the quick brown fox jumps over the lazy dog\"", - ] - "###); - - // 1 first letter typo on a word <5 bytes, replaced letter - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the nuick brown fox jumps over the lazy dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); - - // 1 first letter typo on a word <5 bytes, missing letter - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the uick brown fox jumps over the lazy dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); - - // 1 typo on all words >=5 bytes, replaced letters - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the quack brawn fox junps over the lazy dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); - let texts = collect_field_values(&index, &txn, "text", &documents_ids); - insta::assert_debug_snapshot!(texts, @r###" - [ - "\"the quick brown fox jumps over the lazy dog\"", - ] - "###); - - // 2 typos on words < 9 bytes - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the quckest brawnert fox jumps over the aziest dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); - - // 2 typos on words >= 9 bytes: missing letters, missing first letter, replaced letters - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the extravant fox kyrocketed over the lamguorout dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); - let texts = collect_field_values(&index, &txn, "text", &documents_ids); - insta::assert_debug_snapshot!(texts, @r###" - [ - "\"the extravagant fox skyrocketed over the languorous dog\"", - ] - "###); - - // 2 typos on words >= 9 bytes: 2 extra letters in a single word, swapped letters + extra letter, replaced letters - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the extravaganttt fox sktyrocnketed over the lagnuorrous dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); - let texts = collect_field_values(&index, &txn, "text", &documents_ids); - insta::assert_debug_snapshot!(texts, @r###" - [ - "\"the extravagant fox skyrocketed over the languorous dog\"", - ] - "###); } #[test] @@ -301,6 +235,160 @@ fn test_phrase_no_typo_allowed() { insta::assert_debug_snapshot!(texts, @"[]"); } +#[test] +fn test_typo_exact_word() { + let index = create_index(); + + index + .update_settings(|s| { + s.set_exact_words( + ["quick", "quack", "sunflower"].iter().map(ToString::to_string).collect(), + ) + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let ot = index.min_word_len_one_typo(&txn).unwrap(); + let tt = index.min_word_len_two_typos(&txn).unwrap(); + insta::assert_debug_snapshot!(ot, @"5"); + insta::assert_debug_snapshot!(tt, @"9"); + + // don't match quivk + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); + + // Don't match quick + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quack brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // words not in exact_words (quicest, jummps) have normal typo handling + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quicest brownest fox jummps over the laziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quickest brownest fox jumps over the laziest dog\"", + ] + "###); + + // exact words do not disable prefix (sunflowering OK, but no sunflowar or sun flower) + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sunflower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"network interconnection sunflower\"", + "\"network interconnection sunflowering\"", + ] + "###); +} + +#[test] +fn test_typo_exact_attribute() { + let index = create_index(); + + index + .update_settings(|s| { + s.set_exact_attributes(["text"].iter().map(ToString::to_string).collect()); + s.set_searchable_fields( + ["text", "tolerant_text"].iter().map(ToString::to_string).collect(), + ); + s.set_exact_words(["quivk"].iter().map(ToString::to_string).collect()) + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let ot = index.min_word_len_one_typo(&txn).unwrap(); + let tt = index.min_word_len_two_typos(&txn).unwrap(); + insta::assert_debug_snapshot!(ot, @"5"); + insta::assert_debug_snapshot!(tt, @"9"); + + // Exact match returns both exact attributes and tolerant ones. + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 24, 25]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + let texts = collect_field_values(&index, &txn, "tolerant_text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "__does_not_exist__", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); + + // 1 typo only returns the tolerant attribute + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quidk brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[24, 25]"); + let texts = collect_field_values(&index, &txn, "tolerant_text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); + + // combine with exact words + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quivk brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 25]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quivk brown fox jumps over the lazy dog\"", + "__does_not_exist__", + ] + "###); + let texts = collect_field_values(&index, &txn, "tolerant_text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "__does_not_exist__", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); + + // No result in tolerant attribute + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quicest brownest fox jummps over the laziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + #[test] fn test_ngram_typos() { let index = create_index(); From b41a6cbd7a30071d842cb0ac019d7ec454148842 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 26 Apr 2023 16:28:17 +0200 Subject: [PATCH 207/234] Check sort criteria also in placeholder search --- milli/src/search/new/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index e9518bad5..375f7c774 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -330,6 +330,8 @@ pub fn execute_search( ctx.index.documents_ids(ctx.txn)? }; + check_sort_criteria(ctx, sort_criteria.as_ref())?; + let mut located_query_terms = None; let bucket_sort_output = if let Some(query) = query { // We make sure that the analyzer is aware of the stop words @@ -352,8 +354,6 @@ pub fn execute_search( let graph = QueryGraph::from_query(ctx, &query_terms)?; located_query_terms = Some(query_terms); - check_sort_criteria(ctx, sort_criteria.as_ref())?; - let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; From 374095d42c8d0f00930cb3449fc1aeda25652445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 27 Apr 2023 13:30:09 +0200 Subject: [PATCH 208/234] Add tests for stop words and fix a couple of bugs --- milli/src/search/new/mod.rs | 13 +- .../src/search/new/query_term/parse_query.rs | 2 +- milli/src/search/new/resolve_query_graph.rs | 21 +-- milli/src/search/new/tests/mod.rs | 1 + milli/src/search/new/tests/stop_words.rs | 135 ++++++++++++++++++ 5 files changed, 155 insertions(+), 17 deletions(-) create mode 100644 milli/src/search/new/tests/stop_words.rs diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 375f7c774..246745678 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -333,7 +333,8 @@ pub fn execute_search( check_sort_criteria(ctx, sort_criteria.as_ref())?; let mut located_query_terms = None; - let bucket_sort_output = if let Some(query) = query { + + let query_terms = if let Some(query) = query { // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut tokbuilder = TokenizerBuilder::new(); @@ -351,6 +352,16 @@ pub fn execute_search( let tokens = tokenizer.tokenize(query); let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; + if query_terms.is_empty() { + // Do a placeholder search instead + None + } else { + Some(query_terms) + } + } else { + None + }; + let bucket_sort_output = if let Some(query_terms) = query_terms { let graph = QueryGraph::from_query(ctx, &query_terms)?; located_query_terms = Some(query_terms); diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 91b888dcf..734938551 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -241,7 +241,7 @@ impl PhraseBuilder { } fn is_empty(&self) -> bool { - self.words.is_empty() + self.words.is_empty() || self.words.iter().all(Option::is_none) } // precondition: token has kind Word or StopWord diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index d16162b1b..131dcd856 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -203,20 +203,15 @@ pub fn compute_phrase_docids( if words.is_empty() { return Ok(RoaringBitmap::new()); } - if words.len() == 1 { - if let Some(word) = &words[0] { - if let Some(word_docids) = ctx.word_docids(Word::Original(*word))? { - return Ok(word_docids); - } else { - return Ok(RoaringBitmap::new()); - } + let mut candidates = RoaringBitmap::new(); + for word in words.iter().flatten().copied() { + if let Some(word_docids) = ctx.word_docids(Word::Original(word))? { + candidates |= word_docids; } else { return Ok(RoaringBitmap::new()); } } - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; let winsize = words.len().min(3); for win in words.windows(winsize) { @@ -262,12 +257,8 @@ pub fn compute_phrase_docids( bitmaps.sort_unstable_by_key(|a| a.len()); for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } + candidates &= bitmap; + // There will be no match, return early if candidates.is_empty() { break; diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index 1194d32ac..cdcdb5936 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -11,6 +11,7 @@ pub mod sort; pub mod typo; pub mod typo_proximity; pub mod words_tms; +pub mod stop_words; fn collect_field_values( index: &crate::Index, diff --git a/milli/src/search/new/tests/stop_words.rs b/milli/src/search/new/tests/stop_words.rs new file mode 100644 index 000000000..96dd06584 --- /dev/null +++ b/milli/src/search/new/tests/stop_words.rs @@ -0,0 +1,135 @@ +/*! +This module tests the following properties about stop words: +- they are not indexed +- they are not searchable +- they are case sensitive +- they are ignored in phrases +- If a query consists only of stop words, a placeholder query is used instead +- A prefix word is never ignored, even if the prefix is a stop word +- Phrases consisting only of stop words are ignored +*/ + +use std::{collections::BTreeSet, iter::FromIterator}; + +use crate::{db_snap, index::tests::TempIndex, Search, SearchResult, TermsMatchingStrategy}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["title".to_owned()]); + s.set_stop_words(BTreeSet::from_iter([ + "to".to_owned(), + "The".to_owned(), + "xyz".to_owned(), + ])); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "title": "Shazam!", + }, + { + "id": 1, + "title": "Captain Marvel", + }, + { + "id": 2, + "title": "Escape Room", + }, + { + "id": 3, + "title": "How to Train Your Dragon: The Hidden World", + }, + { + "id": 4, + "title": "Gläss", + }, + { + "id": 5, + "title": "How to Attempt to Train Your Dragon", + }, + { + "id": 6, + "title": "How to Train Your Dragon: the Hidden World", + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_stop_words_not_indexed() { + let index = create_index(); + db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264"); +} + +#[test] +fn test_ignore_stop_words() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + // `the` is treated as a prefix here, so it's not ignored + let mut s = Search::new(&txn, &index); + s.query("xyz to the"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + // `xyz` is treated as a prefix here, so it's not ignored + let mut s = Search::new(&txn, &index); + s.query("to the xyz"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + // `xyz` is not treated as a prefix anymore because of the trailing space, so it's ignored + let mut s = Search::new(&txn, &index); + s.query("to the xyz "); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + let mut s = Search::new(&txn, &index); + s.query("to the dragon xyz"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); +} + +#[test] +fn test_stop_words_in_phrase() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.query("\"how to train your dragon\""); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 6]"); + + let mut s = Search::new(&txn, &index); + s.query("how \"to\" train \"the"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + let mut s = Search::new(&txn, &index); + s.query("how \"to\" train \"The dragon"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + let mut s = Search::new(&txn, &index); + s.query("\"to\""); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5, 6]"); +} From 899baa0ea590b8097bf641b12e6492500dc24398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 27 Apr 2023 13:43:04 +0200 Subject: [PATCH 209/234] Update forgotten snapshot from previous commit --- milli/src/search/new/tests/stop_words.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/new/tests/stop_words.rs b/milli/src/search/new/tests/stop_words.rs index 96dd06584..9dd7f2cb0 100644 --- a/milli/src/search/new/tests/stop_words.rs +++ b/milli/src/search/new/tests/stop_words.rs @@ -125,7 +125,7 @@ fn test_stop_words_in_phrase() { s.query("how \"to\" train \"The dragon"); s.terms_matching_strategy(TermsMatchingStrategy::Last); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 6, 5]"); let mut s = Search::new(&txn, &index); s.query("\"to\""); From bc4efca6117ae318d799b85b92384287219b3276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 26 Apr 2023 11:18:15 +0200 Subject: [PATCH 210/234] Add more tests for the attribute ranking rule --- .../new/ranking_rule_graph/position/mod.rs | 4 ++ milli/src/search/new/tests/attribute_fid.rs | 35 ++++++++- .../search/new/tests/attribute_position.rs | 71 +++++++++++++++++-- 3 files changed, 104 insertions(+), 6 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index ef4880cfb..a8e3f3916 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -77,6 +77,10 @@ impl RankingRuleGraphTrait for PositionGraph { let cost = { let mut cost = 0; for i in 0..term.term_ids.len() { + // This is actually not fully correct and slightly penalises ngrams unfairly. + // Because if two words are in the same bucketed position (e.g. 32) and consecutive, + // then their position cost will be 32+32=64, but an ngram of these two words at the + // same position will have a cost of 32+32+1=65 cost += position as u32 + i as u32; } cost diff --git a/milli/src/search/new/tests/attribute_fid.rs b/milli/src/search/new/tests/attribute_fid.rs index ec7b7a69e..d71c57f2c 100644 --- a/milli/src/search/new/tests/attribute_fid.rs +++ b/milli/src/search/new/tests/attribute_fid.rs @@ -88,7 +88,25 @@ fn create_index() -> TempIndex { "title": "the quick", "description": "", "plot": "brown fox jumps over the lazy dog", - } + }, + { + "id": 12, + "title": "", + "description": "the quickbrownfox", + "plot": "jumps over the lazy dog", + }, + { + "id": 13, + "title": "", + "description": "the quick brown fox", + "plot": "jumps over the lazy dog", + }, + { + "id": 14, + "title": "", + "description": "the quickbrownfox", + "plot": "jumps overthelazy dog", + }, ])) .unwrap(); index @@ -104,5 +122,18 @@ fn test_attribute_fid_simple() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 6, 5, 4, 3, 9, 7, 8, 11, 10, 0]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 6, 5, 4, 3, 9, 7, 8, 11, 10, 12, 13, 14, 0]"); +} + +#[test] +fn test_attribute_fid_ngrams() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 6, 5, 4, 3, 9, 7, 8, 11, 10, 12, 13, 14, 0]"); } diff --git a/milli/src/search/new/tests/attribute_position.rs b/milli/src/search/new/tests/attribute_position.rs index e4ed8b5ff..08b38684b 100644 --- a/milli/src/search/new/tests/attribute_position.rs +++ b/milli/src/search/new/tests/attribute_position.rs @@ -8,7 +8,11 @@ fn create_index() -> TempIndex { index .update_settings(|s| { s.set_primary_key("id".to_owned()); - s.set_searchable_fields(vec!["text".to_owned(), "other".to_owned()]); + s.set_searchable_fields(vec![ + "text".to_owned(), + "text2".to_owned(), + "other".to_owned(), + ]); s.set_criteria(vec![Criterion::Attribute]); }) .unwrap(); @@ -83,8 +87,41 @@ fn create_index() -> TempIndex { a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + quickbrown", + }, + { + "id": 8, + "text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a quick brown", }, + { + "id": 9, + "text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + quickbrown", + }, + { + "id": 10, + "text": "quick brown", + "text2": "brown quick", + }, + { + "id": 11, + "text": "quickbrown", + }, + { + "id": 12, + "text": "quick brown", + }, + { + "id": 13, + "text": "quickbrown", + }, ])) .unwrap(); index @@ -94,7 +131,7 @@ fn create_index() -> TempIndex { fn test_attribute_position_simple() { let index = create_index(); - db_snap!(index, word_position_docids, @"fe86911166fa4c0903c512fd86ec65e4"); + db_snap!(index, word_position_docids, @"1ad58847d772924d8aab5e92be8cf0cc"); let txn = index.read_txn().unwrap(); @@ -102,7 +139,7 @@ fn test_attribute_position_simple() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("quick brown"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0, 6, 7, 5]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]"); } #[test] fn test_attribute_position_repeated() { @@ -114,5 +151,31 @@ fn test_attribute_position_repeated() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("a a a a a"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 7, 6]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 7, 8, 9, 6]"); +} + +#[test] +fn test_attribute_position_different_fields() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("quick brown"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]"); +} + +#[test] +fn test_attribute_position_ngrams() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("quick brown"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]"); } From 93188b3c88d2a761d359a822b11569a6538b8bb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 27 Apr 2023 11:12:46 +0200 Subject: [PATCH 211/234] Fix indexing of word_prefix_fid_docids --- milli/src/update/index_documents/mod.rs | 41 ++++++++--- milli/src/update/mod.rs | 4 +- ...cids.rs => words_prefix_integer_docids.rs} | 72 ++++++++----------- 3 files changed, 65 insertions(+), 52 deletions(-) rename milli/src/update/{words_prefix_position_docids.rs => words_prefix_integer_docids.rs} (71%) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 9d60d59ca..ba1e6b74e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -36,7 +36,7 @@ use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, - WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, + WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; use crate::{Index, Result, RoaringBitmapCodec}; @@ -373,6 +373,7 @@ where let mut final_documents_ids = RoaringBitmap::new(); let mut word_pair_proximity_docids = None; let mut word_position_docids = None; + let mut word_fid_docids = None; let mut word_docids = None; let mut exact_word_docids = None; @@ -406,6 +407,11 @@ where word_position_docids = Some(cloneable_chunk); TypedChunk::WordPositionDocids(chunk) } + TypedChunk::WordFidDocids(chunk) => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + word_fid_docids = Some(cloneable_chunk); + TypedChunk::WordFidDocids(chunk) + } otherwise => otherwise, }; @@ -449,6 +455,7 @@ where exact_word_docids, word_pair_proximity_docids, word_position_docids, + word_fid_docids, )?; Ok(all_documents_ids.len()) @@ -461,6 +468,7 @@ where exact_word_docids: Option>, word_pair_proximity_docids: Option>, word_position_docids: Option>, + word_fid_docids: Option>, ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -595,17 +603,16 @@ where if let Some(word_position_docids) = word_position_docids { // Run the words prefix position docids update operation. - let mut builder = WordPrefixPositionDocids::new(self.wtxn, self.index); + let mut builder = WordPrefixIntegerDocids::new( + self.wtxn, + self.index.word_prefix_position_docids, + self.index.word_position_docids, + ); builder.chunk_compression_type = self.indexer_config.chunk_compression_type; builder.chunk_compression_level = self.indexer_config.chunk_compression_level; builder.max_nb_chunks = self.indexer_config.max_nb_chunks; builder.max_memory = self.indexer_config.max_memory; - if let Some(value) = self.config.words_positions_level_group_size { - builder.level_group_size(value); - } - if let Some(value) = self.config.words_positions_min_level_size { - builder.min_level_size(value); - } + builder.execute( word_position_docids, &new_prefix_fst_words, @@ -613,6 +620,24 @@ where &del_prefix_fst_words, )?; } + if let Some(word_fid_docids) = word_fid_docids { + // Run the words prefix fid docids update operation. + let mut builder = WordPrefixIntegerDocids::new( + self.wtxn, + self.index.word_prefix_fid_docids, + self.index.word_fid_docids, + ); + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + builder.max_nb_chunks = self.indexer_config.max_nb_chunks; + builder.max_memory = self.indexer_config.max_memory; + builder.execute( + word_fid_docids, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + } if (self.should_abort)() { return Err(Error::InternalError(InternalError::AbortedIndexation)); diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 948811a6b..7a3fd1fd9 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -14,7 +14,7 @@ pub use self::prefix_word_pairs::{ pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; -pub use self::words_prefix_position_docids::WordPrefixPositionDocids; +pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids; pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; @@ -27,5 +27,5 @@ mod prefix_word_pairs; mod settings; mod update_step; mod word_prefix_docids; -mod words_prefix_position_docids; +mod words_prefix_integer_docids; mod words_prefixes_fst; diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_integer_docids.rs similarity index 71% rename from milli/src/update/words_prefix_position_docids.rs rename to milli/src/update/words_prefix_integer_docids.rs index b09555264..63ca178ef 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -1,10 +1,9 @@ use std::collections::{HashMap, HashSet}; -use std::num::NonZeroU32; -use std::{cmp, str}; +use std::str; use grenad::CompressionType; use heed::types::ByteSlice; -use heed::{BytesDecode, BytesEncode}; +use heed::{BytesDecode, BytesEncode, Database}; use log::debug; use crate::error::SerializationError; @@ -14,57 +13,46 @@ use crate::update::index_documents::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, CursorClonableMmap, MergeFn, }; -use crate::{Index, Result}; +use crate::{CboRoaringBitmapCodec, Result}; -pub struct WordPrefixPositionDocids<'t, 'u, 'i> { +pub struct WordPrefixIntegerDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, + prefix_database: Database, + word_database: Database, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, - level_group_size: NonZeroU32, - min_level_size: NonZeroU32, } -impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { +impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordPrefixPositionDocids<'t, 'u, 'i> { - WordPrefixPositionDocids { + prefix_database: Database, + word_database: Database, + ) -> WordPrefixIntegerDocids<'t, 'u, 'i> { + WordPrefixIntegerDocids { wtxn, - index, + prefix_database, + word_database, chunk_compression_type: CompressionType::None, chunk_compression_level: None, max_nb_chunks: None, max_memory: None, - level_group_size: NonZeroU32::new(4).unwrap(), - min_level_size: NonZeroU32::new(5).unwrap(), } } - pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { - self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); - self - } - - pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { - self.min_level_size = value; - self - } - - #[logging_timer::time("WordPrefixPositionDocids::{}")] + #[logging_timer::time("WordPrefixIntegerDocids::{}")] pub fn execute( self, - new_word_position_docids: grenad::Reader, + new_word_integer_docids: grenad::Reader, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { - debug!("Computing and writing the word levels positions docids into LMDB on disk..."); + debug!("Computing and writing the word levels integers docids into LMDB on disk..."); - let mut prefix_position_docids_sorter = create_sorter( + let mut prefix_integer_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, self.chunk_compression_type, @@ -73,14 +61,14 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { self.max_memory, ); - let mut new_word_position_docids_iter = new_word_position_docids.into_cursor()?; + let mut new_word_integer_docids_iter = new_word_integer_docids.into_cursor()?; if !common_prefix_fst_words.is_empty() { // We fetch all the new common prefixes between the previous and new prefix fst. let mut buffer = Vec::new(); let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { + while let Some((key, data)) = new_word_integer_docids_iter.move_on_next()? { let (word, pos) = StrBEU16Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; current_prefixes = match current_prefixes.take() { @@ -88,7 +76,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { _otherwise => { write_prefixes_in_sorter( &mut prefixes_cache, - &mut prefix_position_docids_sorter, + &mut prefix_integer_docids_sorter, )?; common_prefix_fst_words .iter() @@ -101,6 +89,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { if word.starts_with(prefix) { buffer.clear(); buffer.extend_from_slice(prefix.as_bytes()); + buffer.push(0); buffer.extend_from_slice(&pos.to_be_bytes()); match prefixes_cache.get_mut(&buffer) { Some(value) => value.push(data.to_owned()), @@ -113,11 +102,11 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { } } - write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?; + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_integer_docids_sorter)?; } // We fetch the docids associated to the newly added word prefix fst only. - let db = self.index.word_position_docids.remap_data_type::(); + let db = self.word_database.remap_data_type::(); for prefix_bytes in new_prefix_fst_words { let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| { SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } @@ -133,19 +122,18 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { if word.starts_with(prefix) { let key = (prefix, pos); let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); - prefix_position_docids_sorter.insert(bytes, data)?; + prefix_integer_docids_sorter.insert(bytes, data)?; } } } - // We remove all the entries that are no more required in this word prefix position + // We remove all the entries that are no more required in this word prefix integer // docids database. - // We also avoid iterating over the whole `word_prefix_position_docids` database if we know in + // We also avoid iterating over the whole `word_prefix_integer_docids` database if we know in // advance that the `if del_prefix_fst_words.contains(prefix.as_bytes()) {` condition below // will always be false (i.e. if `del_prefix_fst_words` is empty). if !del_prefix_fst_words.is_empty() { - let mut iter = - self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + let mut iter = self.prefix_database.iter_mut(self.wtxn)?.lazily_decode_data(); while let Some(((prefix, _), _)) = iter.next().transpose()? { if del_prefix_fst_words.contains(prefix.as_bytes()) { unsafe { iter.del_current()? }; @@ -154,11 +142,11 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { drop(iter); } - // We finally write all the word prefix position docids into the LMDB database. + // We finally write all the word prefix integer docids into the LMDB database. sorter_into_lmdb_database( self.wtxn, - *self.index.word_prefix_position_docids.as_polymorph(), - prefix_position_docids_sorter, + *self.prefix_database.as_polymorph(), + prefix_integer_docids_sorter, merge_cbo_roaring_bitmaps, )?; From 48f5bb1693d0f2437f4ff9c05ee53c12cce12a1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 13 Apr 2023 13:45:34 +0200 Subject: [PATCH 212/234] Implements the geo-sort ranking rule --- milli/examples/search.rs | 7 +- milli/src/lib.rs | 3 +- milli/src/search/mod.rs | 10 + milli/src/search/new/geo_sort.rs | 261 +++++++++++++++++++++++ milli/src/search/new/mod.rs | 121 +++++++---- milli/src/search/new/tests/geo_sort.rs | 273 +++++++++++++++++++++++++ milli/src/search/new/tests/mod.rs | 1 + 7 files changed, 631 insertions(+), 45 deletions(-) create mode 100644 milli/src/search/new/geo_sort.rs create mode 100644 milli/src/search/new/tests/geo_sort.rs diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 030390822..8898e5dac 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -1,10 +1,12 @@ +use std::error::Error; use std::io::stdin; +use std::path::Path; use std::time::Instant; -use std::{error::Error, path::Path}; use heed::EnvOpenOptions; use milli::{ - execute_search, DefaultSearchLogger, Index, SearchContext, SearchLogger, TermsMatchingStrategy, + execute_search, DefaultSearchLogger, GeoSortStrategy, Index, SearchContext, SearchLogger, + TermsMatchingStrategy, }; #[global_allocator] @@ -54,6 +56,7 @@ fn main() -> Result<(), Box> { false, &None, &None, + GeoSortStrategy::default(), 0, 20, None, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 868df74e8..48699e76f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -79,7 +79,8 @@ pub use filter_parser::{Condition, FilterCondition, Span, Token}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; pub use search::new::{ - execute_search, DefaultSearchLogger, SearchContext, SearchLogger, VisualSearchLogger, + execute_search, DefaultSearchLogger, GeoSortStrategy, SearchContext, SearchLogger, + VisualSearchLogger, }; use serde_json::Value; pub use {charabia as tokenizer, heed}; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index a0bf272dd..97725b9bf 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -27,6 +27,7 @@ pub struct Search<'a> { offset: usize, limit: usize, sort_criteria: Option>, + geo_strategy: new::GeoSortStrategy, terms_matching_strategy: TermsMatchingStrategy, words_limit: usize, exhaustive_number_hits: bool, @@ -42,6 +43,7 @@ impl<'a> Search<'a> { offset: 0, limit: 20, sort_criteria: None, + geo_strategy: new::GeoSortStrategy::default(), terms_matching_strategy: TermsMatchingStrategy::default(), exhaustive_number_hits: false, words_limit: 10, @@ -85,6 +87,12 @@ impl<'a> Search<'a> { self } + #[cfg(test)] + pub fn geo_sort_strategy(&mut self, strategy: new::GeoSortStrategy) -> &mut Search<'a> { + self.geo_strategy = strategy; + self + } + /// Force the search to exhastivelly compute the number of candidates, /// this will increase the search time but allows finite pagination. pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> { @@ -102,6 +110,7 @@ impl<'a> Search<'a> { self.exhaustive_number_hits, &self.filter, &self.sort_criteria, + self.geo_strategy, self.offset, self.limit, Some(self.words_limit), @@ -127,6 +136,7 @@ impl fmt::Debug for Search<'_> { offset, limit, sort_criteria, + geo_strategy: _, terms_matching_strategy, words_limit, exhaustive_number_hits, diff --git a/milli/src/search/new/geo_sort.rs b/milli/src/search/new/geo_sort.rs new file mode 100644 index 000000000..b841dfe9c --- /dev/null +++ b/milli/src/search/new/geo_sort.rs @@ -0,0 +1,261 @@ +use std::collections::VecDeque; +use std::iter::FromIterator; + +use heed::types::{ByteSlice, Unit}; +use heed::{RoPrefix, RoTxn}; +use roaring::RoaringBitmap; +use rstar::RTree; + +use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; +use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec}; +use crate::{ + distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index, Result, SearchContext, + SearchLogger, +}; + +const FID_SIZE: usize = 2; +const DOCID_SIZE: usize = 4; + +#[allow(clippy::drop_non_drop)] +fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] { + concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) +} + +/// Return an iterator over each number value in the given field of the given document. +fn facet_number_values<'a>( + docid: u32, + field_id: u16, + index: &Index, + txn: &'a RoTxn, +) -> Result, Unit>> { + let key = facet_values_prefix_key(field_id, docid); + + let iter = index + .field_id_docid_facet_f64s + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_key_type(); + + Ok(iter) +} + +/// Define the strategy used by the geo sort. +/// The paramater represents the cache size, and, in the case of the Dynamic strategy, +/// the point where we move from using the iterative strategy to the rtree. +#[derive(Debug, Clone, Copy)] +pub enum Strategy { + AlwaysIterative(usize), + AlwaysRtree(usize), + Dynamic(usize), +} + +impl Default for Strategy { + fn default() -> Self { + Strategy::Dynamic(1000) + } +} + +impl Strategy { + pub fn use_rtree(&self, candidates: usize) -> bool { + match self { + Strategy::AlwaysIterative(_) => false, + Strategy::AlwaysRtree(_) => true, + Strategy::Dynamic(i) => candidates >= *i, + } + } + + pub fn cache_size(&self) -> usize { + match self { + Strategy::AlwaysIterative(i) | Strategy::AlwaysRtree(i) | Strategy::Dynamic(i) => *i, + } + } +} + +pub struct GeoSort { + query: Option, + + strategy: Strategy, + ascending: bool, + point: [f64; 2], + field_ids: Option<[u16; 2]>, + rtree: Option>, + + cached_sorted_docids: VecDeque, + geo_candidates: RoaringBitmap, +} + +impl GeoSort { + pub fn new( + strategy: Strategy, + geo_faceted_docids: RoaringBitmap, + point: [f64; 2], + ascending: bool, + ) -> Result { + Ok(Self { + query: None, + strategy, + ascending, + point, + geo_candidates: geo_faceted_docids, + field_ids: None, + rtree: None, + cached_sorted_docids: VecDeque::new(), + }) + } + + /// Refill the internal buffer of cached docids based on the strategy. + /// Drop the rtree if we don't need it anymore. + fn fill_buffer<'ctx>(&mut self, ctx: &mut SearchContext<'ctx>) -> Result<()> { + debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng"); + debug_assert!(self.cached_sorted_docids.is_empty()); + + // if we had an rtree and the strategy doesn't require one anymore we can drop it + let use_rtree = self.strategy.use_rtree(self.geo_candidates.len() as usize); + if !use_rtree && self.rtree.is_some() { + self.rtree = None; + } + + let cache_size = self.strategy.cache_size(); + if let Some(ref mut rtree) = self.rtree { + let point = lat_lng_to_xyz(&self.point); + + if self.ascending { + for point in rtree.nearest_neighbor_iter(&point) { + if self.geo_candidates.contains(point.data.0) { + self.cached_sorted_docids.push_back(point.data.0); + if self.cached_sorted_docids.len() >= cache_size { + break; + } + } + } + } else { + // in the case of the desc geo sort we have to scan the whole database + // and only keep the latest candidates. + for point in rtree.nearest_neighbor_iter(&point) { + if self.geo_candidates.contains(point.data.0) { + self.cached_sorted_docids.pop_front(); + self.cached_sorted_docids.push_back(point.data.0); + } + } + } + } else { + // the iterative version + let [lat, lng] = self.field_ids.unwrap(); + + let mut documents = self + .geo_candidates + .iter() + .map(|id| -> Result<_> { + Ok(( + id, + [ + facet_number_values(id, lat, ctx.index, ctx.txn)? + .next() + .expect("A geo faceted document doesn't contain any lat")? + .0 + .2, + facet_number_values(id, lng, ctx.index, ctx.txn)? + .next() + .expect("A geo faceted document doesn't contain any lng")? + .0 + .2, + ], + )) + }) + .collect::>>()?; + documents.sort_by_key(|(_, p)| distance_between_two_points(&self.point, &p) as usize); + self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id)); + }; + + if self.cached_sorted_docids.is_empty() && matches!(self.strategy, Strategy::AlwaysRtree(_)) + { + // this shouldn't be possible + self.rtree = None; + } + Ok(()) + } +} + +impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { + fn id(&self) -> String { + "geo_sort".to_owned() + } + + fn start_iteration( + &mut self, + ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + universe: &RoaringBitmap, + query: &Q, + ) -> Result<()> { + assert!(self.query.is_none()); + + self.query = Some(query.clone()); + self.geo_candidates &= universe; + + if self.geo_candidates.len() == 0 { + return Ok(()); + } + + let fid_map = ctx.index.fields_ids_map(ctx.txn)?; + let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat"); + let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng"); + self.field_ids = Some([lat, lng]); + + if self.strategy.use_rtree(self.geo_candidates.len() as usize) { + self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree")); + } + + self.fill_buffer(ctx)?; + Ok(()) + } + + fn next_bucket( + &mut self, + ctx: &mut SearchContext<'ctx>, + logger: &mut dyn SearchLogger, + universe: &RoaringBitmap, + ) -> Result>> { + assert!(universe.len() > 1); + let query = self.query.as_ref().unwrap().clone(); + self.geo_candidates &= universe; + + if self.geo_candidates.is_empty() { + return Ok(Some(RankingRuleOutput { query, candidates: universe.clone() })); + } + + let ascending = self.ascending; + let next = |cache: &mut VecDeque<_>| { + if ascending { + cache.pop_front() + } else { + cache.pop_back() + } + }; + while let Some(id) = next(&mut self.cached_sorted_docids) { + if self.geo_candidates.contains(id) { + return Ok(Some(RankingRuleOutput { + query, + candidates: RoaringBitmap::from_iter([id]), + })); + } + } + + // if we got out of this loop it means we've exhausted our cache. + + if self.rtree.is_none() { + // with no rtree it means all geo candidates have been returned. We can return all the non geo-faceted documents + Ok(Some(RankingRuleOutput { query, candidates: universe.clone() })) + } else { + // else, we need to refill our bucket and run the function again + self.fill_buffer(ctx)?; + self.next_bucket(ctx, logger, universe) + } + } + + fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger) { + self.query = None; + self.rtree = None; + self.cached_sorted_docids.clear(); + } +} diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 246745678..eb006fbf3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -1,6 +1,7 @@ mod bucket_sort; mod db_cache; mod distinct; +mod geo_sort; mod graph_based_ranking_rule; mod interner; mod limits; @@ -25,32 +26,30 @@ mod tests; use std::collections::HashSet; -use bucket_sort::bucket_sort; +use bucket_sort::{bucket_sort, BucketSortOutput}; use charabia::TokenizerBuilder; use db_cache::DatabaseCache; -use graph_based_ranking_rule::{Fid, Position, Proximity, Typo}; +use exact_attribute::ExactAttribute; +use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo}; use heed::RoTxn; -use interner::DedupInterner; +use interner::{DedupInterner, Interner}; pub use logger::visual::VisualSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm}; -use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; -use resolve_query_graph::PhraseDocIdsCache; +use ranking_rules::{ + BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait, +}; +use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache}; use roaring::RoaringBitmap; +use sort::Sort; use words::Words; +use self::geo_sort::GeoSort; +pub use self::geo_sort::Strategy as GeoSortStrategy; +use self::interner::Interned; use crate::search::new::distinct::apply_distinct_rule; use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; -use bucket_sort::BucketSortOutput; -use exact_attribute::ExactAttribute; -use graph_based_ranking_rule::Exactness; -use interner::Interner; -use ranking_rules::{BoxRankingRule, RankingRule}; -use resolve_query_graph::compute_query_graph_docids; -use sort::Sort; - -use self::interner::Interned; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -139,10 +138,11 @@ fn resolve_universe( fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, sort_criteria: &Option>, + geo_strategy: geo_sort::Strategy, ) -> Result>> { let mut sort = false; - let mut asc = HashSet::new(); - let mut desc = HashSet::new(); + let mut sorted_fields = HashSet::new(); + let mut geo_sorted = false; let mut ranking_rules: Vec> = vec![]; let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; for rr in settings_ranking_rules { @@ -157,21 +157,28 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( if sort { continue; } - resolve_sort_criteria(sort_criteria, ctx, &mut ranking_rules, &mut asc, &mut desc)?; + resolve_sort_criteria( + sort_criteria, + ctx, + &mut ranking_rules, + &mut sorted_fields, + &mut geo_sorted, + geo_strategy, + )?; sort = true; } crate::Criterion::Asc(field_name) => { - if asc.contains(&field_name) { + if sorted_fields.contains(&field_name) { continue; } - asc.insert(field_name.clone()); + sorted_fields.insert(field_name.clone()); ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?)); } crate::Criterion::Desc(field_name) => { - if desc.contains(&field_name) { + if sorted_fields.contains(&field_name) { continue; } - desc.insert(field_name.clone()); + sorted_fields.insert(field_name.clone()); ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?)); } } @@ -183,6 +190,7 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( fn get_ranking_rules_for_query_graph_search<'ctx>( ctx: &SearchContext<'ctx>, sort_criteria: &Option>, + geo_strategy: geo_sort::Strategy, terms_matching_strategy: TermsMatchingStrategy, ) -> Result>> { // query graph search @@ -192,8 +200,8 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut sort = false; let mut attribute = false; let mut exactness = false; - let mut asc = HashSet::new(); - let mut desc = HashSet::new(); + let mut sorted_fields = HashSet::new(); + let mut geo_sorted = false; let mut ranking_rules: Vec> = vec![]; let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; @@ -245,7 +253,14 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( if sort { continue; } - resolve_sort_criteria(sort_criteria, ctx, &mut ranking_rules, &mut asc, &mut desc)?; + resolve_sort_criteria( + sort_criteria, + ctx, + &mut ranking_rules, + &mut sorted_fields, + &mut geo_sorted, + geo_strategy, + )?; sort = true; } crate::Criterion::Exactness => { @@ -257,17 +272,17 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( exactness = true; } crate::Criterion::Asc(field_name) => { - if asc.contains(&field_name) { + if sorted_fields.contains(&field_name) { continue; } - asc.insert(field_name.clone()); + sorted_fields.insert(field_name.clone()); ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?)); } crate::Criterion::Desc(field_name) => { - if desc.contains(&field_name) { + if sorted_fields.contains(&field_name) { continue; } - desc.insert(field_name.clone()); + sorted_fields.insert(field_name.clone()); ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?)); } } @@ -279,33 +294,53 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( sort_criteria: &Option>, ctx: &SearchContext<'ctx>, ranking_rules: &mut Vec>, - asc: &mut HashSet, - desc: &mut HashSet, + sorted_fields: &mut HashSet, + geo_sorted: &mut bool, + geo_strategy: geo_sort::Strategy, ) -> Result<()> { let sort_criteria = sort_criteria.clone().unwrap_or_default(); ranking_rules.reserve(sort_criteria.len()); for criterion in sort_criteria { - let sort_ranking_rule = match criterion { + match criterion { AscDesc::Asc(Member::Field(field_name)) => { - if asc.contains(&field_name) { + if sorted_fields.contains(&field_name) { continue; } - asc.insert(field_name.clone()); - Sort::new(ctx.index, ctx.txn, field_name, true)? + sorted_fields.insert(field_name.clone()); + ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?)); } AscDesc::Desc(Member::Field(field_name)) => { - if desc.contains(&field_name) { + if sorted_fields.contains(&field_name) { continue; } - desc.insert(field_name.clone()); - Sort::new(ctx.index, ctx.txn, field_name, false)? + sorted_fields.insert(field_name.clone()); + ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?)); } - // geosearch - _ => { - todo!() + AscDesc::Asc(Member::Geo(point)) => { + if *geo_sorted { + continue; + } + let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?; + ranking_rules.push(Box::new(GeoSort::new( + geo_strategy, + geo_faceted_docids, + point, + true, + )?)); + } + AscDesc::Desc(Member::Geo(point)) => { + if *geo_sorted { + continue; + } + let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?; + ranking_rules.push(Box::new(GeoSort::new( + geo_strategy, + geo_faceted_docids, + point, + false, + )?)); } }; - ranking_rules.push(Box::new(sort_ranking_rule)); } Ok(()) } @@ -318,6 +353,7 @@ pub fn execute_search( exhaustive_number_hits: bool, filters: &Option, sort_criteria: &Option>, + geo_strategy: geo_sort::Strategy, from: usize, length: usize, words_limit: Option, @@ -373,7 +409,8 @@ pub fn execute_search( bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { - let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?; + let ranking_rules = + get_ranking_rules_for_placeholder_search(ctx, sort_criteria, geo_strategy)?; bucket_sort( ctx, ranking_rules, diff --git a/milli/src/search/new/tests/geo_sort.rs b/milli/src/search/new/tests/geo_sort.rs new file mode 100644 index 000000000..e49fd7c99 --- /dev/null +++ b/milli/src/search/new/tests/geo_sort.rs @@ -0,0 +1,273 @@ +/*! +This module tests the `geo_sort` ranking rule: + +1. an error is returned if the sort ranking rule exists but no fields-to-sort were given at search time +2. an error is returned if the fields-to-sort are not sortable +3. it is possible to add multiple fields-to-sort at search time +4. custom sort ranking rules can be added to the settings, they interact with the generic `sort` ranking rule as expected +5. numbers appear before strings +6. documents with either: (1) no value, (2) null, or (3) an object for the field-to-sort appear at the end of the bucket +7. boolean values are translated to strings +8. if a field contains an array, it is sorted by the best value in the array according to the sort rule +*/ + +use big_s::S; +use heed::RoTxn; +use maplit::hashset; + +use crate::index::tests::TempIndex; +use crate::search::new::tests::collect_field_values; +use crate::{AscDesc, Criterion, GeoSortStrategy, Member, Search, SearchResult}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_sortable_fields(hashset! { S("_geo") }); + s.set_criteria(vec![Criterion::Words, Criterion::Sort]); + }) + .unwrap(); + index +} + +#[track_caller] +fn execute_iterative_and_rtree_returns_the_same<'a>( + rtxn: &RoTxn<'a>, + index: &TempIndex, + search: &mut Search<'a>, +) -> Vec { + search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(2)); + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + let iterative_ids_bucketed = collect_field_values(&index, rtxn, "id", &documents_ids); + + search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(1000)); + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + let iterative_ids = collect_field_values(&index, rtxn, "id", &documents_ids); + + assert_eq!(iterative_ids_bucketed, iterative_ids, "iterative bucket"); + + search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(2)); + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + let rtree_ids_bucketed = collect_field_values(&index, rtxn, "id", &documents_ids); + + search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(1000)); + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + let rtree_ids = collect_field_values(&index, rtxn, "id", &documents_ids); + + assert_eq!(rtree_ids_bucketed, rtree_ids, "rtree bucket"); + + assert_eq!(iterative_ids, rtree_ids, "iterative vs rtree"); + + iterative_ids.into_iter().map(|id| id.parse().unwrap()).collect() +} + +#[test] +fn test_geo_sort() { + let index = create_index(); + + index + .add_documents(documents!([ + { "id": 2, "_geo": { "lat": 2, "lng": -1 } }, + { "id": 3, "_geo": { "lat": -2, "lng": -2 } }, + { "id": 5, "_geo": { "lat": 6, "lng": -5 } }, + { "id": 4, "_geo": { "lat": 3, "lng": 5 } }, + { "id": 0, "_geo": { "lat": 0, "lng": 0 } }, + { "id": 1, "_geo": { "lat": 1, "lng": 1 } }, + { "id": 6 }, { "id": 8 }, { "id": 7 }, { "id": 10 }, { "id": 9 }, + ])) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + + // --- asc + s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., 0.]))]); + + s.geo_sort_strategy(GeoSortStrategy::Dynamic(100)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + + s.geo_sort_strategy(GeoSortStrategy::Dynamic(3)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + + s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + + s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + + s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + + s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + + // --- desc + s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., 0.]))]); + + s.geo_sort_strategy(GeoSortStrategy::Dynamic(100)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + + s.geo_sort_strategy(GeoSortStrategy::Dynamic(3)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + + s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + + s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + + s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + + s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3)); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + let ids = collect_field_values(&index, &txn, "id", &documents_ids); + insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); +} + +#[test] +fn test_geo_sort_around_the_edge_of_the_flat_earth() { + let index = create_index(); + + index + .add_documents(documents!([ + { "id": 0, "_geo": { "lat": 0, "lng": 0 } }, + { "id": 1, "_geo": { "lat": 88, "lng": 0 } }, + { "id": 2, "_geo": { "lat": -89, "lng": 0 } }, + + { "id": 3, "_geo": { "lat": 0, "lng": 178 } }, + { "id": 4, "_geo": { "lat": 0, "lng": -179 } }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut s = Search::new(&rtxn, &index); + + // --- asc + s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., 0.]))]); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[0, 1, 2, 3, 4]"); + + // ensuring the lat doesn't wrap around + s.sort_criteria(vec![AscDesc::Asc(Member::Geo([85., 0.]))]); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[1, 0, 3, 4, 2]"); + + s.sort_criteria(vec![AscDesc::Asc(Member::Geo([-85., 0.]))]); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[2, 0, 3, 4, 1]"); + + // ensuring the lng does wrap around + s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., 175.]))]); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[3, 4, 2, 1, 0]"); + + s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., -175.]))]); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[4, 3, 2, 1, 0]"); + + // --- desc + s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., 0.]))]); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[4, 3, 2, 1, 0]"); + + // ensuring the lat doesn't wrap around + s.sort_criteria(vec![AscDesc::Desc(Member::Geo([85., 0.]))]); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[2, 4, 3, 0, 1]"); + + s.sort_criteria(vec![AscDesc::Desc(Member::Geo([-85., 0.]))]); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[1, 4, 3, 0, 2]"); + + // ensuring the lng does wrap around + s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., 175.]))]); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[0, 1, 2, 4, 3]"); + + s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., -175.]))]); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[0, 1, 2, 3, 4]"); +} + +#[test] +fn geo_sort_mixed_with_words() { + let index = create_index(); + + index + .add_documents(documents!([ + { "id": 0, "doggo": "jean", "_geo": { "lat": 0, "lng": 0 } }, + { "id": 1, "doggo": "intel", "_geo": { "lat": 88, "lng": 0 } }, + { "id": 2, "doggo": "jean bob", "_geo": { "lat": -89, "lng": 0 } }, + { "id": 3, "doggo": "jean michel", "_geo": { "lat": 0, "lng": 178 } }, + { "id": 4, "doggo": "bob marley", "_geo": { "lat": 0, "lng": -179 } }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut s = Search::new(&rtxn, &index); + s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., 0.]))]); + + s.query("jean"); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[0, 2, 3]"); + + s.query("bob"); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[2, 4]"); + + s.query("intel"); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[1]"); +} + +#[test] +fn geo_sort_without_any_geo_faceted_documents() { + let index = create_index(); + + index + .add_documents(documents!([ + { "id": 0, "doggo": "jean" }, + { "id": 1, "doggo": "intel" }, + { "id": 2, "doggo": "jean bob" }, + { "id": 3, "doggo": "jean michel" }, + { "id": 4, "doggo": "bob marley" }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut s = Search::new(&rtxn, &index); + s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., 0.]))]); + + s.query("jean"); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[0, 2, 3]"); +} diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index cdcdb5936..2ad806a87 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -2,6 +2,7 @@ pub mod attribute_fid; pub mod attribute_position; pub mod distinct; pub mod exactness; +pub mod geo_sort; #[cfg(feature = "default")] pub mod language; pub mod ngram_split_words; From 59b12fca87bf59157d78282c5bd19cb37c41ba35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sat, 29 Apr 2023 11:40:00 +0200 Subject: [PATCH 213/234] Fix errors, clippy warnings, and add review comments --- milli/src/search/new/geo_sort.rs | 22 ++++++-- milli/src/search/new/mod.rs | 8 ++- milli/src/search/new/tests/geo_sort.rs | 78 ++++++++++++-------------- 3 files changed, 59 insertions(+), 49 deletions(-) diff --git a/milli/src/search/new/geo_sort.rs b/milli/src/search/new/geo_sort.rs index b841dfe9c..9e1da4479 100644 --- a/milli/src/search/new/geo_sort.rs +++ b/milli/src/search/new/geo_sort.rs @@ -105,7 +105,7 @@ impl GeoSort { /// Refill the internal buffer of cached docids based on the strategy. /// Drop the rtree if we don't need it anymore. - fn fill_buffer<'ctx>(&mut self, ctx: &mut SearchContext<'ctx>) -> Result<()> { + fn fill_buffer(&mut self, ctx: &mut SearchContext) -> Result<()> { debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng"); debug_assert!(self.cached_sorted_docids.is_empty()); @@ -133,7 +133,13 @@ impl GeoSort { // and only keep the latest candidates. for point in rtree.nearest_neighbor_iter(&point) { if self.geo_candidates.contains(point.data.0) { - self.cached_sorted_docids.pop_front(); + // REVIEW COMMENT: that doesn't look right, because we only keep the furthest point in the cache. + // Then the cache will be exhausted after the first bucket and we'll need to repopulate it again immediately. + // I think it's okay if we keep every document id in the cache instead. It's a high memory usage, + // but we already have the whole rtree in memory, which is bigger than a vector of all document ids. + // + // self.cached_sorted_docids.pop_front(); + // self.cached_sorted_docids.push_back(point.data.0); } } @@ -163,7 +169,10 @@ impl GeoSort { )) }) .collect::>>()?; - documents.sort_by_key(|(_, p)| distance_between_two_points(&self.point, &p) as usize); + // REVIEW COMMENT: the haversine distance function can be quite expensive, I think, so it's probably faster + // to use `sort_by_cached_key` instead of `sort_by_key`. + documents + .sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize); self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id)); }; @@ -193,7 +202,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { self.query = Some(query.clone()); self.geo_candidates &= universe; - if self.geo_candidates.len() == 0 { + if self.geo_candidates.is_empty() { return Ok(()); } @@ -203,6 +212,10 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { self.field_ids = Some([lat, lng]); if self.strategy.use_rtree(self.geo_candidates.len() as usize) { + // REVIEW COMMENT: I would prefer to always keep the rtree in memory so that we don't have to deserialize it + // every time the geosort ranking rule starts iterating. + // So we'd initialize it in `::new` and never drop it. + // self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree")); } @@ -210,6 +223,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { Ok(()) } + #[allow(clippy::only_used_in_recursion)] fn next_bucket( &mut self, ctx: &mut SearchContext<'ctx>, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index eb006fbf3..2faf20a1d 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -401,8 +401,12 @@ pub fn execute_search( let graph = QueryGraph::from_query(ctx, &query_terms)?; located_query_terms = Some(query_terms); - let ranking_rules = - get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; + let ranking_rules = get_ranking_rules_for_query_graph_search( + ctx, + sort_criteria, + geo_strategy, + terms_matching_strategy, + )?; universe = resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?; diff --git a/milli/src/search/new/tests/geo_sort.rs b/milli/src/search/new/tests/geo_sort.rs index e49fd7c99..3072007db 100644 --- a/milli/src/search/new/tests/geo_sort.rs +++ b/milli/src/search/new/tests/geo_sort.rs @@ -1,14 +1,9 @@ /*! This module tests the `geo_sort` ranking rule: -1. an error is returned if the sort ranking rule exists but no fields-to-sort were given at search time -2. an error is returned if the fields-to-sort are not sortable -3. it is possible to add multiple fields-to-sort at search time -4. custom sort ranking rules can be added to the settings, they interact with the generic `sort` ranking rule as expected -5. numbers appear before strings -6. documents with either: (1) no value, (2) null, or (3) an object for the field-to-sort appear at the end of the bucket -7. boolean values are translated to strings -8. if a field contains an array, it is sorted by the best value in the array according to the sort rule +REVIEW COMMENT: + - nice tests :) + - add anything that seems not obvious about the behaviour of the geosort ranking rule here */ use big_s::S; @@ -40,21 +35,21 @@ fn execute_iterative_and_rtree_returns_the_same<'a>( ) -> Vec { search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(2)); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let iterative_ids_bucketed = collect_field_values(&index, rtxn, "id", &documents_ids); + let iterative_ids_bucketed = collect_field_values(index, rtxn, "id", &documents_ids); search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(1000)); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let iterative_ids = collect_field_values(&index, rtxn, "id", &documents_ids); + let iterative_ids = collect_field_values(index, rtxn, "id", &documents_ids); assert_eq!(iterative_ids_bucketed, iterative_ids, "iterative bucket"); search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(2)); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let rtree_ids_bucketed = collect_field_values(&index, rtxn, "id", &documents_ids); + let rtree_ids_bucketed = collect_field_values(index, rtxn, "id", &documents_ids); search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(1000)); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let rtree_ids = collect_field_values(&index, rtxn, "id", &documents_ids); + let rtree_ids = collect_field_values(index, rtxn, "id", &documents_ids); assert_eq!(rtree_ids_bucketed, rtree_ids, "rtree bucket"); @@ -67,15 +62,24 @@ fn execute_iterative_and_rtree_returns_the_same<'a>( fn test_geo_sort() { let index = create_index(); + // REVIEW COMMENT: + // I prefer to make the external ids correspond to the internal ids so that + // we can check whether the ranking rules are actually doing work instead of + // returning documents in order of their internal ids. + // index .add_documents(documents!([ - { "id": 2, "_geo": { "lat": 2, "lng": -1 } }, - { "id": 3, "_geo": { "lat": -2, "lng": -2 } }, - { "id": 5, "_geo": { "lat": 6, "lng": -5 } }, - { "id": 4, "_geo": { "lat": 3, "lng": 5 } }, - { "id": 0, "_geo": { "lat": 0, "lng": 0 } }, - { "id": 1, "_geo": { "lat": 1, "lng": 1 } }, - { "id": 6 }, { "id": 8 }, { "id": 7 }, { "id": 10 }, { "id": 9 }, + { "id": 0, "_geo": { "lat": 2, "lng": -1 } }, + { "id": 1, "_geo": { "lat": -2, "lng": -2 } }, + { "id": 2, "_geo": { "lat": 6, "lng": -5 } }, + { "id": 3, "_geo": { "lat": 3, "lng": 5 } }, + { "id": 4, "_geo": { "lat": 0, "lng": 0 } }, + { "id": 5, "_geo": { "lat": 1, "lng": 1 } }, + { "id": 6 }, + { "id": 7 }, + { "id": 8 }, + { "id": 9 }, + { "id": 10 }, ])) .unwrap(); @@ -88,66 +92,54 @@ fn test_geo_sort() { s.geo_sort_strategy(GeoSortStrategy::Dynamic(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::Dynamic(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); // --- desc s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., 0.]))]); s.geo_sort_strategy(GeoSortStrategy::Dynamic(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::Dynamic(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); } #[test] From 79001b9c97861712f47e835ca36efadbaeefc832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sat, 29 Apr 2023 23:26:22 +0200 Subject: [PATCH 214/234] Improve performance of the cheapest path finder algorithm --- milli/examples/index.rs | 12 +-- milli/examples/settings.rs | 2 +- milli/src/search/new/bucket_sort.rs | 6 +- .../search/new/graph_based_ranking_rule.rs | 69 +++++++++++--- .../new/ranking_rule_graph/cheapest_paths.rs | 92 +++++++++++++++---- .../new/ranking_rule_graph/dead_ends_cache.rs | 8 ++ .../src/search/new/ranking_rule_graph/mod.rs | 16 +++- .../new/ranking_rule_graph/position/mod.rs | 27 +++++- 8 files changed, 182 insertions(+), 50 deletions(-) diff --git a/milli/examples/index.rs b/milli/examples/index.rs index 18f82797b..781440b56 100644 --- a/milli/examples/index.rs +++ b/milli/examples/index.rs @@ -6,7 +6,7 @@ use std::path::Path; use heed::EnvOpenOptions; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{Criterion, Index, Object}; +use milli::{Index, Object}; fn usage(error: &str, program_name: &str) -> String { format!( @@ -52,18 +52,10 @@ fn main() -> Result<(), Box> { let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); builder.set_filterable_fields(filterable_fields); - builder.set_criteria(vec![ - Criterion::Words, - Criterion::Typo, - Criterion::Proximity, - Criterion::Attribute, - ]); builder.execute(|_| (), || false).unwrap(); let config = IndexerConfig::default(); - let mut indexing_config = IndexDocumentsConfig::default(); - - indexing_config.autogenerate_docids = true; + let indexing_config = IndexDocumentsConfig::default(); let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); diff --git a/milli/examples/settings.rs b/milli/examples/settings.rs index bb24969cc..c7f4780cb 100644 --- a/milli/examples/settings.rs +++ b/milli/examples/settings.rs @@ -24,8 +24,8 @@ fn main() { Criterion::Typo, Criterion::Proximity, Criterion::Attribute, + Criterion::Sort, Criterion::Exactness, - // Criterion::Asc("release_date".to_owned()), ]); builder.execute(|_| (), || false).unwrap(); diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index ec0116fae..5144a0a28 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -72,7 +72,11 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( /// Update the universes accordingly and inform the logger. macro_rules! back { () => { - assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); + assert!( + ranking_rule_universes[cur_ranking_rule_index].is_empty(), + "The ranking rule {} did not sort its bucket exhaustively", + ranking_rules[cur_ranking_rule_index].id() + ); logger.end_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index].as_ref(), diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 0d22b5b1e..f5918517b 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -36,6 +36,7 @@ That is we find the documents where either: - OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by` */ +use std::collections::BTreeSet; use std::ops::ControlFlow; use roaring::RoaringBitmap; @@ -99,6 +100,8 @@ impl GraphBasedRankingRule { } } +static mut COUNT_PATHS: usize = 0; + /// The internal state of a graph-based ranking rule during iteration pub struct GraphBasedRankingRuleState { /// The current graph @@ -110,7 +113,7 @@ pub struct GraphBasedRankingRuleState { /// A structure giving the list of possible costs from each node to the end node all_costs: MappedInterner>, /// An index in the first element of `all_distances`, giving the cost of the next bucket - cur_distance_idx: usize, + cur_cost: u64, } impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBasedRankingRule { @@ -160,7 +163,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase conditions_cache: condition_docids_cache, dead_ends_cache, all_costs, - cur_distance_idx: 0, + cur_cost: 0, }; self.state = Some(state); @@ -181,16 +184,16 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // should never happen let mut state = self.state.take().unwrap(); - // If the cur_distance_idx does not point to a valid cost in the `all_distances` - // structure, then we have computed all the buckets and can return. - if state.cur_distance_idx >= state.all_costs.get(state.graph.query_graph.root_node).len() { - self.state = None; - return Ok(None); - } - // Retrieve the cost of the paths to compute - let cost = state.all_costs.get(state.graph.query_graph.root_node)[state.cur_distance_idx]; - state.cur_distance_idx += 1; + let Some(&cost) = state + .all_costs + .get(state.graph.query_graph.root_node) + .iter() + .find(|c| **c >= state.cur_cost) else { + self.state = None; + return Ok(None); + }; + state.cur_cost = cost + 1; let mut bucket = RoaringBitmap::new(); @@ -199,7 +202,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase conditions_cache: condition_docids_cache, dead_ends_cache, all_costs, - cur_distance_idx: _, + cur_cost: _, } = &mut state; let mut universe = universe.clone(); @@ -216,9 +219,34 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // the number of future candidate paths given by that same function. let mut subpaths_docids: Vec<(Interned, RoaringBitmap)> = vec![]; + let mut at_least_one = false; + // unsafe { + // if COUNT_PATHS >= 1489 && COUNT_PATHS < 1491 { + // println!("COUNT_PATHS {COUNT_PATHS} COST {cost}, NODES {COUNT_VISITED_NODES}, UNIVERSE {}", universe.len()); + // // let all_costs = all_costs.get(graph.query_graph.root_node); + // // println!("{all_costs:?}"); + // dead_ends_cache.debug_print(0); + // println!("{universe:?}"); + + // println!("=================="); + // } + // } + let mut nodes_with_removed_outgoing_conditions = BTreeSet::new(); let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache); + visitor.visit_paths(&mut |path, graph, dead_ends_cache| { + unsafe { + COUNT_PATHS += 1; + } + // if self.id == "position" { + // at_least_one = true; + // print!("."); + // } + // if self.id == "fid" { + at_least_one = true; + // print!("!"); + // } considered_paths.push(path.to_vec()); // If the universe is empty, stop exploring the graph, since no docids will ever be found anymore. if universe.is_empty() { @@ -243,7 +271,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase }; // Then for the remaining of the path, we continue computing docids. for latest_condition in path[idx_of_first_different_condition..].iter().copied() { - // The visit_path_condition will stop let success = visit_path_condition( ctx, graph, @@ -251,6 +278,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase dead_ends_cache, condition_docids_cache, &mut subpaths_docids, + &mut nodes_with_removed_outgoing_conditions, latest_condition, )?; if !success { @@ -281,7 +309,11 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase Ok(ControlFlow::Continue(())) } })?; - + // if at_least_one { + // unsafe { + // println!("\n===== {id} COST: {cost} ==== PATHS: {COUNT_PATHS} ==== NODES: {COUNT_VISITED_NODES} ===== UNIVERSE: {universe}", id=self.id, universe=universe.len()); + // } + // } logger.log_internal_state(graph); logger.log_internal_state(&good_paths); @@ -305,6 +337,10 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let next_query_graph = QueryGraph::build_from_paths(paths); + if !nodes_with_removed_outgoing_conditions.is_empty() { + graph.update_all_costs_before_nodes(&nodes_with_removed_outgoing_conditions, all_costs); + } + self.state = Some(state); Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket })) @@ -321,6 +357,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase /// Returns false if the intersection between the condition /// docids and the previous path docids is empty. +#[allow(clippy::too_many_arguments)] fn visit_path_condition( ctx: &mut SearchContext, graph: &mut RankingRuleGraph, @@ -328,6 +365,7 @@ fn visit_path_condition( dead_ends_cache: &mut DeadEndsCache, condition_docids_cache: &mut ConditionDocIdsCache, subpath: &mut Vec<(Interned, RoaringBitmap)>, + nodes_with_removed_outgoing_conditions: &mut BTreeSet>, latest_condition: Interned, ) -> Result { let condition_docids = &condition_docids_cache @@ -337,7 +375,8 @@ fn visit_path_condition( // 1. Store in the cache that this edge is empty for this universe dead_ends_cache.forbid_condition(latest_condition); // 2. remove all the edges with this condition from the ranking rule graph - graph.remove_edges_with_condition(latest_condition); + let source_nodes = graph.remove_edges_with_condition(latest_condition); + nodes_with_removed_outgoing_conditions.extend(source_nodes); return Ok(false); } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 443ab0ec4..4a104df69 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -100,16 +100,21 @@ impl VisitorState { let ControlFlow::Continue(next_any_valid) = cf else { return Ok(ControlFlow::Break(())); }; + any_valid |= next_any_valid; if next_any_valid { + // backtrack as much as possible if a valid path was found and the dead_ends_cache + // was updated such that the current prefix is now invalid self.forbidden_conditions = ctx .dead_ends_cache .forbidden_conditions_for_all_prefixes_up_to(self.path.iter().copied()); if self.visited_conditions.intersects(&self.forbidden_conditions) { - break; + return Ok(ControlFlow::Continue(true)); } } - any_valid |= next_any_valid; } + // if there wasn't any valid path from this node to the end node, then + // this node is a dead end **for this specific cost**. + // we could encode this in the dead-ends cache Ok(ControlFlow::Continue(any_valid)) } @@ -117,7 +122,7 @@ impl VisitorState { fn visit_no_condition( &mut self, dest_node: Interned, - edge_forbidden_nodes: &SmallBitmap, + edge_new_nodes_to_skip: &SmallBitmap, visit: VisitFn, ctx: &mut VisitorContext, ) -> Result> { @@ -137,7 +142,7 @@ impl VisitorState { } } else { let old_fbct = self.forbidden_conditions_to_nodes.clone(); - self.forbidden_conditions_to_nodes.union(edge_forbidden_nodes); + self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip); let cf = self.visit_node(dest_node, visit, ctx)?; self.forbidden_conditions_to_nodes = old_fbct; Ok(cf) @@ -147,14 +152,14 @@ impl VisitorState { &mut self, condition: Interned, dest_node: Interned, - edge_forbidden_nodes: &SmallBitmap, + edge_new_nodes_to_skip: &SmallBitmap, visit: VisitFn, ctx: &mut VisitorContext, ) -> Result> { assert!(dest_node != ctx.graph.query_graph.end_node); if self.forbidden_conditions_to_nodes.contains(dest_node) - || edge_forbidden_nodes.intersects(&self.visited_nodes) + || edge_new_nodes_to_skip.intersects(&self.visited_nodes) { return Ok(ControlFlow::Continue(false)); } @@ -162,11 +167,13 @@ impl VisitorState { return Ok(ControlFlow::Continue(false)); } - if ctx + // Checking that from the destination node, there is at least + // one cost that we can visit that corresponds to our remaining budget. + if !ctx .all_costs_from_node .get(dest_node) .iter() - .all(|next_cost| *next_cost != self.remaining_cost) + .any(|next_cost| *next_cost == self.remaining_cost) { return Ok(ControlFlow::Continue(false)); } @@ -182,7 +189,7 @@ impl VisitorState { self.forbidden_conditions.union(&next_forbidden); } let old_fctn = self.forbidden_conditions_to_nodes.clone(); - self.forbidden_conditions_to_nodes.union(edge_forbidden_nodes); + self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip); let cf = self.visit_node(dest_node, visit, ctx)?; @@ -212,22 +219,21 @@ impl RankingRuleGraph { } while let Some(cur_node) = node_stack.pop_front() { - let mut self_costs = BTreeSet::::new(); + let mut self_costs = Vec::::new(); let cur_node_edges = &self.edges_of_node.get(cur_node); for edge_idx in cur_node_edges.iter() { let edge = self.edges_store.get(edge_idx).as_ref().unwrap(); let succ_node = edge.dest_node; let succ_costs = costs_to_end.get(succ_node); - for succ_distance in succ_costs { - self_costs.insert(edge.cost as u64 + succ_distance); + for succ_cost in succ_costs { + self_costs.push(edge.cost as u64 + succ_cost); } } - let costs_to_end_cur_node = costs_to_end.get_mut(cur_node); - for cost in self_costs.iter() { - costs_to_end_cur_node.push(*cost); - } - *costs_to_end.get_mut(cur_node) = self_costs.into_iter().collect(); + self_costs.sort_unstable(); + self_costs.dedup(); + + *costs_to_end.get_mut(cur_node) = self_costs; for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { if !enqueued.contains(prev_node) { node_stack.push_back(prev_node); @@ -237,4 +243,56 @@ impl RankingRuleGraph { } costs_to_end } + + pub fn update_all_costs_before_nodes( + &self, + removed_nodes: &BTreeSet>, + costs: &mut MappedInterner>, + ) { + // unsafe { + // FIND_ALL_COSTS_INC_COUNT += 1; + // println!( + // "update_all_costs_after_removing_edge incrementally count: {}", + // FIND_ALL_COSTS_INC_COUNT + // ); + // } + + let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); + let mut node_stack = VecDeque::new(); + + for node in removed_nodes.iter() { + enqueued.insert(*node); + node_stack.push_back(*node); + } + + while let Some(cur_node) = node_stack.pop_front() { + let mut self_costs = BTreeSet::::new(); + + let cur_node_edges = &self.edges_of_node.get(cur_node); + for edge_idx in cur_node_edges.iter() { + let edge = self.edges_store.get(edge_idx).as_ref().unwrap(); + let succ_node = edge.dest_node; + let succ_costs = costs.get(succ_node); + for succ_distance in succ_costs { + self_costs.insert(edge.cost as u64 + succ_distance); + } + } + let costs_to_end_cur_node = costs.get_mut(cur_node); + for cost in self_costs.iter() { + costs_to_end_cur_node.push(*cost); + } + let self_costs = self_costs.into_iter().collect::>(); + if &self_costs == costs.get(cur_node) { + continue; + } + *costs.get_mut(cur_node) = self_costs; + + for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { + if !enqueued.contains(prev_node) { + node_stack.push_back(prev_node); + enqueued.insert(prev_node); + } + } + } + } } diff --git a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs index f3bb25d56..4bbf91fcd 100644 --- a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs @@ -88,4 +88,12 @@ impl DeadEndsCache { } } } + + // pub fn debug_print(&self, indent: usize) { + // println!("{} {:?}", " ".repeat(indent), self.forbidden.iter().collect::>()); + // for (condition, next) in self.conditions.iter().zip(self.next.iter()) { + // println!("{} {condition}:", " ".repeat(indent)); + // next.debug_print(indent + 2); + // } + // } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index db65afdd7..f60c481de 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -10,10 +10,10 @@ mod cheapest_paths; mod condition_docids_cache; mod dead_ends_cache; -/// Implementation of the `attribute` ranking rule -mod fid; /// Implementation of the `exactness` ranking rule mod exactness; +/// Implementation of the `attribute` ranking rule +mod fid; /// Implementation of the `position` ranking rule mod position; /// Implementation of the `proximity` ranking rule @@ -21,13 +21,14 @@ mod proximity; /// Implementation of the `typo` ranking rule mod typo; +use std::collections::BTreeSet; use std::hash::Hash; -pub use fid::{FidCondition, FidGraph}; pub use cheapest_paths::PathVisitor; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; pub use exactness::{ExactnessCondition, ExactnessGraph}; +pub use fid::{FidCondition, FidGraph}; pub use position::{PositionCondition, PositionGraph}; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; @@ -130,7 +131,12 @@ impl Clone for RankingRuleGraph { } impl RankingRuleGraph { /// Remove all edges with the given condition - pub fn remove_edges_with_condition(&mut self, condition_to_remove: Interned) { + /// Return a set of all the source nodes of the removed edges + pub fn remove_edges_with_condition( + &mut self, + condition_to_remove: Interned, + ) -> BTreeSet> { + let mut source_nodes = BTreeSet::new(); for (edge_id, edge_opt) in self.edges_store.iter_mut() { let Some(edge) = edge_opt.as_mut() else { continue }; let Some(condition) = edge.condition else { continue }; @@ -139,7 +145,9 @@ impl RankingRuleGraph { let (source_node, _dest_node) = (edge.source_node, edge.dest_node); *edge_opt = None; self.edges_of_node.get_mut(source_node).remove(edge_id); + source_nodes.insert(source_node); } } + source_nodes } } diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index a8e3f3916..d3b9ac1d1 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -74,7 +74,7 @@ impl RankingRuleGraphTrait for PositionGraph { let mut edges = vec![]; for position in all_positions { - let cost = { + let sum_positions = { let mut cost = 0; for i in 0..term.term_ids.len() { // This is actually not fully correct and slightly penalises ngrams unfairly. @@ -89,7 +89,7 @@ impl RankingRuleGraphTrait for PositionGraph { // TODO: We can improve performances and relevancy by storing // the term subsets associated to each position fetched. edges.push(( - cost, + cost_from_sum_positions(sum_positions), conditions_interner.insert(PositionCondition { term: term.clone(), // TODO remove this ugly clone position, @@ -100,3 +100,26 @@ impl RankingRuleGraphTrait for PositionGraph { Ok(edges) } } + +fn cost_from_sum_positions(sum_positions: u32) -> u32 { + match sum_positions { + 0 | 1 | 2 | 3 => sum_positions, + 4 | 5 => 4, + 6 | 7 => 5, + 8 | 9 => 6, + 10 | 11 => 7, + 12 | 13 => 8, + 14 | 15 => 9, + 16 | 17..=24 => 10, + 25..=32 => 11, + 33..=64 => 12, + 65..=128 => 13, + 129..=256 => 14, + 257..=512 => 15, + 513..=1024 => 16, + 1025..=2048 => 17, + 2049..=4096 => 18, + 4097..=8192 => 19, + _ => 20, + } +} From 608ceea44087ab849d2f7fbe44ccf458f0f10e4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 1 May 2023 11:30:51 +0200 Subject: [PATCH 215/234] Fix bug in position rr --- .../src/search/new/ranking_rule_graph/position/mod.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index d3b9ac1d1..8b70830df 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -74,22 +74,24 @@ impl RankingRuleGraphTrait for PositionGraph { let mut edges = vec![]; for position in all_positions { - let sum_positions = { + let cost = { let mut cost = 0; for i in 0..term.term_ids.len() { // This is actually not fully correct and slightly penalises ngrams unfairly. // Because if two words are in the same bucketed position (e.g. 32) and consecutive, // then their position cost will be 32+32=64, but an ngram of these two words at the // same position will have a cost of 32+32+1=65 - cost += position as u32 + i as u32; + cost += cost_from_position(position as u32 + i as u32); } cost }; // TODO: We can improve performances and relevancy by storing // the term subsets associated to each position fetched. + // + // TODO: group conditions by their cost edges.push(( - cost_from_sum_positions(sum_positions), + cost, conditions_interner.insert(PositionCondition { term: term.clone(), // TODO remove this ugly clone position, @@ -101,7 +103,7 @@ impl RankingRuleGraphTrait for PositionGraph { } } -fn cost_from_sum_positions(sum_positions: u32) -> u32 { +fn cost_from_position(sum_positions: u32) -> u32 { match sum_positions { 0 | 1 | 2 | 3 => sum_positions, 4 | 5 => 4, From 2a7f9adf7889cb302e64de2515633828780e2567 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 1 May 2023 11:30:51 +0200 Subject: [PATCH 216/234] Build query graph more correctly from paths Update snapshots --- milli/src/search/new/query_graph.rs | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 155e6ad75..faa52d0b9 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -6,8 +6,11 @@ use super::small_bitmap::SmallBitmap; use super::SearchContext; use crate::search::new::interner::Interner; use crate::Result; +use fxhash::{FxHashMap, FxHasher}; use std::cmp::Ordering; +use std::collections::hash_map::Entry; use std::collections::BTreeMap; +use std::hash::{Hash, Hasher}; /// A node of the [`QueryGraph`]. /// @@ -400,14 +403,29 @@ impl QueryGraph { paths_with_single_terms.push(processed_path); } - // TODO: make a prefix tree of the processed paths to avoid uselessly duplicating nodes + let mut paths_with_single_terms_and_suffix_hash = vec![]; + for path in paths_with_single_terms { + let mut hasher = FxHasher::default(); + let mut path_with_hash = vec![]; + for term in path.into_iter().rev() { + term.hash(&mut hasher); + path_with_hash.push((term, hasher.finish())); + } + path_with_hash.reverse(); + paths_with_single_terms_and_suffix_hash.push(path_with_hash); + } + + let mut node_data_id_for_term_and_suffix_hash = + FxHashMap::<(LocatedQueryTermSubset, u64), Interned>::default(); let mut paths_with_ids = vec![]; - for path in paths_with_single_terms { + for path in paths_with_single_terms_and_suffix_hash { let mut path_with_ids = vec![]; - for term in path { - let id = node_data.push(QueryNodeData::Term(term)); - path_with_ids.push(Interned::from_raw(id.into_raw())); + for (term, suffix_hash) in path { + let node_data_id = node_data_id_for_term_and_suffix_hash + .entry((term.clone(), suffix_hash)) + .or_insert_with(|| node_data.push(QueryNodeData::Term(term))); + path_with_ids.push(Interned::from_raw(node_data_id.into_raw())); } paths_with_ids.push(path_with_ids); } From 3b2c8b9f251ade03bb4b42676047550167e3fe19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 1 May 2023 12:06:10 +0200 Subject: [PATCH 217/234] Improve performance of position rr --- .../new/ranking_rule_graph/position/mod.rs | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index 8b70830df..ba4743d99 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -1,4 +1,4 @@ -use fxhash::FxHashSet; +use fxhash::{FxHashMap, FxHashSet}; use roaring::RoaringBitmap; use super::{ComputedCondition, RankingRuleGraphTrait}; @@ -11,7 +11,7 @@ use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] pub struct PositionCondition { term: LocatedQueryTermSubset, - position: u16, + positions: Vec, } pub enum PositionGraph {} @@ -24,14 +24,17 @@ impl RankingRuleGraphTrait for PositionGraph { condition: &Self::Condition, universe: &RoaringBitmap, ) -> Result { - let PositionCondition { term, .. } = condition; - // maybe compute_query_term_subset_docids_within_position_id should accept a universe as argument - let mut docids = compute_query_term_subset_docids_within_position( - ctx, - &term.term_subset, - condition.position, - )?; - docids &= universe; + let PositionCondition { term, positions } = condition; + let mut docids = RoaringBitmap::new(); + for position in positions { + // maybe compute_query_term_subset_docids_within_position should accept a universe as argument + docids |= universe + & compute_query_term_subset_docids_within_position( + ctx, + &term.term_subset, + *position, + )?; + } Ok(ComputedCondition { docids, @@ -72,7 +75,8 @@ impl RankingRuleGraphTrait for PositionGraph { all_positions.extend(positions); } - let mut edges = vec![]; + let mut positions_for_costs = FxHashMap::>::default(); + for position in all_positions { let cost = { let mut cost = 0; @@ -85,7 +89,11 @@ impl RankingRuleGraphTrait for PositionGraph { } cost }; + positions_for_costs.entry(cost).or_default().push(position); + } + let mut edges = vec![]; + for (cost, positions) in positions_for_costs { // TODO: We can improve performances and relevancy by storing // the term subsets associated to each position fetched. // @@ -94,7 +102,7 @@ impl RankingRuleGraphTrait for PositionGraph { cost, conditions_interner.insert(PositionCondition { term: term.clone(), // TODO remove this ugly clone - position, + positions, }), )); } From 30fb1153cc6b6a66487267687edbce41fc64e856 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 1 May 2023 15:33:28 +0200 Subject: [PATCH 218/234] Speed up graph based ranking rule when a lot of different costs exist --- .../search/new/graph_based_ranking_rule.rs | 15 ++--- milli/src/search/new/query_graph.rs | 3 - .../new/ranking_rule_graph/cheapest_paths.rs | 63 ++++++++----------- 3 files changed, 35 insertions(+), 46 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index f5918517b..379a0b2ab 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -309,11 +309,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase Ok(ControlFlow::Continue(())) } })?; - // if at_least_one { - // unsafe { - // println!("\n===== {id} COST: {cost} ==== PATHS: {COUNT_PATHS} ==== NODES: {COUNT_VISITED_NODES} ===== UNIVERSE: {universe}", id=self.id, universe=universe.len()); - // } - // } logger.log_internal_state(graph); logger.log_internal_state(&good_paths); @@ -337,8 +332,14 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let next_query_graph = QueryGraph::build_from_paths(paths); - if !nodes_with_removed_outgoing_conditions.is_empty() { - graph.update_all_costs_before_nodes(&nodes_with_removed_outgoing_conditions, all_costs); + #[allow(clippy::comparison_chain)] + if nodes_with_removed_outgoing_conditions.len() == 1 { + graph.update_all_costs_before_node( + *nodes_with_removed_outgoing_conditions.first().unwrap(), + all_costs, + ); + } else if nodes_with_removed_outgoing_conditions.len() > 1 { + *all_costs = graph.find_all_costs_to_end(); } self.state = Some(state); diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index faa52d0b9..0c3191390 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -8,7 +8,6 @@ use crate::search::new::interner::Interner; use crate::Result; use fxhash::{FxHashMap, FxHasher}; use std::cmp::Ordering; -use std::collections::hash_map::Entry; use std::collections::BTreeMap; use std::hash::{Hash, Hasher}; @@ -364,8 +363,6 @@ impl QueryGraph { └──│ b2 │──│ c2 │───│ d │───│ e2 │ └────┘ └────┘ └────┘ └────┘ ``` - But we accept the first representation as it reduces the size - of the graph and shouldn't cause much problems. */ pub fn build_from_paths( paths: Vec, LocatedQueryTermSubset)>>, diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 4a104df69..c065cc706 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -1,8 +1,11 @@ #![allow(clippy::too_many_arguments)] use std::collections::{BTreeSet, VecDeque}; +use std::iter::FromIterator; use std::ops::ControlFlow; +use fxhash::FxHashSet; + use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNode; @@ -112,9 +115,6 @@ impl VisitorState { } } } - // if there wasn't any valid path from this node to the end node, then - // this node is a dead end **for this specific cost**. - // we could encode this in the dead-ends cache Ok(ControlFlow::Continue(any_valid)) } @@ -126,11 +126,11 @@ impl VisitorState { visit: VisitFn, ctx: &mut VisitorContext, ) -> Result> { - if ctx + if !ctx .all_costs_from_node .get(dest_node) .iter() - .all(|next_cost| *next_cost != self.remaining_cost) + .any(|next_cost| *next_cost == self.remaining_cost) { return Ok(ControlFlow::Continue(false)); } @@ -158,14 +158,12 @@ impl VisitorState { ) -> Result> { assert!(dest_node != ctx.graph.query_graph.end_node); - if self.forbidden_conditions_to_nodes.contains(dest_node) + if self.forbidden_conditions.contains(condition) + || self.forbidden_conditions_to_nodes.contains(dest_node) || edge_new_nodes_to_skip.intersects(&self.visited_nodes) { return Ok(ControlFlow::Continue(false)); } - if self.forbidden_conditions.contains(condition) { - return Ok(ControlFlow::Continue(false)); - } // Checking that from the destination node, there is at least // one cost that we can visit that corresponds to our remaining budget. @@ -244,48 +242,41 @@ impl RankingRuleGraph { costs_to_end } - pub fn update_all_costs_before_nodes( + pub fn update_all_costs_before_node( &self, - removed_nodes: &BTreeSet>, + node_with_removed_outgoing_conditions: Interned, costs: &mut MappedInterner>, ) { - // unsafe { - // FIND_ALL_COSTS_INC_COUNT += 1; - // println!( - // "update_all_costs_after_removing_edge incrementally count: {}", - // FIND_ALL_COSTS_INC_COUNT - // ); - // } - let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); let mut node_stack = VecDeque::new(); - for node in removed_nodes.iter() { - enqueued.insert(*node); - node_stack.push_back(*node); - } + enqueued.insert(node_with_removed_outgoing_conditions); + node_stack.push_back(node_with_removed_outgoing_conditions); - while let Some(cur_node) = node_stack.pop_front() { - let mut self_costs = BTreeSet::::new(); + 'main_loop: while let Some(cur_node) = node_stack.pop_front() { + let mut costs_to_remove = FxHashSet::default(); + for c in costs.get(cur_node) { + costs_to_remove.insert(*c); + } let cur_node_edges = &self.edges_of_node.get(cur_node); for edge_idx in cur_node_edges.iter() { let edge = self.edges_store.get(edge_idx).as_ref().unwrap(); - let succ_node = edge.dest_node; - let succ_costs = costs.get(succ_node); - for succ_distance in succ_costs { - self_costs.insert(edge.cost as u64 + succ_distance); + for cost in costs.get(edge.dest_node).iter() { + costs_to_remove.remove(&(*cost + edge.cost as u64)); + if costs_to_remove.is_empty() { + continue 'main_loop; + } } } - let costs_to_end_cur_node = costs.get_mut(cur_node); - for cost in self_costs.iter() { - costs_to_end_cur_node.push(*cost); + if costs_to_remove.is_empty() { + continue 'main_loop; } - let self_costs = self_costs.into_iter().collect::>(); - if &self_costs == costs.get(cur_node) { - continue; + let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied()); + for c in costs_to_remove { + new_costs.remove(&c); } - *costs.get_mut(cur_node) = self_costs; + *costs.get_mut(cur_node) = new_costs.into_iter().collect(); for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { if !enqueued.contains(prev_node) { From 11f814821d02f604ff57a538d2f9d81337a336c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 1 May 2023 15:48:47 +0200 Subject: [PATCH 219/234] Minor cleanup --- milli/src/search/new/ranking_rule_graph/position/mod.rs | 6 ++++-- milli/tests/assets/test_set.ndjson | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index ba4743d99..c3d33174b 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -91,13 +91,15 @@ impl RankingRuleGraphTrait for PositionGraph { }; positions_for_costs.entry(cost).or_default().push(position); } + println!( + "positions for cost {} : {positions_for_costs:?}", + term.term_subset.description(ctx) + ); let mut edges = vec![]; for (cost, positions) in positions_for_costs { // TODO: We can improve performances and relevancy by storing // the term subsets associated to each position fetched. - // - // TODO: group conditions by their cost edges.push(( cost, conditions_interner.insert(PositionCondition { diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 4c83cbe14..1a8a23c3e 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -47,7 +47,7 @@ "word_rank": 0, "typo_rank": 1, "proximity_rank": 10, - "attribute_rank": 111, + "attribute_rank": 108, "exact_rank": 6, "asc_desc_rank": 2, "sort_by_rank": 0, From 1b514517f52efb833d4dc98361d70827d760ad6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 1 May 2023 16:26:01 +0200 Subject: [PATCH 220/234] Fix bug in computation of query term at a position --- .../search/new/graph_based_ranking_rule.rs | 25 ------ .../new/ranking_rule_graph/position/mod.rs | 8 +- milli/src/search/new/resolve_query_graph.rs | 10 +-- milli/src/search/new/tests/integration.rs | 78 +++++++++++++++++++ milli/src/search/new/tests/mod.rs | 3 +- 5 files changed, 84 insertions(+), 40 deletions(-) create mode 100644 milli/src/search/new/tests/integration.rs diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 379a0b2ab..d8f6836e7 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -100,8 +100,6 @@ impl GraphBasedRankingRule { } } -static mut COUNT_PATHS: usize = 0; - /// The internal state of a graph-based ranking rule during iteration pub struct GraphBasedRankingRuleState { /// The current graph @@ -219,34 +217,11 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // the number of future candidate paths given by that same function. let mut subpaths_docids: Vec<(Interned, RoaringBitmap)> = vec![]; - let mut at_least_one = false; - // unsafe { - // if COUNT_PATHS >= 1489 && COUNT_PATHS < 1491 { - // println!("COUNT_PATHS {COUNT_PATHS} COST {cost}, NODES {COUNT_VISITED_NODES}, UNIVERSE {}", universe.len()); - // // let all_costs = all_costs.get(graph.query_graph.root_node); - // // println!("{all_costs:?}"); - // dead_ends_cache.debug_print(0); - // println!("{universe:?}"); - - // println!("=================="); - // } - // } let mut nodes_with_removed_outgoing_conditions = BTreeSet::new(); let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache); visitor.visit_paths(&mut |path, graph, dead_ends_cache| { - unsafe { - COUNT_PATHS += 1; - } - // if self.id == "position" { - // at_least_one = true; - // print!("."); - // } - // if self.id == "fid" { - at_least_one = true; - // print!("!"); - // } considered_paths.push(path.to_vec()); // If the universe is empty, stop exploring the graph, since no docids will ever be found anymore. if universe.is_empty() { diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index c3d33174b..d4640097e 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -35,7 +35,6 @@ impl RankingRuleGraphTrait for PositionGraph { *position, )?; } - Ok(ComputedCondition { docids, universe_len: universe.len(), @@ -91,15 +90,12 @@ impl RankingRuleGraphTrait for PositionGraph { }; positions_for_costs.entry(cost).or_default().push(position); } - println!( - "positions for cost {} : {positions_for_costs:?}", - term.term_subset.description(ctx) - ); + let mut edges = vec![]; for (cost, positions) in positions_for_costs { // TODO: We can improve performances and relevancy by storing - // the term subsets associated to each position fetched. + // the term subsets associated to each position fetched edges.push(( cost, conditions_interner.insert(PositionCondition { diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 131dcd856..2bcb4d3ac 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -69,16 +69,14 @@ pub fn compute_query_term_subset_docids_within_field_id( } for phrase in term.all_phrases(ctx)? { - let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone(); // There may be false positives when resolving a phrase, so we're not // guaranteed that all of its words are within a single fid. // TODO: fix this? if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? { - phrase_docids &= word_fid_docids; + docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids; } } - docids |= phrase_docids; } if let Some(word_prefix) = term.use_prefix_db(ctx) { @@ -98,7 +96,6 @@ pub fn compute_query_term_subset_docids_within_position( position: u16, ) -> Result { // TODO Use the roaring::MultiOps trait - let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { if let Some(word_position_docids) = @@ -109,16 +106,14 @@ pub fn compute_query_term_subset_docids_within_position( } for phrase in term.all_phrases(ctx)? { - let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone(); // It's difficult to know the expected position of the words in the phrase, // so instead we just check the first one. // TODO: fix this? if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? { - phrase_docids &= word_position_docids; + docids |= ctx.get_phrase_docids(phrase)? & word_position_docids } } - docids |= phrase_docids; } if let Some(word_prefix) = term.use_prefix_db(ctx) { @@ -128,7 +123,6 @@ pub fn compute_query_term_subset_docids_within_position( docids |= word_position_docids; } } - Ok(docids) } diff --git a/milli/src/search/new/tests/integration.rs b/milli/src/search/new/tests/integration.rs new file mode 100644 index 000000000..153a80343 --- /dev/null +++ b/milli/src/search/new/tests/integration.rs @@ -0,0 +1,78 @@ +use std::io::Cursor; + +use big_s::S; +use heed::EnvOpenOptions; +use maplit::{hashmap, hashset}; + +use crate::{ + db_snap, + documents::{DocumentsBatchBuilder, DocumentsBatchReader}, + update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}, + Criterion, Index, Object, +}; +pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson"); + +pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_criteria(criteria.to_vec()); + builder.set_filterable_fields(hashset! { + S("tag"), + S("asc_desc_rank"), + S("_geo"), + S("opt1"), + S("opt1.opt2"), + S("tag_in") + }); + builder.set_sortable_fields(hashset! { + S("tag"), + S("asc_desc_rank"), + }); + builder.set_synonyms(hashmap! { + S("hello") => vec![S("good morning")], + S("world") => vec![S("earth")], + S("america") => vec![S("the united states")], + }); + builder.set_searchable_fields(vec![S("title"), S("description")]); + builder.execute(|_| (), || false).unwrap(); + + // index documents + let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; + let indexing_config = IndexDocumentsConfig::default(); + + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); + let reader = Cursor::new(CONTENT.as_bytes()); + + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); + } + + let vector = documents_builder.into_inner().unwrap(); + + // index documents + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index +} + +#[test] +fn snapshot_integration_dataset() { + let index = setup_search_index_with_criteria(&[Criterion::Attribute]); + db_snap!(index, word_position_docids, @"3c9347a767bceef3beb31465f1e5f3ae"); +} diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index 2ad806a87..906aeda83 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -3,16 +3,17 @@ pub mod attribute_position; pub mod distinct; pub mod exactness; pub mod geo_sort; +pub mod integration; #[cfg(feature = "default")] pub mod language; pub mod ngram_split_words; pub mod proximity; pub mod proximity_typo; pub mod sort; +pub mod stop_words; pub mod typo; pub mod typo_proximity; pub mod words_tms; -pub mod stop_words; fn collect_field_values( index: &crate::Index, From 58735d6d8f3afe210e5da79cc43f72bc5089d91e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 1 May 2023 16:37:35 +0200 Subject: [PATCH 221/234] Fix outdated relevancy test --- milli/tests/assets/test_set.ndjson | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 1a8a23c3e..175d9b3ce 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -68,7 +68,7 @@ "word_rank": 0, "typo_rank": 1, "proximity_rank": 16, - "attribute_rank": 213, + "attribute_rank": 208, "exact_rank": 5, "asc_desc_rank": 3, "sort_by_rank": 2, @@ -220,7 +220,7 @@ "word_rank": 0, "typo_rank": 2, "proximity_rank": 10, - "attribute_rank": 213, + "attribute_rank": 209, "exact_rank": 6, "asc_desc_rank": 1, "sort_by_rank": 2, From aa630917529e775a456e64cd0899f6f74a80baba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 1 May 2023 17:20:10 +0200 Subject: [PATCH 222/234] Fix bug in exact_attribute --- milli/src/search/new/exact_attribute.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 93dd7c3fc..dc9c95d3d 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -34,7 +34,6 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { query: &QueryGraph, ) -> Result<()> { self.state = State::start_iteration(ctx, universe, query)?; - Ok(()) } @@ -169,7 +168,8 @@ impl State { // longer phrases we'll be losing on precision here. let bucketed_position = crate::bucketed_position(position + offset); let word_position_docids = - ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(); + ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default() + & universe; candidates &= word_position_docids; if candidates.is_empty() { return Ok(State::Empty(query_graph.clone())); @@ -183,10 +183,15 @@ impl State { return Ok(State::Empty(query_graph.clone())); } - let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); + let searchable_fields_ids = { + if let Some(fids) = ctx.index.searchable_fields_ids(ctx.txn)? { + fids + } else { + ctx.index.fields_ids_map(ctx.txn)?.ids().collect() + } + }; let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); - // then check that there exists at least one attribute that has all of the terms for fid in searchable_fields_ids { let mut intersection = MultiOps::intersection( @@ -208,10 +213,10 @@ impl State { .field_id_word_count_docids .get(ctx.txn, &(fid, count_all_positions as u8))? .unwrap_or_default() + & universe } else { RoaringBitmap::default() }; - candidates_per_attribute.push(FieldCandidates { start_with_exact: intersection, exact_word_count: candidates_with_exact_word_count, From 7b8cc25625878d5129ad0be8e85e0e5cd9ddca77 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 2 May 2023 18:53:01 +0200 Subject: [PATCH 223/234] rename located_query_terms_from_string -> located_query_terms_from_tokens --- milli/src/search/new/matches/matching_words.rs | 4 ++-- milli/src/search/new/matches/mod.rs | 4 ++-- milli/src/search/new/mod.rs | 4 ++-- milli/src/search/new/query_term/mod.rs | 2 +- milli/src/search/new/query_term/parse_query.rs | 3 +-- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index e9a728a01..92248345e 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -235,7 +235,7 @@ pub(crate) mod tests { use charabia::{TokenKind, TokenizerBuilder}; - use super::super::super::located_query_terms_from_string; + use super::super::super::located_query_terms_from_tokens; use super::*; use crate::index::tests::TempIndex; @@ -256,7 +256,7 @@ pub(crate) mod tests { let mut ctx = SearchContext::new(&temp_index, &rtxn); let tokenizer = TokenizerBuilder::new().build(); let tokens = tokenizer.tokenize("split this world"); - let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); + let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); let matching_words = MatchingWords::new(ctx, query_terms); assert_eq!( diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 0db2c3660..1974ae431 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -499,7 +499,7 @@ mod tests { use charabia::TokenizerBuilder; use matching_words::tests::temp_index_with_documents; - use super::super::located_query_terms_from_string; + use super::super::located_query_terms_from_tokens; use super::*; use crate::SearchContext; @@ -507,7 +507,7 @@ mod tests { pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self { let tokenizer = TokenizerBuilder::new().build(); let tokens = tokenizer.tokenize(query); - let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); + let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); let matching_words = MatchingWords::new(ctx, query_terms); Self::new(matching_words, TokenizerBuilder::new().build()) } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 2faf20a1d..0ba5613b5 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -36,7 +36,7 @@ use interner::{DedupInterner, Interner}; pub use logger::visual::VisualSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; -use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm}; +use query_term::{located_query_terms_from_tokens, LocatedQueryTerm, Phrase, QueryTerm}; use ranking_rules::{ BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait, }; @@ -387,7 +387,7 @@ pub fn execute_search( let tokenizer = tokbuilder.build(); let tokens = tokenizer.tokenize(query); - let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; + let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?; if query_terms.is_empty() { // Do a placeholder search instead None diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 5f1a45d83..a8e121094 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -10,7 +10,7 @@ use std::ops::RangeInclusive; use compute_derivations::partially_initialized_term_from_word; use either::Either; pub use ntypo_subset::NTypoTermSubset; -pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed}; +pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed}; pub use phrase::Phrase; use super::interner::{DedupInterner, Interned}; diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 734938551..dc317a0fb 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -5,8 +5,7 @@ use super::*; use crate::{Result, SearchContext, MAX_WORD_LENGTH}; /// Convert the tokenised search query into a list of located query terms. -// TODO: checking if the positions are correct for phrases, separators, ngrams -pub fn located_query_terms_from_string( +pub fn located_query_terms_from_tokens( ctx: &mut SearchContext, query: NormalizedTokenIter<&[u8]>, words_limit: Option, From 75819bc94047643478a17f8f6e9a58f4ba363f1f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 2 May 2023 18:53:40 +0200 Subject: [PATCH 224/234] Remove too many arguments on resolve_maximally_reduced_query_graph --- milli/src/search/new/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 0ba5613b5..7e8426bf9 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -92,7 +92,6 @@ impl Word { } /// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it. -#[allow(clippy::too_many_arguments)] fn resolve_maximally_reduced_query_graph( ctx: &mut SearchContext, universe: &RoaringBitmap, From fdc17638381099b8c052652054d0307165fd78c6 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 2 May 2023 18:54:09 +0200 Subject: [PATCH 225/234] Use MultiOps for resolve_query_graph --- milli/src/search/new/resolve_query_graph.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 2bcb4d3ac..797db5875 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -3,7 +3,7 @@ use std::collections::VecDeque; use fxhash::FxHashMap; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use super::interner::Interned; use super::query_graph::QueryNodeData; @@ -126,6 +126,7 @@ pub fn compute_query_term_subset_docids_within_position( Ok(docids) } +/// Returns the subset of the input universe that satisfies the contraints of the input query graph. pub fn compute_query_graph_docids( ctx: &mut SearchContext, q: &QueryGraph, @@ -148,10 +149,8 @@ pub fn compute_query_graph_docids( continue; } // Take union of all predecessors - let mut predecessors_docids = RoaringBitmap::new(); - for p in predecessors.iter() { - predecessors_docids |= path_nodes_docids.get(p); - } + let predecessors_docids = + MultiOps::union(predecessors.iter().map(|p| path_nodes_docids.get(p))); let node_docids = match &node.data { QueryNodeData::Term(LocatedQueryTermSubset { From b60840ebffd67b75252b72ca5d3628cd1dcc71f7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 2 May 2023 18:54:23 +0200 Subject: [PATCH 226/234] Remove self.iterating from words --- milli/src/search/new/words.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 5c28f017b..72b7b5916 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -1,15 +1,15 @@ +use roaring::RoaringBitmap; + use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::resolve_query_graph::compute_query_graph_docids; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::{Result, TermsMatchingStrategy}; -use roaring::RoaringBitmap; pub struct Words { exhausted: bool, // TODO: remove query_graph: Option, - iterating: bool, // TODO: remove nodes_to_remove: Vec>, terms_matching_strategy: TermsMatchingStrategy, } @@ -18,7 +18,6 @@ impl Words { Self { exhausted: true, query_graph: None, - iterating: false, nodes_to_remove: vec![], terms_matching_strategy, } @@ -48,7 +47,6 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { vec![] } }; - self.iterating = true; Ok(()) } @@ -58,9 +56,6 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { - assert!(self.iterating); - assert!(universe.len() > 1); - if self.exhausted { return Ok(None); } @@ -85,7 +80,6 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, ) { - self.iterating = false; self.exhausted = true; self.nodes_to_remove = vec![]; self.query_graph = None; From c470b67fa29bd2814237fec58016417b8e38b314 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 May 2023 11:25:10 +0200 Subject: [PATCH 227/234] revamp the test to use execute_iterative_and_rtree_returns_the_same --- milli/src/search/new/tests/geo_sort.rs | 87 +++++--------------------- 1 file changed, 14 insertions(+), 73 deletions(-) diff --git a/milli/src/search/new/tests/geo_sort.rs b/milli/src/search/new/tests/geo_sort.rs index 3072007db..1f0003082 100644 --- a/milli/src/search/new/tests/geo_sort.rs +++ b/milli/src/search/new/tests/geo_sort.rs @@ -1,9 +1,5 @@ /*! -This module tests the `geo_sort` ranking rule: - -REVIEW COMMENT: - - nice tests :) - - add anything that seems not obvious about the behaviour of the geosort ranking rule here +This module tests the `geo_sort` ranking rule */ use big_s::S; @@ -62,84 +58,29 @@ fn execute_iterative_and_rtree_returns_the_same<'a>( fn test_geo_sort() { let index = create_index(); - // REVIEW COMMENT: - // I prefer to make the external ids correspond to the internal ids so that - // we can check whether the ranking rules are actually doing work instead of - // returning documents in order of their internal ids. - // index .add_documents(documents!([ - { "id": 0, "_geo": { "lat": 2, "lng": -1 } }, - { "id": 1, "_geo": { "lat": -2, "lng": -2 } }, - { "id": 2, "_geo": { "lat": 6, "lng": -5 } }, - { "id": 3, "_geo": { "lat": 3, "lng": 5 } }, - { "id": 4, "_geo": { "lat": 0, "lng": 0 } }, - { "id": 5, "_geo": { "lat": 1, "lng": 1 } }, - { "id": 6 }, - { "id": 7 }, - { "id": 8 }, - { "id": 9 }, - { "id": 10 }, + { "id": 2, "_geo": { "lat": 2, "lng": -1 } }, + { "id": 3, "_geo": { "lat": -2, "lng": -2 } }, + { "id": 5, "_geo": { "lat": 6, "lng": -5 } }, + { "id": 4, "_geo": { "lat": 3, "lng": 5 } }, + { "id": 0, "_geo": { "lat": 0, "lng": 0 } }, + { "id": 1, "_geo": { "lat": 1, "lng": 1 } }, + { "id": 6 }, { "id": 8 }, { "id": 7 }, { "id": 10 }, { "id": 9 }, ])) .unwrap(); - let txn = index.read_txn().unwrap(); + let rtxn = index.read_txn().unwrap(); - let mut s = Search::new(&txn, &index); + let mut s = Search::new(&rtxn, &index); - // --- asc s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., 0.]))]); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[0, 1, 2, 3, 4, 5, 6, 8, 7, 10, 9]"); - s.geo_sort_strategy(GeoSortStrategy::Dynamic(100)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); - - s.geo_sort_strategy(GeoSortStrategy::Dynamic(3)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); - - s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); - - s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); - - s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); - - s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); - - // --- desc s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., 0.]))]); - - s.geo_sort_strategy(GeoSortStrategy::Dynamic(100)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); - - s.geo_sort_strategy(GeoSortStrategy::Dynamic(3)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); - - s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); - - s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); - - s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); - - s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3)); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); + let ids = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s); + insta::assert_snapshot!(format!("{ids:?}"), @"[5, 4, 3, 2, 1, 0, 6, 8, 7, 10, 9]"); } #[test] From 8875d24a4858a8445ad80d2f91ce7aa4c7c6077f Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 May 2023 11:39:35 +0200 Subject: [PATCH 228/234] deserialize the rtree only when its needed, and keep it in memory once it has been deserialized --- milli/src/search/new/geo_sort.rs | 38 ++++++++------------------------ 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/milli/src/search/new/geo_sort.rs b/milli/src/search/new/geo_sort.rs index 9e1da4479..98eacd48a 100644 --- a/milli/src/search/new/geo_sort.rs +++ b/milli/src/search/new/geo_sort.rs @@ -111,12 +111,13 @@ impl GeoSort { // if we had an rtree and the strategy doesn't require one anymore we can drop it let use_rtree = self.strategy.use_rtree(self.geo_candidates.len() as usize); - if !use_rtree && self.rtree.is_some() { - self.rtree = None; + if use_rtree && self.rtree.is_none() { + self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree")); } let cache_size = self.strategy.cache_size(); - if let Some(ref mut rtree) = self.rtree { + if use_rtree { + let rtree = self.rtree.as_ref().unwrap(); let point = lat_lng_to_xyz(&self.point); if self.ascending { @@ -169,18 +170,12 @@ impl GeoSort { )) }) .collect::>>()?; - // REVIEW COMMENT: the haversine distance function can be quite expensive, I think, so it's probably faster - // to use `sort_by_cached_key` instead of `sort_by_key`. + // computing the distance between two points is expensive thus we cache the result documents .sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize); self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id)); }; - if self.cached_sorted_docids.is_empty() && matches!(self.strategy, Strategy::AlwaysRtree(_)) - { - // this shouldn't be possible - self.rtree = None; - } Ok(()) } } @@ -210,15 +205,6 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat"); let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng"); self.field_ids = Some([lat, lng]); - - if self.strategy.use_rtree(self.geo_candidates.len() as usize) { - // REVIEW COMMENT: I would prefer to always keep the rtree in memory so that we don't have to deserialize it - // every time the geosort ranking rule starts iterating. - // So we'd initialize it in `::new` and never drop it. - // - self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree")); - } - self.fill_buffer(ctx)?; Ok(()) } @@ -256,20 +242,14 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { } // if we got out of this loop it means we've exhausted our cache. - - if self.rtree.is_none() { - // with no rtree it means all geo candidates have been returned. We can return all the non geo-faceted documents - Ok(Some(RankingRuleOutput { query, candidates: universe.clone() })) - } else { - // else, we need to refill our bucket and run the function again - self.fill_buffer(ctx)?; - self.next_bucket(ctx, logger, universe) - } + // we need to refill it and run the function again. + self.fill_buffer(ctx)?; + self.next_bucket(ctx, logger, universe) } fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger) { + // we do not reset the rtree here, it could be used in a next iteration self.query = None; - self.rtree = None; self.cached_sorted_docids.clear(); } } From c85392ce40826c49c64438228fd60204ed5a7420 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 May 2023 12:04:08 +0200 Subject: [PATCH 229/234] make the descendent geosort fast --- milli/src/search/new/geo_sort.rs | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/milli/src/search/new/geo_sort.rs b/milli/src/search/new/geo_sort.rs index 98eacd48a..ad3a1783a 100644 --- a/milli/src/search/new/geo_sort.rs +++ b/milli/src/search/new/geo_sort.rs @@ -118,9 +118,9 @@ impl GeoSort { let cache_size = self.strategy.cache_size(); if use_rtree { let rtree = self.rtree.as_ref().unwrap(); - let point = lat_lng_to_xyz(&self.point); if self.ascending { + let point = lat_lng_to_xyz(&self.point); for point in rtree.nearest_neighbor_iter(&point) { if self.geo_candidates.contains(point.data.0) { self.cached_sorted_docids.push_back(point.data.0); @@ -130,18 +130,15 @@ impl GeoSort { } } } else { - // in the case of the desc geo sort we have to scan the whole database - // and only keep the latest candidates. + // in the case of the desc geo sort we look for the closest point to the opposite of the queried point + // and we insert the points in reverse order they get reversed when emptying the cache later on + let point = lat_lng_to_xyz(&opposite_of(self.point)); for point in rtree.nearest_neighbor_iter(&point) { if self.geo_candidates.contains(point.data.0) { - // REVIEW COMMENT: that doesn't look right, because we only keep the furthest point in the cache. - // Then the cache will be exhausted after the first bucket and we'll need to repopulate it again immediately. - // I think it's okay if we keep every document id in the cache instead. It's a high memory usage, - // but we already have the whole rtree in memory, which is bigger than a vector of all document ids. - // - // self.cached_sorted_docids.pop_front(); - // - self.cached_sorted_docids.push_back(point.data.0); + self.cached_sorted_docids.push_front(point.data.0); + if self.cached_sorted_docids.len() >= cache_size { + break; + } } } } @@ -253,3 +250,16 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { self.cached_sorted_docids.clear(); } } + +/// Compute the antipodal coordinate of `coord` +fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] { + coord[0] *= -1.; + // in the case of x,0 we want to return x,180 + if coord[1] > 0. { + coord[1] -= 180.; + } else { + coord[1] += 180.; + } + + coord +} From 342c4ff85d56730c734a6dc35815b4c9efba468b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 May 2023 09:52:16 +0200 Subject: [PATCH 230/234] geosort: Remove rtree unwrap --- milli/src/search/new/geo_sort.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/milli/src/search/new/geo_sort.rs b/milli/src/search/new/geo_sort.rs index ad3a1783a..e94ed33d1 100644 --- a/milli/src/search/new/geo_sort.rs +++ b/milli/src/search/new/geo_sort.rs @@ -109,16 +109,23 @@ impl GeoSort { debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng"); debug_assert!(self.cached_sorted_docids.is_empty()); - // if we had an rtree and the strategy doesn't require one anymore we can drop it - let use_rtree = self.strategy.use_rtree(self.geo_candidates.len() as usize); - if use_rtree && self.rtree.is_none() { - self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree")); - } + // lazily initialize the rtree if needed by the strategy, and cache it in `self.rtree` + let rtree = if self.strategy.use_rtree(self.geo_candidates.len() as usize) { + if let Some(rtree) = self.rtree.as_ref() { + // get rtree from cache + Some(rtree) + } else { + let rtree = ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree"); + // insert rtree in cache and returns it. + // Can't use `get_or_insert_with` because getting the rtree from the DB is a fallible operation. + Some(&*self.rtree.insert(rtree)) + } + } else { + None + }; let cache_size = self.strategy.cache_size(); - if use_rtree { - let rtree = self.rtree.as_ref().unwrap(); - + if let Some(rtree) = rtree { if self.ascending { let point = lat_lng_to_xyz(&self.point); for point in rtree.nearest_neighbor_iter(&point) { From 1aaf24ccbf26590ee0979e18adb09dc3a7a6c8b9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 May 2023 12:21:58 +0200 Subject: [PATCH 231/234] Cargo fmt --- milli/src/heed_codec/mod.rs | 2 +- milli/src/search/new/query_graph.rs | 10 ++++++---- milli/src/search/new/query_term/ntypo_subset.rs | 3 +-- milli/src/search/new/query_term/phrase.rs | 3 ++- milli/src/search/new/ranking_rule_graph/build.rs | 3 ++- milli/src/search/new/tests/attribute_fid.rs | 3 ++- milli/src/search/new/tests/attribute_position.rs | 5 ++--- milli/src/search/new/tests/distinct.rs | 7 ++----- milli/src/search/new/tests/exactness.rs | 7 +++---- milli/src/search/new/tests/integration.rs | 9 +++------ milli/src/search/new/tests/language.rs | 3 ++- milli/src/search/new/tests/ngram_split_words.rs | 7 +++---- milli/src/search/new/tests/proximity.rs | 7 +++---- milli/src/search/new/tests/proximity_typo.rs | 7 +++---- milli/src/search/new/tests/sort.rs | 7 +++---- milli/src/search/new/tests/stop_words.rs | 6 ++++-- milli/src/search/new/tests/typo_proximity.rs | 7 +++---- milli/src/search/new/tests/words_tms.rs | 7 +++---- 18 files changed, 48 insertions(+), 55 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index b7a8c3c88..de2644e11 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -21,5 +21,5 @@ pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; pub use self::script_language_codec::ScriptLanguageCodec; -pub use self::str_beu32_codec::{StrBEU32Codec, StrBEU16Codec}; +pub use self::str_beu32_codec::{StrBEU16Codec, StrBEU32Codec}; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 0c3191390..0e7d5a7f3 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,3 +1,9 @@ +use std::cmp::Ordering; +use std::collections::BTreeMap; +use std::hash::{Hash, Hasher}; + +use fxhash::{FxHashMap, FxHasher}; + use super::interner::{FixedSizeInterner, Interned}; use super::query_term::{ self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, QueryTermSubset, @@ -6,10 +12,6 @@ use super::small_bitmap::SmallBitmap; use super::SearchContext; use crate::search::new::interner::Interner; use crate::Result; -use fxhash::{FxHashMap, FxHasher}; -use std::cmp::Ordering; -use std::collections::BTreeMap; -use std::hash::{Hash, Hasher}; /// A node of the [`QueryGraph`]. /// diff --git a/milli/src/search/new/query_term/ntypo_subset.rs b/milli/src/search/new/query_term/ntypo_subset.rs index ad25d73c7..bbc2236b0 100644 --- a/milli/src/search/new/query_term/ntypo_subset.rs +++ b/milli/src/search/new/query_term/ntypo_subset.rs @@ -1,8 +1,7 @@ use std::collections::BTreeSet; -use crate::search::new::interner::Interned; - use super::Phrase; +use crate::search::new::interner::Interned; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum NTypoTermSubset { diff --git a/milli/src/search/new/query_term/phrase.rs b/milli/src/search/new/query_term/phrase.rs index 033c5cf12..1d0b24196 100644 --- a/milli/src/search/new/query_term/phrase.rs +++ b/milli/src/search/new/query_term/phrase.rs @@ -1,6 +1,7 @@ use itertools::Itertools; -use crate::{search::new::interner::Interned, SearchContext}; +use crate::search::new::interner::Interned; +use crate::SearchContext; /// A phrase in the user's search query, consisting of several words /// that must appear side-by-side in the search results. diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index c92eac526..015cd9845 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,10 +1,11 @@ +use std::collections::HashSet; + use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, MappedInterner}; use crate::search::new::query_graph::{QueryNode, QueryNodeData}; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, SearchContext}; use crate::Result; -use std::collections::HashSet; impl RankingRuleGraph { /// Build the ranking rule graph from the given query graph diff --git a/milli/src/search/new/tests/attribute_fid.rs b/milli/src/search/new/tests/attribute_fid.rs index d71c57f2c..177dc393a 100644 --- a/milli/src/search/new/tests/attribute_fid.rs +++ b/milli/src/search/new/tests/attribute_fid.rs @@ -1,4 +1,5 @@ -use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; +use crate::index::tests::TempIndex; +use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); diff --git a/milli/src/search/new/tests/attribute_position.rs b/milli/src/search/new/tests/attribute_position.rs index 08b38684b..5e16cd023 100644 --- a/milli/src/search/new/tests/attribute_position.rs +++ b/milli/src/search/new/tests/attribute_position.rs @@ -1,6 +1,5 @@ -use crate::{ - db_snap, index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy, -}; +use crate::index::tests::TempIndex; +use crate::{db_snap, Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); diff --git a/milli/src/search/new/tests/distinct.rs b/milli/src/search/new/tests/distinct.rs index 74e0cdca0..2c147d514 100644 --- a/milli/src/search/new/tests/distinct.rs +++ b/milli/src/search/new/tests/distinct.rs @@ -17,12 +17,9 @@ use big_s::S; use heed::RoTxn; use maplit::hashset; -use crate::{ - index::tests::TempIndex, AscDesc, Criterion, Index, Member, Search, SearchResult, - TermsMatchingStrategy, -}; - use super::collect_field_values; +use crate::index::tests::TempIndex; +use crate::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); diff --git a/milli/src/search/new/tests/exactness.rs b/milli/src/search/new/tests/exactness.rs index 7543959d3..b45c0529c 100644 --- a/milli/src/search/new/tests/exactness.rs +++ b/milli/src/search/new/tests/exactness.rs @@ -19,10 +19,9 @@ Then these rules will only work with 2. the full query term otherwise */ -use crate::{ - index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, - SearchResult, TermsMatchingStrategy, -}; +use crate::index::tests::TempIndex; +use crate::search::new::tests::collect_field_values; +use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index_simple_ordered() -> TempIndex { let index = TempIndex::new(); diff --git a/milli/src/search/new/tests/integration.rs b/milli/src/search/new/tests/integration.rs index 153a80343..3abb1878f 100644 --- a/milli/src/search/new/tests/integration.rs +++ b/milli/src/search/new/tests/integration.rs @@ -4,12 +4,9 @@ use big_s::S; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use crate::{ - db_snap, - documents::{DocumentsBatchBuilder, DocumentsBatchReader}, - update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}, - Criterion, Index, Object, -}; +use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; +use crate::{db_snap, Criterion, Index, Object}; pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson"); pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { diff --git a/milli/src/search/new/tests/language.rs b/milli/src/search/new/tests/language.rs index e16544fdb..0724b32a4 100644 --- a/milli/src/search/new/tests/language.rs +++ b/milli/src/search/new/tests/language.rs @@ -1,4 +1,5 @@ -use crate::{index::tests::TempIndex, Search, SearchResult}; +use crate::index::tests::TempIndex; +use crate::{Search, SearchResult}; #[test] fn test_kanji_language_detection() { diff --git a/milli/src/search/new/tests/ngram_split_words.rs b/milli/src/search/new/tests/ngram_split_words.rs index b78bbe763..2a0365bac 100644 --- a/milli/src/search/new/tests/ngram_split_words.rs +++ b/milli/src/search/new/tests/ngram_split_words.rs @@ -16,10 +16,9 @@ This module tests the following properties: 13. Ngrams cannot be formed by combining a phrase and a word or two phrases */ -use crate::{ - index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, - SearchResult, TermsMatchingStrategy, -}; +use crate::index::tests::TempIndex; +use crate::search::new::tests::collect_field_values; +use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index 880f933f0..401508866 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -17,10 +17,9 @@ they store fewer sprximities than the regular word sprximity DB. use std::collections::HashMap; -use crate::{ - index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, - SearchResult, TermsMatchingStrategy, -}; +use crate::index::tests::TempIndex; +use crate::search::new::tests::collect_field_values; +use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_simple_index() -> TempIndex { let index = TempIndex::new(); diff --git a/milli/src/search/new/tests/proximity_typo.rs b/milli/src/search/new/tests/proximity_typo.rs index 9f9601e3f..ab98f99c0 100644 --- a/milli/src/search/new/tests/proximity_typo.rs +++ b/milli/src/search/new/tests/proximity_typo.rs @@ -7,10 +7,9 @@ only contains the word pairs that it used to compute its bucket. TODO: This is not currently implemented. */ -use crate::{ - index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, - SearchResult, TermsMatchingStrategy, -}; +use crate::index::tests::TempIndex; +use crate::search::new::tests::collect_field_values; +use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); diff --git a/milli/src/search/new/tests/sort.rs b/milli/src/search/new/tests/sort.rs index d2201f55b..52acc646c 100644 --- a/milli/src/search/new/tests/sort.rs +++ b/milli/src/search/new/tests/sort.rs @@ -14,10 +14,9 @@ This module tests the `sort` ranking rule: use big_s::S; use maplit::hashset; -use crate::{ - index::tests::TempIndex, search::new::tests::collect_field_values, AscDesc, Criterion, Member, - Search, SearchResult, TermsMatchingStrategy, -}; +use crate::index::tests::TempIndex; +use crate::search::new::tests::collect_field_values; +use crate::{AscDesc, Criterion, Member, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); diff --git a/milli/src/search/new/tests/stop_words.rs b/milli/src/search/new/tests/stop_words.rs index 9dd7f2cb0..92168f6d6 100644 --- a/milli/src/search/new/tests/stop_words.rs +++ b/milli/src/search/new/tests/stop_words.rs @@ -9,9 +9,11 @@ This module tests the following properties about stop words: - Phrases consisting only of stop words are ignored */ -use std::{collections::BTreeSet, iter::FromIterator}; +use std::collections::BTreeSet; +use std::iter::FromIterator; -use crate::{db_snap, index::tests::TempIndex, Search, SearchResult, TermsMatchingStrategy}; +use crate::index::tests::TempIndex; +use crate::{db_snap, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); diff --git a/milli/src/search/new/tests/typo_proximity.rs b/milli/src/search/new/tests/typo_proximity.rs index 220fc69e1..103cc4717 100644 --- a/milli/src/search/new/tests/typo_proximity.rs +++ b/milli/src/search/new/tests/typo_proximity.rs @@ -15,10 +15,9 @@ The proximity ranking rule is not allowed to look for the proximity between `bea because the typo ranking rule before it only used the derivation `beautiful`. */ -use crate::{ - index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, - SearchResult, TermsMatchingStrategy, -}; +use crate::index::tests::TempIndex; +use crate::search::new::tests::collect_field_values; +use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); diff --git a/milli/src/search/new/tests/words_tms.rs b/milli/src/search/new/tests/words_tms.rs index 74748ea5a..826f9c47d 100644 --- a/milli/src/search/new/tests/words_tms.rs +++ b/milli/src/search/new/tests/words_tms.rs @@ -12,10 +12,9 @@ account by the proximity ranking rule. 7. The search is capable of returning no results if no documents match the query */ -use crate::{ - index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, - SearchResult, TermsMatchingStrategy, -}; +use crate::index::tests::TempIndex; +use crate::search::new::tests::collect_field_values; +use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); From d3e5b10e23b715faaf431d8731c55747abf679f8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 May 2023 14:11:20 +0200 Subject: [PATCH 232/234] fix nb of dbs --- milli/src/index.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 777d1007e..bc027b1fe 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -166,7 +166,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(21); + options.max_dbs(23); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -1461,11 +1461,11 @@ pub(crate) mod tests { db_snap!(index, field_distribution); db_snap!(index, field_distribution, - @" - age 1 - id 2 - name 2 - " + @r###" + age 1 + id 2 + name 2 + "### ); // snapshot_index!(&index, "1", include: "^field_distribution$"); @@ -1482,10 +1482,10 @@ pub(crate) mod tests { db_snap!(index, field_distribution, @r###" - age 1 - id 2 - name 2 - "### + age 1 + id 2 + name 2 + "### ); // then we update a document by removing one field and another by adding one field @@ -1498,10 +1498,10 @@ pub(crate) mod tests { db_snap!(index, field_distribution, @r###" - has_dog 1 - id 2 - name 2 - "### + has_dog 1 + id 2 + name 2 + "### ); } From 3a408e8287d47790b4af9f8651ecbc4e383f7746 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 May 2023 14:44:48 +0200 Subject: [PATCH 233/234] Increase map size for tests following charabia camelCase tokenization --- milli/src/index.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index bc027b1fe..f02719f92 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1329,10 +1329,10 @@ pub(crate) mod tests { let index_documents_config = IndexDocumentsConfig::default(); Self { inner, indexer_config, index_documents_config, _tempdir } } - /// Creates a temporary index, with a default `4096 * 1000` size. This should be enough for + /// Creates a temporary index, with a default `4096 * 2000` size. This should be enough for /// most tests. pub fn new() -> Self { - Self::new_with_map_size(4096 * 1000) + Self::new_with_map_size(4096 * 2000) } pub fn add_documents_using_wtxn<'t, R>( &'t self, From f8f190cd403b496cd1906d895db71e47de19ad44 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 May 2023 14:45:09 +0200 Subject: [PATCH 234/234] Update exactness tests following charabia camelCase tokenization --- milli/src/search/new/tests/exactness.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/milli/src/search/new/tests/exactness.rs b/milli/src/search/new/tests/exactness.rs index b45c0529c..c5c963ede 100644 --- a/milli/src/search/new/tests/exactness.rs +++ b/milli/src/search/new/tests/exactness.rs @@ -412,7 +412,7 @@ fn create_index_with_typo_and_prefix() -> TempIndex { .add_documents(documents!([ { "id": 0, - "text": "exPraordinarily quick brown fox", + "text": "expraordinarily quick brown fox", }, { "id": 1, @@ -424,11 +424,11 @@ fn create_index_with_typo_and_prefix() -> TempIndex { }, { "id": 3, - "text": "exPraordinarily quack brown fox", + "text": "expraordinarily quack brown fox", }, { "id": 4, - "text": "exPraordinaPily quick brown fox", + "text": "expraordinapily quick brown fox", } ])) .unwrap(); @@ -812,9 +812,9 @@ fn test_exactness_followed_by_typo_prefer_no_typo_prefix() { [ "\"extra quick brown fox\"", "\"extraordinarily quick brown fox\"", - "\"exPraordinarily quick brown fox\"", - "\"exPraordinaPily quick brown fox\"", - "\"exPraordinarily quack brown fox\"", + "\"expraordinarily quick brown fox\"", + "\"expraordinapily quick brown fox\"", + "\"expraordinarily quack brown fox\"", ] "###); } @@ -841,9 +841,9 @@ fn test_typo_followed_by_exactness() { insta::assert_debug_snapshot!(texts, @r###" [ "\"extraordinarily quick brown fox\"", - "\"exPraordinarily quick brown fox\"", - "\"exPraordinaPily quick brown fox\"", - "\"exPraordinarily quack brown fox\"", + "\"expraordinarily quick brown fox\"", + "\"expraordinapily quick brown fox\"", + "\"expraordinarily quack brown fox\"", ] "###); }