From 1db152046e374e0ee532872ccc294cb376cad3d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 2 Mar 2023 21:27:57 +0100 Subject: [PATCH] WIP on split words and synonyms support --- milli/src/search/new/mod.rs | 37 ++--- milli/src/search/new/query_graph.rs | 90 ++---------- milli/src/search/new/query_term.rs | 77 ++++++++-- milli/src/search/new/resolve_query_graph.rs | 152 ++++++++++++++++---- milli/src/search/new/words.rs | 19 +-- 5 files changed, 233 insertions(+), 142 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 7b82fc6e9..e09fe2300 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -31,22 +31,27 @@ pub fn make_query_graph<'transaction>( query: &str, ) -> Result { assert!(!query.is_empty()); - let fst = index.words_fst(txn).unwrap(); - let query = LocatedQueryTerm::from_query(query.tokenize(), None, |word, is_prefix| { - word_derivations( - index, - txn, - word, - if word.len() < 4 { - 0 - } else if word.len() < 100 { - 1 - } else { - 2 - }, - is_prefix, - &fst, - ) + let authorize_typos = index.authorize_typos(txn)?; + let min_len_one_typo = index.min_word_len_one_typo(txn)?; + let min_len_two_typos = index.min_word_len_two_typos(txn)?; + + let exact_words = index.exact_words(txn)?; + let fst = index.words_fst(txn)?; + + // TODO: get rid of this closure + // also, ngrams can have one typo? + let query = LocatedQueryTerm::from_query(query.tokenize(), None, move |word, is_prefix| { + let typos = if !authorize_typos + || word.len() < min_len_one_typo as usize + || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) + { + 0 + } else if word.len() < min_len_two_typos as usize { + 1 + } else { + 2 + }; + word_derivations(index, txn, word, typos, is_prefix, &fst) }) .unwrap(); let graph = QueryGraph::from_query(index, txn, db_cache, query)?; diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 422896068..8178f8ded 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -7,7 +7,7 @@ use super::db_cache::DatabaseCache; use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::{Index, Result}; -#[derive(Clone)] +#[derive(Debug, Clone)] pub enum QueryNode { Term(LocatedQueryTerm), Deleted, @@ -31,7 +31,7 @@ pub struct QueryGraph { } fn _assert_sizes() { - let _: [u8; 112] = [0; std::mem::size_of::()]; + let _: [u8; 184] = [0; std::mem::size_of::()]; let _: [u8; 48] = [0; std::mem::size_of::()]; } @@ -116,6 +116,8 @@ impl QueryGraph { one_typo: vec![], two_typos: vec![], use_prefix_db: false, + synonyms: vec![], // TODO: ngram synonyms + split_words: None, // TODO: maybe ngram split words? }, }, positions: ngram2_pos, @@ -141,6 +143,8 @@ impl QueryGraph { one_typo: vec![], two_typos: vec![], use_prefix_db: false, + synonyms: vec![], // TODO: ngram synonyms + split_words: None, // TODO: maybe ngram split words? }, }, positions: ngram3_pos, @@ -188,19 +192,20 @@ impl QueryGraph { Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }; } } - pub fn remove_words_at_position(&mut self, position: i8) { + pub fn remove_words_at_position(&mut self, position: i8) -> bool { let mut nodes_to_remove_keeping_edges = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { let node_idx = node_idx as u32; let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue }; if positions.start() == &position { - nodes_to_remove_keeping_edges.push(node_idx) + nodes_to_remove_keeping_edges.push(node_idx); } } self.remove_nodes_keep_edges(&nodes_to_remove_keeping_edges); self.simplify(); + !nodes_to_remove_keeping_edges.is_empty() } fn simplify(&mut self) { @@ -223,80 +228,3 @@ impl QueryGraph { } } } -impl Debug for QueryNode { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - QueryNode::Term(term @ LocatedQueryTerm { value, positions: _ }) => match value { - QueryTerm::Word { - derivations: - WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db }, - } => { - if term.is_empty() { - write!(f, "\"{original} (∅)\"") - } else { - let derivations = std::iter::once(original.clone()) - .chain(zero_typo.iter().map(|s| format!("T0 .. {s}"))) - .chain(one_typo.iter().map(|s| format!("T1 .. {s}"))) - .chain(two_typos.iter().map(|s| format!("T2 .. {s}"))) - .collect::>() - .join(" | "); - - write!(f, "\"{derivations}")?; - if *use_prefix_db { - write!(f, " | +prefix_db")?; - } - write!(f, " | pos:{}..={}", term.positions.start(), term.positions.end())?; - write!(f, "\"")?; - /* - "beautiful" [label = " beautiful | beauiful | beautifol"] - */ - Ok(()) - } - } - QueryTerm::Phrase(ws) => { - let joined = - ws.iter().filter_map(|x| x.clone()).collect::>().join(" "); - let in_quotes = format!("\"{joined}\""); - let escaped = in_quotes.escape_default().collect::(); - write!(f, "\"{escaped}\"") - } - }, - QueryNode::Start => write!(f, "\"START\""), - QueryNode::End => write!(f, "\"END\""), - QueryNode::Deleted => write!(f, "\"_deleted_\""), - } - } -} - -impl QueryGraph { - pub fn graphviz(&self) -> String { - let mut desc = String::new(); - desc.push_str( - r#" -digraph G { -rankdir = LR; -node [shape = "record"] -"#, - ); - - for node in 0..self.nodes.len() { - if matches!(self.nodes[node], QueryNode::Deleted) { - continue; - } - desc.push_str(&format!("{node} [label = {:?}]", &self.nodes[node],)); - if node == self.root_node as usize { - desc.push_str("[color = blue]"); - } else if node == self.end_node as usize { - desc.push_str("[color = red]"); - } - desc.push_str(";\n"); - - for edge in self.edges[node].successors.iter() { - desc.push_str(&format!("{node} -> {edge};\n")); - } - } - - desc.push('}'); - desc - } -} diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 537857bf2..9ea72aa3a 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -10,14 +10,28 @@ use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use heed::types::DecodeIgnore; use heed::RoTxn; +use itertools::Itertools; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; -use crate::{Index, Result}; +use crate::{CboRoaringBitmapLenCodec, Index, Result}; + +#[derive(Debug, Default, Clone)] +pub struct Phrase { + pub words: Vec>, +} +impl Phrase { + pub fn description(&self) -> String { + self.words.iter().flatten().join(" ") + } +} #[derive(Debug, Clone)] pub struct WordDerivations { pub original: String, + // TODO: pub prefix_of: Vec, + pub synonyms: Vec, + pub split_words: Option<(String, String)>, pub zero_typo: Vec, pub one_typo: Vec, pub two_typos: Vec, @@ -114,19 +128,63 @@ pub fn word_derivations( } } } + let split_words = split_best_frequency(index, txn, word)?; - Ok(WordDerivations { original: word.to_owned(), zero_typo, one_typo, two_typos, use_prefix_db }) + let synonyms = index.synonyms(txn)?; + let synonyms = synonyms + .get(&vec![word.to_owned()]) + .cloned() + .unwrap_or_default() + .into_iter() + .map(|words| Phrase { words: words.into_iter().map(Some).collect() }) + .collect(); + + Ok(WordDerivations { + original: word.to_owned(), + synonyms, + split_words, + zero_typo, + one_typo, + two_typos, + use_prefix_db, + }) +} + +fn split_best_frequency( + index: &Index, + txn: &RoTxn, + original: &str, +) -> Result> { + let chars = original.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = original.split_at(i); + + let key = (1, left, right); + let frequency = index + .word_pair_proximity_docids + .remap_data_type::() + .get(txn, &key)? + .unwrap_or(0); + + if frequency != 0 && best.map_or(true, |(old, _, _)| frequency > old) { + best = Some((frequency, left, right)); + } + } + + Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned()))) } #[derive(Debug, Clone)] pub enum QueryTerm { - Phrase(Vec>), + Phrase { phrase: Phrase }, Word { derivations: WordDerivations }, } impl QueryTerm { pub fn original_single_word(&self) -> Option<&str> { match self { - QueryTerm::Phrase(_) => None, + QueryTerm::Phrase { phrase: _ } => None, QueryTerm::Word { derivations } => { if derivations.is_empty() { None @@ -140,14 +198,14 @@ impl QueryTerm { #[derive(Debug, Clone)] pub struct LocatedQueryTerm { - pub value: QueryTerm, // value should be able to contain the word derivations as well + pub value: QueryTerm, pub positions: RangeInclusive, } impl LocatedQueryTerm { pub fn is_empty(&self) -> bool { match &self.value { - QueryTerm::Phrase(_) => false, + QueryTerm::Phrase { phrase: _ } => false, QueryTerm::Word { derivations, .. } => derivations.is_empty(), } } @@ -156,6 +214,7 @@ impl LocatedQueryTerm { pub fn from_query( query: NormalizedTokenIter>, words_limit: Option, + // TODO:` use index + txn + ? instead of closure derivations: impl Fn(&str, bool) -> Result, ) -> Result> { let mut primitive_query = Vec::new(); @@ -232,7 +291,9 @@ impl LocatedQueryTerm { && (quote_count > 0 || separator_kind == SeparatorKind::Hard) { let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase(mem::take(&mut phrase)), + value: QueryTerm::Phrase { + phrase: Phrase { words: mem::take(&mut phrase) }, + }, positions: phrase_start..=phrase_end, }; primitive_query.push(located_query_term); @@ -245,7 +306,7 @@ impl LocatedQueryTerm { // If a quote is never closed, we consider all of the end of the query as a phrase. if !phrase.is_empty() { let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase(mem::take(&mut phrase)), + value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } }, positions: phrase_start..=phrase_end, }; primitive_query.push(located_query_term); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 4da853e7c..93ebcf989 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -5,9 +5,10 @@ use heed::{BytesDecode, RoTxn}; use roaring::{MultiOps, RoaringBitmap}; use super::db_cache::DatabaseCache; -use super::query_term::{QueryTerm, WordDerivations}; -use super::QueryGraph; -use crate::{Index, Result, RoaringBitmapCodec}; +use super::query_term::{Phrase, QueryTerm, WordDerivations}; +use super::{QueryGraph, QueryNode}; + +use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. #[derive(Default)] @@ -27,33 +28,49 @@ impl NodeDocIdsCache { return Ok(&self.cache[&node_idx]); }; let docids = match term { - QueryTerm::Phrase(_) => { - todo!("resolve phrase") - } + QueryTerm::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase)?, QueryTerm::Word { derivations: - WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db }, + WordDerivations { + original, + zero_typo, + one_typo, + two_typos, + use_prefix_db, + synonyms, + split_words, + }, } => { - let derivations_docids = { - let mut or_docids = vec![]; - for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) { - if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? { - or_docids.push(word_docids); - } + let mut or_docids = vec![]; + for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) { + if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? { + or_docids.push(word_docids); } - if *use_prefix_db { - if let Some(prefix_docids) = - db_cache.get_prefix_docids(index, txn, original.as_str())? - { - or_docids.push(prefix_docids); - } + } + if *use_prefix_db { + if let Some(prefix_docids) = + db_cache.get_prefix_docids(index, txn, original.as_str())? + { + or_docids.push(prefix_docids); } - or_docids - }; - let derivations_iter = derivations_docids + } + let mut docids = or_docids .into_iter() - .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()); - MultiOps::union(derivations_iter) + .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()) + .collect::>(); + for synonym in synonyms { + // TODO: cache resolve_phrase? + docids.push(resolve_phrase(index, txn, db_cache, synonym)?); + } + if let Some((left, right)) = split_words { + if let Some(split_word_docids) = + db_cache.get_word_pair_proximity_docids(index, txn, left, right, 1)? + { + docids.push(CboRoaringBitmapCodec::deserialize_from(split_word_docids)?); + } + } + + MultiOps::union(docids) } }; let _ = self.cache.insert(node_idx, docids); @@ -90,19 +107,19 @@ pub fn resolve_query_graph<'transaction>( let predecessors_docids = MultiOps::union(predecessors_iter); let n = &q.nodes[node as usize]; - // println!("resolving {node} {n:?}, predecessors: {predecessors:?}, their docids: {predecessors_docids:?}"); + let node_docids = match n { - super::QueryNode::Term(located_term) => { + QueryNode::Term(located_term) => { let term = &located_term.value; let derivations_docids = node_docids_cache.get_docids(index, txn, db_cache, term, node)?; predecessors_docids & derivations_docids } - super::QueryNode::Deleted => { + QueryNode::Deleted => { panic!() } - super::QueryNode::Start => universe.clone(), - super::QueryNode::End => { + QueryNode::Start => universe.clone(), + QueryNode::End => { return Ok(predecessors_docids); } }; @@ -125,3 +142,80 @@ pub fn resolve_query_graph<'transaction>( panic!() } + +pub fn resolve_phrase<'transaction>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + phrase: &Phrase, +) -> Result { + let Phrase { words } = phrase; + let mut candidates = RoaringBitmap::new(); + let mut first_iter = true; + let winsize = words.len().min(3); + + if words.is_empty() { + return Ok(candidates); + } + + for win in words.windows(winsize) { + // Get all the documents with the matching distance for each word pairs. + let mut bitmaps = Vec::with_capacity(winsize.pow(2)); + for (offset, s1) in win + .iter() + .enumerate() + .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) + { + for (dist, s2) in win + .iter() + .skip(offset + 1) + .enumerate() + .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) + { + if dist == 0 { + match db_cache.get_word_pair_proximity_docids(index, txn, s1, s2, 1)? { + Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?), + // If there are no documents for this pair, there will be no + // results for the phrase query. + None => return Ok(RoaringBitmap::new()), + } + } else { + let mut bitmap = RoaringBitmap::new(); + for dist in 0..=dist { + if let Some(m) = db_cache.get_word_pair_proximity_docids( + index, + txn, + s1, + s2, + dist as u8 + 1, + )? { + bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?; + } + } + if bitmap.is_empty() { + return Ok(bitmap); + } else { + bitmaps.push(bitmap); + } + } + } + } + + // We sort the bitmaps so that we perform the small intersections first, which is faster. + bitmaps.sort_unstable_by_key(|a| a.len()); + + for bitmap in bitmaps { + if first_iter { + candidates = bitmap; + first_iter = false; + } else { + candidates &= bitmap; + } + // There will be no match, return early + if candidates.is_empty() { + break; + } + } + } + Ok(candidates) +} diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index e4513eea0..da4599ec5 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -99,14 +99,17 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { )?; let child_query_graph = query_graph.clone(); - // TODO: Check whether a position exists in the graph before removing it and - // returning the next bucket. - // while graph.does_not_contain(positions_to_remove.last()) { positions_to_remove.pop() } - if self.positions_to_remove.is_empty() { - self.exhausted = true; - } else { - let position_to_remove = self.positions_to_remove.pop().unwrap(); - query_graph.remove_words_at_position(position_to_remove); + loop { + if self.positions_to_remove.is_empty() { + self.exhausted = true; + break; + } else { + let position_to_remove = self.positions_to_remove.pop().unwrap(); + let did_delete_any_node = query_graph.remove_words_at_position(position_to_remove); + if did_delete_any_node { + break; + } + } } Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket }))