From 73286dc8bf915d368eae75fe56082246ae9505dc Mon Sep 17 00:00:00 2001 From: many Date: Wed, 10 Feb 2021 11:35:17 +0100 Subject: [PATCH 01/50] Introduce the query tree data structure --- milli/src/search/query_tree.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 00905db2e..9b253350e 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,5 +1,3 @@ -#![allow(unused)] - use std::borrow::Cow; use std::collections::BTreeMap; use std::{fmt, cmp, mem}; From f0ddea821cd214254ce201d29410c25b61940f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Feb 2021 15:14:00 +0100 Subject: [PATCH 02/50] Introduce the Typo criterion --- milli/src/search/criteria/mod.rs | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 milli/src/search/criteria/mod.rs diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs new file mode 100644 index 000000000..4cc4512d7 --- /dev/null +++ b/milli/src/search/criteria/mod.rs @@ -0,0 +1,34 @@ +use crate::Index; + +use roaring::RoaringBitmap; + +use super::query_tree::Operation; + +pub mod typo; + +pub trait Criterion { + fn next(&mut self) -> anyhow::Result, RoaringBitmap)>>; +} + +/// Either a set of candidates that defines the candidates +/// that are allowed to be returned, +/// or the candidates that must never be returned. +enum Candidates { + Allowed(RoaringBitmap), + Forbidden(RoaringBitmap) +} + +impl Candidates { + fn into_inner(self) -> RoaringBitmap { + match self { + Self::Allowed(inner) => inner, + Self::Forbidden(inner) => inner, + } + } +} + +impl Default for Candidates { + fn default() -> Self { + Self::Forbidden(RoaringBitmap::new()) + } +} From ad20d72a39641681a1b591bef0c8d8ebc7332a6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Feb 2021 15:20:07 +0100 Subject: [PATCH 03/50] Introduce the Typo criterion --- milli/src/search/criteria/typo.rs | 253 ++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 milli/src/search/criteria/typo.rs diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs new file mode 100644 index 000000000..196da4a49 --- /dev/null +++ b/milli/src/search/criteria/typo.rs @@ -0,0 +1,253 @@ +use std::{borrow::Cow, mem::take}; + +use anyhow::bail; +use roaring::RoaringBitmap; + +use crate::Index; +use crate::search::query_tree::{Operation, Query, QueryKind}; +use crate::search::word_typos; +use super::{Candidates, Criterion}; + +// FIXME we must stop when the number of typos is equal to +// the maximum number of typos for this query tree. +const MAX_NUM_TYPOS: u8 = 8; + +pub struct Typo<'t> { + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + words_fst: fst::Set>, + query_tree: Option, + number_typos: u8, + candidates: Candidates, + parent: Option>, +} + +impl<'t> Typo<'t> { + pub fn initial( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + query_tree: Option, + candidates: Option, + ) -> anyhow::Result where Self: Sized + { + Ok(Typo { + index, + rtxn, + words_fst: index.words_fst(rtxn)?, + query_tree, + number_typos: 0, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + parent: None, + }) + } + + pub fn new( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + ) -> anyhow::Result where Self: Sized + { + Ok(Typo { + index, + rtxn, + words_fst: index.words_fst(rtxn)?, + query_tree: None, + number_typos: 0, + candidates: Candidates::default(), + parent: Some(parent), + }) + } +} + +impl<'t> Criterion for Typo<'t> { + fn next(&mut self) -> anyhow::Result, RoaringBitmap)>> { + use Candidates::{Allowed, Forbidden}; + while self.number_typos < MAX_NUM_TYPOS { + match (&mut self.query_tree, &mut self.candidates) { + (_, Allowed(candidates)) if candidates.is_empty() => { + self.query_tree = None; + self.candidates = Candidates::default(); + }, + (Some(query_tree), Allowed(candidates)) => { + let new_query_tree = alterate_query_tree(&self.words_fst, query_tree.clone(), self.number_typos)?; + let mut new_candidates = resolve_candidates(&self.index, &self.rtxn, &new_query_tree, self.number_typos)?; + new_candidates.intersect_with(&candidates); + candidates.difference_with(&new_candidates); + self.number_typos += 1; + + return Ok(Some((Some(new_query_tree), new_candidates))); + }, + (Some(query_tree), Forbidden(candidates)) => { + let new_query_tree = alterate_query_tree(&self.words_fst, query_tree.clone(), self.number_typos)?; + let mut new_candidates = resolve_candidates(&self.index, &self.rtxn, &new_query_tree, self.number_typos)?; + new_candidates.difference_with(&candidates); + candidates.union_with(&new_candidates); + self.number_typos += 1; + + return Ok(Some((Some(new_query_tree), new_candidates))); + }, + (None, Allowed(_)) => { + return Ok(Some((None, take(&mut self.candidates).into_inner()))); + }, + (None, Forbidden(_)) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some((query_tree, candidates)) => { + self.query_tree = query_tree; + self.candidates = Candidates::Allowed(candidates); + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } + + Ok(None) + } +} + +/// Modify the query tree by replacing every tolerant query by an Or operation +/// containing all of the corresponding exact words in the words FST. Each tolerant +/// query will only be replaced by exact query with up to `number_typos` maximum typos. +fn alterate_query_tree( + words_fst: &fst::Set>, + mut query_tree: Operation, + number_typos: u8, +) -> anyhow::Result +{ + fn recurse( + words_fst: &fst::Set>, + operation: &mut Operation, + number_typos: u8, + ) -> anyhow::Result<()> + { + use Operation::{And, Consecutive, Or}; + + match operation { + And(ops) | Consecutive(ops) | Or(_, ops) => { + ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos)) + }, + Operation::Query(q) => { + if let QueryKind::Tolerant { typo, word } = &q.kind { + let typo = *typo.min(&number_typos); + let words = word_typos(word, q.prefix, typo, words_fst)?; + + let queries = words.into_iter().map(|(word, _typo)| { + Operation::Query(Query { + prefix: false, + kind: QueryKind::Exact { original_typo: typo, word }, + }) + }).collect(); + + *operation = Operation::or(false, queries); + } + + Ok(()) + }, + } + } + + recurse(words_fst, &mut query_tree, number_typos)?; + Ok(query_tree) +} + +fn resolve_candidates( + index: &Index, + rtxn: &heed::RoTxn, + query_tree: &Operation, + number_typos: u8, +) -> anyhow::Result +{ + // FIXME add a cache + // FIXME keep the cache between typos iterations + // cache: HashMap<(&Operation, u8), RoaringBitmap>, + + fn resolve_operation( + index: &Index, + rtxn: &heed::RoTxn, + query_tree: &Operation, + number_typos: u8, + ) -> anyhow::Result + { + use Operation::{And, Consecutive, Or, Query}; + + match query_tree { + And(ops) => { + mdfs(index, rtxn, ops, number_typos) + }, + Consecutive(ops) => { + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for slice in ops.windows(2) { + match (&slice[0], &slice[1]) { + (Operation::Query(left), Operation::Query(right)) => { + let key_pair = &(left.kind.word(), right.kind.word(), 1); + match index.word_pair_proximity_docids.get(rtxn, key_pair)? { + Some(pair_docids) => { + if first_loop { + candidates = pair_docids; + first_loop = false; + } else { + candidates.intersect_with(&pair_docids) + } + }, + None => return Ok(RoaringBitmap::new()), + } + + }, + _ => bail!("invalid consecutive query type"), + } + } + Ok(candidates) + }, + Or(_, ops) => { + let mut candidates = RoaringBitmap::new(); + for op in ops { + let docids = resolve_operation(index, rtxn, op, number_typos)?; + candidates.union_with(&docids); + } + Ok(candidates) + }, + Query(q) => if q.kind.typo() == number_typos { + let word = q.kind.word(); + Ok(index.word_docids.get(rtxn, word)?.unwrap_or_default()) + } else { + Ok(RoaringBitmap::new()) + }, + } + } + + /// FIXME Make this function generic and mutualize it between Typo and proximity criterion + fn mdfs( + index: &Index, + rtxn: &heed::RoTxn, + branches: &[Operation], + mana: u8, + ) -> anyhow::Result + { + match branches.split_first() { + Some((head, [])) => resolve_operation(index, rtxn, head, mana), + Some((head, tail)) => { + let mut candidates = RoaringBitmap::new(); + + for m in 0..=mana { + let mut head_candidates = resolve_operation(index, rtxn, head, m)?; + if !head_candidates.is_empty() { + let tail_candidates = mdfs(index, rtxn, tail, mana - m)?; + head_candidates.intersect_with(&tail_candidates); + candidates.union_with(&head_candidates); + } + } + + Ok(candidates) + }, + None => Ok(RoaringBitmap::new()), + } + } + + resolve_operation(index, rtxn, query_tree, number_typos) +} From f091f370d055a723b2011a27170d3572d9b2dd2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 10:29:28 +0100 Subject: [PATCH 04/50] Use the Typo criteria in the search module --- milli/src/search/mod.rs | 232 +++++++++++++++++++++++----------------- 1 file changed, 136 insertions(+), 96 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index e5672982e..267f3556a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -18,10 +18,13 @@ use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec} use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::mdfs::Mdfs; use crate::query_tokens::{query_tokens, QueryToken}; -use crate::{Index, FieldId, DocumentId, Criterion}; +use crate::search::criteria::Criterion; +use crate::search::criteria::typo::Typo; +use crate::{Index, FieldId, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::{FacetIter}; +use self::query_tree::QueryTreeBuilder; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -30,6 +33,7 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); mod facet; mod query_tree; +mod criteria; pub struct Search<'a> { query: Option, @@ -258,15 +262,22 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> anyhow::Result { - let limit = self.limit; - let fst = self.index.words_fst(self.rtxn)?; - - // Construct the DFAs related to the query words. - let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) { - Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?), - _otherwise => None, + // We create the query tree by spliting the query into tokens. + let before = Instant::now(); + let query_tree = match self.query.as_ref() { + Some(query) => { + let builder = QueryTreeBuilder::new(self.rtxn, self.index); + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + builder.build(false, true, tokens) + }, + None => None, }; + debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); + // We create the original candidates with the facet conditions results. let before = Instant::now(); let facet_candidates = match &self.facet_condition { @@ -276,100 +287,129 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); - let order_by_facet = { - let criteria = self.index.criteria(self.rtxn)?; - let result = criteria.into_iter().flat_map(|criterion| { - match criterion { - Criterion::Asc(fid) => Some((fid, true)), - Criterion::Desc(fid) => Some((fid, false)), - _ => None - } - }).next(); - match result { - Some((attr_name, is_ascending)) => { - let field_id_map = self.index.fields_ids_map(self.rtxn)?; - let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?; - let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?; - let ftype = *faceted_fields.get(&fid) - .with_context(|| format!("{:?} not found in the faceted fields.", attr_name)) - .expect("corrupted data: "); - Some((fid, ftype, is_ascending)) - }, - None => None, + // We aretesting the typo criteria but there will be more of them soon. + let mut criteria = Typo::initial(self.index, self.rtxn, query_tree, facet_candidates)?; + + let mut offset = self.offset; + let mut limit = self.limit; + let mut documents_ids = Vec::new(); + while let Some((_qt, docids)) = criteria.next()? { + + let mut len = docids.len() as usize; + let mut docids = docids.into_iter(); + + if offset != 0 { + docids.by_ref().skip(offset).for_each(drop); + offset = offset.saturating_sub(len.min(offset)); + len = len.saturating_sub(len.min(offset)); } - }; - let before = Instant::now(); - let (candidates, derived_words) = match (facet_candidates, derived_words) { - (Some(mut facet_candidates), Some(derived_words)) => { - let words_candidates = Self::compute_candidates(&derived_words); - facet_candidates.intersect_with(&words_candidates); - (facet_candidates, derived_words) - }, - (None, Some(derived_words)) => { - (Self::compute_candidates(&derived_words), derived_words) - }, - (Some(facet_candidates), None) => { - // If the query is not set or results in no DFAs but - // there is some facet conditions we return a placeholder. - let documents_ids = match order_by_facet { - Some((fid, ftype, is_ascending)) => { - self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)? - }, - None => facet_candidates.iter().take(limit).collect(), - }; - return Ok(SearchResult { - documents_ids, - candidates: facet_candidates, - ..Default::default() - }) - }, - (None, None) => { - // If the query is not set or results in no DFAs we return a placeholder. - let all_docids = self.index.documents_ids(self.rtxn)?; - let documents_ids = match order_by_facet { - Some((fid, ftype, is_ascending)) => { - self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)? - }, - None => all_docids.iter().take(limit).collect(), - }; - return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() }) - }, - }; - - debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed()); - - // The mana depth first search is a revised DFS that explore - // solutions in the order of their proximities. - let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone()); - let mut documents = Vec::new(); - - // We execute the Mdfs iterator until we find enough documents. - while documents.iter().map(RoaringBitmap::len).sum::() < limit as u64 { - match mdfs.next().transpose()? { - Some((proximity, answer)) => { - debug!("answer with a proximity of {}: {:?}", proximity, answer); - documents.push(answer); - }, - None => break, + if len != 0 { + documents_ids.extend(docids.take(limit)); + limit = limit.saturating_sub(len.min(limit)); } + + if limit == 0 { break } } - let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); - let documents_ids = match order_by_facet { - Some((fid, ftype, order)) => { - let mut ordered_documents = Vec::new(); - for documents_ids in documents { - let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?; - ordered_documents.push(docids); - if ordered_documents.iter().map(Vec::len).sum::() >= limit { break } - } - ordered_documents.into_iter().flatten().take(limit).collect() - }, - None => documents.into_iter().flatten().take(limit).collect(), - }; - + let found_words = HashSet::new(); + let candidates = RoaringBitmap::new(); Ok(SearchResult { found_words, candidates, documents_ids }) + + // let order_by_facet = { + // let criteria = self.index.criteria(self.rtxn)?; + // let result = criteria.into_iter().flat_map(|criterion| { + // match criterion { + // Criterion::Asc(fid) => Some((fid, true)), + // Criterion::Desc(fid) => Some((fid, false)), + // _ => None + // } + // }).next(); + // match result { + // Some((attr_name, is_ascending)) => { + // let field_id_map = self.index.fields_ids_map(self.rtxn)?; + // let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?; + // let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?; + // let ftype = *faceted_fields.get(&fid) + // .with_context(|| format!("{:?} not found in the faceted fields.", attr_name)) + // .expect("corrupted data: "); + // Some((fid, ftype, is_ascending)) + // }, + // None => None, + // } + // }; + + // let before = Instant::now(); + // let (candidates, derived_words) = match (facet_candidates, derived_words) { + // (Some(mut facet_candidates), Some(derived_words)) => { + // let words_candidates = Self::compute_candidates(&derived_words); + // facet_candidates.intersect_with(&words_candidates); + // (facet_candidates, derived_words) + // }, + // (None, Some(derived_words)) => { + // (Self::compute_candidates(&derived_words), derived_words) + // }, + // (Some(facet_candidates), None) => { + // // If the query is not set or results in no DFAs but + // // there is some facet conditions we return a placeholder. + // let documents_ids = match order_by_facet { + // Some((fid, ftype, is_ascending)) => { + // self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)? + // }, + // None => facet_candidates.iter().take(limit).collect(), + // }; + // return Ok(SearchResult { + // documents_ids, + // candidates: facet_candidates, + // ..Default::default() + // }) + // }, + // (None, None) => { + // // If the query is not set or results in no DFAs we return a placeholder. + // let all_docids = self.index.documents_ids(self.rtxn)?; + // let documents_ids = match order_by_facet { + // Some((fid, ftype, is_ascending)) => { + // self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)? + // }, + // None => all_docids.iter().take(limit).collect(), + // }; + // return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() }) + // }, + // }; + + // debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed()); + + // // The mana depth first search is a revised DFS that explore + // // solutions in the order of their proximities. + // let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone()); + // let mut documents = Vec::new(); + + // // We execute the Mdfs iterator until we find enough documents. + // while documents.iter().map(RoaringBitmap::len).sum::() < limit as u64 { + // match mdfs.next().transpose()? { + // Some((proximity, answer)) => { + // debug!("answer with a proximity of {}: {:?}", proximity, answer); + // documents.push(answer); + // }, + // None => break, + // } + // } + + // let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); + // let documents_ids = match order_by_facet { + // Some((fid, ftype, order)) => { + // let mut ordered_documents = Vec::new(); + // for documents_ids in documents { + // let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?; + // ordered_documents.push(docids); + // if ordered_documents.iter().map(Vec::len).sum::() >= limit { break } + // } + // ordered_documents.into_iter().flatten().take(limit).collect() + // }, + // None => documents.into_iter().flatten().take(limit).collect(), + // }; + + // Ok(SearchResult { found_words, candidates, documents_ids }) } } From 98e69e63d2ed67563d9b1cbe456db145039cadab Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 15:27:35 +0100 Subject: [PATCH 05/50] implement Context trait for criteria --- milli/src/search/criteria/mod.rs | 122 +++++++++++++++++++++++++++++- milli/src/search/criteria/typo.rs | 67 +++++++--------- milli/src/search/mod.rs | 3 +- 3 files changed, 152 insertions(+), 40 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 4cc4512d7..36569906e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,8 +1,11 @@ +use std::borrow::Cow; + use crate::Index; +use crate::search::word_typos; use roaring::RoaringBitmap; -use super::query_tree::Operation; +use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; @@ -32,3 +35,120 @@ impl Default for Candidates { Self::Forbidden(RoaringBitmap::new()) } } +pub trait Context { + fn query_docids(&self, query: &Query) -> anyhow::Result>; + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) ->anyhow::Result>; + fn words_fst<'t>(&self) -> &'t fst::Set>; +} +pub struct HeedContext<'t> { + rtxn: &'t heed::RoTxn<'t>, + index: &'t Index, + words_fst: fst::Set>, +} + +impl<'a> Context for HeedContext<'a> { + fn query_docids(&self, query: &Query) -> anyhow::Result> { + match (&query.kind, query.prefix) { + // TODO de-comment when ~ready + // (QueryKind::Exact { word, .. }, true) if in_prefix_cache(&word) => { + // Ok(self.index.prefix_docids.get(self.rtxn, &word)?) + // }, + (QueryKind::Exact { word, .. }, true) => { + let words = word_typos(&word, true, 0, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, + (QueryKind::Exact { word, .. }, false) => { + Ok(self.index.word_docids.get(self.rtxn, &word)?) + }, + (QueryKind::Tolerant { typo, word }, prefix) => { + let words = word_typos(&word, prefix, *typo, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, + } + } + + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result> { + // TODO add prefix cache for Tolerant-Exact-true and Exact-Exact-true + match (&left.kind, &right.kind, right.prefix) { + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) => { + let words = word_typos(&right, true, 0, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let key = (left.as_str(), word.as_str(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) => { + let l_words = word_typos(&left, false, *typo, &self.words_fst)?; + let r_words = word_typos(&right, true, 0, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (left, _typo) in l_words { + for (right, _typo) in r_words.iter() { + let key = (left.as_str(), right.as_str(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + } + Ok(Some(docids)) + }, + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, false) => { + let words = word_typos(&left, false, *typo, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let key = (word.as_str(), right.as_str(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, + (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }, prefix) => { + let words = word_typos(&right, prefix, *typo, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let key = (left.as_str(), word.as_str(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, + (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }, prefix) => { + let l_words = word_typos(&left, false, *l_typo, &self.words_fst)?; + let r_words = word_typos(&right, prefix, *r_typo, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (left, _typo) in l_words { + for (right, _typo) in r_words.iter() { + let key = (left.as_str(), right.as_str(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + } + Ok(Some(docids)) + }, + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, false) => { + let key = (left.as_str(), right.as_str(), distance); + Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?) + }, + } + } + + fn words_fst<'t>(&self) -> &'t fst::Set> { + &self.words_fst + } +} + +impl<'t> HeedContext<'t> { + pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> anyhow::Result { + let words_fst = index.words_fst(rtxn)?; + + Ok(Self { + rtxn, + index, + words_fst, + }) + } +} diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 196da4a49..b3608c397 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -3,19 +3,16 @@ use std::{borrow::Cow, mem::take}; use anyhow::bail; use roaring::RoaringBitmap; -use crate::Index; use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::search::word_typos; -use super::{Candidates, Criterion}; +use super::{Candidates, Criterion, Context}; // FIXME we must stop when the number of typos is equal to // the maximum number of typos for this query tree. const MAX_NUM_TYPOS: u8 = 8; pub struct Typo<'t> { - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - words_fst: fst::Set>, + ctx: &'t dyn Context, query_tree: Option, number_typos: u8, candidates: Candidates, @@ -24,16 +21,13 @@ pub struct Typo<'t> { impl<'t> Typo<'t> { pub fn initial( - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, + ctx: &'t dyn Context, query_tree: Option, candidates: Option, ) -> anyhow::Result where Self: Sized { Ok(Typo { - index, - rtxn, - words_fst: index.words_fst(rtxn)?, + ctx, query_tree, number_typos: 0, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), @@ -42,15 +36,12 @@ impl<'t> Typo<'t> { } pub fn new( - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, + ctx: &'t dyn Context, parent: Box, ) -> anyhow::Result where Self: Sized { Ok(Typo { - index, - rtxn, - words_fst: index.words_fst(rtxn)?, + ctx, query_tree: None, number_typos: 0, candidates: Candidates::default(), @@ -69,8 +60,10 @@ impl<'t> Criterion for Typo<'t> { self.candidates = Candidates::default(); }, (Some(query_tree), Allowed(candidates)) => { - let new_query_tree = alterate_query_tree(&self.words_fst, query_tree.clone(), self.number_typos)?; - let mut new_candidates = resolve_candidates(&self.index, &self.rtxn, &new_query_tree, self.number_typos)?; + // TODO if number_typos >= 2 the generated query_tree will allways be the same, + // generate a new one on each iteration is a waste of time. + let new_query_tree = alterate_query_tree(&self.ctx.words_fst(), query_tree.clone(), self.number_typos)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); self.number_typos += 1; @@ -78,8 +71,10 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some((Some(new_query_tree), new_candidates))); }, (Some(query_tree), Forbidden(candidates)) => { - let new_query_tree = alterate_query_tree(&self.words_fst, query_tree.clone(), self.number_typos)?; - let mut new_candidates = resolve_candidates(&self.index, &self.rtxn, &new_query_tree, self.number_typos)?; + // TODO if number_typos >= 2 the generated query_tree will allways be the same, + // generate a new one on each iteration is a waste of time. + let new_query_tree = alterate_query_tree(&self.ctx.words_fst(), query_tree.clone(), self.number_typos)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); self.number_typos += 1; @@ -132,6 +127,7 @@ fn alterate_query_tree( ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos)) }, Operation::Query(q) => { + // TODO may be optimized when number_typos == 0 if let QueryKind::Tolerant { typo, word } = &q.kind { let typo = *typo.min(&number_typos); let words = word_typos(word, q.prefix, typo, words_fst)?; @@ -155,9 +151,8 @@ fn alterate_query_tree( Ok(query_tree) } -fn resolve_candidates( - index: &Index, - rtxn: &heed::RoTxn, +fn resolve_candidates<'t>( + ctx: &'t dyn Context, query_tree: &Operation, number_typos: u8, ) -> anyhow::Result @@ -166,9 +161,8 @@ fn resolve_candidates( // FIXME keep the cache between typos iterations // cache: HashMap<(&Operation, u8), RoaringBitmap>, - fn resolve_operation( - index: &Index, - rtxn: &heed::RoTxn, + fn resolve_operation<'t>( + ctx: &'t dyn Context, query_tree: &Operation, number_typos: u8, ) -> anyhow::Result @@ -177,7 +171,7 @@ fn resolve_candidates( match query_tree { And(ops) => { - mdfs(index, rtxn, ops, number_typos) + mdfs(ctx, ops, number_typos) }, Consecutive(ops) => { let mut candidates = RoaringBitmap::new(); @@ -185,8 +179,7 @@ fn resolve_candidates( for slice in ops.windows(2) { match (&slice[0], &slice[1]) { (Operation::Query(left), Operation::Query(right)) => { - let key_pair = &(left.kind.word(), right.kind.word(), 1); - match index.word_pair_proximity_docids.get(rtxn, key_pair)? { + match ctx.query_pair_proximity_docids(left, right, 1)? { Some(pair_docids) => { if first_loop { candidates = pair_docids; @@ -207,14 +200,13 @@ fn resolve_candidates( Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { - let docids = resolve_operation(index, rtxn, op, number_typos)?; + let docids = resolve_operation(ctx, op, number_typos)?; candidates.union_with(&docids); } Ok(candidates) }, Query(q) => if q.kind.typo() == number_typos { - let word = q.kind.word(); - Ok(index.word_docids.get(rtxn, word)?.unwrap_or_default()) + Ok(ctx.query_docids(q)?.unwrap_or_default()) } else { Ok(RoaringBitmap::new()) }, @@ -222,22 +214,21 @@ fn resolve_candidates( } /// FIXME Make this function generic and mutualize it between Typo and proximity criterion - fn mdfs( - index: &Index, - rtxn: &heed::RoTxn, + fn mdfs<'t>( + ctx: &'t dyn Context, branches: &[Operation], mana: u8, ) -> anyhow::Result { match branches.split_first() { - Some((head, [])) => resolve_operation(index, rtxn, head, mana), + Some((head, [])) => resolve_operation(ctx, head, mana), Some((head, tail)) => { let mut candidates = RoaringBitmap::new(); for m in 0..=mana { - let mut head_candidates = resolve_operation(index, rtxn, head, m)?; + let mut head_candidates = resolve_operation(ctx, head, m)?; if !head_candidates.is_empty() { - let tail_candidates = mdfs(index, rtxn, tail, mana - m)?; + let tail_candidates = mdfs(ctx, tail, mana - m)?; head_candidates.intersect_with(&tail_candidates); candidates.union_with(&head_candidates); } @@ -249,5 +240,5 @@ fn resolve_candidates( } } - resolve_operation(index, rtxn, query_tree, number_typos) + resolve_operation(ctx, query_tree, number_typos) } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 267f3556a..d061df2b9 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -288,7 +288,8 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); // We aretesting the typo criteria but there will be more of them soon. - let mut criteria = Typo::initial(self.index, self.rtxn, query_tree, facet_candidates)?; + let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; + let mut criteria = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; let mut offset = self.offset; let mut limit = self.limit; From 774a255f2e26272770b04c377d06780c375db611 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 15:49:44 +0100 Subject: [PATCH 06/50] use prefix cache in criteria --- milli/src/search/criteria/mod.rs | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 36569906e..5c6561b62 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -44,15 +44,15 @@ pub struct HeedContext<'t> { rtxn: &'t heed::RoTxn<'t>, index: &'t Index, words_fst: fst::Set>, + words_prefixes_fst: fst::Set>, } impl<'a> Context for HeedContext<'a> { fn query_docids(&self, query: &Query) -> anyhow::Result> { match (&query.kind, query.prefix) { - // TODO de-comment when ~ready - // (QueryKind::Exact { word, .. }, true) if in_prefix_cache(&word) => { - // Ok(self.index.prefix_docids.get(self.rtxn, &word)?) - // }, + (QueryKind::Exact { word, .. }, true) if self.in_prefix_cache(&word) => { + Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?) + }, (QueryKind::Exact { word, .. }, true) => { let words = word_typos(&word, true, 0, &self.words_fst)?; let mut docids = RoaringBitmap::new(); @@ -78,6 +78,19 @@ impl<'a> Context for HeedContext<'a> { fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result> { // TODO add prefix cache for Tolerant-Exact-true and Exact-Exact-true match (&left.kind, &right.kind, right.prefix) { + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { + let key = (left.as_str(), right.as_str(), distance); + Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?) + }, + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { + let words = word_typos(&left, false, *typo, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let key = (word.as_str(), right.as_str(), distance); + docids.union_with(&self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) => { let words = word_typos(&right, true, 0, &self.words_fst)?; let mut docids = RoaringBitmap::new(); @@ -144,11 +157,17 @@ impl<'a> Context for HeedContext<'a> { impl<'t> HeedContext<'t> { pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> anyhow::Result { let words_fst = index.words_fst(rtxn)?; + let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; Ok(Self { rtxn, index, words_fst, + words_prefixes_fst, }) } + + fn in_prefix_cache(&self, word: &str) -> bool { + self.words_prefixes_fst.contains(word) + } } From 907482c8acac6236cf3f5b28271b248f3d511843 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 16:21:21 +0100 Subject: [PATCH 07/50] clean docids fetchers --- milli/src/search/criteria/mod.rs | 86 +++++++++++-------------------- milli/src/search/criteria/typo.rs | 20 +++---- 2 files changed, 41 insertions(+), 65 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 5c6561b62..9cb6547c9 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -36,8 +36,8 @@ impl Default for Candidates { } } pub trait Context { - fn query_docids(&self, query: &Query) -> anyhow::Result>; - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) ->anyhow::Result>; + fn query_docids(&self, query: &Query) -> anyhow::Result; + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) ->anyhow::Result; fn words_fst<'t>(&self) -> &'t fst::Set>; } pub struct HeedContext<'t> { @@ -48,10 +48,10 @@ pub struct HeedContext<'t> { } impl<'a> Context for HeedContext<'a> { - fn query_docids(&self, query: &Query) -> anyhow::Result> { + fn query_docids(&self, query: &Query) -> anyhow::Result { match (&query.kind, query.prefix) { (QueryKind::Exact { word, .. }, true) if self.in_prefix_cache(&word) => { - Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?) + Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?.unwrap_or_default()) }, (QueryKind::Exact { word, .. }, true) => { let words = word_typos(&word, true, 0, &self.words_fst)?; @@ -59,10 +59,10 @@ impl<'a> Context for HeedContext<'a> { for (word, _typo) in words { docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); } - Ok(Some(docids)) + Ok(docids) }, (QueryKind::Exact { word, .. }, false) => { - Ok(self.index.word_docids.get(self.rtxn, &word)?) + Ok(self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()) }, (QueryKind::Tolerant { typo, word }, prefix) => { let words = word_typos(&word, prefix, *typo, &self.words_fst)?; @@ -70,81 +70,46 @@ impl<'a> Context for HeedContext<'a> { for (word, _typo) in words { docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); } - Ok(Some(docids)) + Ok(docids) }, } } - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result> { - // TODO add prefix cache for Tolerant-Exact-true and Exact-Exact-true + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result { match (&left.kind, &right.kind, right.prefix) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { let key = (left.as_str(), right.as_str(), distance); - Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?) + Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) }, (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { - let words = word_typos(&left, false, *typo, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let key = (word.as_str(), right.as_str(), distance); - docids.union_with(&self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - Ok(Some(docids)) + let l_words = word_typos(&left, false, *typo, &self.words_fst)?; + self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) }, (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) => { - let words = word_typos(&right, true, 0, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let key = (left.as_str(), word.as_str(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - Ok(Some(docids)) + let r_words = word_typos(&right, true, 0, &self.words_fst)?; + self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) }, (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) => { let l_words = word_typos(&left, false, *typo, &self.words_fst)?; let r_words = word_typos(&right, true, 0, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (left, _typo) in l_words { - for (right, _typo) in r_words.iter() { - let key = (left.as_str(), right.as_str(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - } - Ok(Some(docids)) + self.all_word_pair_proximity_docids(&l_words, &r_words, distance) }, (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, false) => { - let words = word_typos(&left, false, *typo, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let key = (word.as_str(), right.as_str(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - Ok(Some(docids)) + let l_words = word_typos(&left, false, *typo, &self.words_fst)?; + self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) }, (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }, prefix) => { - let words = word_typos(&right, prefix, *typo, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let key = (left.as_str(), word.as_str(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - Ok(Some(docids)) + let r_words = word_typos(&right, prefix, *typo, &self.words_fst)?; + self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) }, (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }, prefix) => { let l_words = word_typos(&left, false, *l_typo, &self.words_fst)?; let r_words = word_typos(&right, prefix, *r_typo, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (left, _typo) in l_words { - for (right, _typo) in r_words.iter() { - let key = (left.as_str(), right.as_str(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - } - Ok(Some(docids)) + self.all_word_pair_proximity_docids(&l_words, &r_words, distance) }, (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, false) => { let key = (left.as_str(), right.as_str(), distance); - Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?) + Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) }, } } @@ -170,4 +135,15 @@ impl<'t> HeedContext<'t> { fn in_prefix_cache(&self, word: &str) -> bool { self.words_prefixes_fst.contains(word) } + + fn all_word_pair_proximity_docids, U: AsRef>(&self, left_words: &[(T, u8)], right_words: &[(U, u8)], distance: u8) -> anyhow::Result { + let mut docids = RoaringBitmap::new(); + for (left, _l_typo) in left_words { + for (right, _r_typo) in right_words { + let key = (left.as_ref(), right.as_ref(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + } + Ok(docids) + } } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index b3608c397..9834bbc21 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -180,17 +180,17 @@ fn resolve_candidates<'t>( match (&slice[0], &slice[1]) { (Operation::Query(left), Operation::Query(right)) => { match ctx.query_pair_proximity_docids(left, right, 1)? { - Some(pair_docids) => { - if first_loop { - candidates = pair_docids; - first_loop = false; - } else { - candidates.intersect_with(&pair_docids) - } + pair_docids if pair_docids.is_empty() => { + return Ok(RoaringBitmap::new()) + }, + pair_docids if first_loop => { + candidates = pair_docids; + first_loop = false; + }, + pair_docids => { + candidates.intersect_with(&pair_docids); }, - None => return Ok(RoaringBitmap::new()), } - }, _ => bail!("invalid consecutive query type"), } @@ -206,7 +206,7 @@ fn resolve_candidates<'t>( Ok(candidates) }, Query(q) => if q.kind.typo() == number_typos { - Ok(ctx.query_docids(q)?.unwrap_or_default()) + Ok(ctx.query_docids(q)?) } else { Ok(RoaringBitmap::new()) }, From 4128bdc859d394429e9261c457c2c743f79c3849 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 16:50:38 +0100 Subject: [PATCH 08/50] reduce match possibilities in docids fetchers --- milli/src/search/criteria/mod.rs | 93 +++++++++++++++++--------------- 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 9cb6547c9..d8bd21c01 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -49,26 +49,28 @@ pub struct HeedContext<'t> { impl<'a> Context for HeedContext<'a> { fn query_docids(&self, query: &Query) -> anyhow::Result { - match (&query.kind, query.prefix) { - (QueryKind::Exact { word, .. }, true) if self.in_prefix_cache(&word) => { - Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?.unwrap_or_default()) - }, - (QueryKind::Exact { word, .. }, true) => { - let words = word_typos(&word, true, 0, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); + match &query.kind { + QueryKind::Exact { word, .. } => { + if query.prefix && self.in_prefix_cache(&word) { + Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?.unwrap_or_default()) + } else if query.prefix { + let words = word_typos(&word, true, 0, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let current_docids = self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + Ok(docids) + } else { + Ok(self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()) } - Ok(docids) }, - (QueryKind::Exact { word, .. }, false) => { - Ok(self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()) - }, - (QueryKind::Tolerant { typo, word }, prefix) => { - let words = word_typos(&word, prefix, *typo, &self.words_fst)?; + QueryKind::Tolerant { typo, word } => { + let words = word_typos(&word, query.prefix, *typo, &self.words_fst)?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { - docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); + let current_docids = self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default(); + docids.union_with(¤t_docids); } Ok(docids) }, @@ -76,41 +78,47 @@ impl<'a> Context for HeedContext<'a> { } fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result { - match (&left.kind, &right.kind, right.prefix) { - (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { - let key = (left.as_str(), right.as_str(), distance); - Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) + let prefix = right.prefix; + + match (&left.kind, &right.kind) { + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { + if prefix && self.in_prefix_cache(&right) { + let key = (left.as_str(), right.as_str(), distance); + Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) + } else if prefix { + let r_words = word_typos(&right, true, 0, &self.words_fst)?; + self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) + } else { + let key = (left.as_str(), right.as_str(), distance); + Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) + } }, - (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { let l_words = word_typos(&left, false, *typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) + if prefix && self.in_prefix_cache(&right) { + let mut docids = RoaringBitmap::new(); + for (left, _) in l_words { + let key = (left.as_ref(), right.as_ref(), distance); + let current_docids = self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + Ok(docids) + } else if prefix { + let r_words = word_typos(&right, true, 0, &self.words_fst)?; + self.all_word_pair_proximity_docids(&l_words, &r_words, distance) + } else { + self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) + } }, - (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) => { - let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) - }, - (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) => { - let l_words = word_typos(&left, false, *typo, &self.words_fst)?; - let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &r_words, distance) - }, - (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, false) => { - let l_words = word_typos(&left, false, *typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) - }, - (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }, prefix) => { + (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { let r_words = word_typos(&right, prefix, *typo, &self.words_fst)?; self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) }, - (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }, prefix) => { + (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { let l_words = word_typos(&left, false, *l_typo, &self.words_fst)?; let r_words = word_typos(&right, prefix, *r_typo, &self.words_fst)?; self.all_word_pair_proximity_docids(&l_words, &r_words, distance) }, - (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, false) => { - let key = (left.as_str(), right.as_str(), distance); - Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) - }, } } @@ -141,7 +149,8 @@ impl<'t> HeedContext<'t> { for (left, _l_typo) in left_words { for (right, _r_typo) in right_words { let key = (left.as_ref(), right.as_ref(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + let current_docids = self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); + docids.union_with(¤t_docids); } } Ok(docids) From 86bcecf840a978d3ecf775af27714c4c0362ad33 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 17:02:26 +0100 Subject: [PATCH 09/50] change variable's name from distance to proximity --- milli/src/search/criteria/mod.rs | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d8bd21c01..d21102e64 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -37,7 +37,7 @@ impl Default for Candidates { } pub trait Context { fn query_docids(&self, query: &Query) -> anyhow::Result; - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) ->anyhow::Result; + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, proximity: u8) ->anyhow::Result; fn words_fst<'t>(&self) -> &'t fst::Set>; } pub struct HeedContext<'t> { @@ -77,19 +77,19 @@ impl<'a> Context for HeedContext<'a> { } } - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result { + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, proximity: u8) -> anyhow::Result { let prefix = right.prefix; match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix && self.in_prefix_cache(&right) { - let key = (left.as_str(), right.as_str(), distance); + let key = (left.as_str(), right.as_str(), proximity); Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) } else if prefix { let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) + self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, proximity) } else { - let key = (left.as_str(), right.as_str(), distance); + let key = (left.as_str(), right.as_str(), proximity); Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) } }, @@ -98,26 +98,26 @@ impl<'a> Context for HeedContext<'a> { if prefix && self.in_prefix_cache(&right) { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { - let key = (left.as_ref(), right.as_ref(), distance); + let key = (left.as_ref(), right.as_ref(), proximity); let current_docids = self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); docids.union_with(¤t_docids); } Ok(docids) } else if prefix { let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &r_words, distance) + self.all_word_pair_proximity_docids(&l_words, &r_words, proximity) } else { - self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) + self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], proximity) } }, (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { let r_words = word_typos(&right, prefix, *typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) + self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, proximity) }, (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { let l_words = word_typos(&left, false, *l_typo, &self.words_fst)?; let r_words = word_typos(&right, prefix, *r_typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &r_words, distance) + self.all_word_pair_proximity_docids(&l_words, &r_words, proximity) }, } } @@ -144,11 +144,16 @@ impl<'t> HeedContext<'t> { self.words_prefixes_fst.contains(word) } - fn all_word_pair_proximity_docids, U: AsRef>(&self, left_words: &[(T, u8)], right_words: &[(U, u8)], distance: u8) -> anyhow::Result { + fn all_word_pair_proximity_docids, U: AsRef>( + &self, + left_words: &[(T, u8)], + right_words: &[(U, u8)], + proximity: u8 + ) -> anyhow::Result { let mut docids = RoaringBitmap::new(); for (left, _l_typo) in left_words { for (right, _r_typo) in right_words { - let key = (left.as_ref(), right.as_ref(), distance); + let key = (left.as_ref(), right.as_ref(), proximity); let current_docids = self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); docids.union_with(¤t_docids); } From 5344abc0082233906006a652723f6d03a34fb739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 17:34:22 +0100 Subject: [PATCH 10/50] Introduce the CriterionResult return type --- milli/src/search/criteria/mod.rs | 12 +++++++++++- milli/src/search/criteria/typo.rs | 24 ++++++++++++++++++------ milli/src/search/mod.rs | 4 ++-- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d21102e64..45689bbe5 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -10,7 +10,17 @@ use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; pub trait Criterion { - fn next(&mut self) -> anyhow::Result, RoaringBitmap)>>; + fn next(&mut self) -> anyhow::Result>; +} + +/// The result of a call to the parent criterion. +pub struct CriterionResult { + /// The query tree that must be used by the children criterion to fetch candidates. + pub query_tree: Option, + /// The candidates that this criterion is allowed to return subsets of. + pub candidates: RoaringBitmap, + /// Candidates that comes from the current bucket of the initial criterion. + pub bucket_candidates: Option, } /// Either a set of candidates that defines the candidates diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 9834bbc21..781ea1ec8 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::search::word_typos; -use super::{Candidates, Criterion, Context}; +use super::{Candidates, Criterion, CriterionResult, Context}; // FIXME we must stop when the number of typos is equal to // the maximum number of typos for this query tree. @@ -51,7 +51,7 @@ impl<'t> Typo<'t> { } impl<'t> Criterion for Typo<'t> { - fn next(&mut self) -> anyhow::Result, RoaringBitmap)>> { + fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; while self.number_typos < MAX_NUM_TYPOS { match (&mut self.query_tree, &mut self.candidates) { @@ -68,7 +68,11 @@ impl<'t> Criterion for Typo<'t> { candidates.difference_with(&new_candidates); self.number_typos += 1; - return Ok(Some((Some(new_query_tree), new_candidates))); + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: new_candidates, + bucket_candidates: None, + })); }, (Some(query_tree), Forbidden(candidates)) => { // TODO if number_typos >= 2 the generated query_tree will allways be the same, @@ -79,16 +83,24 @@ impl<'t> Criterion for Typo<'t> { candidates.union_with(&new_candidates); self.number_typos += 1; - return Ok(Some((Some(new_query_tree), new_candidates))); + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: new_candidates, + bucket_candidates: None, + })); }, (None, Allowed(_)) => { - return Ok(Some((None, take(&mut self.candidates).into_inner()))); + return Ok(Some(CriterionResult { + query_tree: None, + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: None, + })); }, (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { match parent.next()? { - Some((query_tree, candidates)) => { + Some(CriterionResult { query_tree, candidates, .. }) => { self.query_tree = query_tree; self.candidates = Candidates::Allowed(candidates); }, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index d061df2b9..89abb01b4 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -18,7 +18,7 @@ use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec} use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::mdfs::Mdfs; use crate::query_tokens::{query_tokens, QueryToken}; -use crate::search::criteria::Criterion; +use crate::search::criteria::{Criterion, CriterionResult}; use crate::search::criteria::typo::Typo; use crate::{Index, FieldId, DocumentId}; @@ -294,7 +294,7 @@ impl<'a> Search<'a> { let mut offset = self.offset; let mut limit = self.limit; let mut documents_ids = Vec::new(); - while let Some((_qt, docids)) = criteria.next()? { + while let Some(CriterionResult { candidates: docids, .. }) = criteria.next()? { let mut len = docids.len() as usize; let mut docids = docids.into_iter(); From 229130ed259606d03ecdc22309d102173c040e48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 17:45:56 +0100 Subject: [PATCH 11/50] Correctly compute the bucket candidates for the Typo criterion --- milli/src/search/criteria/typo.rs | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 781ea1ec8..31b56d700 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -16,6 +16,7 @@ pub struct Typo<'t> { query_tree: Option, number_typos: u8, candidates: Candidates, + bucket_candidates: Option, parent: Option>, } @@ -31,6 +32,7 @@ impl<'t> Typo<'t> { query_tree, number_typos: 0, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + bucket_candidates: None, parent: None, }) } @@ -45,6 +47,7 @@ impl<'t> Typo<'t> { query_tree: None, number_typos: 0, candidates: Candidates::default(), + bucket_candidates: None, parent: Some(parent), }) } @@ -68,10 +71,15 @@ impl<'t> Criterion for Typo<'t> { candidates.difference_with(&new_candidates); self.number_typos += 1; + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; + return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), candidates: new_candidates, - bucket_candidates: None, + bucket_candidates, })); }, (Some(query_tree), Forbidden(candidates)) => { @@ -83,26 +91,33 @@ impl<'t> Criterion for Typo<'t> { candidates.union_with(&new_candidates); self.number_typos += 1; + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; + return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), candidates: new_candidates, - bucket_candidates: None, + bucket_candidates, })); }, (None, Allowed(_)) => { + let candidates = take(&mut self.candidates).into_inner(); return Ok(Some(CriterionResult { query_tree: None, - candidates: take(&mut self.candidates).into_inner(), - bucket_candidates: None, + candidates: candidates.clone(), + bucket_candidates: Some(candidates), })); }, (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { match parent.next()? { - Some(CriterionResult { query_tree, candidates, .. }) => { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree; self.candidates = Candidates::Allowed(candidates); + self.bucket_candidates = bucket_candidates; }, None => return Ok(None), } From fea9ffc46a92a0f9c7f295dd2379317a2719c94a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 17:50:46 +0100 Subject: [PATCH 12/50] Use the bucket candidates in the search module --- milli/src/search/mod.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 89abb01b4..d94bc8831 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -294,19 +294,24 @@ impl<'a> Search<'a> { let mut offset = self.offset; let mut limit = self.limit; let mut documents_ids = Vec::new(); - while let Some(CriterionResult { candidates: docids, .. }) = criteria.next()? { + let mut initial_candidates = RoaringBitmap::new(); + while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next()? { - let mut len = docids.len() as usize; - let mut docids = docids.into_iter(); + let mut len = candidates.len() as usize; + let mut candidates = candidates.into_iter(); + + if let Some(docids) = bucket_candidates { + initial_candidates.union_with(&docids); + } if offset != 0 { - docids.by_ref().skip(offset).for_each(drop); + candidates.by_ref().skip(offset).for_each(drop); offset = offset.saturating_sub(len.min(offset)); len = len.saturating_sub(len.min(offset)); } if len != 0 { - documents_ids.extend(docids.take(limit)); + documents_ids.extend(candidates.take(limit)); limit = limit.saturating_sub(len.min(limit)); } @@ -314,8 +319,7 @@ impl<'a> Search<'a> { } let found_words = HashSet::new(); - let candidates = RoaringBitmap::new(); - Ok(SearchResult { found_words, candidates, documents_ids }) + Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids }) // let order_by_facet = { // let criteria = self.index.criteria(self.rtxn)?; From 9ccaea2afc7b147bd367fd283f1d87bf2c1710e1 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 18:16:21 +0100 Subject: [PATCH 13/50] simplify criterion context --- milli/src/search/criteria/mod.rs | 189 ++++++++++++++++-------------- milli/src/search/criteria/typo.rs | 6 +- 2 files changed, 106 insertions(+), 89 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 45689bbe5..d033f5707 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -46,9 +46,12 @@ impl Default for Candidates { } } pub trait Context { - fn query_docids(&self, query: &Query) -> anyhow::Result; - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, proximity: u8) ->anyhow::Result; + fn word_docids(&self, word: &str) -> heed::Result>; + fn word_prefix_docids(&self, word: &str) -> heed::Result>; + fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; + fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; fn words_fst<'t>(&self) -> &'t fst::Set>; + fn in_prefix_cache(&self, word: &str) -> bool; } pub struct HeedContext<'t> { rtxn: &'t heed::RoTxn<'t>, @@ -58,83 +61,31 @@ pub struct HeedContext<'t> { } impl<'a> Context for HeedContext<'a> { - fn query_docids(&self, query: &Query) -> anyhow::Result { - match &query.kind { - QueryKind::Exact { word, .. } => { - if query.prefix && self.in_prefix_cache(&word) { - Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?.unwrap_or_default()) - } else if query.prefix { - let words = word_typos(&word, true, 0, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let current_docids = self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default(); - docids.union_with(¤t_docids); - } - Ok(docids) - } else { - Ok(self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()) - } - }, - QueryKind::Tolerant { typo, word } => { - let words = word_typos(&word, query.prefix, *typo, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let current_docids = self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default(); - docids.union_with(¤t_docids); - } - Ok(docids) - }, - } + fn word_docids(&self, word: &str) -> heed::Result> { + self.index.word_docids.get(self.rtxn, &word) } - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, proximity: u8) -> anyhow::Result { - let prefix = right.prefix; + fn word_prefix_docids(&self, word: &str) -> heed::Result> { + self.index.word_prefix_docids.get(self.rtxn, &word) + } - match (&left.kind, &right.kind) { - (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { - if prefix && self.in_prefix_cache(&right) { - let key = (left.as_str(), right.as_str(), proximity); - Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) - } else if prefix { - let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, proximity) - } else { - let key = (left.as_str(), right.as_str(), proximity); - Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) - } - }, - (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { - let l_words = word_typos(&left, false, *typo, &self.words_fst)?; - if prefix && self.in_prefix_cache(&right) { - let mut docids = RoaringBitmap::new(); - for (left, _) in l_words { - let key = (left.as_ref(), right.as_ref(), proximity); - let current_docids = self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); - docids.union_with(¤t_docids); - } - Ok(docids) - } else if prefix { - let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &r_words, proximity) - } else { - self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], proximity) - } - }, - (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { - let r_words = word_typos(&right, prefix, *typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, proximity) - }, - (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { - let l_words = word_typos(&left, false, *l_typo, &self.words_fst)?; - let r_words = word_typos(&right, prefix, *r_typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &r_words, proximity) - }, - } + fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left, right, proximity); + self.index.word_pair_proximity_docids.get(self.rtxn, &key) + } + + fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left, right, proximity); + self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) } fn words_fst<'t>(&self) -> &'t fst::Set> { &self.words_fst } + + fn in_prefix_cache(&self, word: &str) -> bool { + self.words_prefixes_fst.contains(word) + } } impl<'t> HeedContext<'t> { @@ -149,25 +100,91 @@ impl<'t> HeedContext<'t> { words_prefixes_fst, }) } +} - fn in_prefix_cache(&self, word: &str) -> bool { - self.words_prefixes_fst.contains(word) +fn all_word_pair_proximity_docids, U: AsRef>( + ctx: &dyn Context, + left_words: &[(T, u8)], + right_words: &[(U, u8)], + proximity: u8 +) -> anyhow::Result { + let mut docids = RoaringBitmap::new(); + for (left, _l_typo) in left_words { + for (right, _r_typo) in right_words { + let current_docids = ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } } + Ok(docids) +} - fn all_word_pair_proximity_docids, U: AsRef>( - &self, - left_words: &[(T, u8)], - right_words: &[(U, u8)], - proximity: u8 - ) -> anyhow::Result { - let mut docids = RoaringBitmap::new(); - for (left, _l_typo) in left_words { - for (right, _r_typo) in right_words { - let key = (left.as_ref(), right.as_ref(), proximity); - let current_docids = self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); +fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result { + match &query.kind { + QueryKind::Exact { word, .. } => { + if query.prefix && ctx.in_prefix_cache(&word) { + Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default()) + } else if query.prefix { + let words = word_typos(&word, true, 0, ctx.words_fst())?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + Ok(docids) + } else { + Ok(ctx.word_docids(&word)?.unwrap_or_default()) + } + }, + QueryKind::Tolerant { typo, word } => { + let words = word_typos(&word, query.prefix, *typo, ctx.words_fst())?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); docids.union_with(¤t_docids); } - } - Ok(docids) + Ok(docids) + }, + } +} + +fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, proximity: u8) -> anyhow::Result { + let prefix = right.prefix; + + match (&left.kind, &right.kind) { + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { + if prefix && ctx.in_prefix_cache(&right) { + Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) + } else if prefix { + let r_words = word_typos(&right, true, 0, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + } else { + Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) + } + }, + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { + let l_words = word_typos(&left, false, *typo, ctx.words_fst())?; + if prefix && ctx.in_prefix_cache(&right) { + let mut docids = RoaringBitmap::new(); + for (left, _) in l_words { + let current_docids = ctx.word_prefix_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + Ok(docids) + } else if prefix { + let r_words = word_typos(&right, true, 0, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) + } else { + all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) + } + }, + (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { + let r_words = word_typos(&right, prefix, *typo, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + }, + (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { + let l_words = word_typos(&left, false, *l_typo, ctx.words_fst())?; + let r_words = word_typos(&right, prefix, *r_typo, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) + }, } } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 31b56d700..a1b8e1f16 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::search::word_typos; -use super::{Candidates, Criterion, CriterionResult, Context}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; // FIXME we must stop when the number of typos is equal to // the maximum number of typos for this query tree. @@ -206,7 +206,7 @@ fn resolve_candidates<'t>( for slice in ops.windows(2) { match (&slice[0], &slice[1]) { (Operation::Query(left), Operation::Query(right)) => { - match ctx.query_pair_proximity_docids(left, right, 1)? { + match query_pair_proximity_docids(ctx, left, right, 1)? { pair_docids if pair_docids.is_empty() => { return Ok(RoaringBitmap::new()) }, @@ -233,7 +233,7 @@ fn resolve_candidates<'t>( Ok(candidates) }, Query(q) => if q.kind.typo() == number_typos { - Ok(ctx.query_docids(q)?) + Ok(query_docids(ctx, q)?) } else { Ok(RoaringBitmap::new()) }, From 67c71130dfcfbd7ce78089e49b14ce5b8ce2d13f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Feb 2021 11:00:31 +0100 Subject: [PATCH 14/50] Reduce the number of calls to alterate_query_tree --- milli/src/search/criteria/typo.rs | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a1b8e1f16..57db6e855 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -63,9 +63,16 @@ impl<'t> Criterion for Typo<'t> { self.candidates = Candidates::default(); }, (Some(query_tree), Allowed(candidates)) => { - // TODO if number_typos >= 2 the generated query_tree will allways be the same, - // generate a new one on each iteration is a waste of time. - let new_query_tree = alterate_query_tree(&self.ctx.words_fst(), query_tree.clone(), self.number_typos)?; + let fst = self.ctx.words_fst(); + let new_query_tree = if self.number_typos < 2 { + alterate_query_tree(&fst, query_tree.clone(), self.number_typos)? + } else if self.number_typos == 2 { + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos)?; + query_tree.clone() + } else { + query_tree.clone() + }; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); @@ -83,9 +90,16 @@ impl<'t> Criterion for Typo<'t> { })); }, (Some(query_tree), Forbidden(candidates)) => { - // TODO if number_typos >= 2 the generated query_tree will allways be the same, - // generate a new one on each iteration is a waste of time. - let new_query_tree = alterate_query_tree(&self.ctx.words_fst(), query_tree.clone(), self.number_typos)?; + let fst = self.ctx.words_fst(); + let new_query_tree = if self.number_typos < 2 { + alterate_query_tree(&fst, query_tree.clone(), self.number_typos)? + } else if self.number_typos == 2 { + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos)?; + query_tree.clone() + } else { + query_tree.clone() + }; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); From 4da6e1ea9c31ad10df47af0d15d9a67b248b09a8 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Feb 2021 14:40:59 +0100 Subject: [PATCH 15/50] add cache in typo criterion --- milli/src/search/criteria/typo.rs | 40 ++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 57db6e855..11c96a4d4 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -1,4 +1,4 @@ -use std::{borrow::Cow, mem::take}; +use std::{borrow::Cow, collections::HashMap, mem::take}; use anyhow::bail; use roaring::RoaringBitmap; @@ -18,6 +18,7 @@ pub struct Typo<'t> { candidates: Candidates, bucket_candidates: Option, parent: Option>, + cache: HashMap<(Operation, u8), RoaringBitmap>, } impl<'t> Typo<'t> { @@ -34,6 +35,7 @@ impl<'t> Typo<'t> { candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), bucket_candidates: None, parent: None, + cache: HashMap::new(), }) } @@ -49,6 +51,7 @@ impl<'t> Typo<'t> { candidates: Candidates::default(), bucket_candidates: None, parent: Some(parent), + cache: HashMap::new(), }) } } @@ -73,7 +76,7 @@ impl<'t> Criterion for Typo<'t> { query_tree.clone() }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.cache)?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); self.number_typos += 1; @@ -100,7 +103,7 @@ impl<'t> Criterion for Typo<'t> { query_tree.clone() }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.cache)?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); self.number_typos += 1; @@ -196,6 +199,7 @@ fn resolve_candidates<'t>( ctx: &'t dyn Context, query_tree: &Operation, number_typos: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, ) -> anyhow::Result { // FIXME add a cache @@ -206,13 +210,14 @@ fn resolve_candidates<'t>( ctx: &'t dyn Context, query_tree: &Operation, number_typos: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, ) -> anyhow::Result { use Operation::{And, Consecutive, Or, Query}; match query_tree { And(ops) => { - mdfs(ctx, ops, number_typos) + mdfs(ctx, ops, number_typos, cache) }, Consecutive(ops) => { let mut candidates = RoaringBitmap::new(); @@ -241,7 +246,7 @@ fn resolve_candidates<'t>( Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { - let docids = resolve_operation(ctx, op, number_typos)?; + let docids = resolve_operation(ctx, op, number_typos, cache)?; candidates.union_with(&docids); } Ok(candidates) @@ -259,17 +264,34 @@ fn resolve_candidates<'t>( ctx: &'t dyn Context, branches: &[Operation], mana: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, ) -> anyhow::Result { match branches.split_first() { - Some((head, [])) => resolve_operation(ctx, head, mana), + Some((head, [])) => { + if let Some(candidates) = cache.get(&(head.clone(), mana)) { + Ok(candidates.clone()) + } else { + let candidates = resolve_operation(ctx, head, mana, cache)?; + cache.insert((head.clone(), mana), candidates.clone()); + Ok(candidates) + } + }, Some((head, tail)) => { let mut candidates = RoaringBitmap::new(); for m in 0..=mana { - let mut head_candidates = resolve_operation(ctx, head, m)?; + let mut head_candidates = { + if let Some(candidates) = cache.get(&(head.clone(), m)) { + candidates.clone() + } else { + let candidates = resolve_operation(ctx, head, m, cache)?; + cache.insert((head.clone(), m), candidates.clone()); + candidates + } + }; if !head_candidates.is_empty() { - let tail_candidates = mdfs(ctx, tail, mana - m)?; + let tail_candidates = mdfs(ctx, tail, mana - m, cache)?; head_candidates.intersect_with(&tail_candidates); candidates.union_with(&head_candidates); } @@ -281,5 +303,5 @@ fn resolve_candidates<'t>( } } - resolve_operation(ctx, query_tree, number_typos) + resolve_operation(ctx, query_tree, number_typos, cache) } From 41fc51ebcf8d6f0d029cc4fdc8286b2f862718fc Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Feb 2021 15:08:56 +0100 Subject: [PATCH 16/50] optimize alterate_query_tree when number_typos is zero --- milli/src/search/criteria/typo.rs | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 11c96a4d4..6fd234d0b 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -171,19 +171,27 @@ fn alterate_query_tree( ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos)) }, Operation::Query(q) => { - // TODO may be optimized when number_typos == 0 if let QueryKind::Tolerant { typo, word } = &q.kind { - let typo = *typo.min(&number_typos); - let words = word_typos(word, q.prefix, typo, words_fst)?; + // if no typo is allowed we don't call word_typos(..), + // and directly create an Exact query + if number_typos == 0 { + *operation = Operation::Query(Query { + prefix: q.prefix, + kind: QueryKind::Exact { original_typo: 0, word: word.clone() }, + }); + } else { + let typo = *typo.min(&number_typos); + let words = word_typos(word, q.prefix, typo, words_fst)?; - let queries = words.into_iter().map(|(word, _typo)| { - Operation::Query(Query { - prefix: false, - kind: QueryKind::Exact { original_typo: typo, word }, - }) - }).collect(); + let queries = words.into_iter().map(|(word, _typo)| { + Operation::Query(Query { + prefix: false, + kind: QueryKind::Exact { original_typo: typo, word }, + }) + }).collect(); - *operation = Operation::or(false, queries); + *operation = Operation::or(false, queries); + } } Ok(()) From 9e093d5ff31acd93cf98f037f4650dd92aece60c Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Feb 2021 15:45:47 +0100 Subject: [PATCH 17/50] add cache on alterate_query_tree function --- milli/src/search/criteria/typo.rs | 48 ++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 6fd234d0b..0284d448d 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -18,7 +18,8 @@ pub struct Typo<'t> { candidates: Candidates, bucket_candidates: Option, parent: Option>, - cache: HashMap<(Operation, u8), RoaringBitmap>, + candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, + typo_cache: HashMap<(String, bool, u8), Vec<(String, u8)>>, } impl<'t> Typo<'t> { @@ -35,7 +36,8 @@ impl<'t> Typo<'t> { candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), bucket_candidates: None, parent: None, - cache: HashMap::new(), + candidates_cache: HashMap::new(), + typo_cache: HashMap::new(), }) } @@ -51,7 +53,8 @@ impl<'t> Typo<'t> { candidates: Candidates::default(), bucket_candidates: None, parent: Some(parent), - cache: HashMap::new(), + candidates_cache: HashMap::new(), + typo_cache: HashMap::new(), }) } } @@ -68,15 +71,15 @@ impl<'t> Criterion for Typo<'t> { (Some(query_tree), Allowed(candidates)) => { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; query_tree.clone() } else { query_tree.clone() }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.cache)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); self.number_typos += 1; @@ -95,15 +98,15 @@ impl<'t> Criterion for Typo<'t> { (Some(query_tree), Forbidden(candidates)) => { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; query_tree.clone() } else { query_tree.clone() }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.cache)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); self.number_typos += 1; @@ -156,19 +159,21 @@ fn alterate_query_tree( words_fst: &fst::Set>, mut query_tree: Operation, number_typos: u8, + typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, ) -> anyhow::Result { fn recurse( words_fst: &fst::Set>, operation: &mut Operation, number_typos: u8, + typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, ) -> anyhow::Result<()> { use Operation::{And, Consecutive, Or}; match operation { And(ops) | Consecutive(ops) | Or(_, ops) => { - ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos)) + ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, typo_cache)) }, Operation::Query(q) => { if let QueryKind::Tolerant { typo, word } = &q.kind { @@ -181,9 +186,16 @@ fn alterate_query_tree( }); } else { let typo = *typo.min(&number_typos); - let words = word_typos(word, q.prefix, typo, words_fst)?; + let cache_key = (word.clone(), q.prefix, typo); + let words = if let Some(derivations) = typo_cache.get(&cache_key) { + derivations.clone() + } else { + let derivations = word_typos(word, q.prefix, typo, words_fst)?; + typo_cache.insert(cache_key, derivations.clone()); + derivations + }; - let queries = words.into_iter().map(|(word, _typo)| { + let queries = words.into_iter().map(|(word, typo)| { Operation::Query(Query { prefix: false, kind: QueryKind::Exact { original_typo: typo, word }, @@ -199,7 +211,7 @@ fn alterate_query_tree( } } - recurse(words_fst, &mut query_tree, number_typos)?; + recurse(words_fst, &mut query_tree, number_typos, typo_cache)?; Ok(query_tree) } @@ -277,11 +289,12 @@ fn resolve_candidates<'t>( { match branches.split_first() { Some((head, [])) => { - if let Some(candidates) = cache.get(&(head.clone(), mana)) { + let cache_key = (head.clone(), mana); + if let Some(candidates) = cache.get(&cache_key) { Ok(candidates.clone()) } else { let candidates = resolve_operation(ctx, head, mana, cache)?; - cache.insert((head.clone(), mana), candidates.clone()); + cache.insert(cache_key, candidates.clone()); Ok(candidates) } }, @@ -290,11 +303,12 @@ fn resolve_candidates<'t>( for m in 0..=mana { let mut head_candidates = { - if let Some(candidates) = cache.get(&(head.clone(), m)) { + let cache_key = (head.clone(), m); + if let Some(candidates) = cache.get(&cache_key) { candidates.clone() } else { let candidates = resolve_operation(ctx, head, m, cache)?; - cache.insert((head.clone(), m), candidates.clone()); + cache.insert(cache_key, candidates.clone()); candidates } }; From a273c46559fc699a157213a216f25d1c3aef34a3 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Feb 2021 16:31:10 +0100 Subject: [PATCH 18/50] clean warnings --- milli/src/search/criteria/mod.rs | 18 +- milli/src/search/criteria/typo.rs | 10 +- milli/src/search/mod.rs | 321 ++---------------------------- milli/src/search/query_tree.rs | 2 +- 4 files changed, 25 insertions(+), 326 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d033f5707..3673aef78 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use crate::Index; -use crate::search::word_typos; +use crate::search::word_derivations; use roaring::RoaringBitmap; @@ -124,7 +124,7 @@ fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result anyhow::Result { - let words = word_typos(&word, query.prefix, *typo, ctx.words_fst())?; + let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst())?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); @@ -155,14 +155,14 @@ fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, p if prefix && ctx.in_prefix_cache(&right) { Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) } else if prefix { - let r_words = word_typos(&right, true, 0, ctx.words_fst())?; + let r_words = word_derivations(&right, true, 0, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) } else { Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) } }, (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { - let l_words = word_typos(&left, false, *typo, ctx.words_fst())?; + let l_words = word_derivations(&left, false, *typo, ctx.words_fst())?; if prefix && ctx.in_prefix_cache(&right) { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { @@ -171,19 +171,19 @@ fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, p } Ok(docids) } else if prefix { - let r_words = word_typos(&right, true, 0, ctx.words_fst())?; + let r_words = word_derivations(&right, true, 0, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) } else { all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } }, (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { - let r_words = word_typos(&right, prefix, *typo, ctx.words_fst())?; + let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) }, (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { - let l_words = word_typos(&left, false, *l_typo, ctx.words_fst())?; - let r_words = word_typos(&right, prefix, *r_typo, ctx.words_fst())?; + let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst())?; + let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) }, } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 0284d448d..a6f500bd5 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -4,7 +4,7 @@ use anyhow::bail; use roaring::RoaringBitmap; use crate::search::query_tree::{Operation, Query, QueryKind}; -use crate::search::word_typos; +use crate::search::word_derivations; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; // FIXME we must stop when the number of typos is equal to @@ -177,7 +177,7 @@ fn alterate_query_tree( }, Operation::Query(q) => { if let QueryKind::Tolerant { typo, word } = &q.kind { - // if no typo is allowed we don't call word_typos(..), + // if no typo is allowed we don't call word_derivations function, // and directly create an Exact query if number_typos == 0 { *operation = Operation::Query(Query { @@ -190,7 +190,7 @@ fn alterate_query_tree( let words = if let Some(derivations) = typo_cache.get(&cache_key) { derivations.clone() } else { - let derivations = word_typos(word, q.prefix, typo, words_fst)?; + let derivations = word_derivations(word, q.prefix, typo, words_fst)?; typo_cache.insert(cache_key, derivations.clone()); derivations }; @@ -222,10 +222,6 @@ fn resolve_candidates<'t>( cache: &mut HashMap<(Operation, u8), RoaringBitmap>, ) -> anyhow::Result { - // FIXME add a cache - // FIXME keep the cache between typos iterations - // cache: HashMap<(&Operation, u8), RoaringBitmap>, - fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index d94bc8831..6046cc8d2 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,26 +1,18 @@ use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::fmt; use std::time::Instant; -use anyhow::{bail, Context}; use fst::{IntoStreamer, Streamer, Set}; -use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use log::debug; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; -use ordered_float::OrderedFloat; use roaring::bitmap::RoaringBitmap; -use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; -use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; -use crate::mdfs::Mdfs; -use crate::query_tokens::{query_tokens, QueryToken}; use crate::search::criteria::{Criterion, CriterionResult}; use crate::search::criteria::typo::Typo; -use crate::{Index, FieldId, DocumentId}; +use crate::{Index, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::{FacetIter}; @@ -69,198 +61,6 @@ impl<'a> Search<'a> { self } - /// Extracts the query words from the query string and returns the DFAs accordingly. - /// TODO introduce settings for the number of typos regarding the words lengths. - fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { - let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2); - - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let words: Vec<_> = query_tokens(tokens).collect(); - - let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let number_of_words = words.len(); - - words.into_iter().enumerate().map(|(i, word)| { - let (word, quoted) = match word { - QueryToken::Free(token) => (token.text().to_string(), token.text().len() <= 3), - QueryToken::Quoted(token) => (token.text().to_string(), true), - }; - let is_last = i + 1 == number_of_words; - let is_prefix = is_last && !ends_with_whitespace && !quoted; - let lev = match word.len() { - 0..=4 => if quoted { lev0 } else { lev0 }, - 5..=8 => if quoted { lev0 } else { lev1 }, - _ => if quoted { lev0 } else { lev2 }, - }; - - let dfa = if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - }; - - (word, is_prefix, dfa) - }) - .collect() - } - - /// Fetch the words from the given FST related to the given DFAs along with - /// the associated documents ids. - fn fetch_words_docids( - &self, - fst: &fst::Set>, - dfas: Vec<(String, bool, DFA)>, - ) -> anyhow::Result, RoaringBitmap)>> - { - // A Vec storing all the derived words from the original query words, associated - // with the distance from the original word and the docids where the words appears. - let mut derived_words = Vec::<(HashMap::, RoaringBitmap)>::with_capacity(dfas.len()); - - for (_word, _is_prefix, dfa) in dfas { - - let mut acc_derived_words = HashMap::new(); - let mut unions_docids = RoaringBitmap::new(); - let mut stream = fst.search_with_state(&dfa).into_stream(); - while let Some((word, state)) = stream.next() { - - let word = std::str::from_utf8(word)?; - let docids = self.index.word_docids.get(self.rtxn, word)?.unwrap(); - let distance = dfa.distance(state); - unions_docids.union_with(&docids); - acc_derived_words.insert(word.to_string(), (distance.to_u8(), docids)); - } - derived_words.push((acc_derived_words, unions_docids)); - } - - Ok(derived_words) - } - - /// Returns the set of docids that contains all of the query words. - fn compute_candidates( - derived_words: &[(HashMap, RoaringBitmap)], - ) -> RoaringBitmap - { - // We sort the derived words by inverse popularity, this way intersections are faster. - let mut derived_words: Vec<_> = derived_words.iter().collect(); - derived_words.sort_unstable_by_key(|(_, docids)| docids.len()); - - // we do a union between all the docids of each of the derived words, - // we got N unions (the number of original query words), we then intersect them. - let mut candidates = RoaringBitmap::new(); - - for (i, (_, union_docids)) in derived_words.iter().enumerate() { - if i == 0 { - candidates = union_docids.clone(); - } else { - candidates.intersect_with(&union_docids); - } - } - - candidates - } - - fn facet_ordered( - &self, - field_id: FieldId, - facet_type: FacetType, - ascending: bool, - mut documents_ids: RoaringBitmap, - limit: usize, - ) -> anyhow::Result> - { - let mut output: Vec<_> = match facet_type { - FacetType::Float => { - if documents_ids.len() <= 1000 { - let db = self.index.field_id_docid_facet_values.remap_key_type::(); - let mut docids_values = Vec::with_capacity(documents_ids.len() as usize); - for docid in documents_ids.iter() { - let left = (field_id, docid, f64::MIN); - let right = (field_id, docid, f64::MAX); - let mut iter = db.range(self.rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, OrderedFloat(value))); - } - } - docids_values.sort_unstable_by_key(|(_, value)| *value); - let iter = docids_values.into_iter().map(|(id, _)| id); - if ascending { - iter.take(limit).collect() - } else { - iter.rev().take(limit).collect() - } - } else { - let facet_fn = if ascending { - FacetIter::::new_reducing - } else { - FacetIter::::new_reverse_reducing - }; - let mut limit_tmp = limit; - let mut output = Vec::new(); - for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? { - let (_val, docids) = result?; - limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); - output.push(docids); - if limit_tmp == 0 { break } - } - output.into_iter().flatten().take(limit).collect() - } - }, - FacetType::Integer => { - if documents_ids.len() <= 1000 { - let db = self.index.field_id_docid_facet_values.remap_key_type::(); - let mut docids_values = Vec::with_capacity(documents_ids.len() as usize); - for docid in documents_ids.iter() { - let left = (field_id, docid, i64::MIN); - let right = (field_id, docid, i64::MAX); - let mut iter = db.range(self.rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, value)); - } - } - docids_values.sort_unstable_by_key(|(_, value)| *value); - let iter = docids_values.into_iter().map(|(id, _)| id); - if ascending { - iter.take(limit).collect() - } else { - iter.rev().take(limit).collect() - } - } else { - let facet_fn = if ascending { - FacetIter::::new_reducing - } else { - FacetIter::::new_reverse_reducing - }; - let mut limit_tmp = limit; - let mut output = Vec::new(); - for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? { - let (_val, docids) = result?; - limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); - output.push(docids); - if limit_tmp == 0 { break } - } - output.into_iter().flatten().take(limit).collect() - } - }, - FacetType::String => bail!("criteria facet type must be a number"), - }; - - // if there isn't enough documents to return we try to complete that list - // with documents that are maybe not faceted under this field and therefore - // not returned by the previous facet iteration. - if output.len() < limit { - output.iter().for_each(|n| { documents_ids.remove(*n); }); - let remaining = documents_ids.iter().take(limit - output.len()); - output.extend(remaining); - } - - Ok(output) - } - pub fn execute(&self) -> anyhow::Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); @@ -320,101 +120,6 @@ impl<'a> Search<'a> { let found_words = HashSet::new(); Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids }) - - // let order_by_facet = { - // let criteria = self.index.criteria(self.rtxn)?; - // let result = criteria.into_iter().flat_map(|criterion| { - // match criterion { - // Criterion::Asc(fid) => Some((fid, true)), - // Criterion::Desc(fid) => Some((fid, false)), - // _ => None - // } - // }).next(); - // match result { - // Some((attr_name, is_ascending)) => { - // let field_id_map = self.index.fields_ids_map(self.rtxn)?; - // let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?; - // let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?; - // let ftype = *faceted_fields.get(&fid) - // .with_context(|| format!("{:?} not found in the faceted fields.", attr_name)) - // .expect("corrupted data: "); - // Some((fid, ftype, is_ascending)) - // }, - // None => None, - // } - // }; - - // let before = Instant::now(); - // let (candidates, derived_words) = match (facet_candidates, derived_words) { - // (Some(mut facet_candidates), Some(derived_words)) => { - // let words_candidates = Self::compute_candidates(&derived_words); - // facet_candidates.intersect_with(&words_candidates); - // (facet_candidates, derived_words) - // }, - // (None, Some(derived_words)) => { - // (Self::compute_candidates(&derived_words), derived_words) - // }, - // (Some(facet_candidates), None) => { - // // If the query is not set or results in no DFAs but - // // there is some facet conditions we return a placeholder. - // let documents_ids = match order_by_facet { - // Some((fid, ftype, is_ascending)) => { - // self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)? - // }, - // None => facet_candidates.iter().take(limit).collect(), - // }; - // return Ok(SearchResult { - // documents_ids, - // candidates: facet_candidates, - // ..Default::default() - // }) - // }, - // (None, None) => { - // // If the query is not set or results in no DFAs we return a placeholder. - // let all_docids = self.index.documents_ids(self.rtxn)?; - // let documents_ids = match order_by_facet { - // Some((fid, ftype, is_ascending)) => { - // self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)? - // }, - // None => all_docids.iter().take(limit).collect(), - // }; - // return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() }) - // }, - // }; - - // debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed()); - - // // The mana depth first search is a revised DFS that explore - // // solutions in the order of their proximities. - // let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone()); - // let mut documents = Vec::new(); - - // // We execute the Mdfs iterator until we find enough documents. - // while documents.iter().map(RoaringBitmap::len).sum::() < limit as u64 { - // match mdfs.next().transpose()? { - // Some((proximity, answer)) => { - // debug!("answer with a proximity of {}: {:?}", proximity, answer); - // documents.push(answer); - // }, - // None => break, - // } - // } - - // let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); - // let documents_ids = match order_by_facet { - // Some((fid, ftype, order)) => { - // let mut ordered_documents = Vec::new(); - // for documents_ids in documents { - // let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?; - // ordered_documents.push(docids); - // if ordered_documents.iter().map(Vec::len).sum::() >= limit { break } - // } - // ordered_documents.into_iter().flatten().take(limit).collect() - // }, - // None => documents.into_iter().flatten().take(limit).collect(), - // }; - - // Ok(SearchResult { found_words, candidates, documents_ids }) } } @@ -438,19 +143,17 @@ pub struct SearchResult { pub documents_ids: Vec, } -pub fn word_typos(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>) -> anyhow::Result> { - let dfa = { - let lev = match max_typo { - 0 => &LEVDIST0, - 1 => &LEVDIST1, - _ => &LEVDIST2, - }; +pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>) -> anyhow::Result> { + let lev = match max_typo { + 0 => &LEVDIST0, + 1 => &LEVDIST1, + _ => &LEVDIST2, + }; - if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - } + let dfa = if is_prefix { + lev.build_prefix_dfa(&word) + } else { + lev.build_dfa(&word) }; let mut derived_words = Vec::new(); diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 9b253350e..02f6dc0c8 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -303,7 +303,7 @@ fn fetch_words(tree: &Operation, fst: &fst::Set>) -> FetchedWords { match query.kind.clone() { QueryKind::Exact { word, .. } => vec![(word, query.prefix)], QueryKind::Tolerant { typo, word } => { - if let Ok(words) = super::word_typos(&word, query.prefix, typo, fst) { + if let Ok(words) = super::word_derivations(&word, query.prefix, typo, fst) { words.into_iter().map(|(w, _)| (w, query.prefix)).collect() } else { vec![(word, query.prefix)] From c5a32fd4fa90c2914b1296937af517d7cc2beffd Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 23 Feb 2021 17:33:20 +0100 Subject: [PATCH 19/50] Fix the typo criterion --- milli/src/search/criteria/typo.rs | 115 ++++++++++++++++-------------- 1 file changed, 60 insertions(+), 55 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a6f500bd5..75f3f5666 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -3,17 +3,13 @@ use std::{borrow::Cow, collections::HashMap, mem::take}; use anyhow::bail; use roaring::RoaringBitmap; -use crate::search::query_tree::{Operation, Query, QueryKind}; +use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::word_derivations; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; -// FIXME we must stop when the number of typos is equal to -// the maximum number of typos for this query tree. -const MAX_NUM_TYPOS: u8 = 8; - pub struct Typo<'t> { ctx: &'t dyn Context, - query_tree: Option, + query_tree: Option<(usize, Operation)>, number_typos: u8, candidates: Candidates, bucket_candidates: Option, @@ -31,7 +27,7 @@ impl<'t> Typo<'t> { { Ok(Typo { ctx, - query_tree, + query_tree: query_tree.map(|op| (maximum_typo(&op), op)), number_typos: 0, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), bucket_candidates: None, @@ -62,65 +58,75 @@ impl<'t> Typo<'t> { impl<'t> Criterion for Typo<'t> { fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; - while self.number_typos < MAX_NUM_TYPOS { + loop { match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { self.query_tree = None; self.candidates = Candidates::default(); }, - (Some(query_tree), Allowed(candidates)) => { - let fst = self.ctx.words_fst(); - let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? - } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; - query_tree.clone() + (Some((max_typos, query_tree)), Allowed(candidates)) => { + if self.number_typos as usize > *max_typos { + self.query_tree = None; + self.candidates = Candidates::default(); } else { - query_tree.clone() - }; + let fst = self.ctx.words_fst(); + let new_query_tree = if self.number_typos < 2 { + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? + } else if self.number_typos == 2 { + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; + query_tree.clone() + } else { + query_tree.clone() + }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; - new_candidates.intersect_with(&candidates); - candidates.difference_with(&new_candidates); - self.number_typos += 1; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; + new_candidates.intersect_with(&candidates); + candidates.difference_with(&new_candidates); + self.number_typos += 1; - let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), - }; + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: new_candidates, - bucket_candidates, - })); + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: new_candidates, + bucket_candidates, + })); + } }, - (Some(query_tree), Forbidden(candidates)) => { - let fst = self.ctx.words_fst(); - let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? - } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; - query_tree.clone() + (Some((max_typos, query_tree)), Forbidden(candidates)) => { + if self.number_typos as usize > *max_typos { + self.query_tree = None; + self.candidates = Candidates::default(); } else { - query_tree.clone() - }; + let fst = self.ctx.words_fst(); + let new_query_tree = if self.number_typos < 2 { + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? + } else if self.number_typos == 2 { + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; + query_tree.clone() + } else { + query_tree.clone() + }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; - new_candidates.difference_with(&candidates); - candidates.union_with(&new_candidates); - self.number_typos += 1; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; + new_candidates.difference_with(&candidates); + candidates.union_with(&new_candidates); + self.number_typos += 1; - let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), - }; + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: new_candidates, - bucket_candidates, - })); + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: new_candidates, + bucket_candidates, + })); + } }, (None, Allowed(_)) => { let candidates = take(&mut self.candidates).into_inner(); @@ -135,7 +141,8 @@ impl<'t> Criterion for Typo<'t> { Some(parent) => { match parent.next()? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree; + self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); + self.number_typos = 0; self.candidates = Candidates::Allowed(candidates); self.bucket_candidates = bucket_candidates; }, @@ -147,8 +154,6 @@ impl<'t> Criterion for Typo<'t> { }, } } - - Ok(None) } } From fb7e6df790d840e276a24dbc0d1e855f007692e2 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 24 Feb 2021 10:25:22 +0100 Subject: [PATCH 20/50] add tests on typo criterion --- milli/src/search/criteria/mod.rs | 137 +++++++++++++++++++++++++++ milli/src/search/criteria/typo.rs | 150 ++++++++++++++++++++++++++++++ milli/src/search/query_tree.rs | 8 +- 3 files changed, 293 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 3673aef78..77d92f6ea 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -14,6 +14,7 @@ pub trait Criterion { } /// The result of a call to the parent criterion. +#[derive(Debug, Clone, PartialEq)] pub struct CriterionResult { /// The query tree that must be used by the children criterion to fetch candidates. pub query_tree: Option, @@ -188,3 +189,139 @@ fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, p }, } } + +#[cfg(test)] +pub mod test { + use maplit::hashmap; + use rand::{Rng, SeedableRng, rngs::StdRng}; + + use super::*; + use std::collections::HashMap; + + fn s(s: &str) -> String { s.to_string() } + pub struct TestContext<'t> { + words_fst: fst::Set>, + word_docids: HashMap, + word_prefix_docids: HashMap, + word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + } + + impl<'a> Context for TestContext<'a> { + fn word_docids(&self, word: &str) -> heed::Result> { + Ok(self.word_docids.get(&word.to_string()).cloned()) + } + + fn word_prefix_docids(&self, word: &str) -> heed::Result> { + Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) + } + + fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left.to_string(), right.to_string(), proximity.into()); + Ok(self.word_pair_proximity_docids.get(&key).cloned()) + } + + fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left.to_string(), right.to_string(), proximity.into()); + Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) + } + + fn words_fst<'t>(&self) -> &'t fst::Set> { + &self.words_fst + } + + fn in_prefix_cache(&self, word: &str) -> bool { + self.word_prefix_docids.contains_key(&word.to_string()) + } + } + + impl<'a> Default for TestContext<'a> { + fn default() -> TestContext<'a> { + let mut rng = StdRng::seed_from_u64(102); + let rng = &mut rng; + + fn random_postings(rng: &mut R, len: usize) -> RoaringBitmap { + let mut values = Vec::::with_capacity(len); + while values.len() != len { + values.push(rng.gen()); + } + values.sort_unstable(); + + RoaringBitmap::from_sorted_iter(values.into_iter()) + } + + let word_docids = hashmap!{ + s("hello") => random_postings(rng, 1500), + s("hi") => random_postings(rng, 4000), + s("word") => random_postings(rng, 2500), + s("split") => random_postings(rng, 400), + s("ngrams") => random_postings(rng, 1400), + s("world") => random_postings(rng, 15_000), + s("earth") => random_postings(rng, 8000), + s("2021") => random_postings(rng, 100), + s("2020") => random_postings(rng, 500), + s("is") => random_postings(rng, 50_000), + s("this") => random_postings(rng, 50_000), + s("good") => random_postings(rng, 1250), + s("morning") => random_postings(rng, 125), + }; + + let word_prefix_docids = hashmap!{ + s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], + s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], + s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], + }; + + let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")]; + let hello_world_split = (hello_world.len() / 2) as usize; + let hello_world_1 = hello_world.iter().take(hello_world_split).collect(); + let hello_world_2 = hello_world.iter().skip(hello_world_split).collect(); + + let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")]; + let hello_word_split = (hello_word.len() / 2) as usize; + let hello_word_4 = hello_word.iter().take(hello_word_split).collect(); + let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect(); + let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect(); + let word_pair_proximity_docids = hashmap!{ + (s("good"), s("morning"), 1) => &word_docids[&s("good")] & &word_docids[&s("morning")], + (s("hello"), s("world"), 1) => hello_world_1, + (s("hello"), s("world"), 4) => hello_world_2, + (s("this"), s("is"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")], + (s("is"), s("2021"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], + (s("is"), s("2020"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), + (s("this"), s("2021"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], + (s("this"), s("2020"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), + (s("word"), s("split"), 1) => &word_docids[&s("word")] & &word_docids[&s("split")], + (s("world"), s("split"), 1) => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")], + (s("hello"), s("word"), 4) => hello_word_4, + (s("hello"), s("word"), 6) => hello_word_6, + (s("hello"), s("word"), 7) => hello_word_7, + (s("split"), s("ngrams"), 3) => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")], + (s("split"), s("ngrams"), 5) => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], + (s("this"), s("ngrams"), 1) => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")], + (s("this"), s("ngrams"), 2) => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], + }; + + let word_prefix_pair_proximity_docids = hashmap!{ + (s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(), + (s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(), + (s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(), + (s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(), + (s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(), + (s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(), + }; + + let mut keys = word_docids.keys().collect::>(); + keys.sort_unstable(); + let words_fst = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap(); + + TestContext { + words_fst, + word_docids, + word_prefix_docids, + word_pair_proximity_docids, + word_prefix_pair_proximity_docids, + } + } + } +} diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 75f3f5666..d9a5f8aa6 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -328,3 +328,153 @@ fn resolve_candidates<'t>( resolve_operation(ctx, query_tree, number_typos, cache) } + +#[cfg(test)] +mod test { + + use super::*; + use super::super::test::TestContext; + + #[test] + fn initial_placeholder_no_facets() { + let context = TestContext::default(); + let query_tree = None; + let facet_candidates = None; + + let mut criteria = Typo::initial(&context, query_tree, facet_candidates).unwrap(); + + assert!(criteria.next().unwrap().is_none()); + } + + #[test] + fn initial_query_tree_no_facets() { + let context = TestContext::default(); + let query_tree = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), + ]) + ]); + + let facet_candidates = None; + + let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates).unwrap(); + + let candidates_1 = context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("world").unwrap().unwrap(); + let expected_1 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ])), + candidates: candidates_1.clone(), + bucket_candidates: Some(candidates_1), + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_1)); + + let candidates_2 = ( + context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("word").unwrap().unwrap() + ) - context.word_docids("world").unwrap().unwrap(); + let expected_2 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ]), + ])), + candidates: candidates_2.clone(), + bucket_candidates: Some(candidates_2), + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_2)); + } + + #[test] + fn initial_placeholder_with_facets() { + let context = TestContext::default(); + let query_tree = None; + let facet_candidates = context.word_docids("earth").unwrap(); + + let mut criteria = Typo::initial(&context, query_tree, facet_candidates.clone()).unwrap(); + + let expected = CriterionResult { + query_tree: None, + candidates: facet_candidates.clone().unwrap(), + bucket_candidates: facet_candidates, + }; + + // first iteration, returns the facet candidates + assert_eq!(criteria.next().unwrap(), Some(expected)); + + // second iteration, returns None because there is no more things to do + assert!(criteria.next().unwrap().is_none()); + } + + #[test] + fn initial_query_tree_with_facets() { + let context = TestContext::default(); + let query_tree = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), + ]) + ]); + + let facet_candidates = context.word_docids("earth").unwrap().unwrap(); + + let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())).unwrap(); + + let candidates_1 = context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("world").unwrap().unwrap(); + let expected_1 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ])), + candidates: &candidates_1 & &facet_candidates, + bucket_candidates: Some(candidates_1 & &facet_candidates), + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_1)); + + let candidates_2 = ( + context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("word").unwrap().unwrap() + ) - context.word_docids("world").unwrap().unwrap(); + let expected_2 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ]), + ])), + candidates: &candidates_2 & &facet_candidates, + bucket_candidates: Some(candidates_2 & &facet_candidates), + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_2)); + } + +} diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 02f6dc0c8..715a4864e 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -94,11 +94,15 @@ pub enum QueryKind { } impl QueryKind { - fn exact(word: String) -> Self { + pub fn exact(word: String) -> Self { QueryKind::Exact { original_typo: 0, word } } - fn tolerant(typo: u8, word: String) -> Self { + pub fn exact_with_typo(original_typo: u8, word: String) -> Self { + QueryKind::Exact { original_typo, word } + } + + pub fn tolerant(typo: u8, word: String) -> Self { QueryKind::Tolerant { typo, word } } From 64688b3786561804ec25c0a69a402d64e5456863 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 25 Feb 2021 14:59:40 +0100 Subject: [PATCH 21/50] fix query tree builder --- milli/src/search/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 6046cc8d2..1e3e8eefe 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -66,12 +66,12 @@ impl<'a> Search<'a> { let before = Instant::now(); let query_tree = match self.query.as_ref() { Some(query) => { - let builder = QueryTreeBuilder::new(self.rtxn, self.index); + let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); let stop_words = &Set::default(); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.build(false, true, tokens) + builder.optional_words(false).build(tokens) }, None => None, }; From d92ad5640ae188ace90517687fedea5041c48bc1 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 25 Feb 2021 16:14:38 +0100 Subject: [PATCH 22/50] remove option on bucket_candidates --- milli/src/search/criteria/mod.rs | 2 +- milli/src/search/criteria/typo.rs | 30 +++++++++++++++--------------- milli/src/search/mod.rs | 4 +--- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 77d92f6ea..e7549cae1 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -21,7 +21,7 @@ pub struct CriterionResult { /// The candidates that this criterion is allowed to return subsets of. pub candidates: RoaringBitmap, /// Candidates that comes from the current bucket of the initial criterion. - pub bucket_candidates: Option, + pub bucket_candidates: RoaringBitmap, } /// Either a set of candidates that defines the candidates diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index d9a5f8aa6..a9d3b5d96 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -12,7 +12,7 @@ pub struct Typo<'t> { query_tree: Option<(usize, Operation)>, number_typos: u8, candidates: Candidates, - bucket_candidates: Option, + bucket_candidates: RoaringBitmap, parent: Option>, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, typo_cache: HashMap<(String, bool, u8), Vec<(String, u8)>>, @@ -30,7 +30,7 @@ impl<'t> Typo<'t> { query_tree: query_tree.map(|op| (maximum_typo(&op), op)), number_typos: 0, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::new(), typo_cache: HashMap::new(), @@ -47,7 +47,7 @@ impl<'t> Typo<'t> { query_tree: None, number_typos: 0, candidates: Candidates::default(), - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::new(), typo_cache: HashMap::new(), @@ -85,8 +85,8 @@ impl<'t> Criterion for Typo<'t> { self.number_typos += 1; let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), }; return Ok(Some(CriterionResult { @@ -117,8 +117,8 @@ impl<'t> Criterion for Typo<'t> { self.number_typos += 1; let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), }; return Ok(Some(CriterionResult { @@ -133,7 +133,7 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some(CriterionResult { query_tree: None, candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, (None, Forbidden(_)) => { @@ -373,7 +373,7 @@ mod test { ]), ])), candidates: candidates_1.clone(), - bucket_candidates: Some(candidates_1), + bucket_candidates: candidates_1, }; assert_eq!(criteria.next().unwrap(), Some(expected_1)); @@ -395,7 +395,7 @@ mod test { ]), ])), candidates: candidates_2.clone(), - bucket_candidates: Some(candidates_2), + bucket_candidates: candidates_2, }; assert_eq!(criteria.next().unwrap(), Some(expected_2)); @@ -405,13 +405,13 @@ mod test { fn initial_placeholder_with_facets() { let context = TestContext::default(); let query_tree = None; - let facet_candidates = context.word_docids("earth").unwrap(); + let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criteria = Typo::initial(&context, query_tree, facet_candidates.clone()).unwrap(); + let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())).unwrap(); let expected = CriterionResult { query_tree: None, - candidates: facet_candidates.clone().unwrap(), + candidates: facet_candidates.clone(), bucket_candidates: facet_candidates, }; @@ -449,7 +449,7 @@ mod test { ]), ])), candidates: &candidates_1 & &facet_candidates, - bucket_candidates: Some(candidates_1 & &facet_candidates), + bucket_candidates: candidates_1 & &facet_candidates, }; assert_eq!(criteria.next().unwrap(), Some(expected_1)); @@ -471,7 +471,7 @@ mod test { ]), ])), candidates: &candidates_2 & &facet_candidates, - bucket_candidates: Some(candidates_2 & &facet_candidates), + bucket_candidates: candidates_2 & &facet_candidates, }; assert_eq!(criteria.next().unwrap(), Some(expected_2)); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 1e3e8eefe..7a8bbdc09 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -100,9 +100,7 @@ impl<'a> Search<'a> { let mut len = candidates.len() as usize; let mut candidates = candidates.into_iter(); - if let Some(docids) = bucket_candidates { - initial_candidates.union_with(&docids); - } + initial_candidates.union_with(&bucket_candidates); if offset != 0 { candidates.by_ref().skip(offset).for_each(drop); From 2d068bd45b61da527c1bf12a866b58ba5d895222 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 15:27:35 +0100 Subject: [PATCH 23/50] implement Context trait for criteria --- milli/src/search/criteria/typo.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a9d3b5d96..659292619 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -181,6 +181,7 @@ fn alterate_query_tree( ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, typo_cache)) }, Operation::Query(q) => { + // TODO may be optimized when number_typos == 0 if let QueryKind::Tolerant { typo, word } = &q.kind { // if no typo is allowed we don't call word_derivations function, // and directly create an Exact query From 1e47f9b3ffa7f55071274d95d542bb183c7862cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Feb 2021 15:32:14 +0100 Subject: [PATCH 24/50] Introduce the Words criterion --- milli/src/search/criteria/mod.rs | 1 + milli/src/search/criteria/typo.rs | 4 +- milli/src/search/criteria/words.rs | 109 +++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 milli/src/search/criteria/words.rs diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index e7549cae1..5cc803dee 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -8,6 +8,7 @@ use roaring::RoaringBitmap; use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; +pub mod words; pub trait Criterion { fn next(&mut self) -> anyhow::Result>; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 659292619..a62616f08 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -13,7 +13,7 @@ pub struct Typo<'t> { number_typos: u8, candidates: Candidates, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Option>, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, typo_cache: HashMap<(String, bool, u8), Vec<(String, u8)>>, } @@ -39,7 +39,7 @@ impl<'t> Typo<'t> { pub fn new( ctx: &'t dyn Context, - parent: Box, + parent: Box, ) -> anyhow::Result where Self: Sized { Ok(Typo { diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs new file mode 100644 index 000000000..ade370fda --- /dev/null +++ b/milli/src/search/criteria/words.rs @@ -0,0 +1,109 @@ +use std::collections::HashMap; +use std::mem::take; + +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use super::{Candidates, Criterion, CriterionResult, Context}; + +pub struct Words<'t> { + ctx: &'t dyn Context, + query_trees: Vec, + candidates: Candidates, + bucket_candidates: RoaringBitmap, + parent: Option>, + candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, +} + +impl<'t> Words<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> anyhow::Result where Self: Sized + { + Ok(Words { + ctx, + query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + bucket_candidates: RoaringBitmap::new(), + parent: None, + candidates_cache: HashMap::default(), + }) + } + + pub fn new( + ctx: &'t dyn Context, + parent: Box, + ) -> anyhow::Result where Self: Sized + { + Ok(Words { + ctx, + query_trees: Vec::default(), + candidates: Candidates::default(), + bucket_candidates: RoaringBitmap::new(), + parent: Some(parent), + candidates_cache: HashMap::default(), + }) + } +} + +impl<'t> Criterion for Words<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + + loop { + match (self.query_trees.pop(), &mut self.candidates) { + (_, Allowed(candidates)) if candidates.is_empty() => { + self.query_trees = Vec::new(); + self.candidates = Candidates::default(); + }, + (Some(qt), Allowed(candidates)) => { + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(qt), + candidates: candidates.clone(), + bucket_candidates, + })); + }, + (Some(_qt), Forbidden(candidates)) => { + todo!() + }, + (None, Allowed(_)) => { + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: None, + candidates: candidates.clone(), + bucket_candidates: candidates, + })); + }, + (None, Forbidden(_)) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); + self.candidates = Candidates::Allowed(candidates); + self.bucket_candidates = bucket_candidates; + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } + } +} + +fn explode_query_tree(query_tree: Operation) -> Vec { + match query_tree { + Operation::Or(true, ops) => ops, + otherwise => vec![otherwise], + } +} From e174ccbd8e20a6b357430cd876b56ad7fd67420c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Feb 2021 17:22:43 +0100 Subject: [PATCH 25/50] Use the words criterion in the search module --- milli/src/search/criteria/words.rs | 2 +- milli/src/search/mod.rs | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index ade370fda..bf3aa8b12 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -70,7 +70,7 @@ impl<'t> Criterion for Words<'t> { bucket_candidates, })); }, - (Some(_qt), Forbidden(candidates)) => { + (Some(_qt), Forbidden(_candidates)) => { todo!() }, (None, Allowed(_)) => { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7a8bbdc09..2a726f635 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,8 +11,8 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::search::criteria::{Criterion, CriterionResult}; -use crate::search::criteria::typo::Typo; -use crate::{Index, DocumentId}; +use crate::search::criteria::{typo::Typo, words::Words}; +use crate::{Index, FieldId, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::{FacetIter}; @@ -71,7 +71,7 @@ impl<'a> Search<'a> { let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.optional_words(false).build(tokens) + builder.build(tokens) }, None => None, }; @@ -89,7 +89,8 @@ impl<'a> Search<'a> { // We aretesting the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; - let mut criteria = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; + let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; + let mut criteria = Words::new(&criteria_ctx, Box::new(typo_criterion))?; let mut offset = self.offset; let mut limit = self.limit; From ef381e17bbd4647ae9894f6d56f5db87fce2f861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Feb 2021 11:20:42 +0100 Subject: [PATCH 26/50] Compute the candidates for each sub query tree --- milli/src/search/criteria/words.rs | 77 +++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index bf3aa8b12..93298b64e 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -1,10 +1,11 @@ use std::collections::HashMap; use std::mem::take; +use anyhow::bail; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; -use super::{Candidates, Criterion, CriterionResult, Context}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; pub struct Words<'t> { ctx: &'t dyn Context, @@ -64,9 +65,13 @@ impl<'t> Criterion for Words<'t> { None => candidates.clone(), }; + let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; + found_candidates.intersect_with(&candidates); + candidates.difference_with(&found_candidates); + return Ok(Some(CriterionResult { query_tree: Some(qt), - candidates: candidates.clone(), + candidates: found_candidates, bucket_candidates, })); }, @@ -107,3 +112,71 @@ fn explode_query_tree(query_tree: Operation) -> Vec { otherwise => vec![otherwise], } } + +fn resolve_candidates<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, +) -> anyhow::Result +{ + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + ) -> anyhow::Result + { + use Operation::{And, Consecutive, Or, Query}; + + match query_tree { + And(ops) => { + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for op in ops { + let docids = resolve_operation(ctx, op, cache)?; + if first_loop { + candidates = docids; + first_loop = false; + } else { + candidates.intersect_with(&docids); + } + } + Ok(candidates) + }, + Consecutive(ops) => { + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for slice in ops.windows(2) { + match (&slice[0], &slice[1]) { + (Operation::Query(left), Operation::Query(right)) => { + match query_pair_proximity_docids(ctx, left, right, 1)? { + pair_docids if pair_docids.is_empty() => { + return Ok(RoaringBitmap::new()) + }, + pair_docids if first_loop => { + candidates = pair_docids; + first_loop = false; + }, + pair_docids => { + candidates.intersect_with(&pair_docids); + }, + } + }, + _ => bail!("invalid consecutive query type"), + } + } + Ok(candidates) + }, + Or(_, ops) => { + let mut candidates = RoaringBitmap::new(); + for op in ops { + let docids = resolve_operation(ctx, op, cache)?; + candidates.union_with(&docids); + } + Ok(candidates) + }, + Query(q) => Ok(query_docids(ctx, q)?), + } + } + + resolve_operation(ctx, query_tree, cache) +} From 3415812b06291f5ee7504513f76dbcf411823c26 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 24 Feb 2021 14:48:12 +0100 Subject: [PATCH 27/50] Imrpove the intersection speed in the words criterion --- milli/src/search/criteria/words.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 93298b64e..cf1668055 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -129,10 +129,15 @@ fn resolve_candidates<'t>( match query_tree { And(ops) => { + let mut ops = ops.iter().map(|op| { + resolve_operation(ctx, op, cache) + }).collect::>>()?; + + ops.sort_unstable_by_key(|cds| cds.len()); + let mut candidates = RoaringBitmap::new(); let mut first_loop = true; - for op in ops { - let docids = resolve_operation(ctx, op, cache)?; + for docids in ops { if first_loop { candidates = docids; first_loop = false; From b5b7ec0162bd5c3613f3281cdb29becb1c792758 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 24 Feb 2021 15:59:19 +0100 Subject: [PATCH 28/50] implement initial state for words criterion --- milli/src/search/criteria/words.rs | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index cf1668055..bd03ecf97 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -60,23 +60,36 @@ impl<'t> Criterion for Words<'t> { self.candidates = Candidates::default(); }, (Some(qt), Allowed(candidates)) => { - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => candidates.clone(), - }; - let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => found_candidates.clone(), + }; + return Ok(Some(CriterionResult { query_tree: Some(qt), candidates: found_candidates, bucket_candidates, })); }, - (Some(_qt), Forbidden(_candidates)) => { - todo!() + (Some(qt), Forbidden(candidates)) => { + let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; + found_candidates.difference_with(&candidates); + candidates.union_with(&found_candidates); + + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => found_candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(qt), + candidates: found_candidates, + bucket_candidates, + })); }, (None, Allowed(_)) => { let candidates = take(&mut self.candidates).into_inner(); From 14f9f85c4b2d30ff79ab31562ddade10c571d849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Feb 2021 15:45:15 +0100 Subject: [PATCH 29/50] Introduce the AscDesc criterion --- milli/src/search/criteria/asc_desc.rs | 272 ++++++++++++++++++++++++++ milli/src/search/criteria/mod.rs | 2 + milli/src/search/mod.rs | 15 +- 3 files changed, 285 insertions(+), 4 deletions(-) create mode 100644 milli/src/search/criteria/asc_desc.rs diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs new file mode 100644 index 000000000..bf75ada7e --- /dev/null +++ b/milli/src/search/criteria/asc_desc.rs @@ -0,0 +1,272 @@ +use std::mem::take; + +use anyhow::bail; +use itertools::Itertools; +use ordered_float::OrderedFloat; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; +use crate::heed_codec::facet::{FieldDocIdFacetI64Codec, FieldDocIdFacetF64Codec}; +use crate::search::facet::FacetIter; +use crate::search::query_tree::Operation; +use crate::{FieldId, Index}; +use super::{Candidates, Criterion, CriterionResult}; + +pub struct AscDesc<'t> { + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + query_tree: Option, + candidates: Candidates, + bucket_candidates: Option, + faceted_candidates: RoaringBitmap, + parent: Option>, +} + +impl<'t> AscDesc<'t> { + pub fn initial_asc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + query_tree: Option, + candidates: Option, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result where Self: Sized + { + Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, true) + } + + pub fn initial_desc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + query_tree: Option, + candidates: Option, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result where Self: Sized + { + Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, false) + } + + pub fn asc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result where Self: Sized + { + Self::new(index, rtxn, parent, field_id, facet_type, true) + } + + pub fn desc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result where Self: Sized + { + Self::new(index, rtxn, parent, field_id, facet_type, false) + } + + fn initial( + index: &'t Index, + rtxn: &'t heed::RoTxn, + query_tree: Option, + candidates: Option, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + ) -> anyhow::Result where Self: Sized + { + Ok(AscDesc { + index, + rtxn, + field_id, + facet_type, + ascending, + query_tree, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, + bucket_candidates: None, + parent: None, + }) + } + + fn new( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + ) -> anyhow::Result where Self: Sized + { + Ok(AscDesc { + index, + rtxn, + field_id, + facet_type, + ascending, + query_tree: None, + candidates: Candidates::default(), + faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, + bucket_candidates: None, + parent: Some(parent), + }) + } +} + +impl<'t> Criterion for AscDesc<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + + loop { + match (&mut self.query_tree, &mut self.candidates) { + (_, Allowed(candidates)) if candidates.is_empty() => { + self.query_tree = None; + self.candidates = Candidates::default(); + }, + (Some(qt), Allowed(candidates)) => { + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(candidates.clone()), + }; + + let mut found_candidates = facet_ordered( + self.index, + self.rtxn, + self.field_id, + self.facet_type, + self.ascending, + candidates.clone(), + )?; + + found_candidates.intersect_with(&candidates); + candidates.difference_with(&found_candidates); + + return Ok(Some(CriterionResult { + query_tree: Some(qt.clone()), + candidates: found_candidates, + bucket_candidates, + })); + }, + (Some(_qt), Forbidden(_candidates)) => { + todo!() + }, + (None, Allowed(_)) => { + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: None, + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + (None, Forbidden(_)) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { + self.query_tree = query_tree; + candidates.intersect_with(&self.faceted_candidates); + self.candidates = Candidates::Allowed(candidates); + self.bucket_candidates = bucket_candidates; + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } + } +} + +fn facet_ordered( + index: &Index, + rtxn: &heed::RoTxn, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + candidates: RoaringBitmap, +) -> anyhow::Result +{ + match facet_type { + FacetType::Float => { + if candidates.len() <= 1000 { + let db = index.field_id_docid_facet_values.remap_key_type::(); + let mut docids_values = Vec::with_capacity(candidates.len() as usize); + for docid in candidates.iter() { + let left = (field_id, docid, f64::MIN); + let right = (field_id, docid, f64::MAX); + let mut iter = db.range(rtxn, &(left..=right))?; + let entry = if ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), ())) = entry.transpose()? { + docids_values.push((docid, OrderedFloat(value))); + } + } + docids_values.sort_unstable_by_key(|(_, value)| *value); + let iter = docids_values.into_iter(); + let iter = if ascending { + Box::new(iter) as Box> + } else { + Box::new(iter.rev()) + }; + match iter.group_by(|(_, v)| *v).into_iter().next() { + Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()), + None => Ok(RoaringBitmap::new()) + } + } else { + let facet_fn = if ascending { + FacetIter::::new_reducing + } else { + FacetIter::::new_reverse_reducing + }; + + let mut iter = facet_fn(rtxn, index, field_id, candidates)?; + Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default()) + } + }, + FacetType::Integer => { + if candidates.len() <= 1000 { + let db = index.field_id_docid_facet_values.remap_key_type::(); + let mut docids_values = Vec::with_capacity(candidates.len() as usize); + for docid in candidates.iter() { + let left = (field_id, docid, i64::MIN); + let right = (field_id, docid, i64::MAX); + let mut iter = db.range(rtxn, &(left..=right))?; + let entry = if ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), ())) = entry.transpose()? { + docids_values.push((docid, value)); + } + } + docids_values.sort_unstable_by_key(|(_, value)| *value); + let iter = docids_values.into_iter(); + let iter = if ascending { + Box::new(iter) as Box> + } else { + Box::new(iter.rev()) + }; + match iter.group_by(|(_, v)| *v).into_iter().next() { + Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()), + None => Ok(RoaringBitmap::new()) + } + } else { + let facet_fn = if ascending { + FacetIter::::new_reducing + } else { + FacetIter::::new_reverse_reducing + }; + + let mut iter = facet_fn(rtxn, index, field_id, candidates)?; + Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default()) + } + }, + FacetType::String => bail!("criteria facet type must be a number"), + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 5cc803dee..34d06dce3 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -9,6 +9,7 @@ use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; pub mod words; +pub mod asc_desc; pub trait Criterion { fn next(&mut self) -> anyhow::Result>; @@ -28,6 +29,7 @@ pub struct CriterionResult { /// Either a set of candidates that defines the candidates /// that are allowed to be returned, /// or the candidates that must never be returned. +#[derive(Debug)] enum Candidates { Allowed(RoaringBitmap), Forbidden(RoaringBitmap) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 2a726f635..93cac34b6 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,11 +11,11 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::search::criteria::{Criterion, CriterionResult}; -use crate::search::criteria::{typo::Typo, words::Words}; -use crate::{Index, FieldId, DocumentId}; +use crate::search::criteria::{typo::Typo, words::Words, asc_desc::AscDesc}; +use crate::{Index, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; -pub use self::facet::{FacetIter}; +pub use self::facet::FacetIter; use self::query_tree::QueryTreeBuilder; // Building these factories is not free. @@ -90,7 +90,14 @@ impl<'a> Search<'a> { // We aretesting the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; - let mut criteria = Words::new(&criteria_ctx, Box::new(typo_criterion))?; + let words_criterion = Words::new(&criteria_ctx, Box::new(typo_criterion))?; + + // We sort in descending order on a specific field *by hand*, don't do that at home. + let attr_name = "released-timestamp"; + let fid = self.index.fields_ids_map(self.rtxn)?.id(attr_name).unwrap(); + let ftype = *self.index.faceted_fields(self.rtxn)?.get(attr_name).unwrap(); + let desc_criterion = AscDesc::desc(self.index, self.rtxn, Box::new(words_criterion), fid, ftype)?; + let mut criteria = desc_criterion; let mut offset = self.offset; let mut limit = self.limit; From 3d731cc861a1799e0b14288443a2e7668d7bb83b Mon Sep 17 00:00:00 2001 From: many Date: Thu, 25 Feb 2021 16:47:34 +0100 Subject: [PATCH 30/50] remove option on bucket_candidates --- milli/src/search/criteria/asc_desc.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index bf75ada7e..3d32bd845 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -21,7 +21,7 @@ pub struct AscDesc<'t> { ascending: bool, query_tree: Option, candidates: Candidates, - bucket_candidates: Option, + bucket_candidates: RoaringBitmap, faceted_candidates: RoaringBitmap, parent: Option>, } @@ -92,7 +92,7 @@ impl<'t> AscDesc<'t> { query_tree, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: None, }) } @@ -115,7 +115,7 @@ impl<'t> AscDesc<'t> { query_tree: None, candidates: Candidates::default(), faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: Some(parent), }) } @@ -133,8 +133,8 @@ impl<'t> Criterion for AscDesc<'t> { }, (Some(qt), Allowed(candidates)) => { let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(candidates.clone()), + Some(_) => take(&mut self.bucket_candidates), + None => candidates.clone(), }; let mut found_candidates = facet_ordered( @@ -163,7 +163,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree: None, candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, (None, Forbidden(_)) => { From 22b84fe543c5f0d34d80a39db22e4c564f410378 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 12:12:35 +0100 Subject: [PATCH 31/50] Use the words criterion in the search module --- milli/src/search/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 93cac34b6..51f81f540 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -66,12 +66,12 @@ impl<'a> Search<'a> { let before = Instant::now(); let query_tree = match self.query.as_ref() { Some(query) => { - let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); + let builder = QueryTreeBuilder::new(self.rtxn, self.index); let stop_words = &Set::default(); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.build(tokens) + builder.build(tokens)? }, None => None, }; From 9bc9b366450b76861156b082205a950f58cc7f66 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 22 Feb 2021 17:17:01 +0100 Subject: [PATCH 32/50] Introduce the Proximity criterion --- milli/src/search/criteria/mod.rs | 1 + milli/src/search/criteria/proximity.rs | 283 +++++++++++++++++++++++++ milli/src/search/mod.rs | 18 +- milli/src/search/query_tree.rs | 7 + 4 files changed, 301 insertions(+), 8 deletions(-) create mode 100644 milli/src/search/criteria/proximity.rs diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 34d06dce3..41cd6722c 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -10,6 +10,7 @@ use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; pub mod words; pub mod asc_desc; +pub mod proximity; pub trait Criterion { fn next(&mut self) -> anyhow::Result>; diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs new file mode 100644 index 000000000..2a46cf5d0 --- /dev/null +++ b/milli/src/search/criteria/proximity.rs @@ -0,0 +1,283 @@ +use std::collections::HashMap; +use std::mem::take; + +use roaring::RoaringBitmap; + +use crate::search::query_tree::{maximum_proximity, Operation, Query}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; + +pub struct Proximity<'t> { + ctx: &'t dyn Context, + query_tree: Option<(usize, Operation)>, + proximity: u8, + candidates: Candidates, + bucket_candidates: Option, + parent: Option>, + candidates_cache: HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, +} + +impl<'t> Proximity<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> anyhow::Result where Self: Sized + { + Ok(Proximity { + ctx, + query_tree: query_tree.map(|op| (maximum_proximity(&op), op)), + proximity: 0, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + bucket_candidates: None, + parent: None, + candidates_cache: HashMap::new(), + }) + } + + pub fn new( + ctx: &'t dyn Context, + parent: Box, + ) -> anyhow::Result where Self: Sized + { + Ok(Proximity { + ctx, + query_tree: None, + proximity: 0, + candidates: Candidates::default(), + bucket_candidates: None, + parent: Some(parent), + candidates_cache: HashMap::new(), + }) + } +} + +impl<'t> Criterion for Proximity<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + loop { + match (&mut self.query_tree, &mut self.candidates) { + (_, Allowed(candidates)) if candidates.is_empty() => { + self.query_tree = None; + self.candidates = Candidates::default(); + }, + (Some((max_prox, query_tree)), Allowed(candidates)) => { + if self.proximity as usize > *max_prox { + self.query_tree = None; + self.candidates = Candidates::default(); + } else { + let mut new_candidates = resolve_candidates( + self.ctx, + &query_tree, + self.proximity, + &mut self.candidates_cache, + )?; + + new_candidates.intersect_with(&candidates); + candidates.difference_with(&new_candidates); + self.proximity += 1; + + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(query_tree.clone()), + candidates: new_candidates, + bucket_candidates, + })); + } + }, + (Some((max_prox, query_tree)), Forbidden(candidates)) => { + if self.proximity as usize > *max_prox { + self.query_tree = None; + self.candidates = Candidates::default(); + } else { + let mut new_candidates = resolve_candidates( + self.ctx, + &query_tree, + self.proximity, + &mut self.candidates_cache, + )?; + + new_candidates.difference_with(&candidates); + candidates.union_with(&new_candidates); + self.proximity += 1; + + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(query_tree.clone()), + candidates: new_candidates, + bucket_candidates, + })); + } + }, + (None, Allowed(_)) => { + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: None, + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + (None, Forbidden(_)) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); + self.proximity = 0; + self.candidates = Candidates::Allowed(candidates); + self.bucket_candidates = bucket_candidates; + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } + } +} + +fn resolve_candidates<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, +) -> anyhow::Result +{ + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + ) -> anyhow::Result> + { + use Operation::{And, Consecutive, Or, Query}; + + let result = match query_tree { + And(ops) => mdfs(ctx, ops, proximity, cache)?, + Consecutive(ops) => if proximity == 0 { + mdfs(ctx, ops, 0, cache)? + } else { + Default::default() + }, + Or(_, ops) => { + let mut output = Vec::new(); + for op in ops { + let result = resolve_operation(ctx, op, proximity, cache)?; + output.extend(result); + } + output + }, + Query(q) => if proximity == 0 { + let candidates = query_docids(ctx, q)?; + vec![(q.clone(), q.clone(), candidates)] + } else { + Default::default() + }, + }; + + Ok(result) + } + + fn mdfs_pair<'t>( + ctx: &'t dyn Context, + left: &Operation, + right: &Operation, + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + ) -> anyhow::Result> + { + fn pair_combinations(mana: u8) -> impl Iterator { + (0..=mana).map(move |m| (mana - m, m)) + } + + let mut output = Vec::new(); + + for (pair_p, left_right_p) in pair_combinations(proximity) { + for (left_p, right_p) in pair_combinations(left_right_p) { + let left_key = (left.clone(), left_p); + if !cache.contains_key(&left_key) { + let candidates = resolve_operation(ctx, left, left_p, cache)?; + cache.insert(left_key.clone(), candidates); + } + + let right_key = (right.clone(), right_p); + if !cache.contains_key(&right_key) { + let candidates = resolve_operation(ctx, right, right_p, cache)?; + cache.insert(right_key.clone(), candidates); + } + + let lefts = cache.get(&left_key).unwrap(); + let rights = cache.get(&right_key).unwrap(); + + for (ll, lr, lcandidates) in lefts { + for (rl, rr, rcandidates) in rights { + let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1)?; + if lcandidates.len() < rcandidates.len() { + candidates.intersect_with(lcandidates); + candidates.intersect_with(rcandidates); + } else { + candidates.intersect_with(rcandidates); + candidates.intersect_with(lcandidates); + } + if !candidates.is_empty() { + output.push((ll.clone(), rr.clone(), candidates)); + } + } + } + } + } + + Ok(output) + } + + fn mdfs<'t>( + ctx: &'t dyn Context, + branches: &[Operation], + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + ) -> anyhow::Result> + { + // Extract the first two elements but gives the tail + // that is just after the first element. + let next = branches.split_first().map(|(h1, t)| { + (h1, t.split_first().map(|(h2, _)| (h2, t))) + }); + + match next { + Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache), + Some((head1, Some((head2, tail)))) => { + let mut output = Vec::new(); + for p in 0..=proximity { + for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache)? { + if !head_candidates.is_empty() { + for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache)? { + candidates.intersect_with(&head_candidates); + if !candidates.is_empty() { + output.push((lhead.clone(), rtail, candidates)); + } + } + } + } + } + Ok(output) + }, + Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache), + None => return Ok(Default::default()), + } + } + + let mut candidates = RoaringBitmap::new(); + for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache)? { + candidates.union_with(&cds); + } + Ok(candidates) +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 51f81f540..aced8bbd1 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,7 +11,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::search::criteria::{Criterion, CriterionResult}; -use crate::search::criteria::{typo::Typo, words::Words, asc_desc::AscDesc}; +use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity}; use crate::{Index, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; @@ -87,17 +87,19 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); - // We aretesting the typo criteria but there will be more of them soon. + // We are testing the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; let words_criterion = Words::new(&criteria_ctx, Box::new(typo_criterion))?; + let proximity_criterion = Proximity::new(&criteria_ctx, Box::new(words_criterion))?; + // let proximity_criterion = Proximity::initial(&criteria_ctx, query_tree, facet_candidates)?; + let mut criteria = proximity_criterion; - // We sort in descending order on a specific field *by hand*, don't do that at home. - let attr_name = "released-timestamp"; - let fid = self.index.fields_ids_map(self.rtxn)?.id(attr_name).unwrap(); - let ftype = *self.index.faceted_fields(self.rtxn)?.get(attr_name).unwrap(); - let desc_criterion = AscDesc::desc(self.index, self.rtxn, Box::new(words_criterion), fid, ftype)?; - let mut criteria = desc_criterion; + // // We sort in descending order on a specific field *by hand*, don't do that at home. + // let attr_name = "released-timestamp"; + // let fid = self.index.fields_ids_map(self.rtxn)?.id(attr_name).unwrap(); + // let ftype = *self.index.faceted_fields(self.rtxn)?.get(attr_name).unwrap(); + // let desc_criterion = AscDesc::desc(self.index, self.rtxn, Box::new(words_criterion), fid, ftype)?; let mut offset = self.offset; let mut limit = self.limit; diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 715a4864e..59f7802f3 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -79,6 +79,13 @@ impl Operation { Self::Consecutive(ops) } } + + pub fn query(&self) -> Option<&Query> { + match self { + Operation::Query(query) => Some(query), + _ => None, + } + } } #[derive(Clone, Eq, PartialEq, Hash)] From ae4a237e580a38b930c8422297b275cb08819aac Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 24 Feb 2021 15:36:57 +0100 Subject: [PATCH 33/50] Fix the maximum_proximity function --- milli/src/search/criteria/mod.rs | 8 +++++++- milli/src/search/query_tree.rs | 5 ++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 41cd6722c..52367ac5f 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -153,8 +153,14 @@ fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result anyhow::Result { - let prefix = right.prefix; + if proximity >= 8 { + let mut candidates = query_docids(ctx, left)?; + let right_candidates = query_docids(ctx, right)?; + candidates.intersect_with(&right_candidates); + return Ok(candidates); + } + let prefix = right.prefix; match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix && ctx.in_prefix_cache(&right) { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 59f7802f3..47057ad10 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -546,7 +546,10 @@ pub fn maximum_proximity(operation: &Operation) -> usize { use Operation::{Or, And, Query, Consecutive}; match operation { Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), - And(ops) => ops.len().saturating_sub(1) * 8, + And(ops) => { + ops.iter().map(maximum_proximity).sum::() + + ops.len().saturating_sub(1) * 7 + }, Query(_) | Consecutive(_) => 0, } } From 4510bbcccac375409e18dc5170cc0238118bffb8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 24 Feb 2021 15:37:37 +0100 Subject: [PATCH 34/50] Add a lot of debug --- milli/src/search/criteria/proximity.rs | 7 +++++++ milli/src/search/criteria/typo.rs | 3 +++ milli/src/search/criteria/words.rs | 4 +++- milli/src/search/mod.rs | 2 ++ 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 2a46cf5d0..aab46a6a2 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use std::mem::take; use roaring::RoaringBitmap; +use log::debug; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; @@ -55,6 +56,12 @@ impl<'t> Criterion for Proximity<'t> { fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { + debug!("Proximity at iteration {} (max {:?}) ({:?})", + self.proximity, + self.query_tree.as_ref().map(|(mp, _)| mp), + self.candidates, + ); + match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { self.query_tree = None; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a62616f08..a48b074cc 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -2,6 +2,7 @@ use std::{borrow::Cow, collections::HashMap, mem::take}; use anyhow::bail; use roaring::RoaringBitmap; +use log::debug; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::word_derivations; @@ -59,6 +60,8 @@ impl<'t> Criterion for Typo<'t> { fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { + debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); + match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { self.query_tree = None; diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index bd03ecf97..3b0ecd54a 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use std::mem::take; use anyhow::bail; +use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; @@ -52,8 +53,9 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; - loop { + debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); + match (self.query_trees.pop(), &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { self.query_trees = Vec::new(); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index aced8bbd1..f3d5af2da 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -107,6 +107,8 @@ impl<'a> Search<'a> { let mut initial_candidates = RoaringBitmap::new(); while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next()? { + debug!("Number of candidates found {}", candidates.len()); + let mut len = candidates.len() as usize; let mut candidates = candidates.into_iter(); From 5af63c74e04251461dd022836a3e4f38ca3df52d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 24 Feb 2021 17:44:35 +0100 Subject: [PATCH 35/50] Speed-up the MatchingWords highlighting struct --- http-ui/src/main.rs | 18 ++--- milli/src/lib.rs | 2 +- milli/src/search/criteria/typo.rs | 2 +- milli/src/search/mod.rs | 51 +++++++++----- milli/src/search/query_tree.rs | 111 +++++++++++++----------------- 5 files changed, 91 insertions(+), 93 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 2ce7f8bd1..86f965368 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -32,7 +32,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use milli::facet::FacetValue; use milli::update::UpdateIndexingStep::*; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; -use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; +use milli::{obkv_to_json, Index, UpdateStore, SearchResult, MatchingWords, FacetCondition}; static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); @@ -132,7 +132,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { Self { analyzer } } - fn highlight_value(&self, value: Value, words_to_highlight: &HashSet) -> Value { + fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value { match value { Value::Null => Value::Null, Value::Bool(boolean) => Value::Bool(boolean), @@ -142,7 +142,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed = self.analyzer.analyze(&old_string); for (word, token) in analyzed.reconstruct() { if token.is_word() { - let to_highlight = words_to_highlight.contains(token.text()); + let to_highlight = matching_words.matches(token.text()); if to_highlight { string.push_str("") } string.push_str(word); if to_highlight { string.push_str("") } @@ -154,12 +154,12 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { }, Value::Array(values) => { Value::Array(values.into_iter() - .map(|v| self.highlight_value(v, words_to_highlight)) + .map(|v| self.highlight_value(v, matching_words)) .collect()) }, Value::Object(object) => { Value::Object(object.into_iter() - .map(|(k, v)| (k, self.highlight_value(v, words_to_highlight))) + .map(|(k, v)| (k, self.highlight_value(v, matching_words))) .collect()) }, } @@ -168,14 +168,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { fn highlight_record( &self, object: &mut Map, - words_to_highlight: &HashSet, + matching_words: &MatchingWords, attributes_to_highlight: &HashSet, ) { // TODO do we need to create a string for element that are not and needs to be highlight? for (key, value) in object.iter_mut() { if attributes_to_highlight.contains(key) { let old_value = mem::take(value); - *value = self.highlight_value(old_value, words_to_highlight); + *value = self.highlight_value(old_value, matching_words); } } } @@ -722,7 +722,7 @@ async fn main() -> anyhow::Result<()> { search.facet_condition(condition); } - let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap(); + let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); let number_of_candidates = candidates.len(); let facets = if query.facet_distribution == Some(true) { @@ -748,7 +748,7 @@ async fn main() -> anyhow::Result<()> { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { - highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight); + highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight); } documents.push(object); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 0fa966ee8..75d6f9fb3 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -28,7 +28,7 @@ pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; -pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult}; +pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords}; pub use self::update_store::UpdateStore; pub type FastMap4 = HashMap>; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a48b074cc..0b8111997 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -1,8 +1,8 @@ use std::{borrow::Cow, collections::HashMap, mem::take}; use anyhow::bail; -use roaring::RoaringBitmap; use log::debug; +use roaring::RoaringBitmap; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::word_derivations; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f3d5af2da..dbb504368 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,10 +1,9 @@ use std::borrow::Cow; -use std::collections::HashSet; use std::fmt; use std::time::Instant; use fst::{IntoStreamer, Streamer, Set}; -use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; +use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder}; use log::debug; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; @@ -14,8 +13,9 @@ use crate::search::criteria::{Criterion, CriterionResult}; use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity}; use crate::{Index, DocumentId}; -pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::FacetIter; +pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; +pub use self::query_tree::MatchingWords; use self::query_tree::QueryTreeBuilder; // Building these factories is not free. @@ -87,6 +87,11 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); + let matching_words = match query_tree.as_ref() { + Some(query_tree) => MatchingWords::from_query_tree(&query_tree), + None => MatchingWords::default(), + }; + // We are testing the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; @@ -128,8 +133,7 @@ impl<'a> Search<'a> { if limit == 0 { break } } - let found_words = HashSet::new(); - Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids }) + Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids }) } } @@ -147,26 +151,21 @@ impl fmt::Debug for Search<'_> { #[derive(Default)] pub struct SearchResult { - pub found_words: HashSet, + pub matching_words: MatchingWords, pub candidates: RoaringBitmap, // TODO those documents ids should be associated with their criteria scores. pub documents_ids: Vec, } -pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>) -> anyhow::Result> { - let lev = match max_typo { - 0 => &LEVDIST0, - 1 => &LEVDIST1, - _ => &LEVDIST2, - }; - - let dfa = if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - }; - +pub fn word_derivations( + word: &str, + is_prefix: bool, + max_typo: u8, + fst: &fst::Set>, +) -> anyhow::Result> +{ let mut derived_words = Vec::new(); + let dfa = build_dfa(word, max_typo, is_prefix); let mut stream = fst.search_with_state(&dfa).into_stream(); while let Some((word, state)) = stream.next() { @@ -177,3 +176,17 @@ pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Se Ok(derived_words) } + +pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { + let lev = match typos { + 0 => &LEVDIST0, + 1 => &LEVDIST1, + _ => &LEVDIST2, + }; + + if is_prefix { + lev.build_prefix_dfa(word) + } else { + lev.build_dfa(word) + } +} diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 47057ad10..114032eb8 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,12 +1,13 @@ -use std::borrow::Cow; -use std::collections::BTreeMap; +use std::collections::HashSet; use std::{fmt, cmp, mem}; +use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; use slice_group_by::GroupBy; use crate::Index; +use super::build_dfa; type IsOptionalWord = bool; type IsPrefix = bool; @@ -113,6 +114,14 @@ impl QueryKind { QueryKind::Tolerant { typo, word } } + pub fn is_tolerant(&self) -> bool { + matches!(self, QueryKind::Tolerant { .. }) + } + + pub fn is_exact(&self) -> bool { + matches!(self, QueryKind::Exact { .. }) + } + pub fn typo(&self) -> u8 { match self { QueryKind::Tolerant { typo, .. } => *typo, @@ -275,69 +284,45 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result + dfas: Vec<(DFA, u8)>, } impl MatchingWords { /// List all words which can be considered as a match for the query tree. - pub fn from_query_tree(tree: &Operation, fst: &fst::Set>) -> Self { - Self { inner: fetch_words(tree, fst).into_iter().collect() } + pub fn from_query_tree(tree: &Operation) -> Self { + Self { + dfas: fetch_queries(tree).into_iter().map(|(w, t, p)| (build_dfa(w, t, p), t)).collect() + } } /// Return true if the word match. - pub fn is_match(&self, word: &str) -> bool { - fn first_char(s: &str) -> Option<&str> { - s.chars().next().map(|c| &s[..c.len_utf8()]) - } - - match first_char(word) { - Some(first) => { - let left = first.to_owned(); - let right = word.to_owned(); - self.inner.range(left..=right).any(|(w, is_prefix)| *is_prefix || *w == word) - }, - None => false - } + pub fn matches(&self, word: &str) -> bool { + self.dfas.iter().any(|(dfa, typo)| match dfa.eval(word) { + Distance::Exact(t) => t <= *typo, + Distance::AtLeast(_) => false, + }) } } -type FetchedWords = Vec<(String, IsPrefix)>; - /// Lists all words which can be considered as a match for the query tree. -fn fetch_words(tree: &Operation, fst: &fst::Set>) -> FetchedWords { - fn resolve_branch(tree: &[Operation], fst: &fst::Set>) -> FetchedWords { - tree.iter().map(|op| resolve_ops(op, fst)).flatten().collect() - } - - fn resolve_query(query: &Query, fst: &fst::Set>) -> FetchedWords { - match query.kind.clone() { - QueryKind::Exact { word, .. } => vec![(word, query.prefix)], - QueryKind::Tolerant { typo, word } => { - if let Ok(words) = super::word_derivations(&word, query.prefix, typo, fst) { - words.into_iter().map(|(w, _)| (w, query.prefix)).collect() - } else { - vec![(word, query.prefix)] - } - } - } - } - - fn resolve_ops(tree: &Operation, fst: &fst::Set>) -> FetchedWords { +fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { + fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { match tree { Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { - resolve_branch(ops.as_slice(), fst) + ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); }, - Operation::Query(ops) => { - resolve_query(ops, fst) + Operation::Query(Query { prefix, kind }) => { + let typo = if kind.is_exact() { 0 } else { kind.typo() }; + out.insert((kind.word(), typo, *prefix)); }, } } - let mut words = resolve_ops(tree, fst); - words.sort_unstable(); - words.dedup(); - words + let mut queries = HashSet::new(); + resolve_ops(tree, &mut queries); + queries } /// Main function that creates the final query tree from the primitive query. @@ -559,7 +544,7 @@ mod test { use std::collections::HashMap; use fst::Set; - use maplit::hashmap; + use maplit::{hashmap, hashset}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rand::{Rng, SeedableRng, rngs::StdRng}; @@ -970,26 +955,26 @@ mod test { let context = TestContext::default(); let query_tree = context.build(false, true, tokens).unwrap().unwrap(); - let expected = vec![ - ("city".to_string(), false), - ("earth".to_string(), false), - ("nature".to_string(), false), - ("new".to_string(), false), - ("nyc".to_string(), false), - ("split".to_string(), false), - ("word".to_string(), false), - ("word".to_string(), true), - ("world".to_string(), true), - ("york".to_string(), false), - - ]; + let expected = hashset!{ + ("word", 0, false), + ("nyc", 0, false), + ("wordsplit", 2, false), + ("wordsplitnycworld", 2, true), + ("nature", 0, false), + ("new", 0, false), + ("city", 0, false), + ("world", 1, true), + ("york", 0, false), + ("split", 0, false), + ("nycworld", 1, true), + ("earth", 0, false), + ("wordsplitnyc", 2, false), + }; let mut keys = context.postings.keys().collect::>(); keys.sort_unstable(); - let set = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap(); - - let words = fetch_words(&query_tree, &set); + let words = fetch_queries(&query_tree); assert_eq!(expected, words); } } From 7ac09d7b7c37e5cf5f43ab7a31bbd359755229ef Mon Sep 17 00:00:00 2001 From: many Date: Thu, 25 Feb 2021 16:54:41 +0100 Subject: [PATCH 36/50] remove option of bucket_candidates --- milli/src/search/criteria/proximity.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index aab46a6a2..57c7007fc 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -12,7 +12,7 @@ pub struct Proximity<'t> { query_tree: Option<(usize, Operation)>, proximity: u8, candidates: Candidates, - bucket_candidates: Option, + bucket_candidates: RoaringBitmap, parent: Option>, candidates_cache: HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, } @@ -29,7 +29,7 @@ impl<'t> Proximity<'t> { query_tree: query_tree.map(|op| (maximum_proximity(&op), op)), proximity: 0, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::new(), }) @@ -45,7 +45,7 @@ impl<'t> Proximity<'t> { query_tree: None, proximity: 0, candidates: Candidates::default(), - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::new(), }) @@ -84,8 +84,8 @@ impl<'t> Criterion for Proximity<'t> { self.proximity += 1; let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), }; return Ok(Some(CriterionResult { @@ -112,8 +112,8 @@ impl<'t> Criterion for Proximity<'t> { self.proximity += 1; let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), }; return Ok(Some(CriterionResult { @@ -128,7 +128,7 @@ impl<'t> Criterion for Proximity<'t> { return Ok(Some(CriterionResult { query_tree: None, candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, (None, Forbidden(_)) => { From daf126a63838a9382e73f106f553e546b625a979 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 25 Feb 2021 16:34:29 +0100 Subject: [PATCH 37/50] Introduce the final Fetcher criterion --- milli/src/search/criteria/fetcher.rs | 107 +++++++++++++++++++++++++++ milli/src/search/criteria/mod.rs | 90 +++++++++++++++++++++- milli/src/search/criteria/words.rs | 80 +------------------- milli/src/search/mod.rs | 7 +- 4 files changed, 201 insertions(+), 83 deletions(-) create mode 100644 milli/src/search/criteria/fetcher.rs diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs new file mode 100644 index 000000000..7706ee280 --- /dev/null +++ b/milli/src/search/criteria/fetcher.rs @@ -0,0 +1,107 @@ +use std::collections::HashMap; +use std::mem::take; + +use log::debug; +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; + +pub struct Fetcher<'t> { + ctx: &'t dyn Context, + query_tree: Option, + candidates: Candidates, + parent: Option>, + should_get_documents_ids: bool, +} + +impl<'t> Fetcher<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> Self + { + Fetcher { + ctx, + query_tree, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + parent: None, + should_get_documents_ids: true, + } + } + + pub fn new( + ctx: &'t dyn Context, + parent: Box, + ) -> Self + { + Fetcher { + ctx, + query_tree: None, + candidates: Candidates::default(), + parent: Some(parent), + should_get_documents_ids: true, + } + } +} + +impl<'t> Criterion for Fetcher<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + loop { + debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", + self.should_get_documents_ids, self.candidates, + ); + + match &mut self.candidates { + Allowed(candidates) => if candidates.is_empty() { + self.candidates = Candidates::default(); + } else { + self.should_get_documents_ids = false; + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + Forbidden(_) => { + let should_get_documents_ids = take(&mut self.should_get_documents_ids); + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(result) => return Ok(Some(result)), + None => if should_get_documents_ids { + let candidates = match &self.query_tree { + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?, + None => self.ctx.documents_ids()?, + }; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + } + }, + None => if should_get_documents_ids { + let candidates = match &self.query_tree { + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?, + None => self.ctx.documents_ids()?, + }; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + } + return Ok(None); + }, + } + } + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 52367ac5f..1845e607a 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,16 +1,19 @@ +use std::collections::HashMap; use std::borrow::Cow; +use anyhow::bail; +use roaring::RoaringBitmap; + use crate::Index; use crate::search::word_derivations; -use roaring::RoaringBitmap; - use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; pub mod words; pub mod asc_desc; pub mod proximity; +pub mod fetcher; pub trait Criterion { fn next(&mut self) -> anyhow::Result>; @@ -51,6 +54,7 @@ impl Default for Candidates { } } pub trait Context { + fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; @@ -66,6 +70,10 @@ pub struct HeedContext<'t> { } impl<'a> Context for HeedContext<'a> { + fn documents_ids(&self) -> heed::Result { + self.index.documents_ids(self.rtxn) + } + fn word_docids(&self, word: &str) -> heed::Result> { self.index.word_docids.get(self.rtxn, &word) } @@ -107,6 +115,80 @@ impl<'t> HeedContext<'t> { } } +pub fn resolve_query_tree<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, +) -> anyhow::Result +{ + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + ) -> anyhow::Result + { + use Operation::{And, Consecutive, Or, Query}; + + match query_tree { + And(ops) => { + let mut ops = ops.iter().map(|op| { + resolve_operation(ctx, op, cache) + }).collect::>>()?; + + ops.sort_unstable_by_key(|cds| cds.len()); + + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for docids in ops { + if first_loop { + candidates = docids; + first_loop = false; + } else { + candidates.intersect_with(&docids); + } + } + Ok(candidates) + }, + Consecutive(ops) => { + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for slice in ops.windows(2) { + match (&slice[0], &slice[1]) { + (Operation::Query(left), Operation::Query(right)) => { + match query_pair_proximity_docids(ctx, left, right, 1)? { + pair_docids if pair_docids.is_empty() => { + return Ok(RoaringBitmap::new()) + }, + pair_docids if first_loop => { + candidates = pair_docids; + first_loop = false; + }, + pair_docids => { + candidates.intersect_with(&pair_docids); + }, + } + }, + _ => bail!("invalid consecutive query type"), + } + } + Ok(candidates) + }, + Or(_, ops) => { + let mut candidates = RoaringBitmap::new(); + for op in ops { + let docids = resolve_operation(ctx, op, cache)?; + candidates.union_with(&docids); + } + Ok(candidates) + }, + Query(q) => Ok(query_docids(ctx, q)?), + } + } + + resolve_operation(ctx, query_tree, cache) +} + + fn all_word_pair_proximity_docids, U: AsRef>( ctx: &dyn Context, left_words: &[(T, u8)], @@ -218,6 +300,10 @@ pub mod test { } impl<'a> Context for TestContext<'a> { + fn documents_ids(&self) -> heed::Result { + Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids)) + } + fn word_docids(&self, word: &str) -> heed::Result> { Ok(self.word_docids.get(&word.to_string()).cloned()) } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 3b0ecd54a..0913d429d 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -1,12 +1,11 @@ use std::collections::HashMap; use std::mem::take; -use anyhow::bail; use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; pub struct Words<'t> { ctx: &'t dyn Context, @@ -62,7 +61,7 @@ impl<'t> Criterion for Words<'t> { self.candidates = Candidates::default(); }, (Some(qt), Allowed(candidates)) => { - let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -78,7 +77,7 @@ impl<'t> Criterion for Words<'t> { })); }, (Some(qt), Forbidden(candidates)) => { - let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; found_candidates.difference_with(&candidates); candidates.union_with(&found_candidates); @@ -127,76 +126,3 @@ fn explode_query_tree(query_tree: Operation) -> Vec { otherwise => vec![otherwise], } } - -fn resolve_candidates<'t>( - ctx: &'t dyn Context, - query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, -) -> anyhow::Result -{ - fn resolve_operation<'t>( - ctx: &'t dyn Context, - query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, - ) -> anyhow::Result - { - use Operation::{And, Consecutive, Or, Query}; - - match query_tree { - And(ops) => { - let mut ops = ops.iter().map(|op| { - resolve_operation(ctx, op, cache) - }).collect::>>()?; - - ops.sort_unstable_by_key(|cds| cds.len()); - - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for docids in ops { - if first_loop { - candidates = docids; - first_loop = false; - } else { - candidates.intersect_with(&docids); - } - } - Ok(candidates) - }, - Consecutive(ops) => { - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for slice in ops.windows(2) { - match (&slice[0], &slice[1]) { - (Operation::Query(left), Operation::Query(right)) => { - match query_pair_proximity_docids(ctx, left, right, 1)? { - pair_docids if pair_docids.is_empty() => { - return Ok(RoaringBitmap::new()) - }, - pair_docids if first_loop => { - candidates = pair_docids; - first_loop = false; - }, - pair_docids => { - candidates.intersect_with(&pair_docids); - }, - } - }, - _ => bail!("invalid consecutive query type"), - } - } - Ok(candidates) - }, - Or(_, ops) => { - let mut candidates = RoaringBitmap::new(); - for op in ops { - let docids = resolve_operation(ctx, op, cache)?; - candidates.union_with(&docids); - } - Ok(candidates) - }, - Query(q) => Ok(query_docids(ctx, q)?), - } - } - - resolve_operation(ctx, query_tree, cache) -} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index dbb504368..84c6acf3e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -10,7 +10,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::search::criteria::{Criterion, CriterionResult}; -use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity}; +use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity, fetcher::Fetcher}; use crate::{Index, DocumentId}; pub use self::facet::FacetIter; @@ -92,13 +92,12 @@ impl<'a> Search<'a> { None => MatchingWords::default(), }; - // We are testing the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; let words_criterion = Words::new(&criteria_ctx, Box::new(typo_criterion))?; let proximity_criterion = Proximity::new(&criteria_ctx, Box::new(words_criterion))?; - // let proximity_criterion = Proximity::initial(&criteria_ctx, query_tree, facet_candidates)?; - let mut criteria = proximity_criterion; + let fetcher_criterion = Fetcher::new(&criteria_ctx, Box::new(proximity_criterion)); + let mut criteria = fetcher_criterion; // // We sort in descending order on a specific field *by hand*, don't do that at home. // let attr_name = "released-timestamp"; From b0e0c5eba0c83d57d7262036c076b0a829065de7 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 25 Feb 2021 16:59:09 +0100 Subject: [PATCH 38/50] remove option of bucket_candidates --- milli/src/search/criteria/fetcher.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index 7706ee280..e21548e3f 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -63,7 +63,7 @@ impl<'t> Criterion for Fetcher<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, Forbidden(_) => { @@ -81,7 +81,7 @@ impl<'t> Criterion for Fetcher<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, } @@ -95,7 +95,7 @@ impl<'t> Criterion for Fetcher<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, } From 36c1f93ceb969c69bb37b252e8369169670c1767 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 25 Feb 2021 17:28:20 +0100 Subject: [PATCH 39/50] Do an union of the bucket candidates --- milli/src/search/criteria/asc_desc.rs | 2 +- milli/src/search/criteria/proximity.rs | 2 +- milli/src/search/criteria/typo.rs | 2 +- milli/src/search/criteria/words.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 3d32bd845..df80f3bb4 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -174,7 +174,7 @@ impl<'t> Criterion for AscDesc<'t> { self.query_tree = query_tree; candidates.intersect_with(&self.faceted_candidates); self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates = bucket_candidates; + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 57c7007fc..352567d1a 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -139,7 +139,7 @@ impl<'t> Criterion for Proximity<'t> { self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); self.proximity = 0; self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates = bucket_candidates; + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 0b8111997..b82ebbf5b 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -147,7 +147,7 @@ impl<'t> Criterion for Typo<'t> { self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); self.number_typos = 0; self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates = bucket_candidates; + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 0913d429d..c8bb0abc4 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -107,7 +107,7 @@ impl<'t> Criterion for Words<'t> { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates = bucket_candidates; + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } From 025835c5b2af5c2622c41504c19ea3f94aab6680 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 1 Mar 2021 14:03:12 +0100 Subject: [PATCH 40/50] Fix the criteria to avoid always returning a placeholder --- milli/src/search/criteria/asc_desc.rs | 7 +++++-- milli/src/search/criteria/fetcher.rs | 18 ++++++++++++------ milli/src/search/criteria/proximity.rs | 7 +++++-- milli/src/search/criteria/typo.rs | 7 +++++-- milli/src/search/criteria/words.rs | 8 ++++++-- 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index df80f3bb4..151b0a6a0 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -128,8 +128,11 @@ impl<'t> Criterion for AscDesc<'t> { loop { match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { - self.query_tree = None; - self.candidates = Candidates::default(); + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: take(&mut self.bucket_candidates), + })); }, (Some(qt), Allowed(candidates)) => { let bucket_candidates = match self.parent { diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index e21548e3f..f0cf16b90 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -54,20 +54,26 @@ impl<'t> Criterion for Fetcher<'t> { self.should_get_documents_ids, self.candidates, ); + let should_get_documents_ids = take(&mut self.should_get_documents_ids); match &mut self.candidates { - Allowed(candidates) => if candidates.is_empty() { - self.candidates = Candidates::default(); - } else { - self.should_get_documents_ids = false; + Allowed(candidates) => { let candidates = take(&mut self.candidates).into_inner(); + let candidates = match &self.query_tree { + Some(qt) if should_get_documents_ids => { + let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?; + docids.intersect_with(&candidates); + docids + }, + _ => candidates, + }; + return Ok(Some(CriterionResult { - query_tree: self.query_tree.clone(), + query_tree: self.query_tree.take(), candidates: candidates.clone(), bucket_candidates: candidates, })); }, Forbidden(_) => { - let should_get_documents_ids = take(&mut self.should_get_documents_ids); match self.parent.as_mut() { Some(parent) => { match parent.next()? { diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 352567d1a..553a191ec 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -64,8 +64,11 @@ impl<'t> Criterion for Proximity<'t> { match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { - self.query_tree = None; - self.candidates = Candidates::default(); + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take().map(|(_, qt)| qt), + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: take(&mut self.bucket_candidates), + })); }, (Some((max_prox, query_tree)), Allowed(candidates)) => { if self.proximity as usize > *max_prox { diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index b82ebbf5b..5c8592c5e 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -64,8 +64,11 @@ impl<'t> Criterion for Typo<'t> { match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { - self.query_tree = None; - self.candidates = Candidates::default(); + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take().map(|(_, qt)| qt), + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: take(&mut self.bucket_candidates), + })); }, (Some((max_typos, query_tree)), Allowed(candidates)) => { if self.number_typos as usize > *max_typos { diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index c8bb0abc4..bfb85579a 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -56,9 +56,13 @@ impl<'t> Criterion for Words<'t> { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); match (self.query_trees.pop(), &mut self.candidates) { - (_, Allowed(candidates)) if candidates.is_empty() => { + (query_tree, Allowed(candidates)) if candidates.is_empty() => { self.query_trees = Vec::new(); - self.candidates = Candidates::default(); + return Ok(Some(CriterionResult { + query_tree, + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: take(&mut self.bucket_candidates), + })); }, (Some(qt), Allowed(candidates)) => { let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; From f118d7e067cb6550feaa87c38e3a259d96dd1ae0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 2 Mar 2021 11:58:32 +0100 Subject: [PATCH 41/50] build criteria from settings --- milli/src/search/criteria/fetcher.rs | 2 +- milli/src/search/criteria/mod.rs | 82 ++++++++++++++++++++++++---- milli/src/search/mod.rs | 9 +-- 3 files changed, 74 insertions(+), 19 deletions(-) diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index f0cf16b90..38fee20d3 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -56,7 +56,7 @@ impl<'t> Criterion for Fetcher<'t> { let should_get_documents_ids = take(&mut self.should_get_documents_ids); match &mut self.candidates { - Allowed(candidates) => { + Allowed(_) => { let candidates = take(&mut self.candidates).into_inner(); let candidates = match &self.query_tree { Some(qt) if should_get_documents_ids => { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1845e607a..49bacf209 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,13 +1,19 @@ use std::collections::HashMap; use std::borrow::Cow; -use anyhow::bail; +use anyhow::{bail, Context as _}; use roaring::RoaringBitmap; -use crate::Index; +use crate::facet::FacetType; use crate::search::word_derivations; +use crate::{Index, FieldId}; use super::query_tree::{Operation, Query, QueryKind}; +use self::typo::Typo; +use self::words::Words; +use self::asc_desc::AscDesc; +use self::proximity::Proximity; +use self::fetcher::Fetcher; pub mod typo; pub mod words; @@ -62,14 +68,14 @@ pub trait Context { fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; } -pub struct HeedContext<'t> { +pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, index: &'t Index, words_fst: fst::Set>, words_prefixes_fst: fst::Set>, } -impl<'a> Context for HeedContext<'a> { +impl<'a> Context for CriteriaBuilder<'a> { fn documents_ids(&self) -> heed::Result { self.index.documents_ids(self.rtxn) } @@ -101,17 +107,71 @@ impl<'a> Context for HeedContext<'a> { } } -impl<'t> HeedContext<'t> { +impl<'t> CriteriaBuilder<'t> { pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> anyhow::Result { let words_fst = index.words_fst(rtxn)?; let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; + Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) + } - Ok(Self { - rtxn, - index, - words_fst, - words_prefixes_fst, - }) + pub fn build( + &'t self, + mut query_tree: Option, + mut facet_candidates: Option, + ) -> anyhow::Result> + { + use crate::criterion::Criterion as Name; + + let fields_ids_map = self.index.fields_ids_map(&self.rtxn)?; + let faceted_fields = self.index.faceted_fields(&self.rtxn)?; + let field_id_facet_type = |field: &str| -> anyhow::Result<(FieldId, FacetType)> { + let id = fields_ids_map.id(field).with_context(|| { + format!("field {:?} isn't registered", field) + })?; + let facet_type = faceted_fields.get(field).with_context(|| { + format!("field {:?} isn't faceted", field) + })?; + Ok((id, *facet_type)) + }; + + let mut criterion = None as Option>; + for name in self.index.criteria(&self.rtxn)? { + criterion = Some(match criterion.take() { + Some(father) => match name { + Name::Typo => Box::new(Typo::new(self, father)?), + Name::Words => Box::new(Words::new(self, father)?), + Name::Proximity => Box::new(Proximity::new(self, father)?), + Name::Asc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::asc(&self.index, &self.rtxn, father, id, facet_type)?) + }, + Name::Desc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::desc(&self.index, &self.rtxn, father, id, facet_type)?) + }, + _otherwise => father, + }, + None => match name { + Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())?), + Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())?), + Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())?), + Name::Asc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?) + }, + Name::Desc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?) + }, + _otherwise => continue, + }, + }); + } + + match criterion { + Some(criterion) => Ok(Fetcher::new(self, criterion)), + None => Ok(Fetcher::initial(self, query_tree, facet_candidates)), + } } } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 84c6acf3e..48b0f71da 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -10,7 +10,6 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::search::criteria::{Criterion, CriterionResult}; -use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity, fetcher::Fetcher}; use crate::{Index, DocumentId}; pub use self::facet::FacetIter; @@ -92,12 +91,8 @@ impl<'a> Search<'a> { None => MatchingWords::default(), }; - let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; - let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; - let words_criterion = Words::new(&criteria_ctx, Box::new(typo_criterion))?; - let proximity_criterion = Proximity::new(&criteria_ctx, Box::new(words_criterion))?; - let fetcher_criterion = Fetcher::new(&criteria_ctx, Box::new(proximity_criterion)); - let mut criteria = fetcher_criterion; + let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; + let mut criteria = criteria_builder.build(query_tree, facet_candidates)?; // // We sort in descending order on a specific field *by hand*, don't do that at home. // let attr_name = "released-timestamp"; From 6bf6b404955c9087a905051e002244010289cb26 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 2 Mar 2021 11:02:09 +0100 Subject: [PATCH 42/50] Remove unused files --- milli/src/lib.rs | 2 - milli/src/mdfs.rs | 163 ---------------------------- milli/src/query_tokens.rs | 217 -------------------------------------- 3 files changed, 382 deletions(-) delete mode 100644 milli/src/mdfs.rs delete mode 100644 milli/src/query_tokens.rs diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 75d6f9fb3..d6a078a1f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -3,8 +3,6 @@ mod criterion; mod external_documents_ids; mod fields_ids_map; -mod mdfs; -mod query_tokens; mod search; mod update_store; pub mod facet; diff --git a/milli/src/mdfs.rs b/milli/src/mdfs.rs deleted file mode 100644 index 6beba3c69..000000000 --- a/milli/src/mdfs.rs +++ /dev/null @@ -1,163 +0,0 @@ -use std::collections::hash_map::Entry::{Occupied, Vacant}; -use std::collections::HashMap; -use std::mem; - -use roaring::RoaringBitmap; -use crate::Index; - -/// A mana depth first search implementation. -pub struct Mdfs<'a> { - index: &'a Index, - rtxn: &'a heed::RoTxn<'a>, - words: &'a [(HashMap, RoaringBitmap)], - union_cache: HashMap<(usize, u8), RoaringBitmap>, - candidates: RoaringBitmap, - mana: u32, - max_mana: u32, -} - -impl<'a> Mdfs<'a> { - pub fn new( - index: &'a Index, - rtxn: &'a heed::RoTxn, - words: &'a [(HashMap, RoaringBitmap)], - candidates: RoaringBitmap, - ) -> Mdfs<'a> - { - // Compute the number of pairs (windows) we have for this list of words. - let mana = words.len().saturating_sub(1) as u32; - let max_mana = mana * 8; - Mdfs { index, rtxn, words, union_cache: HashMap::new(), candidates, mana, max_mana } - } -} - -impl<'a> Iterator for Mdfs<'a> { - type Item = anyhow::Result<(u32, RoaringBitmap)>; - - fn next(&mut self) -> Option { - // If there is less or only one word therefore the only - // possible documents that we can return are the candidates. - if self.words.len() <= 1 { - if self.candidates.is_empty() { return None } - return Some(Ok((0, mem::take(&mut self.candidates)))); - } - - while self.mana <= self.max_mana { - let mut answer = RoaringBitmap::new(); - let result = mdfs_step( - &self.index, - &self.rtxn, - self.mana, - self.words, - &self.candidates, - &self.candidates, - &mut self.union_cache, - &mut answer, - ); - - match result { - Ok(()) => { - // We always increase the mana for the next loop. - let proximity = self.mana; - self.mana += 1; - - // If no documents were found we must not return and continue - // the search with more mana. - if !answer.is_empty() { - - // We remove the answered documents from the list of - // candidates to be sure we don't search for them again. - self.candidates.difference_with(&answer); - - // We return the answer. - return Some(Ok((proximity, answer))); - } - }, - Err(e) => return Some(Err(e)), - } - } - - None - } -} - -fn mdfs_step( - index: &Index, - rtxn: &heed::RoTxn, - mana: u32, - words: &[(HashMap, RoaringBitmap)], - candidates: &RoaringBitmap, - parent_docids: &RoaringBitmap, - union_cache: &mut HashMap<(usize, u8), RoaringBitmap>, - answer: &mut RoaringBitmap, -) -> anyhow::Result<()> -{ - use std::cmp::{min, max}; - - let (words1, words2) = (&words[0].0, &words[1].0); - let pairs = words_pair_combinations(words1, words2); - let tail = &words[1..]; - let nb_children = tail.len() as u32 - 1; - - // The minimum amount of mana that you must consume is at least 1 and the - // amount of mana that your children can consume. Because the last child must - // consume the remaining mana, it is mandatory that there not too much at the end. - let min_proximity = max(1, mana.saturating_sub(nb_children * 8)) as u8; - - // The maximum amount of mana that you can use is 8 or the remaining amount of - // mana minus your children, as you can't just consume all the mana, - // your children must have at least 1 mana. - let max_proximity = min(8, mana - nb_children) as u8; - - for proximity in min_proximity..=max_proximity { - let mut docids = match union_cache.entry((words.len(), proximity)) { - Occupied(entry) => entry.get().clone(), - Vacant(entry) => { - let mut docids = RoaringBitmap::new(); - if proximity == 8 { - docids = candidates.clone(); - } else { - for (w1, w2) in pairs.iter().cloned() { - let key = (w1, w2, proximity); - if let Some(di) = index.word_pair_proximity_docids.get(rtxn, &key)? { - docids.union_with(&di); - } - } - } - entry.insert(docids).clone() - } - }; - - // We must be sure that we only return docids that are present in the candidates. - docids.intersect_with(parent_docids); - - if !docids.is_empty() { - let mana = mana.checked_sub(proximity as u32).unwrap(); - if tail.len() < 2 { - // We are the last pair, we return without recuring as we don't have any child. - answer.union_with(&docids); - return Ok(()); - } else { - return mdfs_step(index, rtxn, mana, tail, candidates, &docids, union_cache, answer); - } - } - } - - Ok(()) -} - -fn words_pair_combinations<'h>( - w1: &'h HashMap, - w2: &'h HashMap, -) -> Vec<(&'h str, &'h str)> -{ - let mut pairs = Vec::new(); - for (w1, (_typos, docids1)) in w1 { - for (w2, (_typos, docids2)) in w2 { - if !docids1.is_disjoint(&docids2) { - pairs.push((w1.as_str(), w2.as_str())); - } - } - } - pairs -} diff --git a/milli/src/query_tokens.rs b/milli/src/query_tokens.rs deleted file mode 100644 index 258c90765..000000000 --- a/milli/src/query_tokens.rs +++ /dev/null @@ -1,217 +0,0 @@ -use meilisearch_tokenizer::{Token, TokenKind}; - -#[derive(Debug)] -enum State { - Free, - Quoted, -} - -impl State { - fn swap(&mut self) { - match self { - State::Quoted => *self = State::Free, - State::Free => *self = State::Quoted, - } - } -} - -#[derive(Debug, PartialEq, Eq)] -pub enum QueryToken<'a> { - Free(Token<'a>), - Quoted(Token<'a>), -} - -pub fn query_tokens<'a>(mut tokens: impl Iterator>) -> impl Iterator> { - let mut state = State::Free; - let f = move || { - loop { - let token = tokens.next()?; - match token.kind() { - _ if token.text().trim() == "\"" => state.swap(), - TokenKind::Word => { - let token = match state { - State::Quoted => QueryToken::Quoted(token), - State::Free => QueryToken::Free(token), - }; - return Some(token); - }, - _ => (), - } - } - }; - std::iter::from_fn(f) -} - -#[cfg(test)] -mod tests { - use super::*; - use QueryToken::{Quoted, Free}; - use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; - use fst::Set; - - macro_rules! assert_eq_query_token { - ($test:expr, Quoted($val:literal)) => { - match $test { - Quoted(val) => assert_eq!(val.text(), $val), - Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()), - } - }; - - ($test:expr, Free($val:literal)) => { - match $test { - Quoted(val) => panic!("expected Free(\"{}\"), found Quoted(\"{}\")", $val, val.text()), - Free(val) => assert_eq!(val.text(), $val), - } - }; - } - - #[test] - fn empty() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = ""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert!(iter.next().is_none()); - - let query = " "; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert!(iter.next().is_none()); - } - - #[test] - fn one_quoted_string() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert!(iter.next().is_none()); - } - - #[test] - fn one_pending_quoted_string() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert!(iter.next().is_none()); - } - - #[test] - fn one_non_quoted_string() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert!(iter.next().is_none()); - } - - #[test] - fn quoted_directly_followed_by_free_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello\"world"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert_eq_query_token!(iter.next().unwrap(), Free("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn free_directly_followed_by_quoted_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello\"world\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn free_followed_by_quoted_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello \"world\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn multiple_spaces_separated_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello world "; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Free("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn multi_interleaved_quoted_free_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello \"world\" coucou \"monde\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert_eq_query_token!(iter.next().unwrap(), Free("coucou")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("monde")); - assert!(iter.next().is_none()); - } - - #[test] - fn multi_quoted_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello world\" coucou \"monde est beau\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert_eq_query_token!(iter.next().unwrap(), Free("coucou")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("monde")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("est")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("beau")); - assert!(iter.next().is_none()); - } - - #[test] - fn chinese() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "汽车男生"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("汽车")); - assert_eq_query_token!(iter.next().unwrap(), Free("男生")); - assert!(iter.next().is_none()); - } -} From 246286f0ebede97517bc262fc9cf67448e221194 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 2 Mar 2021 11:14:10 +0100 Subject: [PATCH 43/50] take hard separator into account --- milli/src/update/index_documents/store.rs | 41 +++++++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 96d1098f9..05974d55e 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -13,7 +13,7 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; use log::{debug, info}; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; @@ -471,14 +471,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { }; let analyzed = self.analyzer.analyze(&content); - let tokens = analyzed - .tokens() - .filter(|t| t.is_word()) - .map(|t| t.text().to_string()); + let tokens = process_tokens(analyzed.tokens()); - for (pos, word) in tokens.enumerate().take(MAX_POSITION) { + for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); } } } @@ -609,6 +606,36 @@ enum FacetValue { Integer(i64), } +/// take an iterator on tokens and compute their relative position depending on separator kinds +/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// else we keep the standart proximity of 1 between words. +fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator)> { + tokens + .skip_while(|token| token.is_separator().is_some()) + .scan((0, None), |(offset, prev_kind), token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) + } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => { + *prev_kind = Some(token.kind); + } + _ => (), + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} + fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result> { use FacetValue::*; From cdaa96df6329670cf959ebb67c5e86618e1bfaea Mon Sep 17 00:00:00 2001 From: many Date: Tue, 2 Mar 2021 14:46:50 +0100 Subject: [PATCH 44/50] optimize proximity criterion --- milli/src/search/criteria/proximity.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 553a191ec..dc05787dd 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -205,14 +205,16 @@ fn resolve_candidates<'t>( cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, ) -> anyhow::Result> { - fn pair_combinations(mana: u8) -> impl Iterator { - (0..=mana).map(move |m| (mana - m, m)) + fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator { + (0..=mana.min(left_max)).map(move |m| (m, mana - m)) } + let pair_max_proximity = 7; + let mut output = Vec::new(); - for (pair_p, left_right_p) in pair_combinations(proximity) { - for (left_p, right_p) in pair_combinations(left_right_p) { + for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) { + for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) { let left_key = (left.clone(), left_p); if !cache.contains_key(&left_key) { let candidates = resolve_operation(ctx, left, left_p, cache)?; From 5c5e51095cdbfc4c37592b90a08e5433e67c2ac1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 11:43:42 +0100 Subject: [PATCH 45/50] Fix the Asc/Desc criteria to alsways return the QueryTree when available --- milli/src/search/criteria/asc_desc.rs | 97 ++++++++++++++------------- 1 file changed, 52 insertions(+), 45 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 151b0a6a0..9af9d53e6 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -1,17 +1,20 @@ +use std::collections::HashMap; use std::mem::take; use anyhow::bail; use itertools::Itertools; +use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetI64Codec, FieldDocIdFacetF64Codec}; +use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index}; -use super::{Candidates, Criterion, CriterionResult}; +use super::{Criterion, CriterionResult}; pub struct AscDesc<'t> { index: &'t Index, @@ -20,7 +23,7 @@ pub struct AscDesc<'t> { facet_type: FacetType, ascending: bool, query_tree: Option, - candidates: Candidates, + candidates: RoaringBitmap, bucket_candidates: RoaringBitmap, faceted_candidates: RoaringBitmap, parent: Option>, @@ -83,6 +86,19 @@ impl<'t> AscDesc<'t> { ascending: bool, ) -> anyhow::Result where Self: Sized { + let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?; + let candidates = match &query_tree { + Some(qt) => { + let context = CriteriaBuilder::new(rtxn, index)?; + let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new())?; + if let Some(candidates) = candidates { + qt_candidates.intersect_with(&candidates); + } + qt_candidates + }, + None => candidates.unwrap_or(faceted_candidates.clone()), + }; + Ok(AscDesc { index, rtxn, @@ -90,8 +106,8 @@ impl<'t> AscDesc<'t> { facet_type, ascending, query_tree, - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, + candidates, + faceted_candidates, bucket_candidates: RoaringBitmap::new(), parent: None, }) @@ -113,7 +129,7 @@ impl<'t> AscDesc<'t> { facet_type, ascending, query_tree: None, - candidates: Candidates::default(), + candidates: RoaringBitmap::new(), faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, bucket_candidates: RoaringBitmap::new(), parent: Some(parent), @@ -123,24 +139,43 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { fn next(&mut self) -> anyhow::Result> { - use Candidates::{Allowed, Forbidden}; - loop { - match (&mut self.query_tree, &mut self.candidates) { - (_, Allowed(candidates)) if candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: take(&mut self.candidates).into_inner(), - bucket_candidates: take(&mut self.bucket_candidates), - })); + debug!("Facet {} iteration ({:?})", + if self.ascending { "Asc" } else { "Desc" }, self.candidates, + ); + + match &mut self.candidates { + candidates if candidates.is_empty() => { + let query_tree = self.query_tree.take(); + let candidates = take(&mut self.candidates); + let bucket_candidates = take(&mut self.bucket_candidates); + + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { + self.query_tree = query_tree; + candidates.intersect_with(&self.faceted_candidates); + self.candidates = candidates; + self.bucket_candidates = bucket_candidates; + }, + None => return Ok(None), + } + }, + None => if query_tree.is_none() && bucket_candidates.is_empty() { + return Ok(None) + }, + } + + return Ok(Some(CriterionResult { query_tree, candidates, bucket_candidates })); }, - (Some(qt), Allowed(candidates)) => { + candidates => { let bucket_candidates = match self.parent { Some(_) => take(&mut self.bucket_candidates), None => candidates.clone(), }; - let mut found_candidates = facet_ordered( + let found_candidates = facet_ordered( self.index, self.rtxn, self.field_id, @@ -149,42 +184,14 @@ impl<'t> Criterion for AscDesc<'t> { candidates.clone(), )?; - found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); return Ok(Some(CriterionResult { - query_tree: Some(qt.clone()), + query_tree: self.query_tree.clone(), candidates: found_candidates, bucket_candidates, })); }, - (Some(_qt), Forbidden(_candidates)) => { - todo!() - }, - (None, Allowed(_)) => { - let candidates = take(&mut self.candidates).into_inner(); - return Ok(Some(CriterionResult { - query_tree: None, - candidates: candidates.clone(), - bucket_candidates: candidates, - })); - }, - (None, Forbidden(_)) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next()? { - Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { - self.query_tree = query_tree; - candidates.intersect_with(&self.faceted_candidates); - self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates.union_with(&bucket_candidates); - }, - None => return Ok(None), - } - }, - None => return Ok(None), - } - }, } } } From f376c6a7287a952599cfe203e08f397edc672163 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 10:48:16 +0100 Subject: [PATCH 46/50] Make sure we retrieve the docid word positions --- milli/src/update/index_documents/store.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 05974d55e..05767080a 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -274,13 +274,15 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; // We store document_id associated with all the words the record contains. - for (word, _) in words_positions.drain() { - self.insert_word_docid(&word, document_id)?; + for (word, _) in words_positions.iter() { + self.insert_word_docid(word, document_id)?; } self.documents_writer.insert(document_id.to_be_bytes(), record)?; Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; + words_positions.clear(); + // We store document_id associated with all the field id and values. for (field, values) in facet_values.drain() { for value in values { From 07784c899043d24318f13edac84e805708946696 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 11:25:36 +0100 Subject: [PATCH 47/50] Tune the words prefixes threshold to compute for 1/1000 instead --- Cargo.lock | 1 + infos/src/main.rs | 2 +- milli/src/update/words_prefixes.rs | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 883d836b7..b7f479d2e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -866,6 +866,7 @@ dependencies = [ "anyhow", "byte-unit", "heed", + "jemallocator", "milli", "stderrlog", "structopt", diff --git a/infos/src/main.rs b/infos/src/main.rs index 91157aaad..0d2b7abb5 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -598,7 +598,7 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) - let fields_ids_map = index.fields_ids_map(rtxn)?; let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); - let iter: Box> = if internal_ids.is_empty() { + let iter: Box> = if internal_ids.is_empty() { Box::new(index.documents.iter(rtxn)?.map(|result| { result.map(|(_id, obkv)| obkv) })) diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs index f7c898c89..70b82b217 100644 --- a/milli/src/update/words_prefixes.rs +++ b/milli/src/update/words_prefixes.rs @@ -41,7 +41,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { chunk_fusing_shrink_size: None, max_nb_chunks: None, max_memory: None, - threshold: 0.01, // 1% + threshold: 0.1 / 100.0, // .01% max_prefix_length: 4, _update_id: update_id, } From 1fc25148da6971b901f0502de255f3daafb8674c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 18:09:19 +0100 Subject: [PATCH 48/50] Remove useless where clauses for the criteria --- milli/src/search/criteria/asc_desc.rs | 12 ++++++------ milli/src/search/criteria/proximity.rs | 4 ++-- milli/src/search/criteria/typo.rs | 4 ++-- milli/src/search/criteria/words.rs | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 9af9d53e6..193e9c942 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -37,7 +37,7 @@ impl<'t> AscDesc<'t> { candidates: Option, field_id: FieldId, facet_type: FacetType, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, true) } @@ -49,7 +49,7 @@ impl<'t> AscDesc<'t> { candidates: Option, field_id: FieldId, facet_type: FacetType, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, false) } @@ -60,7 +60,7 @@ impl<'t> AscDesc<'t> { parent: Box, field_id: FieldId, facet_type: FacetType, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Self::new(index, rtxn, parent, field_id, facet_type, true) } @@ -71,7 +71,7 @@ impl<'t> AscDesc<'t> { parent: Box, field_id: FieldId, facet_type: FacetType, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Self::new(index, rtxn, parent, field_id, facet_type, false) } @@ -84,7 +84,7 @@ impl<'t> AscDesc<'t> { field_id: FieldId, facet_type: FacetType, ascending: bool, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?; let candidates = match &query_tree { @@ -120,7 +120,7 @@ impl<'t> AscDesc<'t> { field_id: FieldId, facet_type: FacetType, ascending: bool, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(AscDesc { index, diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index dc05787dd..fe82523ca 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -22,7 +22,7 @@ impl<'t> Proximity<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Proximity { ctx, @@ -38,7 +38,7 @@ impl<'t> Proximity<'t> { pub fn new( ctx: &'t dyn Context, parent: Box, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Proximity { ctx, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 5c8592c5e..76c2fbc46 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -24,7 +24,7 @@ impl<'t> Typo<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Typo { ctx, @@ -41,7 +41,7 @@ impl<'t> Typo<'t> { pub fn new( ctx: &'t dyn Context, parent: Box, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Typo { ctx, diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index bfb85579a..08cbeaab3 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -21,7 +21,7 @@ impl<'t> Words<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Words { ctx, @@ -36,7 +36,7 @@ impl<'t> Words<'t> { pub fn new( ctx: &'t dyn Context, parent: Box, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Words { ctx, From 2cc4a467a6db6012225c090ad1a8350d2f72fba4 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 18:16:13 +0100 Subject: [PATCH 49/50] Change the criterion output that cannot fail --- milli/src/search/criteria/mod.rs | 12 ++++++------ milli/src/search/criteria/proximity.rs | 16 ++++++---------- milli/src/search/criteria/typo.rs | 24 ++++++++++-------------- milli/src/search/criteria/words.rs | 16 ++++++---------- 4 files changed, 28 insertions(+), 40 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 49bacf209..0dcaa5a69 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -138,9 +138,9 @@ impl<'t> CriteriaBuilder<'t> { for name in self.index.criteria(&self.rtxn)? { criterion = Some(match criterion.take() { Some(father) => match name { - Name::Typo => Box::new(Typo::new(self, father)?), - Name::Words => Box::new(Words::new(self, father)?), - Name::Proximity => Box::new(Proximity::new(self, father)?), + Name::Typo => Box::new(Typo::new(self, father)), + Name::Words => Box::new(Words::new(self, father)), + Name::Proximity => Box::new(Proximity::new(self, father)), Name::Asc(field) => { let (id, facet_type) = field_id_facet_type(&field)?; Box::new(AscDesc::asc(&self.index, &self.rtxn, father, id, facet_type)?) @@ -152,9 +152,9 @@ impl<'t> CriteriaBuilder<'t> { _otherwise => father, }, None => match name { - Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())?), - Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())?), - Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())?), + Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())), + Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())), + Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())), Name::Asc(field) => { let (id, facet_type) = field_id_facet_type(&field)?; Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index fe82523ca..b192902c1 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -22,9 +22,9 @@ impl<'t> Proximity<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result + ) -> Self { - Ok(Proximity { + Proximity { ctx, query_tree: query_tree.map(|op| (maximum_proximity(&op), op)), proximity: 0, @@ -32,15 +32,11 @@ impl<'t> Proximity<'t> { bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::new(), - }) + } } - pub fn new( - ctx: &'t dyn Context, - parent: Box, - ) -> anyhow::Result - { - Ok(Proximity { + pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + Proximity { ctx, query_tree: None, proximity: 0, @@ -48,7 +44,7 @@ impl<'t> Proximity<'t> { bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::new(), - }) + } } } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 76c2fbc46..e952bda55 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -24,9 +24,9 @@ impl<'t> Typo<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result + ) -> Self { - Ok(Typo { + Typo { ctx, query_tree: query_tree.map(|op| (maximum_typo(&op), op)), number_typos: 0, @@ -35,15 +35,11 @@ impl<'t> Typo<'t> { parent: None, candidates_cache: HashMap::new(), typo_cache: HashMap::new(), - }) + } } - pub fn new( - ctx: &'t dyn Context, - parent: Box, - ) -> anyhow::Result - { - Ok(Typo { + pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + Typo { ctx, query_tree: None, number_typos: 0, @@ -52,7 +48,7 @@ impl<'t> Typo<'t> { parent: Some(parent), candidates_cache: HashMap::new(), typo_cache: HashMap::new(), - }) + } } } @@ -348,7 +344,7 @@ mod test { let query_tree = None; let facet_candidates = None; - let mut criteria = Typo::initial(&context, query_tree, facet_candidates).unwrap(); + let mut criteria = Typo::initial(&context, query_tree, facet_candidates); assert!(criteria.next().unwrap().is_none()); } @@ -366,7 +362,7 @@ mod test { let facet_candidates = None; - let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates).unwrap(); + let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates); let candidates_1 = context.word_docids("split").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap() @@ -414,7 +410,7 @@ mod test { let query_tree = None; let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())).unwrap(); + let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())); let expected = CriterionResult { query_tree: None, @@ -442,7 +438,7 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())).unwrap(); + let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())); let candidates_1 = context.word_docids("split").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap() diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 08cbeaab3..1827cd1ed 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -21,31 +21,27 @@ impl<'t> Words<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result + ) -> Self { - Ok(Words { + Words { ctx, query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::default(), - }) + } } - pub fn new( - ctx: &'t dyn Context, - parent: Box, - ) -> anyhow::Result - { - Ok(Words { + pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + Words { ctx, query_trees: Vec::default(), candidates: Candidates::default(), bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::default(), - }) + } } } From 9b6b35d9b7b18539c8b11adc30a12863858a84df Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 18:19:10 +0100 Subject: [PATCH 50/50] Clean up some comments --- milli/src/search/criteria/typo.rs | 1 - milli/src/search/mod.rs | 6 ------ 2 files changed, 7 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index e952bda55..a78ac3339 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -283,7 +283,6 @@ fn resolve_candidates<'t>( } } - /// FIXME Make this function generic and mutualize it between Typo and proximity criterion fn mdfs<'t>( ctx: &'t dyn Context, branches: &[Operation], diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 48b0f71da..8570cefaa 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -94,12 +94,6 @@ impl<'a> Search<'a> { let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; let mut criteria = criteria_builder.build(query_tree, facet_candidates)?; - // // We sort in descending order on a specific field *by hand*, don't do that at home. - // let attr_name = "released-timestamp"; - // let fid = self.index.fields_ids_map(self.rtxn)?.id(attr_name).unwrap(); - // let ftype = *self.index.faceted_fields(self.rtxn)?.get(attr_name).unwrap(); - // let desc_criterion = AscDesc::desc(self.index, self.rtxn, Box::new(words_criterion), fid, ftype)?; - let mut offset = self.offset; let mut limit = self.limit; let mut documents_ids = Vec::new();