diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d033f5707..3673aef78 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use crate::Index; -use crate::search::word_typos; +use crate::search::word_derivations; use roaring::RoaringBitmap; @@ -124,7 +124,7 @@ fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result anyhow::Result { - let words = word_typos(&word, query.prefix, *typo, ctx.words_fst())?; + let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst())?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); @@ -155,14 +155,14 @@ fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, p if prefix && ctx.in_prefix_cache(&right) { Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) } else if prefix { - let r_words = word_typos(&right, true, 0, ctx.words_fst())?; + let r_words = word_derivations(&right, true, 0, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) } else { Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) } }, (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { - let l_words = word_typos(&left, false, *typo, ctx.words_fst())?; + let l_words = word_derivations(&left, false, *typo, ctx.words_fst())?; if prefix && ctx.in_prefix_cache(&right) { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { @@ -171,19 +171,19 @@ fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, p } Ok(docids) } else if prefix { - let r_words = word_typos(&right, true, 0, ctx.words_fst())?; + let r_words = word_derivations(&right, true, 0, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) } else { all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } }, (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { - let r_words = word_typos(&right, prefix, *typo, ctx.words_fst())?; + let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) }, (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { - let l_words = word_typos(&left, false, *l_typo, ctx.words_fst())?; - let r_words = word_typos(&right, prefix, *r_typo, ctx.words_fst())?; + let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst())?; + let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) }, } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 0284d448d..a6f500bd5 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -4,7 +4,7 @@ use anyhow::bail; use roaring::RoaringBitmap; use crate::search::query_tree::{Operation, Query, QueryKind}; -use crate::search::word_typos; +use crate::search::word_derivations; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; // FIXME we must stop when the number of typos is equal to @@ -177,7 +177,7 @@ fn alterate_query_tree( }, Operation::Query(q) => { if let QueryKind::Tolerant { typo, word } = &q.kind { - // if no typo is allowed we don't call word_typos(..), + // if no typo is allowed we don't call word_derivations function, // and directly create an Exact query if number_typos == 0 { *operation = Operation::Query(Query { @@ -190,7 +190,7 @@ fn alterate_query_tree( let words = if let Some(derivations) = typo_cache.get(&cache_key) { derivations.clone() } else { - let derivations = word_typos(word, q.prefix, typo, words_fst)?; + let derivations = word_derivations(word, q.prefix, typo, words_fst)?; typo_cache.insert(cache_key, derivations.clone()); derivations }; @@ -222,10 +222,6 @@ fn resolve_candidates<'t>( cache: &mut HashMap<(Operation, u8), RoaringBitmap>, ) -> anyhow::Result { - // FIXME add a cache - // FIXME keep the cache between typos iterations - // cache: HashMap<(&Operation, u8), RoaringBitmap>, - fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index d94bc8831..6046cc8d2 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,26 +1,18 @@ use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::fmt; use std::time::Instant; -use anyhow::{bail, Context}; use fst::{IntoStreamer, Streamer, Set}; -use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use log::debug; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; -use ordered_float::OrderedFloat; use roaring::bitmap::RoaringBitmap; -use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; -use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; -use crate::mdfs::Mdfs; -use crate::query_tokens::{query_tokens, QueryToken}; use crate::search::criteria::{Criterion, CriterionResult}; use crate::search::criteria::typo::Typo; -use crate::{Index, FieldId, DocumentId}; +use crate::{Index, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::{FacetIter}; @@ -69,198 +61,6 @@ impl<'a> Search<'a> { self } - /// Extracts the query words from the query string and returns the DFAs accordingly. - /// TODO introduce settings for the number of typos regarding the words lengths. - fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { - let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2); - - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let words: Vec<_> = query_tokens(tokens).collect(); - - let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let number_of_words = words.len(); - - words.into_iter().enumerate().map(|(i, word)| { - let (word, quoted) = match word { - QueryToken::Free(token) => (token.text().to_string(), token.text().len() <= 3), - QueryToken::Quoted(token) => (token.text().to_string(), true), - }; - let is_last = i + 1 == number_of_words; - let is_prefix = is_last && !ends_with_whitespace && !quoted; - let lev = match word.len() { - 0..=4 => if quoted { lev0 } else { lev0 }, - 5..=8 => if quoted { lev0 } else { lev1 }, - _ => if quoted { lev0 } else { lev2 }, - }; - - let dfa = if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - }; - - (word, is_prefix, dfa) - }) - .collect() - } - - /// Fetch the words from the given FST related to the given DFAs along with - /// the associated documents ids. - fn fetch_words_docids( - &self, - fst: &fst::Set>, - dfas: Vec<(String, bool, DFA)>, - ) -> anyhow::Result, RoaringBitmap)>> - { - // A Vec storing all the derived words from the original query words, associated - // with the distance from the original word and the docids where the words appears. - let mut derived_words = Vec::<(HashMap::, RoaringBitmap)>::with_capacity(dfas.len()); - - for (_word, _is_prefix, dfa) in dfas { - - let mut acc_derived_words = HashMap::new(); - let mut unions_docids = RoaringBitmap::new(); - let mut stream = fst.search_with_state(&dfa).into_stream(); - while let Some((word, state)) = stream.next() { - - let word = std::str::from_utf8(word)?; - let docids = self.index.word_docids.get(self.rtxn, word)?.unwrap(); - let distance = dfa.distance(state); - unions_docids.union_with(&docids); - acc_derived_words.insert(word.to_string(), (distance.to_u8(), docids)); - } - derived_words.push((acc_derived_words, unions_docids)); - } - - Ok(derived_words) - } - - /// Returns the set of docids that contains all of the query words. - fn compute_candidates( - derived_words: &[(HashMap, RoaringBitmap)], - ) -> RoaringBitmap - { - // We sort the derived words by inverse popularity, this way intersections are faster. - let mut derived_words: Vec<_> = derived_words.iter().collect(); - derived_words.sort_unstable_by_key(|(_, docids)| docids.len()); - - // we do a union between all the docids of each of the derived words, - // we got N unions (the number of original query words), we then intersect them. - let mut candidates = RoaringBitmap::new(); - - for (i, (_, union_docids)) in derived_words.iter().enumerate() { - if i == 0 { - candidates = union_docids.clone(); - } else { - candidates.intersect_with(&union_docids); - } - } - - candidates - } - - fn facet_ordered( - &self, - field_id: FieldId, - facet_type: FacetType, - ascending: bool, - mut documents_ids: RoaringBitmap, - limit: usize, - ) -> anyhow::Result> - { - let mut output: Vec<_> = match facet_type { - FacetType::Float => { - if documents_ids.len() <= 1000 { - let db = self.index.field_id_docid_facet_values.remap_key_type::(); - let mut docids_values = Vec::with_capacity(documents_ids.len() as usize); - for docid in documents_ids.iter() { - let left = (field_id, docid, f64::MIN); - let right = (field_id, docid, f64::MAX); - let mut iter = db.range(self.rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, OrderedFloat(value))); - } - } - docids_values.sort_unstable_by_key(|(_, value)| *value); - let iter = docids_values.into_iter().map(|(id, _)| id); - if ascending { - iter.take(limit).collect() - } else { - iter.rev().take(limit).collect() - } - } else { - let facet_fn = if ascending { - FacetIter::::new_reducing - } else { - FacetIter::::new_reverse_reducing - }; - let mut limit_tmp = limit; - let mut output = Vec::new(); - for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? { - let (_val, docids) = result?; - limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); - output.push(docids); - if limit_tmp == 0 { break } - } - output.into_iter().flatten().take(limit).collect() - } - }, - FacetType::Integer => { - if documents_ids.len() <= 1000 { - let db = self.index.field_id_docid_facet_values.remap_key_type::(); - let mut docids_values = Vec::with_capacity(documents_ids.len() as usize); - for docid in documents_ids.iter() { - let left = (field_id, docid, i64::MIN); - let right = (field_id, docid, i64::MAX); - let mut iter = db.range(self.rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, value)); - } - } - docids_values.sort_unstable_by_key(|(_, value)| *value); - let iter = docids_values.into_iter().map(|(id, _)| id); - if ascending { - iter.take(limit).collect() - } else { - iter.rev().take(limit).collect() - } - } else { - let facet_fn = if ascending { - FacetIter::::new_reducing - } else { - FacetIter::::new_reverse_reducing - }; - let mut limit_tmp = limit; - let mut output = Vec::new(); - for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? { - let (_val, docids) = result?; - limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); - output.push(docids); - if limit_tmp == 0 { break } - } - output.into_iter().flatten().take(limit).collect() - } - }, - FacetType::String => bail!("criteria facet type must be a number"), - }; - - // if there isn't enough documents to return we try to complete that list - // with documents that are maybe not faceted under this field and therefore - // not returned by the previous facet iteration. - if output.len() < limit { - output.iter().for_each(|n| { documents_ids.remove(*n); }); - let remaining = documents_ids.iter().take(limit - output.len()); - output.extend(remaining); - } - - Ok(output) - } - pub fn execute(&self) -> anyhow::Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); @@ -320,101 +120,6 @@ impl<'a> Search<'a> { let found_words = HashSet::new(); Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids }) - - // let order_by_facet = { - // let criteria = self.index.criteria(self.rtxn)?; - // let result = criteria.into_iter().flat_map(|criterion| { - // match criterion { - // Criterion::Asc(fid) => Some((fid, true)), - // Criterion::Desc(fid) => Some((fid, false)), - // _ => None - // } - // }).next(); - // match result { - // Some((attr_name, is_ascending)) => { - // let field_id_map = self.index.fields_ids_map(self.rtxn)?; - // let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?; - // let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?; - // let ftype = *faceted_fields.get(&fid) - // .with_context(|| format!("{:?} not found in the faceted fields.", attr_name)) - // .expect("corrupted data: "); - // Some((fid, ftype, is_ascending)) - // }, - // None => None, - // } - // }; - - // let before = Instant::now(); - // let (candidates, derived_words) = match (facet_candidates, derived_words) { - // (Some(mut facet_candidates), Some(derived_words)) => { - // let words_candidates = Self::compute_candidates(&derived_words); - // facet_candidates.intersect_with(&words_candidates); - // (facet_candidates, derived_words) - // }, - // (None, Some(derived_words)) => { - // (Self::compute_candidates(&derived_words), derived_words) - // }, - // (Some(facet_candidates), None) => { - // // If the query is not set or results in no DFAs but - // // there is some facet conditions we return a placeholder. - // let documents_ids = match order_by_facet { - // Some((fid, ftype, is_ascending)) => { - // self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)? - // }, - // None => facet_candidates.iter().take(limit).collect(), - // }; - // return Ok(SearchResult { - // documents_ids, - // candidates: facet_candidates, - // ..Default::default() - // }) - // }, - // (None, None) => { - // // If the query is not set or results in no DFAs we return a placeholder. - // let all_docids = self.index.documents_ids(self.rtxn)?; - // let documents_ids = match order_by_facet { - // Some((fid, ftype, is_ascending)) => { - // self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)? - // }, - // None => all_docids.iter().take(limit).collect(), - // }; - // return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() }) - // }, - // }; - - // debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed()); - - // // The mana depth first search is a revised DFS that explore - // // solutions in the order of their proximities. - // let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone()); - // let mut documents = Vec::new(); - - // // We execute the Mdfs iterator until we find enough documents. - // while documents.iter().map(RoaringBitmap::len).sum::() < limit as u64 { - // match mdfs.next().transpose()? { - // Some((proximity, answer)) => { - // debug!("answer with a proximity of {}: {:?}", proximity, answer); - // documents.push(answer); - // }, - // None => break, - // } - // } - - // let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); - // let documents_ids = match order_by_facet { - // Some((fid, ftype, order)) => { - // let mut ordered_documents = Vec::new(); - // for documents_ids in documents { - // let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?; - // ordered_documents.push(docids); - // if ordered_documents.iter().map(Vec::len).sum::() >= limit { break } - // } - // ordered_documents.into_iter().flatten().take(limit).collect() - // }, - // None => documents.into_iter().flatten().take(limit).collect(), - // }; - - // Ok(SearchResult { found_words, candidates, documents_ids }) } } @@ -438,19 +143,17 @@ pub struct SearchResult { pub documents_ids: Vec, } -pub fn word_typos(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>) -> anyhow::Result> { - let dfa = { - let lev = match max_typo { - 0 => &LEVDIST0, - 1 => &LEVDIST1, - _ => &LEVDIST2, - }; +pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>) -> anyhow::Result> { + let lev = match max_typo { + 0 => &LEVDIST0, + 1 => &LEVDIST1, + _ => &LEVDIST2, + }; - if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - } + let dfa = if is_prefix { + lev.build_prefix_dfa(&word) + } else { + lev.build_dfa(&word) }; let mut derived_words = Vec::new(); diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 9b253350e..02f6dc0c8 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -303,7 +303,7 @@ fn fetch_words(tree: &Operation, fst: &fst::Set>) -> FetchedWords { match query.kind.clone() { QueryKind::Exact { word, .. } => vec![(word, query.prefix)], QueryKind::Tolerant { typo, word } => { - if let Ok(words) = super::word_typos(&word, query.prefix, typo, fst) { + if let Ok(words) = super::word_derivations(&word, query.prefix, typo, fst) { words.into_iter().map(|(w, _)| (w, query.prefix)).collect() } else { vec![(word, query.prefix)]