diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 470983017..889e811ad 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -68,8 +68,8 @@ pub trait SearchLogger { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u16, + distances: &MappedInterner>, + cost: u64, ); /// Logs the internal state of the typo ranking rule @@ -79,8 +79,8 @@ pub trait SearchLogger { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u16, + distances: &MappedInterner>, + cost: u64, ); } @@ -139,8 +139,8 @@ impl SearchLogger for DefaultSearchLogger { _paths_map: &[Vec>], _dead_ends_cache: &DeadEndsCache, _universe: &RoaringBitmap, - _distances: &MappedInterner>, - _cost: u16, + _distances: &MappedInterner>, + _cost: u64, ) { } @@ -150,8 +150,8 @@ impl SearchLogger for DefaultSearchLogger { _paths: &[Vec>], _dead_ends_cache: &DeadEndsCache, _universe: &RoaringBitmap, - _distances: &MappedInterner>, - _cost: u16, + _distances: &MappedInterner>, + _cost: u64, ) { } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index c27051de0..de02b67a4 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,21 +1,17 @@ -// use std::collections::HashSet; -use std::fmt::Write; -use std::iter::FromIterator; - -use fxhash::FxHashSet; use roaring::RoaringBitmap; -use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; +use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; -use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; +use crate::search::new::query_term::{LocatedQueryTermSubset, NTypoTermSubset}; +use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] pub struct TypoCondition { - term: Interned, + term: LocatedQueryTermSubset, + nbr_typos: u8, } pub enum TypoGraph {} @@ -23,121 +19,63 @@ pub enum TypoGraph {} impl RankingRuleGraphTrait for TypoGraph { type Condition = TypoCondition; - fn resolve_condition<'db_cache>( + fn resolve_condition( ctx: &mut SearchContext, condition: &Self::Condition, universe: &RoaringBitmap, - ) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)> { - let SearchContext { - index, - txn, - db_cache, - word_interner, - phrase_interner, - term_interner, - term_docids: query_term_docids, - } = ctx; + ) -> Result { + let TypoCondition { term, .. } = condition; + // maybe compute_query_term_subset_docids should accept a universe as argument + let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?; + docids &= universe; - let docids = universe - & query_term_docids.get_query_term_docids( - index, - txn, - db_cache, - word_interner, - term_interner, - phrase_interner, - condition.term, - )?; - - let term = term_interner.get(condition.term); - Ok(( + Ok(ComputedCondition { docids, - FxHashSet::from_iter(term.all_single_words_except_prefix_db()), - FxHashSet::from_iter(term.all_phrases()), - )) + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: term.clone(), + }) } fn build_edges( ctx: &mut SearchContext, conditions_interner: &mut DedupInterner, - _from_node: &QueryNode, - to_node: &QueryNode, - ) -> Result>)>> { - let SearchContext { term_interner, .. } = ctx; - match &to_node.data { - QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { - let mut edges = vec![]; - // Ngrams have a base typo cost - // 2-gram -> equivalent to 1 typo - // 3-gram -> equivalent to 2 typos - let base_cost = positions.len().min(2) as u8; + _from: Option<&LocatedQueryTermSubset>, + to_term: &LocatedQueryTermSubset, + ) -> Result)>> { + let term = to_term; // LocatedQueryTermSubset { term_subset, positions: _, term_ids } = to_term; + let original_full_term = ctx.term_interner.get(term.term_subset.original); - for nbr_typos in 0..=2 { - let term = term_interner.get(*value).clone(); - let new_term = match nbr_typos { - 0 => QueryTerm { - original: term.original, - is_prefix: term.is_prefix, - zero_typo: term.zero_typo, - prefix_of: term.prefix_of, - // TOOD: debatable - synonyms: term.synonyms, - split_words: None, - one_typo: Box::new([]), - two_typos: Box::new([]), - use_prefix_db: term.use_prefix_db, - is_ngram: term.is_ngram, - phrase: term.phrase, - }, - 1 => { - // What about split words and synonyms here? - QueryTerm { - original: term.original, - is_prefix: false, - zero_typo: None, - prefix_of: Box::new([]), - synonyms: Box::new([]), - split_words: term.split_words, - one_typo: term.one_typo, - two_typos: Box::new([]), - use_prefix_db: None, // false because all items from use_prefix_db have 0 typos - is_ngram: term.is_ngram, - phrase: None, - } - } - 2 => { - // What about split words and synonyms here? - QueryTerm { - original: term.original, - zero_typo: None, - is_prefix: false, - prefix_of: Box::new([]), - synonyms: Box::new([]), - split_words: None, - one_typo: Box::new([]), - two_typos: term.two_typos, - use_prefix_db: None, // false because all items from use_prefix_db have 0 typos - is_ngram: term.is_ngram, - phrase: None, - } - } - _ => panic!(), - }; - if !new_term.is_empty() { - edges.push(( - nbr_typos as u8 + base_cost, - Some( - conditions_interner - .insert(TypoCondition { term: term_interner.push(new_term) }), - ), - )) - } + let mut edges = vec![]; + // Ngrams have a base typo cost + // 2-gram -> equivalent to 1 typo + // 3-gram -> equivalent to 2 typos + let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; + + for nbr_typos in 0..=original_full_term.max_nbr_typos { + let mut term = term.clone(); + match nbr_typos { + 0 => { + term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; } - Ok(edges) - } - QueryNodeData::End => Ok(vec![(0, None)]), - QueryNodeData::Deleted | QueryNodeData::Start => panic!(), + 1 => { + term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; + } + 2 => { + term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; + } + _ => panic!(), + }; + + edges.push(( + nbr_typos as u32 + base_cost, + conditions_interner.insert(TypoCondition { term, nbr_typos }), + )); } + Ok(edges) } fn log_state( @@ -145,81 +83,18 @@ impl RankingRuleGraphTrait for TypoGraph { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u16, + distances: &MappedInterner>, + cost: u64, logger: &mut dyn SearchLogger, ) { logger.log_typo_state(graph, paths, dead_ends_cache, universe, distances, cost); } fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { - let TypoCondition { term } = condition; - let term = ctx.term_interner.get(*term); - let QueryTerm { - original: _, - is_ngram: _, - is_prefix: _, - phrase, - zero_typo, - prefix_of, - synonyms, - split_words, - one_typo, - two_typos, - use_prefix_db, - } = term; - let mut s = String::new(); - if let Some(phrase) = phrase { - let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner); - writeln!(&mut s, "\"{phrase}\" : phrase").unwrap(); - } - if let Some(w) = zero_typo { - let w = ctx.word_interner.get(*w); - writeln!(&mut s, "\"{w}\" : 0 typo").unwrap(); - } - for w in prefix_of.iter() { - let w = ctx.word_interner.get(*w); - writeln!(&mut s, "\"{w}\" : prefix").unwrap(); - } - for w in one_typo.iter() { - let w = ctx.word_interner.get(*w); - writeln!(&mut s, "\"{w}\" : 1 typo").unwrap(); - } - for w in two_typos.iter() { - let w = ctx.word_interner.get(*w); - writeln!(&mut s, "\"{w}\" : 2 typos").unwrap(); - } - if let Some(phrase) = split_words { - let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner); - writeln!(&mut s, "\"{phrase}\" : split words").unwrap(); - } - for phrase in synonyms.iter() { - let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner); - writeln!(&mut s, "\"{phrase}\" : synonym").unwrap(); - } - if let Some(w) = use_prefix_db { - let w = ctx.word_interner.get(*w); - writeln!(&mut s, "\"{w}\" : use prefix db").unwrap(); - } + let TypoCondition { term, nbr_typos } = condition; + let original_term = ctx.term_interner.get(term.term_subset.original); + let original = ctx.word_interner.get(original_term.original); - Ok(s) + Ok(format!("{original}: {nbr_typos}")) } - - // fn words_used_by_condition<'ctx>( - // ctx: &mut SearchContext<'ctx>, - // condition: &Self::Condition, - // ) -> Result>> { - // let TypoCondition { term, .. } = condition; - // let term = ctx.term_interner.get(*term); - // Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) - // } - - // fn phrases_used_by_condition<'ctx>( - // ctx: &mut SearchContext<'ctx>, - // condition: &Self::Condition, - // ) -> Result>> { - // let TypoCondition { term, .. } = condition; - // let term = ctx.term_interner.get(*term); - // Ok(HashSet::from_iter(term.all_phrases())) - // } }