Simplify query_term module a bit

This commit is contained in:
Loïc Lecrenier 2023-04-04 15:01:42 +02:00
parent 3f13608002
commit 4129d657e2
2 changed files with 129 additions and 158 deletions

View File

@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::collections::hash_map::Entry; use std::collections::hash_map::Entry;
use std::hash::Hash; use std::hash::Hash;
@ -24,6 +25,8 @@ pub struct DatabaseCache<'ctx> {
pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>, pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>, pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>, pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
} }
impl<'ctx> DatabaseCache<'ctx> { impl<'ctx> DatabaseCache<'ctx> {
fn get_value<'v, K1, KC>( fn get_value<'v, K1, KC>(
@ -49,6 +52,16 @@ impl<'ctx> DatabaseCache<'ctx> {
} }
} }
impl<'ctx> SearchContext<'ctx> { impl<'ctx> SearchContext<'ctx> {
pub fn get_words_fst(&mut self) -> Result<fst::Set<Cow<'ctx, [u8]>>> {
if let Some(fst) = self.db_cache.words_fst.clone() {
Ok(fst)
} else {
let fst = self.index.words_fst(self.txn)?;
self.db_cache.words_fst = Some(fst.clone());
Ok(fst)
}
}
/// Retrieve or insert the given value in the `word_docids` database. /// Retrieve or insert the given value in the `word_docids` database.
pub fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'ctx [u8]>> { pub fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'ctx [u8]>> {
DatabaseCache::get_value( DatabaseCache::get_value(

View File

@ -7,14 +7,14 @@ use charabia::{SeparatorKind, TokenKind};
use fst::automaton::Str; use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer}; use fst::{Automaton, IntoStreamer, Streamer};
use heed::types::DecodeIgnore; use heed::types::DecodeIgnore;
use heed::RoTxn; use heed::BytesDecode;
use itertools::Itertools; use itertools::Itertools;
use super::interner::{DedupInterner, Interned}; use super::interner::{DedupInterner, Interned};
use super::{limits, SearchContext}; use super::{limits, SearchContext};
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
use crate::search::{build_dfa, get_first}; use crate::search::{build_dfa, get_first};
use crate::{CboRoaringBitmapLenCodec, Index, Result, MAX_WORD_LENGTH}; use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH};
/// A phrase in the user's search query, consisting of several words /// A phrase in the user's search query, consisting of several words
/// that must appear side-by-side in the search results. /// that must appear side-by-side in the search results.
@ -191,18 +191,13 @@ impl QueryTermSubset {
&self, &self,
ctx: &mut SearchContext, ctx: &mut SearchContext,
) -> Result<BTreeSet<Interned<String>>> { ) -> Result<BTreeSet<Interned<String>>> {
let original = ctx.term_interner.get_mut(self.original);
let mut result = BTreeSet::default(); let mut result = BTreeSet::default();
// TODO: a compute_partially funtion // TODO: a compute_partially funtion
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
original.compute_fully_if_needed( self.original.compute_fully_if_needed(ctx)?;
ctx.index,
ctx.txn,
&mut ctx.word_interner,
&mut ctx.phrase_interner,
)?;
} }
let original = ctx.term_interner.get_mut(self.original);
if !self.zero_typo_subset.is_empty() { if !self.zero_typo_subset.is_empty() {
let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } =
&original.zero_typo; &original.zero_typo;
@ -245,18 +240,13 @@ impl QueryTermSubset {
Ok(result) Ok(result)
} }
pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result<BTreeSet<Interned<Phrase>>> { pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result<BTreeSet<Interned<Phrase>>> {
let original = ctx.term_interner.get_mut(self.original);
let mut result = BTreeSet::default(); let mut result = BTreeSet::default();
if !self.one_typo_subset.is_empty() { if !self.one_typo_subset.is_empty() {
// TODO: compute less than fully if possible // TODO: compute less than fully if possible
original.compute_fully_if_needed( self.original.compute_fully_if_needed(ctx)?;
ctx.index,
ctx.txn,
&mut ctx.word_interner,
&mut ctx.phrase_interner,
)?;
} }
let original = ctx.term_interner.get_mut(self.original);
let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } =
&original.zero_typo; &original.zero_typo;
@ -274,26 +264,23 @@ impl QueryTermSubset {
} }
} }
impl QueryTerm { impl Interned<QueryTerm> {
pub fn compute_fully_if_needed( pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
&mut self, let s = ctx.term_interner.get_mut(self);
index: &Index, if s.max_nbr_typos == 0 {
txn: &RoTxn, s.one_typo = Lazy::Init(OneTypoTerm::default());
word_interner: &mut DedupInterner<String>, s.two_typo = Lazy::Init(TwoTypoTerm::default());
phrase_interner: &mut DedupInterner<Phrase>, } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() {
) -> Result<()> { assert!(s.two_typo.is_uninit());
if self.max_nbr_typos == 0 { self.initialize_one_typo_subterm(ctx)?;
self.one_typo = Lazy::Init(OneTypoTerm::default()); let s = ctx.term_interner.get_mut(self);
self.two_typo = Lazy::Init(TwoTypoTerm::default()); assert!(s.one_typo.is_init());
} else if self.max_nbr_typos == 1 && self.one_typo.is_uninit() { s.two_typo = Lazy::Init(TwoTypoTerm::default());
assert!(self.two_typo.is_uninit()); } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() {
self.initialize_one_typo_subterm(index, txn, word_interner, phrase_interner)?; assert!(s.two_typo.is_uninit());
assert!(self.one_typo.is_init()); self.initialize_one_and_two_typo_subterm(ctx)?;
self.two_typo = Lazy::Init(TwoTypoTerm::default()); let s = ctx.term_interner.get_mut(self);
} else if self.max_nbr_typos > 1 && self.two_typo.is_uninit() { assert!(s.one_typo.is_init() && s.two_typo.is_init());
assert!(self.two_typo.is_uninit());
self.initialize_one_and_two_typo_subterm(index, txn, word_interner, phrase_interner)?;
assert!(self.one_typo.is_init() && self.two_typo.is_init());
} }
Ok(()) Ok(())
} }
@ -302,7 +289,7 @@ impl QueryTerm {
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
pub struct QueryTerm { pub struct QueryTerm {
pub original: Interned<String>, pub original: Interned<String>,
pub is_multiple_words: bool, pub ngram_words: Option<Vec<Interned<String>>>,
pub max_nbr_typos: u8, pub max_nbr_typos: u8,
pub is_prefix: bool, pub is_prefix: bool,
pub zero_typo: ZeroTypoTerm, pub zero_typo: ZeroTypoTerm,
@ -363,39 +350,6 @@ impl TwoTypoTerm {
} }
impl QueryTerm { impl QueryTerm {
pub fn phrase(
word_interner: &mut DedupInterner<String>,
phrase_interner: &mut DedupInterner<Phrase>,
phrase: Phrase,
) -> Self {
Self {
original: word_interner.insert(phrase.description(word_interner)),
is_multiple_words: false,
max_nbr_typos: 0,
is_prefix: false,
zero_typo: ZeroTypoTerm {
phrase: Some(phrase_interner.insert(phrase)),
zero_typo: None,
prefix_of: BTreeSet::default(),
synonyms: BTreeSet::default(),
use_prefix_db: None,
},
one_typo: Lazy::Uninit,
two_typo: Lazy::Uninit,
}
}
pub fn empty(word_interner: &mut DedupInterner<String>, original: &str) -> Self {
Self {
original: word_interner.insert(original.to_owned()),
is_multiple_words: false,
is_prefix: false,
max_nbr_typos: 0,
zero_typo: <_>::default(),
one_typo: Lazy::Init(<_>::default()),
two_typo: Lazy::Init(<_>::default()),
}
}
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
let Lazy::Init(one_typo) = &self.one_typo else { let Lazy::Init(one_typo) = &self.one_typo else {
return false; return false;
@ -438,13 +392,13 @@ fn find_zero_typo_prefix_derivations(
} }
fn find_zero_one_typo_derivations( fn find_zero_one_typo_derivations(
ctx: &mut SearchContext,
word_interned: Interned<String>, word_interned: Interned<String>,
is_prefix: bool, is_prefix: bool,
fst: fst::Set<Cow<[u8]>>,
word_interner: &mut DedupInterner<String>,
mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>, mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
) -> Result<()> { ) -> Result<()> {
let word = word_interner.get(word_interned).to_owned(); let fst = ctx.get_words_fst()?;
let word = ctx.word_interner.get(word_interned).to_owned();
let word = word.as_str(); let word = word.as_str();
let dfa = build_dfa(word, 1, is_prefix); let dfa = build_dfa(word, 1, is_prefix);
@ -453,7 +407,7 @@ fn find_zero_one_typo_derivations(
while let Some((derived_word, state)) = stream.next() { while let Some((derived_word, state)) = stream.next() {
let derived_word = std::str::from_utf8(derived_word)?; let derived_word = std::str::from_utf8(derived_word)?;
let derived_word = word_interner.insert(derived_word.to_owned()); let derived_word = ctx.word_interner.insert(derived_word.to_owned());
let d = dfa.distance(state.1); let d = dfa.distance(state.1);
match d.to_u8() { match d.to_u8() {
0 => { 0 => {
@ -553,7 +507,17 @@ fn partially_initialized_term_from_word(
let word_interned = ctx.word_interner.insert(word.to_owned()); let word_interned = ctx.word_interner.insert(word.to_owned());
if word.len() > MAX_WORD_LENGTH { if word.len() > MAX_WORD_LENGTH {
return Ok(QueryTerm::empty(&mut ctx.word_interner, word)); return Ok({
QueryTerm {
original: ctx.word_interner.insert(word.to_owned()),
ngram_words: None,
is_prefix: false,
max_nbr_typos: 0,
zero_typo: <_>::default(),
one_typo: Lazy::Init(<_>::default()),
two_typo: Lazy::Init(<_>::default()),
}
});
} }
let fst = ctx.index.words_fst(ctx.txn)?; let fst = ctx.index.words_fst(ctx.txn)?;
@ -610,7 +574,7 @@ fn partially_initialized_term_from_word(
Ok(QueryTerm { Ok(QueryTerm {
original: word_interned, original: word_interned,
is_multiple_words: false, ngram_words: None,
max_nbr_typos: max_typo, max_nbr_typos: max_typo,
is_prefix, is_prefix,
zero_typo, zero_typo,
@ -619,42 +583,27 @@ fn partially_initialized_term_from_word(
}) })
} }
fn find_split_words( fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Interned<Phrase>>> {
index: &Index, if let Some((l, r)) = split_best_frequency(ctx, word)? {
txn: &RoTxn, Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
word_interner: &mut DedupInterner<String>, } else {
phrase_interner: &mut DedupInterner<Phrase>, Ok(None)
word: &str, }
) -> Result<Option<Interned<Phrase>>> {
let split_words = split_best_frequency(index, txn, word)?.map(|(l, r)| {
phrase_interner.insert(Phrase {
words: vec![Some(word_interner.insert(l)), Some(word_interner.insert(r))],
})
});
Ok(split_words)
} }
impl QueryTerm { impl Interned<QueryTerm> {
fn initialize_one_typo_subterm( fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
&mut self, let self_mut = ctx.term_interner.get_mut(self);
index: &Index, let QueryTerm { original, is_prefix, one_typo, .. } = self_mut;
txn: &RoTxn, let original = *original;
word_interner: &mut DedupInterner<String>, let is_prefix = *is_prefix;
phrase_interner: &mut DedupInterner<Phrase>, // let original_str = ctx.word_interner.get(*original).to_owned();
) -> Result<()> {
let QueryTerm { original, is_prefix, one_typo, .. } = self;
let original_str = word_interner.get(*original).to_owned();
if one_typo.is_init() { if one_typo.is_init() {
return Ok(()); return Ok(());
} }
let mut one_typo_words = BTreeSet::new(); let mut one_typo_words = BTreeSet::new();
find_zero_one_typo_derivations( find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
*original,
*is_prefix,
index.words_fst(txn)?,
word_interner,
|derived_word, nbr_typos| {
match nbr_typos { match nbr_typos {
ZeroOrOneTypo::Zero => {} ZeroOrOneTypo::Zero => {}
ZeroOrOneTypo::One => { ZeroOrOneTypo::One => {
@ -666,25 +615,20 @@ impl QueryTerm {
} }
} }
Ok(ControlFlow::Continue(())) Ok(ControlFlow::Continue(()))
}, })?;
)?; let original_str = ctx.word_interner.get(original).to_owned();
let split_words = let split_words = find_split_words(ctx, original_str.as_str())?;
find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?;
let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words };
self.one_typo = Lazy::Init(one_typo); let self_mut = ctx.term_interner.get_mut(self);
self_mut.one_typo = Lazy::Init(one_typo);
Ok(()) Ok(())
} }
fn initialize_one_and_two_typo_subterm( fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
&mut self, let self_mut = ctx.term_interner.get_mut(self);
index: &Index, let QueryTerm { original, is_prefix, two_typo, .. } = self_mut;
txn: &RoTxn, let original_str = ctx.word_interner.get(*original).to_owned();
word_interner: &mut DedupInterner<String>,
phrase_interner: &mut DedupInterner<Phrase>,
) -> Result<()> {
let QueryTerm { original, is_prefix, two_typo, .. } = self;
let original_str = word_interner.get(*original).to_owned();
if two_typo.is_init() { if two_typo.is_init() {
return Ok(()); return Ok(());
} }
@ -694,8 +638,8 @@ impl QueryTerm {
find_zero_one_two_typo_derivations( find_zero_one_two_typo_derivations(
*original, *original,
*is_prefix, *is_prefix,
index.words_fst(txn)?, ctx.index.words_fst(ctx.txn)?,
word_interner, &mut ctx.word_interner,
|derived_word, nbr_typos| { |derived_word, nbr_typos| {
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
@ -719,14 +663,15 @@ impl QueryTerm {
Ok(ControlFlow::Continue(())) Ok(ControlFlow::Continue(()))
}, },
)?; )?;
let split_words = let split_words = find_split_words(ctx, original_str.as_str())?;
find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?; let self_mut = ctx.term_interner.get_mut(self);
let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words };
let two_typo = TwoTypoTerm { two_typos: two_typo_words }; let two_typo = TwoTypoTerm { two_typos: two_typo_words };
self.one_typo = Lazy::Init(one_typo); self_mut.one_typo = Lazy::Init(one_typo);
self.two_typo = Lazy::Init(two_typo); self_mut.two_typo = Lazy::Init(two_typo);
Ok(()) Ok(())
} }
@ -737,38 +682,37 @@ impl QueryTerm {
/// ///
/// Return `None` if the original word cannot be split. /// Return `None` if the original word cannot be split.
fn split_best_frequency( fn split_best_frequency(
index: &Index, ctx: &mut SearchContext,
txn: &RoTxn,
original: &str, original: &str,
) -> Result<Option<(String, String)>> { ) -> Result<Option<(Interned<String>, Interned<String>)>> {
let chars = original.char_indices().skip(1); let chars = original.char_indices().skip(1);
let mut best = None; let mut best = None;
for (i, _) in chars { for (i, _) in chars {
let (left, right) = original.split_at(i); let (left, right) = original.split_at(i);
let left = ctx.word_interner.insert(left.to_owned());
let right = ctx.word_interner.insert(right.to_owned());
let key = (1, left, right); if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? {
let frequency = index let frequency =
.word_pair_proximity_docids CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?;
.remap_data_type::<CboRoaringBitmapLenCodec>() if best.map_or(true, |(old, _, _)| frequency > old) {
.get(txn, &key)?
.unwrap_or(0);
if frequency != 0 && best.map_or(true, |(old, _, _)| frequency > old) {
best = Some((frequency, left, right)); best = Some((frequency, left, right));
} }
} }
Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned())))
} }
impl QueryTerm { Ok(best.map(|(_, left, right)| (left, right)))
}
impl Interned<QueryTerm> {
/// Return the original word from the given query term /// Return the original word from the given query term
pub fn original_single_word(&self) -> Option<Interned<String>> { pub fn original_single_word(self, ctx: &SearchContext) -> Option<Interned<String>> {
if self.is_multiple_words { let self_ = ctx.term_interner.get(self);
if self_.ngram_words.is_some() {
None None
} else { } else {
Some(self.original) Some(self_.original)
} }
} }
} }
@ -824,11 +768,25 @@ impl PhraseBuilder {
return None; return None;
} }
Some(LocatedQueryTerm { Some(LocatedQueryTerm {
value: ctx.term_interner.push(QueryTerm::phrase( value: ctx.term_interner.push({
&mut ctx.word_interner, let phrase = Phrase { words: self.words };
&mut ctx.phrase_interner, let phrase_desc = phrase.description(&ctx.word_interner);
Phrase { words: self.words }, QueryTerm {
)), original: ctx.word_interner.insert(phrase_desc),
ngram_words: None,
max_nbr_typos: 0,
is_prefix: false,
zero_typo: ZeroTypoTerm {
phrase: Some(ctx.phrase_interner.insert(phrase)),
zero_typo: None,
prefix_of: BTreeSet::default(),
synonyms: BTreeSet::default(),
use_prefix_db: None,
},
one_typo: Lazy::Uninit,
two_typo: Lazy::Uninit,
}
}),
positions: self.start..=self.end, positions: self.start..=self.end,
}) })
} }
@ -1001,7 +959,7 @@ pub fn make_ngram(
} }
let mut words_interned = vec![]; let mut words_interned = vec![];
for term in terms { for term in terms {
if let Some(original_term_word) = ctx.term_interner.get(term.value).original_single_word() { if let Some(original_term_word) = term.value.original_single_word(ctx) {
words_interned.push(original_term_word); words_interned.push(original_term_word);
} else { } else {
return Ok(None); return Ok(None);
@ -1036,7 +994,7 @@ pub fn make_ngram(
let term = QueryTerm { let term = QueryTerm {
original: ngram_str_interned, original: ngram_str_interned,
is_multiple_words: true, ngram_words: Some(words_interned),
is_prefix, is_prefix,
max_nbr_typos, max_nbr_typos,
zero_typo: term.zero_typo, zero_typo: term.zero_typo,