mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Adjust some algorithms using DBs of word pair proximities
This commit is contained in:
parent
072b576514
commit
18d578dfc4
@ -226,6 +226,7 @@ fn resolve_state(
|
|||||||
}
|
}
|
||||||
// compute intersection on pair of words with a proximity of 0.
|
// compute intersection on pair of words with a proximity of 0.
|
||||||
Phrase(phrase) => {
|
Phrase(phrase) => {
|
||||||
|
// TODO: use resolve_phrase here
|
||||||
let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1));
|
let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1));
|
||||||
for words in phrase.windows(2) {
|
for words in phrase.windows(2) {
|
||||||
if let [left, right] = words {
|
if let [left, right] = words {
|
||||||
|
@ -71,6 +71,7 @@ pub trait Context<'c> {
|
|||||||
fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
fn exact_word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn exact_word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
|
|
||||||
fn word_pair_proximity_docids(
|
fn word_pair_proximity_docids(
|
||||||
&self,
|
&self,
|
||||||
left: &str,
|
left: &str,
|
||||||
@ -83,6 +84,12 @@ pub trait Context<'c> {
|
|||||||
right: &str,
|
right: &str,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> heed::Result<Option<RoaringBitmap>>;
|
) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
|
fn prefix_word_pair_proximity_docids(
|
||||||
|
&self,
|
||||||
|
prefix: &str,
|
||||||
|
right: &str,
|
||||||
|
proximity: u8,
|
||||||
|
) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
|
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
|
||||||
fn in_prefix_cache(&self, word: &str) -> bool;
|
fn in_prefix_cache(&self, word: &str) -> bool;
|
||||||
fn docid_words_positions(
|
fn docid_words_positions(
|
||||||
@ -111,6 +118,68 @@ pub struct CriteriaBuilder<'t> {
|
|||||||
words_prefixes_fst: fst::Set<Cow<'t, [u8]>>,
|
words_prefixes_fst: fst::Set<Cow<'t, [u8]>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the docids for the following word pairs and proximities using [`Context::word_pair_proximity_docids`].
|
||||||
|
/// * `left, right, prox` (leftward proximity)
|
||||||
|
/// * `right, left, prox-1` (rightward proximity)
|
||||||
|
///
|
||||||
|
/// ## Example
|
||||||
|
/// For a document with the text `the good fox eats the apple`, we have:
|
||||||
|
/// * `rightward_proximity(the, eats) = 3`
|
||||||
|
/// * `leftward_proximity(eats, the) = 1`
|
||||||
|
///
|
||||||
|
/// So both the expressions `word_pair_overall_proximity_docids(ctx, the, eats, 3)`
|
||||||
|
/// and `word_pair_overall_proximity_docids(ctx, the, eats, 2)` would return a bitmap containing
|
||||||
|
/// the id of this document.
|
||||||
|
fn word_pair_overall_proximity_docids(
|
||||||
|
ctx: &dyn Context,
|
||||||
|
left: &str,
|
||||||
|
right: &str,
|
||||||
|
prox: u8,
|
||||||
|
) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
|
let rightward = ctx.word_pair_proximity_docids(left, right, prox)?;
|
||||||
|
let leftward =
|
||||||
|
if prox > 1 { ctx.word_pair_proximity_docids(right, left, prox - 1)? } else { None };
|
||||||
|
if let Some(mut all) = rightward {
|
||||||
|
if let Some(leftward) = leftward {
|
||||||
|
all |= leftward;
|
||||||
|
}
|
||||||
|
Ok(Some(all))
|
||||||
|
} else {
|
||||||
|
Ok(leftward)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This function works identically to [`word_pair_overall_proximity_docids`] except that the
|
||||||
|
/// right word is replaced by a prefix string.
|
||||||
|
///
|
||||||
|
/// It will return None if no documents were found or if the prefix does not exist in the
|
||||||
|
/// `word_prefix_pair_proximity_docids` database.
|
||||||
|
fn word_prefix_pair_overall_proximity_docids(
|
||||||
|
ctx: &dyn Context,
|
||||||
|
left: &str,
|
||||||
|
prefix: &str,
|
||||||
|
proximity: u8,
|
||||||
|
) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
|
// We retrieve the docids for the original and swapped word pairs:
|
||||||
|
// A: word1 prefix2 proximity
|
||||||
|
// B: prefix2 word1 proximity-1
|
||||||
|
let rightward = ctx.word_prefix_pair_proximity_docids(left, prefix, proximity)?;
|
||||||
|
|
||||||
|
let leftward = if proximity > 1 {
|
||||||
|
ctx.prefix_word_pair_proximity_docids(prefix, left, proximity - 1)?
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
if let Some(mut all) = rightward {
|
||||||
|
if let Some(leftward) = leftward {
|
||||||
|
all |= leftward;
|
||||||
|
}
|
||||||
|
Ok(Some(all))
|
||||||
|
} else {
|
||||||
|
Ok(leftward)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
||||||
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
|
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
|
||||||
self.index.documents_ids(self.rtxn)
|
self.index.documents_ids(self.rtxn)
|
||||||
@ -138,18 +207,24 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
|||||||
right: &str,
|
right: &str,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> heed::Result<Option<RoaringBitmap>> {
|
) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
let key = (proximity, left, right);
|
self.index.word_pair_proximity_docids.get(self.rtxn, &(proximity, left, right))
|
||||||
self.index.word_pair_proximity_docids.get(self.rtxn, &key)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn word_prefix_pair_proximity_docids(
|
fn word_prefix_pair_proximity_docids(
|
||||||
&self,
|
&self,
|
||||||
left: &str,
|
left: &str,
|
||||||
|
prefix: &str,
|
||||||
|
proximity: u8,
|
||||||
|
) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
|
self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &(proximity, left, prefix))
|
||||||
|
}
|
||||||
|
fn prefix_word_pair_proximity_docids(
|
||||||
|
&self,
|
||||||
|
prefix: &str,
|
||||||
right: &str,
|
right: &str,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> heed::Result<Option<RoaringBitmap>> {
|
) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
let key = (proximity, left, right);
|
self.index.prefix_word_pair_proximity_docids.get(self.rtxn, &(proximity, prefix, right))
|
||||||
self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
|
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
|
||||||
@ -353,18 +428,35 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBit
|
|||||||
let mut candidates = RoaringBitmap::new();
|
let mut candidates = RoaringBitmap::new();
|
||||||
let mut first_iter = true;
|
let mut first_iter = true;
|
||||||
let winsize = phrase.len().min(7);
|
let winsize = phrase.len().min(7);
|
||||||
|
|
||||||
for win in phrase.windows(winsize) {
|
for win in phrase.windows(winsize) {
|
||||||
// Get all the documents with the matching distance for each word pairs.
|
// Get all the documents with the matching distance for each word pairs.
|
||||||
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
|
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
|
||||||
for (offset, s1) in win.iter().enumerate() {
|
for (offset, s1) in win.iter().enumerate() {
|
||||||
for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
|
for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
|
||||||
match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? {
|
if s1 == s2 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if dist == 0 {
|
||||||
|
match ctx.word_pair_proximity_docids(s1, s2, 1)? {
|
||||||
Some(m) => bitmaps.push(m),
|
Some(m) => bitmaps.push(m),
|
||||||
// If there are no document for this distance, there will be no
|
// If there are no document for this pair, there will be no
|
||||||
// results for the phrase query.
|
// results for the phrase query.
|
||||||
None => return Ok(RoaringBitmap::new()),
|
None => return Ok(RoaringBitmap::new()),
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
for dist in 0..=dist {
|
||||||
|
match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? {
|
||||||
|
Some(m) => bitmap |= m,
|
||||||
|
None => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if bitmap.is_empty() {
|
||||||
|
return Ok(bitmap);
|
||||||
|
} else {
|
||||||
|
bitmaps.push(bitmap);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -387,7 +479,7 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBit
|
|||||||
Ok(candidates)
|
Ok(candidates)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
|
fn all_word_pair_overall_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
|
||||||
ctx: &dyn Context,
|
ctx: &dyn Context,
|
||||||
left_words: &[(T, u8)],
|
left_words: &[(T, u8)],
|
||||||
right_words: &[(U, u8)],
|
right_words: &[(U, u8)],
|
||||||
@ -396,8 +488,8 @@ fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
|
|||||||
let mut docids = RoaringBitmap::new();
|
let mut docids = RoaringBitmap::new();
|
||||||
for (left, _l_typo) in left_words {
|
for (left, _l_typo) in left_words {
|
||||||
for (right, _r_typo) in right_words {
|
for (right, _r_typo) in right_words {
|
||||||
let current_docids = ctx
|
let current_docids =
|
||||||
.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?
|
word_pair_overall_proximity_docids(ctx, left.as_ref(), right.as_ref(), proximity)?
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
docids |= current_docids;
|
docids |= current_docids;
|
||||||
}
|
}
|
||||||
@ -472,7 +564,8 @@ fn query_pair_proximity_docids(
|
|||||||
match (&left.kind, &right.kind) {
|
match (&left.kind, &right.kind) {
|
||||||
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
|
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
|
||||||
if prefix {
|
if prefix {
|
||||||
match ctx.word_prefix_pair_proximity_docids(
|
match word_prefix_pair_overall_proximity_docids(
|
||||||
|
ctx,
|
||||||
left.as_str(),
|
left.as_str(),
|
||||||
right.as_str(),
|
right.as_str(),
|
||||||
proximity,
|
proximity,
|
||||||
@ -480,7 +573,12 @@ fn query_pair_proximity_docids(
|
|||||||
Some(docids) => Ok(docids),
|
Some(docids) => Ok(docids),
|
||||||
None => {
|
None => {
|
||||||
let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
|
let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
|
||||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
all_word_pair_overall_proximity_docids(
|
||||||
|
ctx,
|
||||||
|
&[(left, 0)],
|
||||||
|
&r_words,
|
||||||
|
proximity,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -495,7 +593,8 @@ fn query_pair_proximity_docids(
|
|||||||
if prefix {
|
if prefix {
|
||||||
let mut docids = RoaringBitmap::new();
|
let mut docids = RoaringBitmap::new();
|
||||||
for (left, _) in l_words {
|
for (left, _) in l_words {
|
||||||
let current_docids = match ctx.word_prefix_pair_proximity_docids(
|
let current_docids = match word_prefix_pair_overall_proximity_docids(
|
||||||
|
ctx,
|
||||||
left.as_str(),
|
left.as_str(),
|
||||||
right.as_str(),
|
right.as_str(),
|
||||||
proximity,
|
proximity,
|
||||||
@ -504,19 +603,24 @@ fn query_pair_proximity_docids(
|
|||||||
None => {
|
None => {
|
||||||
let r_words =
|
let r_words =
|
||||||
word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
|
word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
|
||||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
all_word_pair_overall_proximity_docids(
|
||||||
|
ctx,
|
||||||
|
&[(left, 0)],
|
||||||
|
&r_words,
|
||||||
|
proximity,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}?;
|
}?;
|
||||||
docids |= current_docids;
|
docids |= current_docids;
|
||||||
}
|
}
|
||||||
Ok(docids)
|
Ok(docids)
|
||||||
} else {
|
} else {
|
||||||
all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
|
all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
(QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => {
|
(QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => {
|
||||||
let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?;
|
let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?;
|
||||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
||||||
}
|
}
|
||||||
(
|
(
|
||||||
QueryKind::Tolerant { typo: l_typo, word: left },
|
QueryKind::Tolerant { typo: l_typo, word: left },
|
||||||
@ -525,7 +629,7 @@ fn query_pair_proximity_docids(
|
|||||||
let l_words =
|
let l_words =
|
||||||
word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned();
|
word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned();
|
||||||
let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?;
|
let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?;
|
||||||
all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity)
|
all_word_pair_overall_proximity_docids(ctx, &l_words, &r_words, proximity)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -552,6 +656,7 @@ pub mod test {
|
|||||||
exact_word_prefix_docids: HashMap<String, RoaringBitmap>,
|
exact_word_prefix_docids: HashMap<String, RoaringBitmap>,
|
||||||
word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||||
word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||||
|
prefix_word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||||
docid_words: HashMap<u32, Vec<String>>,
|
docid_words: HashMap<u32, Vec<String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -588,13 +693,22 @@ pub mod test {
|
|||||||
|
|
||||||
fn word_prefix_pair_proximity_docids(
|
fn word_prefix_pair_proximity_docids(
|
||||||
&self,
|
&self,
|
||||||
left: &str,
|
word: &str,
|
||||||
right: &str,
|
prefix: &str,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> heed::Result<Option<RoaringBitmap>> {
|
) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
let key = (left.to_string(), right.to_string(), proximity.into());
|
let key = (word.to_string(), prefix.to_string(), proximity.into());
|
||||||
Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned())
|
Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned())
|
||||||
}
|
}
|
||||||
|
fn prefix_word_pair_proximity_docids(
|
||||||
|
&self,
|
||||||
|
prefix: &str,
|
||||||
|
word: &str,
|
||||||
|
proximity: u8,
|
||||||
|
) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
|
let key = (prefix.to_string(), word.to_string(), proximity.into());
|
||||||
|
Ok(self.prefix_word_pair_proximity_docids.get(&key).cloned())
|
||||||
|
}
|
||||||
|
|
||||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
|
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
|
||||||
&self.words_fst
|
&self.words_fst
|
||||||
@ -708,6 +822,8 @@ pub mod test {
|
|||||||
|
|
||||||
let mut word_pair_proximity_docids = HashMap::new();
|
let mut word_pair_proximity_docids = HashMap::new();
|
||||||
let mut word_prefix_pair_proximity_docids = HashMap::new();
|
let mut word_prefix_pair_proximity_docids = HashMap::new();
|
||||||
|
let mut prefix_word_pair_proximity_docids = HashMap::new();
|
||||||
|
|
||||||
for (lword, lcandidates) in &word_docids {
|
for (lword, lcandidates) in &word_docids {
|
||||||
for (rword, rcandidates) in &word_docids {
|
for (rword, rcandidates) in &word_docids {
|
||||||
if lword == rword {
|
if lword == rword {
|
||||||
@ -740,15 +856,19 @@ pub mod test {
|
|||||||
let lposition = docid_words.iter().position(|w| w == lword).unwrap();
|
let lposition = docid_words.iter().position(|w| w == lword).unwrap();
|
||||||
let rposition =
|
let rposition =
|
||||||
docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
|
docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
|
||||||
let key = if lposition < rposition {
|
if lposition < rposition {
|
||||||
(s(lword), s(pword), (rposition - lposition) as i32)
|
let key = (s(lword), s(pword), (rposition - lposition) as i32);
|
||||||
} else {
|
|
||||||
(s(lword), s(pword), (lposition - rposition + 1) as i32)
|
|
||||||
};
|
|
||||||
let docids = word_prefix_pair_proximity_docids
|
let docids = word_prefix_pair_proximity_docids
|
||||||
.entry(key)
|
.entry(key)
|
||||||
.or_insert(RoaringBitmap::new());
|
.or_insert(RoaringBitmap::new());
|
||||||
docids.push(candidate);
|
docids.push(candidate);
|
||||||
|
} else {
|
||||||
|
let key = (s(lword), s(pword), (lposition - rposition) as i32);
|
||||||
|
let docids = prefix_word_pair_proximity_docids
|
||||||
|
.entry(key)
|
||||||
|
.or_insert(RoaringBitmap::new());
|
||||||
|
docids.push(candidate);
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -766,6 +886,7 @@ pub mod test {
|
|||||||
exact_word_prefix_docids,
|
exact_word_prefix_docids,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
|
prefix_word_pair_proximity_docids,
|
||||||
docid_words,
|
docid_words,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user