Simplify word level position DB into a word position DB

This commit is contained in:
many 2021-10-05 11:18:42 +02:00
parent 75d341d928
commit 3296bb243c
No known key found for this signature in database
GPG key ID: 2CEF23B75189EACA
18 changed files with 220 additions and 545 deletions

View file

@ -10,7 +10,7 @@ use super::{resolve_query_tree, Context, Criterion, CriterionParameters, Criteri
use crate::search::criteria::Query;
use crate::search::query_tree::{Operation, QueryKind};
use crate::search::{build_dfa, word_derivations, WordDerivationsCache};
use crate::{Result, TreeLevel};
use crate::Result;
/// To be able to divide integers by the number of words in the query
/// we want to find a multiplier that allow us to divide by any number between 1 and 10.
@ -176,20 +176,14 @@ impl<'t> Criterion for Attribute<'t> {
}
}
/// QueryLevelIterator is an pseudo-Iterator for a Query,
/// It contains WordLevelIterators and is chainned with other QueryLevelIterator.
struct QueryLevelIterator<'t> {
inner: Vec<
Peekable<
Box<
dyn Iterator<Item = heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>>
+ 't,
>,
>,
>,
/// QueryPositionIterator is an Iterator over positions of a Query,
/// It contains iterators over words positions.
struct QueryPositionIterator<'t> {
inner:
Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u32), RoaringBitmap)>> + 't>>>,
}
impl<'t> QueryLevelIterator<'t> {
impl<'t> QueryPositionIterator<'t> {
fn new(
ctx: &'t dyn Context<'t>,
queries: &[Query],
@ -201,25 +195,14 @@ impl<'t> QueryLevelIterator<'t> {
match &query.kind {
QueryKind::Exact { word, .. } => {
if !query.prefix || in_prefix_cache {
let iter = ctx.word_position_iterator(
query.kind.word(),
TreeLevel::min_value(),
in_prefix_cache,
None,
None,
)?;
let iter =
ctx.word_position_iterator(query.kind.word(), in_prefix_cache)?;
inner.push(iter.peekable());
} else {
for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?
{
let iter = ctx.word_position_iterator(
&word,
TreeLevel::min_value(),
in_prefix_cache,
None,
None,
)?;
let iter = ctx.word_position_iterator(&word, in_prefix_cache)?;
inner.push(iter.peekable());
}
@ -229,13 +212,7 @@ impl<'t> QueryLevelIterator<'t> {
for (word, _) in
word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?
{
let iter = ctx.word_position_iterator(
&word,
TreeLevel::min_value(),
in_prefix_cache,
None,
None,
)?;
let iter = ctx.word_position_iterator(&word, in_prefix_cache)?;
inner.push(iter.peekable());
}
@ -247,7 +224,7 @@ impl<'t> QueryLevelIterator<'t> {
}
}
impl<'t> Iterator for QueryLevelIterator<'t> {
impl<'t> Iterator for QueryPositionIterator<'t> {
type Item = heed::Result<(u32, RoaringBitmap)>;
fn next(&mut self) -> Option<Self::Item> {
@ -256,14 +233,14 @@ impl<'t> Iterator for QueryLevelIterator<'t> {
.inner
.iter_mut()
.filter_map(|wli| match wli.peek() {
Some(Ok(((_, _, pos, _), _))) => Some(*pos),
Some(Ok(((_, pos), _))) => Some(*pos),
_ => None,
})
.min()?;
let mut candidates = None;
for wli in self.inner.iter_mut() {
if let Some(Ok(((_, _, pos, _), _))) = wli.peek() {
if let Some(Ok(((_, pos), _))) = wli.peek() {
if *pos > expected_pos {
continue;
}
@ -286,9 +263,9 @@ impl<'t> Iterator for QueryLevelIterator<'t> {
}
/// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
/// This branch allows us to iterate over meta-interval of position and to dig in it if it contains interesting candidates.
/// This branch allows us to iterate over meta-interval of positions.
struct Branch<'t> {
query_level_iterator: Vec<(u32, RoaringBitmap, Peekable<QueryLevelIterator<'t>>)>,
query_level_iterator: Vec<(u32, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>,
last_result: (u32, RoaringBitmap),
branch_size: u32,
}
@ -302,7 +279,7 @@ impl<'t> Branch<'t> {
) -> Result<Self> {
let mut query_level_iterator = Vec::new();
for queries in flatten_branch {
let mut qli = QueryLevelIterator::new(ctx, queries, wdcache)?.peekable();
let mut qli = QueryPositionIterator::new(ctx, queries, wdcache)?.peekable();
let (pos, docids) = qli.next().transpose()?.unwrap_or((0, RoaringBitmap::new()));
query_level_iterator.push((pos, docids & allowed_candidates, qli));
}

View file

@ -10,7 +10,7 @@ use crate::search::criteria::{
resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
};
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
use crate::{Result, TreeLevel};
use crate::Result;
pub struct Exactness<'t> {
ctx: &'t dyn Context<'t>,
@ -293,7 +293,6 @@ fn attribute_start_with_docids(
attribute_id: u32,
query: &[ExactQueryPart],
) -> heed::Result<Vec<RoaringBitmap>> {
let lowest_level = TreeLevel::min_value();
let mut attribute_candidates_array = Vec::new();
// start from attribute first position
let mut pos = attribute_id * 1000;
@ -303,7 +302,7 @@ fn attribute_start_with_docids(
Synonyms(synonyms) => {
let mut synonyms_candidates = RoaringBitmap::new();
for word in synonyms {
let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?;
let wc = ctx.word_position_docids(word, pos)?;
if let Some(word_candidates) = wc {
synonyms_candidates |= word_candidates;
}
@ -313,7 +312,7 @@ fn attribute_start_with_docids(
}
Phrase(phrase) => {
for word in phrase {
let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?;
let wc = ctx.word_position_docids(word, pos)?;
if let Some(word_candidates) = wc {
attribute_candidates_array.push(word_candidates);
}

View file

@ -14,7 +14,7 @@ use self::words::Words;
use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind};
use crate::search::criteria::geo::Geo;
use crate::search::{word_derivations, WordDerivationsCache};
use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result, TreeLevel};
use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result};
mod asc_desc;
mod attribute;
@ -90,20 +90,8 @@ pub trait Context<'c> {
fn word_position_iterator(
&self,
word: &str,
level: TreeLevel,
in_prefix_cache: bool,
left: Option<u32>,
right: Option<u32>,
) -> heed::Result<
Box<
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c,
>,
>;
fn word_position_last_level(
&self,
word: &str,
in_prefix_cache: bool,
) -> heed::Result<Option<TreeLevel>>;
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>>;
fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>;
fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>;
fn field_id_word_count_docids(
@ -111,13 +99,7 @@ pub trait Context<'c> {
field_id: FieldId,
word_count: u8,
) -> heed::Result<Option<RoaringBitmap>>;
fn word_level_position_docids(
&self,
word: &str,
level: TreeLevel,
left: u32,
right: u32,
) -> heed::Result<Option<RoaringBitmap>>;
fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result<Option<RoaringBitmap>>;
}
pub struct CriteriaBuilder<'t> {
@ -183,54 +165,24 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
fn word_position_iterator(
&self,
word: &str,
level: TreeLevel,
in_prefix_cache: bool,
left: Option<u32>,
right: Option<u32>,
) -> heed::Result<
Box<
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c,
>,
> {
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>>
{
let range = {
let left = left.unwrap_or(u32::min_value());
let right = right.unwrap_or(u32::max_value());
let left = (word, level, left, left);
let right = (word, level, right, right);
let left = u32::min_value();
let right = u32::max_value();
let left = (word, left);
let right = (word, right);
left..=right
};
let db = match in_prefix_cache {
true => self.index.word_prefix_level_position_docids,
false => self.index.word_level_position_docids,
true => self.index.word_prefix_position_docids,
false => self.index.word_position_docids,
};
Ok(Box::new(db.range(self.rtxn, &range)?))
}
fn word_position_last_level(
&self,
word: &str,
in_prefix_cache: bool,
) -> heed::Result<Option<TreeLevel>> {
let range = {
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
left..=right
};
let db = match in_prefix_cache {
true => self.index.word_prefix_level_position_docids,
false => self.index.word_level_position_docids,
};
let last_level = db
.remap_data_type::<heed::types::DecodeIgnore>()
.range(self.rtxn, &range)?
.last()
.transpose()?
.map(|((_, level, _, _), _)| level);
Ok(last_level)
}
fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>> {
self.index.words_synonyms(self.rtxn, &[word])
}
@ -251,15 +203,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
self.index.field_id_word_count_docids.get(self.rtxn, &key)
}
fn word_level_position_docids(
&self,
word: &str,
level: TreeLevel,
left: u32,
right: u32,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (word, level, left, right);
self.index.word_level_position_docids.get(self.rtxn, &key)
fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result<Option<RoaringBitmap>> {
let key = (word, pos);
self.index.word_position_docids.get(self.rtxn, &key)
}
}
@ -616,27 +562,13 @@ pub mod test {
fn word_position_iterator(
&self,
_word: &str,
_level: TreeLevel,
_in_prefix_cache: bool,
_left: Option<u32>,
_right: Option<u32>,
) -> heed::Result<
Box<
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>>
+ 'c,
>,
Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>,
> {
todo!()
}
fn word_position_last_level(
&self,
_word: &str,
_in_prefix_cache: bool,
) -> heed::Result<Option<TreeLevel>> {
todo!()
}
fn synonyms(&self, _word: &str) -> heed::Result<Option<Vec<Vec<String>>>> {
todo!()
}
@ -645,12 +577,10 @@ pub mod test {
todo!()
}
fn word_level_position_docids(
fn word_position_docids(
&self,
_word: &str,
_level: TreeLevel,
_left: u32,
_right: u32,
_pos: u32,
) -> heed::Result<Option<RoaringBitmap>> {
todo!()
}