mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-24 21:50:07 +01:00
Exact attribute with state
This commit is contained in:
parent
4b4ffb8ec9
commit
337e75b0e4
@ -1,5 +1,5 @@
|
|||||||
use heed::BytesDecode;
|
use heed::BytesDecode;
|
||||||
use roaring::MultiOps;
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
|
|
||||||
use super::query_graph::QueryGraph;
|
use super::query_graph::QueryGraph;
|
||||||
use super::ranking_rules::{RankingRule, RankingRuleOutput};
|
use super::ranking_rules::{RankingRule, RankingRuleOutput};
|
||||||
@ -7,19 +7,18 @@ use crate::search::new::query_graph::QueryNodeData;
|
|||||||
use crate::search::new::query_term::ExactTerm;
|
use crate::search::new::query_term::ExactTerm;
|
||||||
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
|
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
|
||||||
|
|
||||||
/// FIXME:
|
/// A ranking rule that produces 3 disjoint buckets:
|
||||||
///
|
///
|
||||||
/// - A lot of work done in next_bucket that start_iteration could do.
|
/// 1. Documents from the universe whose value is exactly the query.
|
||||||
/// - Consider calling the graph based rule directly from this one.
|
/// 2. Documents from the universe not in (1) whose value starts with the query.
|
||||||
/// - currently we did exact term, don't forget about prefix
|
/// 3. Documents from the universe not in (1) or (2).
|
||||||
/// - some tests
|
|
||||||
pub struct ExactAttribute {
|
pub struct ExactAttribute {
|
||||||
query_graph: Option<QueryGraph>,
|
state: State,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ExactAttribute {
|
impl ExactAttribute {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self { query_graph: None }
|
Self { state: Default::default() }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -30,23 +29,69 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
|||||||
|
|
||||||
fn start_iteration(
|
fn start_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
_ctx: &mut SearchContext<'ctx>,
|
ctx: &mut SearchContext<'ctx>,
|
||||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
_universe: &roaring::RoaringBitmap,
|
universe: &roaring::RoaringBitmap,
|
||||||
query: &QueryGraph,
|
query: &QueryGraph,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
self.query_graph = Some(query.clone());
|
self.state = State::start_iteration(ctx, universe, query)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn next_bucket(
|
fn next_bucket(
|
||||||
&mut self,
|
&mut self,
|
||||||
ctx: &mut SearchContext<'ctx>,
|
_ctx: &mut SearchContext<'ctx>,
|
||||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
universe: &roaring::RoaringBitmap,
|
universe: &roaring::RoaringBitmap,
|
||||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||||
// iterate on the nodes of the graph, retain LocatedQueryTermSubset
|
let state = std::mem::take(&mut self.state);
|
||||||
let query_graph = self.query_graph.as_ref().unwrap();
|
let (state, output) = State::next(state, universe);
|
||||||
|
self.state = state;
|
||||||
|
|
||||||
|
Ok(output)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn end_iteration(
|
||||||
|
&mut self,
|
||||||
|
_ctx: &mut SearchContext<'ctx>,
|
||||||
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
|
) {
|
||||||
|
self.state = Default::default();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Inner state of the ranking rule.
|
||||||
|
#[derive(Default)]
|
||||||
|
enum State {
|
||||||
|
/// State between two iterations
|
||||||
|
#[default]
|
||||||
|
Uninitialized,
|
||||||
|
/// The next call to `next` will output the documents in the universe that have an attribute that is the exact query
|
||||||
|
ExactAttribute(QueryGraph, Vec<FieldCandidates>),
|
||||||
|
/// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query,
|
||||||
|
/// but isn't the exact query.
|
||||||
|
AttributeStarts(QueryGraph, Vec<FieldCandidates>),
|
||||||
|
/// The next calls to `next` will output the input universe.
|
||||||
|
Empty(QueryGraph),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The candidates sorted by attributes
|
||||||
|
///
|
||||||
|
/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field.
|
||||||
|
struct FieldCandidates {
|
||||||
|
/// The candidates that start with all the words of the query in the field
|
||||||
|
start_with_exact: RoaringBitmap,
|
||||||
|
/// The candidates that have the same number of words as the query in the field
|
||||||
|
exact_word_count: RoaringBitmap,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl State {
|
||||||
|
fn start_iteration(
|
||||||
|
ctx: &mut SearchContext<'_>,
|
||||||
|
universe: &RoaringBitmap,
|
||||||
|
query_graph: &QueryGraph,
|
||||||
|
) -> Result<Self> {
|
||||||
let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> =
|
let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> =
|
||||||
Vec::with_capacity(query_graph.nodes.len() as usize);
|
Vec::with_capacity(query_graph.nodes.len() as usize);
|
||||||
for (_, node) in query_graph.nodes.iter() {
|
for (_, node) in query_graph.nodes.iter() {
|
||||||
@ -55,11 +100,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
|||||||
let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
|
let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
|
||||||
exact_term
|
exact_term
|
||||||
} else {
|
} else {
|
||||||
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
|
continue;
|
||||||
return Ok(Some(RankingRuleOutput {
|
|
||||||
query: query_graph.clone(),
|
|
||||||
candidates: universe.clone(),
|
|
||||||
}));
|
|
||||||
};
|
};
|
||||||
exact_term_position_ids.push((
|
exact_term_position_ids.push((
|
||||||
exact_term,
|
exact_term,
|
||||||
@ -73,14 +114,17 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
|||||||
|
|
||||||
exact_term_position_ids.sort_by_key(|(_, _, id)| *id);
|
exact_term_position_ids.sort_by_key(|(_, _, id)| *id);
|
||||||
// bail if there is a "hole" (missing word) in remaining query graph
|
// bail if there is a "hole" (missing word) in remaining query graph
|
||||||
|
if let Some((_, _, first_id)) = exact_term_position_ids.first() {
|
||||||
|
if *first_id != 0 {
|
||||||
|
return Ok(State::Empty(query_graph.clone()));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Ok(State::Empty(query_graph.clone()));
|
||||||
|
}
|
||||||
let mut previous_id = 0;
|
let mut previous_id = 0;
|
||||||
for (_, _, id) in exact_term_position_ids.iter().copied() {
|
for (_, _, id) in exact_term_position_ids.iter().copied() {
|
||||||
if id < previous_id || id - previous_id > 1 {
|
if id < previous_id || id - previous_id > 1 {
|
||||||
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
|
return Ok(State::Empty(query_graph.clone()));
|
||||||
return Ok(Some(RankingRuleOutput {
|
|
||||||
query: query_graph.clone(),
|
|
||||||
candidates: universe.clone(),
|
|
||||||
}));
|
|
||||||
} else {
|
} else {
|
||||||
previous_id = id;
|
previous_id = id;
|
||||||
}
|
}
|
||||||
@ -102,11 +146,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
|||||||
.collect();
|
.collect();
|
||||||
for (words, position) in &words_positions {
|
for (words, position) in &words_positions {
|
||||||
if candidates.is_empty() {
|
if candidates.is_empty() {
|
||||||
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
|
return Ok(State::Empty(query_graph.clone()));
|
||||||
return Ok(Some(RankingRuleOutput {
|
|
||||||
query: query_graph.clone(),
|
|
||||||
candidates: universe.clone(),
|
|
||||||
}));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
'words: for (offset, word) in words.iter().enumerate() {
|
'words: for (offset, word) in words.iter().enumerate() {
|
||||||
@ -116,8 +156,11 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
|||||||
} else {
|
} else {
|
||||||
continue 'words;
|
continue 'words;
|
||||||
};
|
};
|
||||||
|
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
|
||||||
|
// longer phrases we'll be losing on precision here.
|
||||||
|
let bucketed_position = crate::bucketed_position(position + offset);
|
||||||
let word_position_docids = CboRoaringBitmapCodec::bytes_decode(
|
let word_position_docids = CboRoaringBitmapCodec::bytes_decode(
|
||||||
ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(),
|
ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(),
|
||||||
)
|
)
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
candidates &= word_position_docids;
|
candidates &= word_position_docids;
|
||||||
@ -127,16 +170,12 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
|||||||
let candidates = candidates;
|
let candidates = candidates;
|
||||||
|
|
||||||
if candidates.is_empty() {
|
if candidates.is_empty() {
|
||||||
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
|
return Ok(State::Empty(query_graph.clone()));
|
||||||
return Ok(Some(RankingRuleOutput {
|
|
||||||
query: query_graph.clone(),
|
|
||||||
candidates: universe.clone(),
|
|
||||||
}));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default();
|
let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default();
|
||||||
|
|
||||||
let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len());
|
let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
|
||||||
|
|
||||||
// then check that there exists at least one attribute that has all of the terms
|
// then check that there exists at least one attribute that has all of the terms
|
||||||
for fid in searchable_fields_ids {
|
for fid in searchable_fields_ids {
|
||||||
@ -156,20 +195,59 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
|||||||
)?;
|
)?;
|
||||||
intersection &= &candidates;
|
intersection &= &candidates;
|
||||||
if !intersection.is_empty() {
|
if !intersection.is_empty() {
|
||||||
candidates_per_attributes.push(intersection);
|
let candidates_with_exact_word_count = ctx
|
||||||
|
.index
|
||||||
|
.field_id_word_count_docids
|
||||||
|
.get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))?
|
||||||
|
.unwrap_or_default();
|
||||||
|
candidates_per_attribute.push(FieldCandidates {
|
||||||
|
start_with_exact: intersection,
|
||||||
|
exact_word_count: candidates_with_exact_word_count,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// note we could have "false positives" where there both exist different attributes that collectively
|
// note we could have "false positives" where there both exist different attributes that collectively
|
||||||
// have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
|
// have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
|
||||||
|
|
||||||
let candidates = MultiOps::union(candidates_per_attributes.into_iter());
|
Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute))
|
||||||
Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates }))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn end_iteration(
|
fn next(
|
||||||
&mut self,
|
state: State,
|
||||||
_ctx: &mut SearchContext<'ctx>,
|
universe: &RoaringBitmap,
|
||||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
) -> (State, Option<RankingRuleOutput<QueryGraph>>) {
|
||||||
) {
|
let (state, output) = match state {
|
||||||
|
State::Uninitialized => (state, None),
|
||||||
|
State::ExactAttribute(query_graph, candidates_per_attribute) => {
|
||||||
|
let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
|
||||||
|
|FieldCandidates { start_with_exact, exact_word_count }| {
|
||||||
|
start_with_exact & exact_word_count
|
||||||
|
},
|
||||||
|
));
|
||||||
|
candidates &= universe;
|
||||||
|
(
|
||||||
|
State::AttributeStarts(query_graph.clone(), candidates_per_attribute),
|
||||||
|
Some(RankingRuleOutput { query: query_graph, candidates }),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
State::AttributeStarts(query_graph, candidates_per_attribute) => {
|
||||||
|
let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
|
||||||
|
|FieldCandidates { mut start_with_exact, exact_word_count }| {
|
||||||
|
start_with_exact -= exact_word_count;
|
||||||
|
start_with_exact
|
||||||
|
},
|
||||||
|
));
|
||||||
|
candidates &= universe;
|
||||||
|
(
|
||||||
|
State::Empty(query_graph.clone()),
|
||||||
|
Some(RankingRuleOutput { query: query_graph, candidates }),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
State::Empty(query_graph) => (
|
||||||
|
State::Empty(query_graph.clone()),
|
||||||
|
Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }),
|
||||||
|
),
|
||||||
|
};
|
||||||
|
(state, output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user