mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
Merge branch 'search-refactor-exactness' into search-refactor-tests-doc
This commit is contained in:
commit
7ca91ebb71
@ -27,6 +27,8 @@ pub struct DatabaseCache<'ctx> {
|
|||||||
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||||
|
|
||||||
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
||||||
|
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||||
|
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||||
}
|
}
|
||||||
impl<'ctx> DatabaseCache<'ctx> {
|
impl<'ctx> DatabaseCache<'ctx> {
|
||||||
fn get_value<'v, K1, KC>(
|
fn get_value<'v, K1, KC>(
|
||||||
@ -141,4 +143,32 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_db_word_position_docids(
|
||||||
|
&mut self,
|
||||||
|
word: Interned<String>,
|
||||||
|
position: u16,
|
||||||
|
) -> Result<Option<&'ctx [u8]>> {
|
||||||
|
DatabaseCache::get_value(
|
||||||
|
self.txn,
|
||||||
|
(word, position),
|
||||||
|
&(self.word_interner.get(word).as_str(), position),
|
||||||
|
&mut self.db_cache.word_position_docids,
|
||||||
|
self.index.word_position_docids.remap_data_type::<ByteSlice>(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_db_word_fid_docids(
|
||||||
|
&mut self,
|
||||||
|
word: Interned<String>,
|
||||||
|
fid: u16,
|
||||||
|
) -> Result<Option<&'ctx [u8]>> {
|
||||||
|
DatabaseCache::get_value(
|
||||||
|
self.txn,
|
||||||
|
(word, fid),
|
||||||
|
&(self.word_interner.get(word).as_str(), fid),
|
||||||
|
&mut self.db_cache.word_fid_docids,
|
||||||
|
self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
253
milli/src/search/new/exact_attribute.rs
Normal file
253
milli/src/search/new/exact_attribute.rs
Normal file
@ -0,0 +1,253 @@
|
|||||||
|
use heed::BytesDecode;
|
||||||
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
|
|
||||||
|
use super::query_graph::QueryGraph;
|
||||||
|
use super::ranking_rules::{RankingRule, RankingRuleOutput};
|
||||||
|
use crate::search::new::query_graph::QueryNodeData;
|
||||||
|
use crate::search::new::query_term::ExactTerm;
|
||||||
|
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
|
||||||
|
|
||||||
|
/// A ranking rule that produces 3 disjoint buckets:
|
||||||
|
///
|
||||||
|
/// 1. Documents from the universe whose value is exactly the query.
|
||||||
|
/// 2. Documents from the universe not in (1) whose value starts with the query.
|
||||||
|
/// 3. Documents from the universe not in (1) or (2).
|
||||||
|
pub struct ExactAttribute {
|
||||||
|
state: State,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExactAttribute {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self { state: Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
||||||
|
fn id(&self) -> String {
|
||||||
|
"exact_attribute".to_owned()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn start_iteration(
|
||||||
|
&mut self,
|
||||||
|
ctx: &mut SearchContext<'ctx>,
|
||||||
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
|
universe: &roaring::RoaringBitmap,
|
||||||
|
query: &QueryGraph,
|
||||||
|
) -> Result<()> {
|
||||||
|
self.state = State::start_iteration(ctx, universe, query)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next_bucket(
|
||||||
|
&mut self,
|
||||||
|
_ctx: &mut SearchContext<'ctx>,
|
||||||
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
|
universe: &roaring::RoaringBitmap,
|
||||||
|
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||||
|
let state = std::mem::take(&mut self.state);
|
||||||
|
let (state, output) = State::next(state, universe);
|
||||||
|
self.state = state;
|
||||||
|
|
||||||
|
Ok(output)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn end_iteration(
|
||||||
|
&mut self,
|
||||||
|
_ctx: &mut SearchContext<'ctx>,
|
||||||
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
|
) {
|
||||||
|
self.state = Default::default();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Inner state of the ranking rule.
|
||||||
|
#[derive(Default)]
|
||||||
|
enum State {
|
||||||
|
/// State between two iterations
|
||||||
|
#[default]
|
||||||
|
Uninitialized,
|
||||||
|
/// The next call to `next` will output the documents in the universe that have an attribute that is the exact query
|
||||||
|
ExactAttribute(QueryGraph, Vec<FieldCandidates>),
|
||||||
|
/// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query,
|
||||||
|
/// but isn't the exact query.
|
||||||
|
AttributeStarts(QueryGraph, Vec<FieldCandidates>),
|
||||||
|
/// The next calls to `next` will output the input universe.
|
||||||
|
Empty(QueryGraph),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The candidates sorted by attributes
|
||||||
|
///
|
||||||
|
/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field.
|
||||||
|
struct FieldCandidates {
|
||||||
|
/// The candidates that start with all the words of the query in the field
|
||||||
|
start_with_exact: RoaringBitmap,
|
||||||
|
/// The candidates that have the same number of words as the query in the field
|
||||||
|
exact_word_count: RoaringBitmap,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl State {
|
||||||
|
fn start_iteration(
|
||||||
|
ctx: &mut SearchContext<'_>,
|
||||||
|
universe: &RoaringBitmap,
|
||||||
|
query_graph: &QueryGraph,
|
||||||
|
) -> Result<Self> {
|
||||||
|
let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> =
|
||||||
|
Vec::with_capacity(query_graph.nodes.len() as usize);
|
||||||
|
for (_, node) in query_graph.nodes.iter() {
|
||||||
|
match &node.data {
|
||||||
|
QueryNodeData::Term(term) => {
|
||||||
|
let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
|
||||||
|
exact_term
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
exact_term_position_ids.push((
|
||||||
|
exact_term,
|
||||||
|
*term.positions.start(),
|
||||||
|
*term.term_ids.start(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
exact_term_position_ids.sort_by_key(|(_, _, id)| *id);
|
||||||
|
// bail if there is a "hole" (missing word) in remaining query graph
|
||||||
|
if let Some((_, _, first_id)) = exact_term_position_ids.first() {
|
||||||
|
if *first_id != 0 {
|
||||||
|
return Ok(State::Empty(query_graph.clone()));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Ok(State::Empty(query_graph.clone()));
|
||||||
|
}
|
||||||
|
let mut previous_id = 0;
|
||||||
|
for (_, _, id) in exact_term_position_ids.iter().copied() {
|
||||||
|
if id < previous_id || id - previous_id > 1 {
|
||||||
|
return Ok(State::Empty(query_graph.clone()));
|
||||||
|
} else {
|
||||||
|
previous_id = id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sample query: "sunflower are pretty"
|
||||||
|
// sunflower at pos 0 in attr A
|
||||||
|
// are at pos 1 in attr B
|
||||||
|
// pretty at pos 2 in attr C
|
||||||
|
// We want to eliminate such document
|
||||||
|
|
||||||
|
// first check that for each term, there exists some attribute that has this term at the correct position
|
||||||
|
//"word-position-docids";
|
||||||
|
let mut candidates = universe.clone();
|
||||||
|
let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids
|
||||||
|
.iter()
|
||||||
|
.copied()
|
||||||
|
.map(|(term, position, _)| (term.interned_words(ctx).collect(), position))
|
||||||
|
.collect();
|
||||||
|
for (words, position) in &words_positions {
|
||||||
|
if candidates.is_empty() {
|
||||||
|
return Ok(State::Empty(query_graph.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
|
'words: for (offset, word) in words.iter().enumerate() {
|
||||||
|
let offset = offset as u16;
|
||||||
|
let word = if let Some(word) = word {
|
||||||
|
word
|
||||||
|
} else {
|
||||||
|
continue 'words;
|
||||||
|
};
|
||||||
|
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
|
||||||
|
// longer phrases we'll be losing on precision here.
|
||||||
|
let bucketed_position = crate::bucketed_position(position + offset);
|
||||||
|
let word_position_docids = CboRoaringBitmapCodec::bytes_decode(
|
||||||
|
ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(),
|
||||||
|
)
|
||||||
|
.unwrap_or_default();
|
||||||
|
candidates &= word_position_docids;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let candidates = candidates;
|
||||||
|
|
||||||
|
if candidates.is_empty() {
|
||||||
|
return Ok(State::Empty(query_graph.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default();
|
||||||
|
|
||||||
|
let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
|
||||||
|
|
||||||
|
// then check that there exists at least one attribute that has all of the terms
|
||||||
|
for fid in searchable_fields_ids {
|
||||||
|
let mut intersection = MultiOps::intersection(
|
||||||
|
words_positions
|
||||||
|
.iter()
|
||||||
|
.flat_map(|(words, ..)| words.iter())
|
||||||
|
// ignore stop words words in phrases
|
||||||
|
.flatten()
|
||||||
|
.map(|word| -> Result<_> {
|
||||||
|
Ok(ctx
|
||||||
|
.get_db_word_fid_docids(*word, fid)?
|
||||||
|
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||||
|
.unwrap_or_default()
|
||||||
|
.unwrap_or_default())
|
||||||
|
}),
|
||||||
|
)?;
|
||||||
|
intersection &= &candidates;
|
||||||
|
if !intersection.is_empty() {
|
||||||
|
let candidates_with_exact_word_count = ctx
|
||||||
|
.index
|
||||||
|
.field_id_word_count_docids
|
||||||
|
.get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))?
|
||||||
|
.unwrap_or_default();
|
||||||
|
candidates_per_attribute.push(FieldCandidates {
|
||||||
|
start_with_exact: intersection,
|
||||||
|
exact_word_count: candidates_with_exact_word_count,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// note we could have "false positives" where there both exist different attributes that collectively
|
||||||
|
// have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
|
||||||
|
|
||||||
|
Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next(
|
||||||
|
state: State,
|
||||||
|
universe: &RoaringBitmap,
|
||||||
|
) -> (State, Option<RankingRuleOutput<QueryGraph>>) {
|
||||||
|
let (state, output) = match state {
|
||||||
|
State::Uninitialized => (state, None),
|
||||||
|
State::ExactAttribute(query_graph, candidates_per_attribute) => {
|
||||||
|
let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
|
||||||
|
|FieldCandidates { start_with_exact, exact_word_count }| {
|
||||||
|
start_with_exact & exact_word_count
|
||||||
|
},
|
||||||
|
));
|
||||||
|
candidates &= universe;
|
||||||
|
(
|
||||||
|
State::AttributeStarts(query_graph.clone(), candidates_per_attribute),
|
||||||
|
Some(RankingRuleOutput { query: query_graph, candidates }),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
State::AttributeStarts(query_graph, candidates_per_attribute) => {
|
||||||
|
let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
|
||||||
|
|FieldCandidates { mut start_with_exact, exact_word_count }| {
|
||||||
|
start_with_exact -= exact_word_count;
|
||||||
|
start_with_exact
|
||||||
|
},
|
||||||
|
));
|
||||||
|
candidates &= universe;
|
||||||
|
(
|
||||||
|
State::Empty(query_graph.clone()),
|
||||||
|
Some(RankingRuleOutput { query: query_graph, candidates }),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
State::Empty(query_graph) => (
|
||||||
|
State::Empty(query_graph.clone()),
|
||||||
|
Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }),
|
||||||
|
),
|
||||||
|
};
|
||||||
|
(state, output)
|
||||||
|
}
|
||||||
|
}
|
@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner};
|
|||||||
use super::logger::SearchLogger;
|
use super::logger::SearchLogger;
|
||||||
use super::query_graph::QueryNode;
|
use super::query_graph::QueryNode;
|
||||||
use super::ranking_rule_graph::{
|
use super::ranking_rule_graph::{
|
||||||
ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait,
|
ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph,
|
||||||
TypoGraph,
|
RankingRuleGraphTrait, TypoGraph,
|
||||||
};
|
};
|
||||||
use super::small_bitmap::SmallBitmap;
|
use super::small_bitmap::SmallBitmap;
|
||||||
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
||||||
@ -65,6 +65,12 @@ impl GraphBasedRankingRule<TypoGraph> {
|
|||||||
Self::new_with_id("typo".to_owned(), terms_matching_strategy)
|
Self::new_with_id("typo".to_owned(), terms_matching_strategy)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
pub type Exactness = GraphBasedRankingRule<ExactnessGraph>;
|
||||||
|
impl GraphBasedRankingRule<ExactnessGraph> {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self::new_with_id("exactness".to_owned(), None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// A generic graph-based ranking rule
|
/// A generic graph-based ranking rule
|
||||||
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
||||||
|
@ -10,8 +10,9 @@ mod query_term;
|
|||||||
mod ranking_rule_graph;
|
mod ranking_rule_graph;
|
||||||
mod ranking_rules;
|
mod ranking_rules;
|
||||||
mod resolve_query_graph;
|
mod resolve_query_graph;
|
||||||
// TODO: documentation + comments
|
|
||||||
mod small_bitmap;
|
mod small_bitmap;
|
||||||
|
|
||||||
|
mod exact_attribute;
|
||||||
// TODO: documentation + comments
|
// TODO: documentation + comments
|
||||||
// implementation is currently an adaptation of the previous implementation to fit with the new model
|
// implementation is currently an adaptation of the previous implementation to fit with the new model
|
||||||
mod sort;
|
mod sort;
|
||||||
@ -38,6 +39,8 @@ use resolve_query_graph::PhraseDocIdsCache;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use words::Words;
|
use words::Words;
|
||||||
|
|
||||||
|
use self::exact_attribute::ExactAttribute;
|
||||||
|
use self::graph_based_ranking_rule::Exactness;
|
||||||
use self::interner::Interner;
|
use self::interner::Interner;
|
||||||
use self::ranking_rules::{BoxRankingRule, RankingRule};
|
use self::ranking_rules::{BoxRankingRule, RankingRule};
|
||||||
use self::resolve_query_graph::compute_query_graph_docids;
|
use self::resolve_query_graph::compute_query_graph_docids;
|
||||||
@ -155,7 +158,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
|
|||||||
let mut proximity = false;
|
let mut proximity = false;
|
||||||
let mut sort = false;
|
let mut sort = false;
|
||||||
let attribute = false;
|
let attribute = false;
|
||||||
let exactness = false;
|
let mut exactness = false;
|
||||||
let mut asc = HashSet::new();
|
let mut asc = HashSet::new();
|
||||||
let mut desc = HashSet::new();
|
let mut desc = HashSet::new();
|
||||||
|
|
||||||
@ -216,8 +219,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
|
|||||||
if exactness {
|
if exactness {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// todo!();
|
ranking_rules.push(Box::new(ExactAttribute::new()));
|
||||||
// exactness = false;
|
ranking_rules.push(Box::new(Exactness::new()));
|
||||||
|
exactness = true;
|
||||||
}
|
}
|
||||||
crate::Criterion::Asc(field_name) => {
|
crate::Criterion::Asc(field_name) => {
|
||||||
if asc.contains(&field_name) {
|
if asc.contains(&field_name) {
|
||||||
|
@ -244,7 +244,8 @@ pub fn partially_initialized_term_from_word(
|
|||||||
Some(ctx.phrase_interner.insert(Phrase { words }))
|
Some(ctx.phrase_interner.insert(Phrase { words }))
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db };
|
let zero_typo =
|
||||||
|
ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db };
|
||||||
|
|
||||||
Ok(QueryTerm {
|
Ok(QueryTerm {
|
||||||
original: word_interned,
|
original: word_interned,
|
||||||
|
@ -9,16 +9,14 @@ use crate::Result;
|
|||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::ops::RangeInclusive;
|
use std::ops::RangeInclusive;
|
||||||
|
|
||||||
|
use either::Either;
|
||||||
pub use ntypo_subset::NTypoTermSubset;
|
pub use ntypo_subset::NTypoTermSubset;
|
||||||
pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed};
|
pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed};
|
||||||
pub use phrase::Phrase;
|
pub use phrase::Phrase;
|
||||||
|
|
||||||
use compute_derivations::partially_initialized_term_from_word;
|
use compute_derivations::partially_initialized_term_from_word;
|
||||||
|
|
||||||
/**
|
/// A set of word derivations attached to a location in the search query.
|
||||||
A set of word derivations attached to a location in the search query.
|
|
||||||
|
|
||||||
*/
|
|
||||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct LocatedQueryTermSubset {
|
pub struct LocatedQueryTermSubset {
|
||||||
pub term_subset: QueryTermSubset,
|
pub term_subset: QueryTermSubset,
|
||||||
@ -53,7 +51,7 @@ struct ZeroTypoTerm {
|
|||||||
/// The original phrase, if any
|
/// The original phrase, if any
|
||||||
phrase: Option<Interned<Phrase>>,
|
phrase: Option<Interned<Phrase>>,
|
||||||
/// A single word equivalent to the original term, with zero typos
|
/// A single word equivalent to the original term, with zero typos
|
||||||
zero_typo: Option<Interned<String>>,
|
exact: Option<Interned<String>>,
|
||||||
/// All the words that contain the original word as prefix
|
/// All the words that contain the original word as prefix
|
||||||
prefix_of: BTreeSet<Interned<String>>,
|
prefix_of: BTreeSet<Interned<String>>,
|
||||||
/// All the synonyms of the original word or phrase
|
/// All the synonyms of the original word or phrase
|
||||||
@ -94,7 +92,43 @@ impl<T> Lazy<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub enum ExactTerm {
|
||||||
|
Phrase(Interned<Phrase>),
|
||||||
|
Word(Interned<String>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExactTerm {
|
||||||
|
pub fn interned_words<'ctx>(
|
||||||
|
&self,
|
||||||
|
ctx: &'ctx SearchContext<'ctx>,
|
||||||
|
) -> impl Iterator<Item = Option<Interned<String>>> + 'ctx {
|
||||||
|
match *self {
|
||||||
|
ExactTerm::Phrase(phrase) => {
|
||||||
|
let phrase = ctx.phrase_interner.get(phrase);
|
||||||
|
Either::Left(phrase.words.iter().copied())
|
||||||
|
}
|
||||||
|
ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl QueryTermSubset {
|
impl QueryTermSubset {
|
||||||
|
pub fn exact_term(&self, ctx: &SearchContext) -> Option<ExactTerm> {
|
||||||
|
let full_query_term = ctx.term_interner.get(self.original);
|
||||||
|
if full_query_term.ngram_words.is_some() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// TODO: included in subset
|
||||||
|
if let Some(phrase) = full_query_term.zero_typo.phrase {
|
||||||
|
self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
|
||||||
|
} else if let Some(word) = full_query_term.zero_typo.exact {
|
||||||
|
self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn empty(for_term: Interned<QueryTerm>) -> Self {
|
pub fn empty(for_term: Interned<QueryTerm>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
original: for_term,
|
original: for_term,
|
||||||
@ -155,8 +189,13 @@ impl QueryTermSubset {
|
|||||||
|
|
||||||
let original = ctx.term_interner.get_mut(self.original);
|
let original = ctx.term_interner.get_mut(self.original);
|
||||||
if !self.zero_typo_subset.is_empty() {
|
if !self.zero_typo_subset.is_empty() {
|
||||||
let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } =
|
let ZeroTypoTerm {
|
||||||
&original.zero_typo;
|
phrase: _,
|
||||||
|
exact: zero_typo,
|
||||||
|
prefix_of,
|
||||||
|
synonyms: _,
|
||||||
|
use_prefix_db: _,
|
||||||
|
} = &original.zero_typo;
|
||||||
result.extend(zero_typo.iter().copied());
|
result.extend(zero_typo.iter().copied());
|
||||||
result.extend(prefix_of.iter().copied());
|
result.extend(prefix_of.iter().copied());
|
||||||
};
|
};
|
||||||
@ -204,7 +243,7 @@ impl QueryTermSubset {
|
|||||||
}
|
}
|
||||||
let original = ctx.term_interner.get_mut(self.original);
|
let original = ctx.term_interner.get_mut(self.original);
|
||||||
|
|
||||||
let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } =
|
let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } =
|
||||||
&original.zero_typo;
|
&original.zero_typo;
|
||||||
result.extend(phrase.iter().copied());
|
result.extend(phrase.iter().copied());
|
||||||
result.extend(synonyms.iter().copied());
|
result.extend(synonyms.iter().copied());
|
||||||
@ -270,7 +309,7 @@ impl QueryTermSubset {
|
|||||||
|
|
||||||
impl ZeroTypoTerm {
|
impl ZeroTypoTerm {
|
||||||
fn is_empty(&self) -> bool {
|
fn is_empty(&self) -> bool {
|
||||||
let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self;
|
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self;
|
||||||
phrase.is_none()
|
phrase.is_none()
|
||||||
&& zero_typo.is_none()
|
&& zero_typo.is_none()
|
||||||
&& prefix_of.is_empty()
|
&& prefix_of.is_empty()
|
||||||
|
@ -266,7 +266,7 @@ impl PhraseBuilder {
|
|||||||
is_prefix: false,
|
is_prefix: false,
|
||||||
zero_typo: ZeroTypoTerm {
|
zero_typo: ZeroTypoTerm {
|
||||||
phrase: Some(phrase),
|
phrase: Some(phrase),
|
||||||
zero_typo: None,
|
exact: None,
|
||||||
prefix_of: BTreeSet::default(),
|
prefix_of: BTreeSet::default(),
|
||||||
synonyms: BTreeSet::default(),
|
synonyms: BTreeSet::default(),
|
||||||
use_prefix_db: None,
|
use_prefix_db: None,
|
||||||
|
101
milli/src/search/new/ranking_rule_graph/exactness/mod.rs
Normal file
101
milli/src/search/new/ranking_rule_graph/exactness/mod.rs
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
use heed::BytesDecode;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
|
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
|
||||||
|
use crate::search::new::query_graph::{QueryGraph, QueryNode};
|
||||||
|
use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
|
||||||
|
use crate::{Result, RoaringBitmapCodec, SearchContext, SearchLogger};
|
||||||
|
|
||||||
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub enum ExactnessCondition {
|
||||||
|
ExactInAttribute(LocatedQueryTermSubset),
|
||||||
|
Skip(LocatedQueryTermSubset),
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum ExactnessGraph {}
|
||||||
|
|
||||||
|
fn compute_docids(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
dest_node: &LocatedQueryTermSubset,
|
||||||
|
universe: &RoaringBitmap,
|
||||||
|
) -> Result<RoaringBitmap> {
|
||||||
|
let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) {
|
||||||
|
exact_term
|
||||||
|
} else {
|
||||||
|
return Ok(Default::default());
|
||||||
|
};
|
||||||
|
let mut candidates = match exact_term {
|
||||||
|
ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(),
|
||||||
|
ExactTerm::Word(word) => {
|
||||||
|
if let Some(word_candidates) = ctx.get_db_word_docids(word)? {
|
||||||
|
RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)?
|
||||||
|
} else {
|
||||||
|
return Ok(Default::default());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// TODO: synonyms?
|
||||||
|
candidates &= universe;
|
||||||
|
Ok(candidates)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RankingRuleGraphTrait for ExactnessGraph {
|
||||||
|
type Condition = ExactnessCondition;
|
||||||
|
|
||||||
|
fn resolve_condition(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
condition: &Self::Condition,
|
||||||
|
universe: &RoaringBitmap,
|
||||||
|
) -> Result<ComputedCondition> {
|
||||||
|
let (docids, dest_node) = match condition {
|
||||||
|
ExactnessCondition::ExactInAttribute(dest_node) => {
|
||||||
|
(compute_docids(ctx, dest_node, universe)?, dest_node)
|
||||||
|
}
|
||||||
|
ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node),
|
||||||
|
};
|
||||||
|
Ok(ComputedCondition {
|
||||||
|
docids,
|
||||||
|
universe_len: universe.len(),
|
||||||
|
start_term_subset: None,
|
||||||
|
end_term_subset: dest_node.clone(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_edges(
|
||||||
|
_ctx: &mut SearchContext,
|
||||||
|
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||||
|
_source_node: Option<&LocatedQueryTermSubset>,
|
||||||
|
dest_node: &LocatedQueryTermSubset,
|
||||||
|
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||||
|
let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone());
|
||||||
|
let exact_condition = conditions_interner.insert(exact_condition);
|
||||||
|
|
||||||
|
let skip_condition = ExactnessCondition::Skip(dest_node.clone());
|
||||||
|
let skip_condition = conditions_interner.insert(skip_condition);
|
||||||
|
|
||||||
|
Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)])
|
||||||
|
}
|
||||||
|
|
||||||
|
fn log_state(
|
||||||
|
_graph: &RankingRuleGraph<Self>,
|
||||||
|
_paths: &[Vec<Interned<Self::Condition>>],
|
||||||
|
_dead_ends_cache: &DeadEndsCache<Self::Condition>,
|
||||||
|
_niverse: &RoaringBitmap,
|
||||||
|
_costs: &MappedInterner<QueryNode, Vec<u64>>,
|
||||||
|
_cost: u64,
|
||||||
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
|
) {
|
||||||
|
}
|
||||||
|
|
||||||
|
fn label_for_condition(
|
||||||
|
_ctx: &mut SearchContext,
|
||||||
|
condition: &Self::Condition,
|
||||||
|
) -> Result<String> {
|
||||||
|
Ok(match condition {
|
||||||
|
ExactnessCondition::ExactInAttribute(_) => "exact",
|
||||||
|
ExactnessCondition::Skip(_) => "skip",
|
||||||
|
}
|
||||||
|
.to_owned())
|
||||||
|
}
|
||||||
|
}
|
@ -10,6 +10,8 @@ mod cheapest_paths;
|
|||||||
mod condition_docids_cache;
|
mod condition_docids_cache;
|
||||||
mod dead_ends_cache;
|
mod dead_ends_cache;
|
||||||
|
|
||||||
|
/// Implementation of the `exactness` ranking rule
|
||||||
|
mod exactness;
|
||||||
/// Implementation of the `proximity` ranking rule
|
/// Implementation of the `proximity` ranking rule
|
||||||
mod proximity;
|
mod proximity;
|
||||||
/// Implementation of the `typo` ranking rule
|
/// Implementation of the `typo` ranking rule
|
||||||
@ -20,6 +22,7 @@ use std::hash::Hash;
|
|||||||
pub use cheapest_paths::PathVisitor;
|
pub use cheapest_paths::PathVisitor;
|
||||||
pub use condition_docids_cache::ConditionDocIdsCache;
|
pub use condition_docids_cache::ConditionDocIdsCache;
|
||||||
pub use dead_ends_cache::DeadEndsCache;
|
pub use dead_ends_cache::DeadEndsCache;
|
||||||
|
pub use exactness::{ExactnessCondition, ExactnessGraph};
|
||||||
pub use proximity::{ProximityCondition, ProximityGraph};
|
pub use proximity::{ProximityCondition, ProximityGraph};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
pub use typo::{TypoCondition, TypoGraph};
|
pub use typo::{TypoCondition, TypoGraph};
|
||||||
|
@ -1,14 +1,17 @@
|
|||||||
#![allow(clippy::too_many_arguments)]
|
#![allow(clippy::too_many_arguments)]
|
||||||
|
|
||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
|
use heed::BytesDecode;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::ProximityCondition;
|
use super::ProximityCondition;
|
||||||
use crate::search::new::interner::Interned;
|
use crate::search::new::interner::Interned;
|
||||||
use crate::search::new::query_term::{Phrase, QueryTermSubset};
|
use crate::search::new::query_term::{Phrase, QueryTermSubset};
|
||||||
use crate::search::new::ranking_rule_graph::ComputedCondition;
|
use crate::search::new::ranking_rule_graph::ComputedCondition;
|
||||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||||
use crate::search::new::SearchContext;
|
use crate::search::new::SearchContext;
|
||||||
use crate::{CboRoaringBitmapCodec, Result};
|
use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
use std::collections::BTreeSet;
|
|
||||||
|
|
||||||
pub fn compute_docids(
|
pub fn compute_docids(
|
||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
@ -90,7 +93,8 @@ pub fn compute_docids(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? {
|
} else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? {
|
||||||
let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?;
|
let left_word_docids =
|
||||||
|
RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?;
|
||||||
if universe.is_disjoint(&left_word_docids) {
|
if universe.is_disjoint(&left_word_docids) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -248,6 +248,11 @@ pub fn snap_word_position_docids(index: &Index) -> String {
|
|||||||
&format!("{word:<16} {position:<6} {}", display_bitmap(&b))
|
&format!("{word:<16} {position:<6} {}", display_bitmap(&b))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
pub fn snap_word_fid_docids(index: &Index) -> String {
|
||||||
|
make_db_snap_from_iter!(index, word_fid_docids, |((word, fid), b)| {
|
||||||
|
&format!("{word:<16} {fid:<3} {}", display_bitmap(&b))
|
||||||
|
})
|
||||||
|
}
|
||||||
pub fn snap_field_id_word_count_docids(index: &Index) -> String {
|
pub fn snap_field_id_word_count_docids(index: &Index) -> String {
|
||||||
make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| {
|
make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| {
|
||||||
&format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b))
|
&format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b))
|
||||||
@ -477,6 +482,9 @@ macro_rules! full_snap_of_db {
|
|||||||
($index:ident, word_position_docids) => {{
|
($index:ident, word_position_docids) => {{
|
||||||
$crate::snapshot_tests::snap_word_position_docids(&$index)
|
$crate::snapshot_tests::snap_word_position_docids(&$index)
|
||||||
}};
|
}};
|
||||||
|
($index:ident, word_fid_docids) => {{
|
||||||
|
$crate::snapshot_tests::snap_word_fid_docids(&$index)
|
||||||
|
}};
|
||||||
($index:ident, field_id_word_count_docids) => {{
|
($index:ident, field_id_word_count_docids) => {{
|
||||||
$crate::snapshot_tests::snap_field_id_word_count_docids(&$index)
|
$crate::snapshot_tests::snap_field_id_word_count_docids(&$index)
|
||||||
}};
|
}};
|
||||||
|
@ -153,7 +153,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st
|
|||||||
|
|
||||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
||||||
/// else we keep the standart proximity of 1 between words.
|
/// else we keep the standard proximity of 1 between words.
|
||||||
fn process_tokens<'a>(
|
fn process_tokens<'a>(
|
||||||
tokens: impl Iterator<Item = Token<'a>>,
|
tokens: impl Iterator<Item = Token<'a>>,
|
||||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||||
|
@ -0,0 +1,48 @@
|
|||||||
|
use std::fs::File;
|
||||||
|
use std::io;
|
||||||
|
|
||||||
|
use super::helpers::{
|
||||||
|
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||||
|
try_split_array_at, GrenadParameters,
|
||||||
|
};
|
||||||
|
use crate::error::SerializationError;
|
||||||
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
|
use crate::{relative_from_absolute_position, DocumentId, Result};
|
||||||
|
|
||||||
|
/// Extracts the word, field id, and the documents ids where this word appear at this field id.
|
||||||
|
#[logging_timer::time]
|
||||||
|
pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
|
||||||
|
docid_word_positions: grenad::Reader<R>,
|
||||||
|
indexer: GrenadParameters,
|
||||||
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
|
let mut word_fid_docids_sorter = create_sorter(
|
||||||
|
grenad::SortAlgorithm::Unstable,
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||||
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
|
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
|
for position in read_u32_ne_bytes(value) {
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
|
let (fid, _) = relative_from_absolute_position(position);
|
||||||
|
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
|
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
|
||||||
|
|
||||||
|
Ok(word_fid_docids_reader)
|
||||||
|
}
|
@ -7,17 +7,14 @@ use super::helpers::{
|
|||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::{
|
use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result};
|
||||||
absolute_from_relative_position, bucketed_position, relative_from_absolute_position,
|
|
||||||
DocumentId, Result,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Extracts the word positions and the documents ids where this word appear.
|
/// Extracts the word positions and the documents ids where this word appear.
|
||||||
///
|
///
|
||||||
/// Returns a grenad reader with the list of extracted words at positions and
|
/// Returns a grenad reader with the list of extracted words at positions and
|
||||||
/// documents ids from the given chunk of docid word positions.
|
/// documents ids from the given chunk of docid word positions.
|
||||||
#[logging_timer::time]
|
#[logging_timer::time]
|
||||||
pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
|
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
@ -42,9 +39,8 @@ pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
|
|||||||
for position in read_u32_ne_bytes(value) {
|
for position in read_u32_ne_bytes(value) {
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.extend_from_slice(word_bytes);
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
let (fid, position) = relative_from_absolute_position(position);
|
let (_, position) = relative_from_absolute_position(position);
|
||||||
let position = bucketed_position(position);
|
let position = bucketed_position(position);
|
||||||
let position = absolute_from_relative_position(fid, position);
|
|
||||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||||
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@ mod extract_fid_docid_facet_values;
|
|||||||
mod extract_fid_word_count_docids;
|
mod extract_fid_word_count_docids;
|
||||||
mod extract_geo_points;
|
mod extract_geo_points;
|
||||||
mod extract_word_docids;
|
mod extract_word_docids;
|
||||||
|
mod extract_word_fid_docids;
|
||||||
mod extract_word_pair_proximity_docids;
|
mod extract_word_pair_proximity_docids;
|
||||||
mod extract_word_position_docids;
|
mod extract_word_position_docids;
|
||||||
|
|
||||||
@ -22,8 +23,9 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
|
|||||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||||
use self::extract_geo_points::extract_geo_points;
|
use self::extract_geo_points::extract_geo_points;
|
||||||
use self::extract_word_docids::extract_word_docids;
|
use self::extract_word_docids::extract_word_docids;
|
||||||
|
use self::extract_word_fid_docids::extract_word_fid_docids;
|
||||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||||
use self::extract_word_position_docids::extract_word_fid_and_position_docids;
|
use self::extract_word_position_docids::extract_word_position_docids;
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
|
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
|
||||||
GrenadParameters, MergeFn, MergeableReader,
|
GrenadParameters, MergeFn, MergeableReader,
|
||||||
@ -130,14 +132,23 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_word_positions_chunks,
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_word_fid_and_position_docids,
|
extract_word_position_docids,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
TypedChunk::WordPositionDocids,
|
TypedChunk::WordPositionDocids,
|
||||||
"word-position-docids",
|
"word-position-docids",
|
||||||
);
|
);
|
||||||
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
|
docid_word_positions_chunks,
|
||||||
|
indexer,
|
||||||
|
lmdb_writer_sx.clone(),
|
||||||
|
extract_word_fid_docids,
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
|
TypedChunk::WordFidDocids,
|
||||||
|
"word-fid-docids",
|
||||||
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_fid_facet_strings_chunks,
|
docid_fid_facet_strings_chunks,
|
||||||
|
@ -2255,4 +2255,61 @@ mod tests {
|
|||||||
{"id":1,"catto":"jorts"}
|
{"id":1,"catto":"jorts"}
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_word_fid_position() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{"id": 0, "text": "sun flowers are looking at the sun" },
|
||||||
|
{"id": 1, "text": "sun flowers are looking at the sun" },
|
||||||
|
{"id": 2, "text": "the sun is shining today" },
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"text": "a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a "
|
||||||
|
}
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9");
|
||||||
|
db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f");
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{"id": 4, "text": "sun flowers are looking at the sun" },
|
||||||
|
{"id": 5, "text2": "sun flowers are looking at the sun" },
|
||||||
|
{"id": 6, "text": "b b b" },
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"text2": "a a a a"
|
||||||
|
}
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
|
||||||
|
db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
// Delete not all of the documents but some of them.
|
||||||
|
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||||
|
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||||
|
builder.delete_external_id("0");
|
||||||
|
builder.delete_external_id("3");
|
||||||
|
let result = builder.execute().unwrap();
|
||||||
|
println!("{result:?}");
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
|
||||||
|
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
||||||
|
db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -35,6 +35,7 @@ pub(crate) enum TypedChunk {
|
|||||||
exact_word_docids_reader: grenad::Reader<File>,
|
exact_word_docids_reader: grenad::Reader<File>,
|
||||||
},
|
},
|
||||||
WordPositionDocids(grenad::Reader<File>),
|
WordPositionDocids(grenad::Reader<File>),
|
||||||
|
WordFidDocids(grenad::Reader<File>),
|
||||||
WordPairProximityDocids(grenad::Reader<File>),
|
WordPairProximityDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
FieldIdFacetStringDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
||||||
@ -140,6 +141,17 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
|
TypedChunk::WordFidDocids(word_fid_docids_iter) => {
|
||||||
|
append_entries_into_database(
|
||||||
|
word_fid_docids_iter,
|
||||||
|
&index.word_fid_docids,
|
||||||
|
wtxn,
|
||||||
|
index_is_empty,
|
||||||
|
|value, _buffer| Ok(value),
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
|
)?;
|
||||||
|
is_merged_database = true;
|
||||||
|
}
|
||||||
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
|
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
|
||||||
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
|
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
|
||||||
indexer.execute(wtxn)?;
|
indexer.execute(wtxn)?;
|
||||||
|
Loading…
Reference in New Issue
Block a user