diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index b780af39f..a0dde4686 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -27,6 +27,8 @@ pub struct DatabaseCache<'ctx> { pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, pub words_fst: Option>>, + pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, } impl<'ctx> DatabaseCache<'ctx> { fn get_value<'v, K1, KC>( @@ -141,4 +143,32 @@ impl<'ctx> SearchContext<'ctx> { self.index.prefix_word_pair_proximity_docids.remap_data_type::(), ) } + + pub fn get_db_word_position_docids( + &mut self, + word: Interned, + position: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, position), + &(self.word_interner.get(word).as_str(), position), + &mut self.db_cache.word_position_docids, + self.index.word_position_docids.remap_data_type::(), + ) + } + + pub fn get_db_word_fid_docids( + &mut self, + word: Interned, + fid: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, fid), + &(self.word_interner.get(word).as_str(), fid), + &mut self.db_cache.word_fid_docids, + self.index.word_fid_docids.remap_data_type::(), + ) + } } diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs new file mode 100644 index 000000000..fa837272b --- /dev/null +++ b/milli/src/search/new/exact_attribute.rs @@ -0,0 +1,253 @@ +use heed::BytesDecode; +use roaring::{MultiOps, RoaringBitmap}; + +use super::query_graph::QueryGraph; +use super::ranking_rules::{RankingRule, RankingRuleOutput}; +use crate::search::new::query_graph::QueryNodeData; +use crate::search::new::query_term::ExactTerm; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; + +/// A ranking rule that produces 3 disjoint buckets: +/// +/// 1. Documents from the universe whose value is exactly the query. +/// 2. Documents from the universe not in (1) whose value starts with the query. +/// 3. Documents from the universe not in (1) or (2). +pub struct ExactAttribute { + state: State, +} + +impl ExactAttribute { + pub fn new() -> Self { + Self { state: Default::default() } + } +} + +impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { + fn id(&self) -> String { + "exact_attribute".to_owned() + } + + fn start_iteration( + &mut self, + ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + universe: &roaring::RoaringBitmap, + query: &QueryGraph, + ) -> Result<()> { + self.state = State::start_iteration(ctx, universe, query)?; + + Ok(()) + } + + fn next_bucket( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + universe: &roaring::RoaringBitmap, + ) -> Result>> { + let state = std::mem::take(&mut self.state); + let (state, output) = State::next(state, universe); + self.state = state; + + Ok(output) + } + + fn end_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + ) { + self.state = Default::default(); + } +} + +/// Inner state of the ranking rule. +#[derive(Default)] +enum State { + /// State between two iterations + #[default] + Uninitialized, + /// The next call to `next` will output the documents in the universe that have an attribute that is the exact query + ExactAttribute(QueryGraph, Vec), + /// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query, + /// but isn't the exact query. + AttributeStarts(QueryGraph, Vec), + /// The next calls to `next` will output the input universe. + Empty(QueryGraph), +} + +/// The candidates sorted by attributes +/// +/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field. +struct FieldCandidates { + /// The candidates that start with all the words of the query in the field + start_with_exact: RoaringBitmap, + /// The candidates that have the same number of words as the query in the field + exact_word_count: RoaringBitmap, +} + +impl State { + fn start_iteration( + ctx: &mut SearchContext<'_>, + universe: &RoaringBitmap, + query_graph: &QueryGraph, + ) -> Result { + let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = + Vec::with_capacity(query_graph.nodes.len() as usize); + for (_, node) in query_graph.nodes.iter() { + match &node.data { + QueryNodeData::Term(term) => { + let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { + exact_term + } else { + continue; + }; + exact_term_position_ids.push(( + exact_term, + *term.positions.start(), + *term.term_ids.start(), + )) + } + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, + } + } + + exact_term_position_ids.sort_by_key(|(_, _, id)| *id); + // bail if there is a "hole" (missing word) in remaining query graph + if let Some((_, _, first_id)) = exact_term_position_ids.first() { + if *first_id != 0 { + return Ok(State::Empty(query_graph.clone())); + } + } else { + return Ok(State::Empty(query_graph.clone())); + } + let mut previous_id = 0; + for (_, _, id) in exact_term_position_ids.iter().copied() { + if id < previous_id || id - previous_id > 1 { + return Ok(State::Empty(query_graph.clone())); + } else { + previous_id = id; + } + } + + // sample query: "sunflower are pretty" + // sunflower at pos 0 in attr A + // are at pos 1 in attr B + // pretty at pos 2 in attr C + // We want to eliminate such document + + // first check that for each term, there exists some attribute that has this term at the correct position + //"word-position-docids"; + let mut candidates = universe.clone(); + let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids + .iter() + .copied() + .map(|(term, position, _)| (term.interned_words(ctx).collect(), position)) + .collect(); + for (words, position) in &words_positions { + if candidates.is_empty() { + return Ok(State::Empty(query_graph.clone())); + } + + 'words: for (offset, word) in words.iter().enumerate() { + let offset = offset as u16; + let word = if let Some(word) = word { + word + } else { + continue 'words; + }; + // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of + // longer phrases we'll be losing on precision here. + let bucketed_position = crate::bucketed_position(position + offset); + let word_position_docids = CboRoaringBitmapCodec::bytes_decode( + ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(), + ) + .unwrap_or_default(); + candidates &= word_position_docids; + } + } + + let candidates = candidates; + + if candidates.is_empty() { + return Ok(State::Empty(query_graph.clone())); + } + + let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); + + let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); + + // then check that there exists at least one attribute that has all of the terms + for fid in searchable_fields_ids { + let mut intersection = MultiOps::intersection( + words_positions + .iter() + .flat_map(|(words, ..)| words.iter()) + // ignore stop words words in phrases + .flatten() + .map(|word| -> Result<_> { + Ok(ctx + .get_db_word_fid_docids(*word, fid)? + .map(CboRoaringBitmapCodec::bytes_decode) + .unwrap_or_default() + .unwrap_or_default()) + }), + )?; + intersection &= &candidates; + if !intersection.is_empty() { + let candidates_with_exact_word_count = ctx + .index + .field_id_word_count_docids + .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))? + .unwrap_or_default(); + candidates_per_attribute.push(FieldCandidates { + start_with_exact: intersection, + exact_word_count: candidates_with_exact_word_count, + }); + } + } + // note we could have "false positives" where there both exist different attributes that collectively + // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. + + Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute)) + } + + fn next( + state: State, + universe: &RoaringBitmap, + ) -> (State, Option>) { + let (state, output) = match state { + State::Uninitialized => (state, None), + State::ExactAttribute(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.iter().map( + |FieldCandidates { start_with_exact, exact_word_count }| { + start_with_exact & exact_word_count + }, + )); + candidates &= universe; + ( + State::AttributeStarts(query_graph.clone(), candidates_per_attribute), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::AttributeStarts(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map( + |FieldCandidates { mut start_with_exact, exact_word_count }| { + start_with_exact -= exact_word_count; + start_with_exact + }, + )); + candidates &= universe; + ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::Empty(query_graph) => ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }), + ), + }; + (state, output) + } +} diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index b8c58c726..28b4ed1f4 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner}; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, - TypoGraph, + ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph, + RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; @@ -65,6 +65,12 @@ impl GraphBasedRankingRule { Self::new_with_id("typo".to_owned(), terms_matching_strategy) } } +pub type Exactness = GraphBasedRankingRule; +impl GraphBasedRankingRule { + pub fn new() -> Self { + Self::new_with_id("exactness".to_owned(), None) + } +} /// A generic graph-based ranking rule pub struct GraphBasedRankingRule { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index e7e38fe89..b307b2434 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -10,8 +10,9 @@ mod query_term; mod ranking_rule_graph; mod ranking_rules; mod resolve_query_graph; -// TODO: documentation + comments mod small_bitmap; + +mod exact_attribute; // TODO: documentation + comments // implementation is currently an adaptation of the previous implementation to fit with the new model mod sort; @@ -38,6 +39,8 @@ use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; +use self::exact_attribute::ExactAttribute; +use self::graph_based_ranking_rule::Exactness; use self::interner::Interner; use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::resolve_query_graph::compute_query_graph_docids; @@ -155,7 +158,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut proximity = false; let mut sort = false; let attribute = false; - let exactness = false; + let mut exactness = false; let mut asc = HashSet::new(); let mut desc = HashSet::new(); @@ -216,8 +219,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( if exactness { continue; } - // todo!(); - // exactness = false; + ranking_rules.push(Box::new(ExactAttribute::new())); + ranking_rules.push(Box::new(Exactness::new())); + exactness = true; } crate::Criterion::Asc(field_name) => { if asc.contains(&field_name) { diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index f95956fbf..695c57f08 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -244,7 +244,8 @@ pub fn partially_initialized_term_from_word( Some(ctx.phrase_interner.insert(Phrase { words })) }) .collect(); - let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; + let zero_typo = + ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db }; Ok(QueryTerm { original: word_interned, diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 50977395b..896c70e1b 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -9,16 +9,14 @@ use crate::Result; use std::collections::BTreeSet; use std::ops::RangeInclusive; +use either::Either; pub use ntypo_subset::NTypoTermSubset; pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed}; pub use phrase::Phrase; use compute_derivations::partially_initialized_term_from_word; -/** -A set of word derivations attached to a location in the search query. - -*/ +/// A set of word derivations attached to a location in the search query. #[derive(Clone, PartialEq, Eq, Hash)] pub struct LocatedQueryTermSubset { pub term_subset: QueryTermSubset, @@ -53,7 +51,7 @@ struct ZeroTypoTerm { /// The original phrase, if any phrase: Option>, /// A single word equivalent to the original term, with zero typos - zero_typo: Option>, + exact: Option>, /// All the words that contain the original word as prefix prefix_of: BTreeSet>, /// All the synonyms of the original word or phrase @@ -94,7 +92,43 @@ impl Lazy { } } +#[derive(Clone, Copy)] +pub enum ExactTerm { + Phrase(Interned), + Word(Interned), +} + +impl ExactTerm { + pub fn interned_words<'ctx>( + &self, + ctx: &'ctx SearchContext<'ctx>, + ) -> impl Iterator>> + 'ctx { + match *self { + ExactTerm::Phrase(phrase) => { + let phrase = ctx.phrase_interner.get(phrase); + Either::Left(phrase.words.iter().copied()) + } + ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))), + } + } +} + impl QueryTermSubset { + pub fn exact_term(&self, ctx: &SearchContext) -> Option { + let full_query_term = ctx.term_interner.get(self.original); + if full_query_term.ngram_words.is_some() { + return None; + } + // TODO: included in subset + if let Some(phrase) = full_query_term.zero_typo.phrase { + self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase)) + } else if let Some(word) = full_query_term.zero_typo.exact { + self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word)) + } else { + None + } + } + pub fn empty(for_term: Interned) -> Self { Self { original: for_term, @@ -155,8 +189,13 @@ impl QueryTermSubset { let original = ctx.term_interner.get_mut(self.original); if !self.zero_typo_subset.is_empty() { - let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = - &original.zero_typo; + let ZeroTypoTerm { + phrase: _, + exact: zero_typo, + prefix_of, + synonyms: _, + use_prefix_db: _, + } = &original.zero_typo; result.extend(zero_typo.iter().copied()); result.extend(prefix_of.iter().copied()); }; @@ -204,7 +243,7 @@ impl QueryTermSubset { } let original = ctx.term_interner.get_mut(self.original); - let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = + let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } = &original.zero_typo; result.extend(phrase.iter().copied()); result.extend(synonyms.iter().copied()); @@ -270,7 +309,7 @@ impl QueryTermSubset { impl ZeroTypoTerm { fn is_empty(&self) -> bool { - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self; phrase.is_none() && zero_typo.is_none() && prefix_of.is_empty() diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index e0f6d971b..7bcbeeff4 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -266,7 +266,7 @@ impl PhraseBuilder { is_prefix: false, zero_typo: ZeroTypoTerm { phrase: Some(phrase), - zero_typo: None, + exact: None, prefix_of: BTreeSet::default(), synonyms: BTreeSet::default(), use_prefix_db: None, diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs new file mode 100644 index 000000000..3d558e87b --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -0,0 +1,101 @@ +use heed::BytesDecode; +use roaring::RoaringBitmap; + +use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; +use crate::search::new::query_graph::{QueryGraph, QueryNode}; +use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; +use crate::{Result, RoaringBitmapCodec, SearchContext, SearchLogger}; + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum ExactnessCondition { + ExactInAttribute(LocatedQueryTermSubset), + Skip(LocatedQueryTermSubset), +} + +pub enum ExactnessGraph {} + +fn compute_docids( + ctx: &mut SearchContext, + dest_node: &LocatedQueryTermSubset, + universe: &RoaringBitmap, +) -> Result { + let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) { + exact_term + } else { + return Ok(Default::default()); + }; + let mut candidates = match exact_term { + ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), + ExactTerm::Word(word) => { + if let Some(word_candidates) = ctx.get_db_word_docids(word)? { + RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)? + } else { + return Ok(Default::default()); + } + } + }; + // TODO: synonyms? + candidates &= universe; + Ok(candidates) +} + +impl RankingRuleGraphTrait for ExactnessGraph { + type Condition = ExactnessCondition; + + fn resolve_condition( + ctx: &mut SearchContext, + condition: &Self::Condition, + universe: &RoaringBitmap, + ) -> Result { + let (docids, dest_node) = match condition { + ExactnessCondition::ExactInAttribute(dest_node) => { + (compute_docids(ctx, dest_node, universe)?, dest_node) + } + ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node), + }; + Ok(ComputedCondition { + docids, + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: dest_node.clone(), + }) + } + + fn build_edges( + _ctx: &mut SearchContext, + conditions_interner: &mut DedupInterner, + _source_node: Option<&LocatedQueryTermSubset>, + dest_node: &LocatedQueryTermSubset, + ) -> Result)>> { + let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone()); + let exact_condition = conditions_interner.insert(exact_condition); + + let skip_condition = ExactnessCondition::Skip(dest_node.clone()); + let skip_condition = conditions_interner.insert(skip_condition); + + Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)]) + } + + fn log_state( + _graph: &RankingRuleGraph, + _paths: &[Vec>], + _dead_ends_cache: &DeadEndsCache, + _niverse: &RoaringBitmap, + _costs: &MappedInterner>, + _cost: u64, + _logger: &mut dyn SearchLogger, + ) { + } + + fn label_for_condition( + _ctx: &mut SearchContext, + condition: &Self::Condition, + ) -> Result { + Ok(match condition { + ExactnessCondition::ExactInAttribute(_) => "exact", + ExactnessCondition::Skip(_) => "skip", + } + .to_owned()) + } +} diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 7c40008c8..936c3e942 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -10,6 +10,8 @@ mod cheapest_paths; mod condition_docids_cache; mod dead_ends_cache; +/// Implementation of the `exactness` ranking rule +mod exactness; /// Implementation of the `proximity` ranking rule mod proximity; /// Implementation of the `typo` ranking rule @@ -20,6 +22,7 @@ use std::hash::Hash; pub use cheapest_paths::PathVisitor; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; +pub use exactness::{ExactnessCondition, ExactnessGraph}; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 8496054b7..07bd102ca 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,14 +1,17 @@ #![allow(clippy::too_many_arguments)] +use std::collections::BTreeSet; + +use heed::BytesDecode; +use roaring::RoaringBitmap; + use super::ProximityCondition; use crate::search::new::interner::Interned; use crate::search::new::query_term::{Phrase, QueryTermSubset}; use crate::search::new::ranking_rule_graph::ComputedCondition; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::SearchContext; -use crate::{CboRoaringBitmapCodec, Result}; -use roaring::RoaringBitmap; -use std::collections::BTreeSet; +use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; pub fn compute_docids( ctx: &mut SearchContext, @@ -90,7 +93,8 @@ pub fn compute_docids( continue; } } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? { - let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?; + let left_word_docids = + RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?; if universe.is_disjoint(&left_word_docids) { continue; } diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f7f1a97e6..eb94c4be9 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -248,6 +248,11 @@ pub fn snap_word_position_docids(index: &Index) -> String { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) }) } +pub fn snap_word_fid_docids(index: &Index) -> String { + make_db_snap_from_iter!(index, word_fid_docids, |((word, fid), b)| { + &format!("{word:<16} {fid:<3} {}", display_bitmap(&b)) + }) +} pub fn snap_field_id_word_count_docids(index: &Index) -> String { make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| { &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) @@ -477,6 +482,9 @@ macro_rules! full_snap_of_db { ($index:ident, word_position_docids) => {{ $crate::snapshot_tests::snap_word_position_docids(&$index) }}; + ($index:ident, word_fid_docids) => {{ + $crate::snapshot_tests::snap_word_fid_docids(&$index) + }}; ($index:ident, field_id_word_count_docids) => {{ $crate::snapshot_tests::snap_field_id_word_count_docids(&$index) }}; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 2d51fcc1a..c362f8f1b 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -153,7 +153,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st /// take an iterator on tokens and compute their relative position depending on separator kinds /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, -/// else we keep the standart proximity of 1 between words. +/// else we keep the standard proximity of 1 between words. fn process_tokens<'a>( tokens: impl Iterator>, ) -> impl Iterator)> { diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs new file mode 100644 index 000000000..72b30cddf --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs @@ -0,0 +1,48 @@ +use std::fs::File; +use std::io; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::{relative_from_absolute_position, DocumentId, Result}; + +/// Extracts the word, field id, and the documents ids where this word appear at this field id. +#[logging_timer::time] +pub fn extract_word_fid_docids( + docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_fid_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = DocumentId::from_be_bytes(document_id_bytes); + + for position in read_u32_ne_bytes(value) { + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + let (fid, _) = relative_from_absolute_position(position); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + } + + let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; + + Ok(word_fid_docids_reader) +} diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index cd3ec691b..80a36c308 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -7,17 +7,14 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{ - absolute_from_relative_position, bucketed_position, relative_from_absolute_position, - DocumentId, Result, -}; +use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result}; /// Extracts the word positions and the documents ids where this word appear. /// /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_fid_and_position_docids( +pub fn extract_word_position_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { @@ -42,9 +39,8 @@ pub fn extract_word_fid_and_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - let (fid, position) = relative_from_absolute_position(position); + let (_, position) = relative_from_absolute_position(position); let position = bucketed_position(position); - let position = absolute_from_relative_position(fid, position); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 844efed36..db041de6f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -5,6 +5,7 @@ mod extract_fid_docid_facet_values; mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_word_docids; +mod extract_word_fid_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; @@ -22,8 +23,9 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; +use self::extract_word_fid_docids::extract_word_fid_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; -use self::extract_word_position_docids::extract_word_fid_and_position_docids; +use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, @@ -130,14 +132,23 @@ pub(crate) fn data_from_obkv_documents( ); spawn_extraction_task::<_, _, Vec>>( - docid_word_positions_chunks, + docid_word_positions_chunks.clone(), indexer, lmdb_writer_sx.clone(), - extract_word_fid_and_position_docids, + extract_word_position_docids, merge_cbo_roaring_bitmaps, TypedChunk::WordPositionDocids, "word-position-docids", ); + spawn_extraction_task::<_, _, Vec>>( + docid_word_positions_chunks, + indexer, + lmdb_writer_sx.clone(), + extract_word_fid_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordFidDocids, + "word-fid-docids", + ); spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_strings_chunks, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ade217beb..235b35fc8 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2255,4 +2255,61 @@ mod tests { {"id":1,"catto":"jorts"} "###); } + + #[test] + fn test_word_fid_position() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + {"id": 0, "text": "sun flowers are looking at the sun" }, + {"id": 1, "text": "sun flowers are looking at the sun" }, + {"id": 2, "text": "the sun is shining today" }, + { + "id": 3, + "text": "a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a " + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9"); + db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f"); + + index + .add_documents(documents!([ + {"id": 4, "text": "sun flowers are looking at the sun" }, + {"id": 5, "text2": "sun flowers are looking at the sun" }, + {"id": 6, "text": "b b b" }, + { + "id": 7, + "text2": "a a a a" + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4"); + db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83"); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.strategy(DeletionStrategy::AlwaysHard); + builder.delete_external_id("0"); + builder.delete_external_id("3"); + let result = builder.execute().unwrap(); + println!("{result:?}"); + + wtxn.commit().unwrap(); + + db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); + db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); + db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1"); + } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b9b11cfa8..14ba021bd 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -35,6 +35,7 @@ pub(crate) enum TypedChunk { exact_word_docids_reader: grenad::Reader, }, WordPositionDocids(grenad::Reader), + WordFidDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), @@ -140,6 +141,17 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::WordFidDocids(word_fid_docids_iter) => { + append_entries_into_database( + word_fid_docids_iter, + &index.word_fid_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?;