Merge branch 'search-refactor-exactness' into search-refactor-tests-doc

2025-05-25 00:53:59 +02:00 · 2023-04-06 13:07:00 +02:00 · 2023-04-06 13:07:00 +02:00 · 7ca91ebb71
commit 7ca91ebb71
parent b5691802a3 5440f43fd3
17 changed files with 605 additions and 32 deletions
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@ -27,6 +27,8 @@ pub struct DatabaseCache<'ctx> {
    pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
    pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
    pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
    pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
 }
 impl<'ctx> DatabaseCache<'ctx> {
    fn get_value<'v, K1, KC>(
@ -141,4 +143,32 @@ impl<'ctx> SearchContext<'ctx> {
            self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
        )
    }
    pub fn get_db_word_position_docids(
        &mut self,
        word: Interned<String>,
        position: u16,
    ) -> Result<Option<&'ctx [u8]>> {
        DatabaseCache::get_value(
            self.txn,
            (word, position),
            &(self.word_interner.get(word).as_str(), position),
            &mut self.db_cache.word_position_docids,
            self.index.word_position_docids.remap_data_type::<ByteSlice>(),
        )
    }
    pub fn get_db_word_fid_docids(
        &mut self,
        word: Interned<String>,
        fid: u16,
    ) -> Result<Option<&'ctx [u8]>> {
        DatabaseCache::get_value(
            self.txn,
            (word, fid),
            &(self.word_interner.get(word).as_str(), fid),
            &mut self.db_cache.word_fid_docids,
            self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
        )
    }
 }
--- a/milli/src/search/new/exact_attribute.rs
+++ b/milli/src/search/new/exact_attribute.rs
@ -0,0 +1,253 @@
 use heed::BytesDecode;
 use roaring::{MultiOps, RoaringBitmap};
 use super::query_graph::QueryGraph;
 use super::ranking_rules::{RankingRule, RankingRuleOutput};
 use crate::search::new::query_graph::QueryNodeData;
 use crate::search::new::query_term::ExactTerm;
 use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
 /// A ranking rule that produces 3 disjoint buckets:
 ///
 /// 1. Documents from the universe whose value is exactly the query.
 /// 2. Documents from the universe not in (1) whose value starts with the query.
 /// 3. Documents from the universe not in (1) or (2).
 pub struct ExactAttribute {
    state: State,
 }
 impl ExactAttribute {
    pub fn new() -> Self {
        Self { state: Default::default() }
    }
 }
 impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
    fn id(&self) -> String {
        "exact_attribute".to_owned()
    }
    fn start_iteration(
        &mut self,
        ctx: &mut SearchContext<'ctx>,
        _logger: &mut dyn SearchLogger<QueryGraph>,
        universe: &roaring::RoaringBitmap,
        query: &QueryGraph,
    ) -> Result<()> {
        self.state = State::start_iteration(ctx, universe, query)?;
        Ok(())
    }
    fn next_bucket(
        &mut self,
        _ctx: &mut SearchContext<'ctx>,
        _logger: &mut dyn SearchLogger<QueryGraph>,
        universe: &roaring::RoaringBitmap,
    ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
        let state = std::mem::take(&mut self.state);
        let (state, output) = State::next(state, universe);
        self.state = state;
        Ok(output)
    }
    fn end_iteration(
        &mut self,
        _ctx: &mut SearchContext<'ctx>,
        _logger: &mut dyn SearchLogger<QueryGraph>,
    ) {
        self.state = Default::default();
    }
 }
 /// Inner state of the ranking rule.
 #[derive(Default)]
 enum State {
    /// State between two iterations
    #[default]
    Uninitialized,
    /// The next call to `next` will output the documents in the universe that have an attribute that is the exact query
    ExactAttribute(QueryGraph, Vec<FieldCandidates>),
    /// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query,
    /// but isn't the exact query.
    AttributeStarts(QueryGraph, Vec<FieldCandidates>),
    /// The next calls to `next` will output the input universe.
    Empty(QueryGraph),
 }
 /// The candidates sorted by attributes
 ///
 /// Each of the bitmap in a single `FieldCandidates` struct applies to the same field.
 struct FieldCandidates {
    /// The candidates that start with all the words of the query in the field
    start_with_exact: RoaringBitmap,
    /// The candidates that have the same number of words as the query in the field
    exact_word_count: RoaringBitmap,
 }
 impl State {
    fn start_iteration(
        ctx: &mut SearchContext<'_>,
        universe: &RoaringBitmap,
        query_graph: &QueryGraph,
    ) -> Result<Self> {
        let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> =
            Vec::with_capacity(query_graph.nodes.len() as usize);
        for (_, node) in query_graph.nodes.iter() {
            match &node.data {
                QueryNodeData::Term(term) => {
                    let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
                        exact_term
                    } else {
                        continue;
                    };
                    exact_term_position_ids.push((
                        exact_term,
                        *term.positions.start(),
                        *term.term_ids.start(),
                    ))
                }
                QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
            }
        }
        exact_term_position_ids.sort_by_key(|(_, _, id)| *id);
        // bail if there is a "hole" (missing word) in remaining query graph
        if let Some((_, _, first_id)) = exact_term_position_ids.first() {
            if *first_id != 0 {
                return Ok(State::Empty(query_graph.clone()));
            }
        } else {
            return Ok(State::Empty(query_graph.clone()));
        }
        let mut previous_id = 0;
        for (_, _, id) in exact_term_position_ids.iter().copied() {
            if id < previous_id || id - previous_id > 1 {
                return Ok(State::Empty(query_graph.clone()));
            } else {
                previous_id = id;
            }
        }
        // sample query: "sunflower are pretty"
        // sunflower at pos 0 in attr A
        // are at pos 1 in attr B
        // pretty at pos 2 in attr C
        // We want to eliminate such document
        // first check that for each term, there exists some attribute that has this term at the correct position
        //"word-position-docids";
        let mut candidates = universe.clone();
        let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids
            .iter()
            .copied()
            .map(|(term, position, _)| (term.interned_words(ctx).collect(), position))
            .collect();
        for (words, position) in &words_positions {
            if candidates.is_empty() {
                return Ok(State::Empty(query_graph.clone()));
            }
            'words: for (offset, word) in words.iter().enumerate() {
                let offset = offset as u16;
                let word = if let Some(word) = word {
                    word
                } else {
                    continue 'words;
                };
                // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
                // longer phrases we'll be losing on precision here.
                let bucketed_position = crate::bucketed_position(position + offset);
                let word_position_docids = CboRoaringBitmapCodec::bytes_decode(
                    ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(),
                )
                .unwrap_or_default();
                candidates &= word_position_docids;
            }
        }
        let candidates = candidates;
        if candidates.is_empty() {
            return Ok(State::Empty(query_graph.clone()));
        }
        let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default();
        let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
        // then check that there exists at least one attribute that has all of the terms
        for fid in searchable_fields_ids {
            let mut intersection = MultiOps::intersection(
                words_positions
                    .iter()
                    .flat_map(|(words, ..)| words.iter())
                    // ignore stop words words in phrases
                    .flatten()
                    .map(|word| -> Result<_> {
                        Ok(ctx
                            .get_db_word_fid_docids(*word, fid)?
                            .map(CboRoaringBitmapCodec::bytes_decode)
                            .unwrap_or_default()
                            .unwrap_or_default())
                    }),
            )?;
            intersection &= &candidates;
            if !intersection.is_empty() {
                let candidates_with_exact_word_count = ctx
                    .index
                    .field_id_word_count_docids
                    .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))?
                    .unwrap_or_default();
                candidates_per_attribute.push(FieldCandidates {
                    start_with_exact: intersection,
                    exact_word_count: candidates_with_exact_word_count,
                });
            }
        }
        // note we could have "false positives" where there both exist different attributes that collectively
        // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
        Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute))
    }
    fn next(
        state: State,
        universe: &RoaringBitmap,
    ) -> (State, Option<RankingRuleOutput<QueryGraph>>) {
        let (state, output) = match state {
            State::Uninitialized => (state, None),
            State::ExactAttribute(query_graph, candidates_per_attribute) => {
                let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
                    |FieldCandidates { start_with_exact, exact_word_count }| {
                        start_with_exact & exact_word_count
                    },
                ));
                candidates &= universe;
                (
                    State::AttributeStarts(query_graph.clone(), candidates_per_attribute),
                    Some(RankingRuleOutput { query: query_graph, candidates }),
                )
            }
            State::AttributeStarts(query_graph, candidates_per_attribute) => {
                let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
                    |FieldCandidates { mut start_with_exact, exact_word_count }| {
                        start_with_exact -= exact_word_count;
                        start_with_exact
                    },
                ));
                candidates &= universe;
                (
                    State::Empty(query_graph.clone()),
                    Some(RankingRuleOutput { query: query_graph, candidates }),
                )
            }
            State::Empty(query_graph) => (
                State::Empty(query_graph.clone()),
                Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }),
            ),
        };
        (state, output)
    }
 }
--- a/milli/src/search/new/graph_based_ranking_rule.rs
+++ b/milli/src/search/new/graph_based_ranking_rule.rs
@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner};
 use super::logger::SearchLogger;
 use super::query_graph::QueryNode;
 use super::ranking_rule_graph::{
-    ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait,
+    ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph,
-    TypoGraph,
+    RankingRuleGraphTrait, TypoGraph,
 };
 use super::small_bitmap::SmallBitmap;
 use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
@ -65,6 +65,12 @@ impl GraphBasedRankingRule<TypoGraph> {
        Self::new_with_id("typo".to_owned(), terms_matching_strategy)
    }
 }
 pub type Exactness = GraphBasedRankingRule<ExactnessGraph>;
 impl GraphBasedRankingRule<ExactnessGraph> {
    pub fn new() -> Self {
        Self::new_with_id("exactness".to_owned(), None)
    }
 }
 /// A generic graph-based ranking rule
 pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -10,8 +10,9 @@ mod query_term;
 mod ranking_rule_graph;
 mod ranking_rules;
 mod resolve_query_graph;
 // TODO: documentation + comments
 mod small_bitmap;
 mod exact_attribute;
 // TODO: documentation + comments
 // implementation is currently an adaptation of the previous implementation to fit with the new model
 mod sort;
@ -38,6 +39,8 @@ use resolve_query_graph::PhraseDocIdsCache;
 use roaring::RoaringBitmap;
 use words::Words;
 use self::exact_attribute::ExactAttribute;
 use self::graph_based_ranking_rule::Exactness;
 use self::interner::Interner;
 use self::ranking_rules::{BoxRankingRule, RankingRule};
 use self::resolve_query_graph::compute_query_graph_docids;
@ -155,7 +158,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
    let mut proximity = false;
    let mut sort = false;
    let attribute = false;
-    let exactness = false;
+    let mut exactness = false;
    let mut asc = HashSet::new();
    let mut desc = HashSet::new();
@ -216,8 +219,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
                if exactness {
                    continue;
                }
-                // todo!();
+                ranking_rules.push(Box::new(ExactAttribute::new()));
-                // exactness = false;
+                ranking_rules.push(Box::new(Exactness::new()));
                exactness = true;
            }
            crate::Criterion::Asc(field_name) => {
                if asc.contains(&field_name) {
--- a/milli/src/search/new/query_term/compute_derivations.rs
+++ b/milli/src/search/new/query_term/compute_derivations.rs
@ -244,7 +244,8 @@ pub fn partially_initialized_term_from_word(
            Some(ctx.phrase_interner.insert(Phrase { words }))
        })
        .collect();
-    let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db };
+    let zero_typo =
        ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db };
    Ok(QueryTerm {
        original: word_interned,
--- a/milli/src/search/new/query_term/mod.rs
+++ b/milli/src/search/new/query_term/mod.rs
@ -9,16 +9,14 @@ use crate::Result;
 use std::collections::BTreeSet;
 use std::ops::RangeInclusive;
 use either::Either;
 pub use ntypo_subset::NTypoTermSubset;
 pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed};
 pub use phrase::Phrase;
 use compute_derivations::partially_initialized_term_from_word;
-/**
+/// A set of word derivations attached to a location in the search query.
 A set of word derivations attached to a location in the search query.
 */
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub struct LocatedQueryTermSubset {
    pub term_subset: QueryTermSubset,
@ -53,7 +51,7 @@ struct ZeroTypoTerm {
    /// The original phrase, if any
    phrase: Option<Interned<Phrase>>,
    /// A single word equivalent to the original term, with zero typos
-    zero_typo: Option<Interned<String>>,
+    exact: Option<Interned<String>>,
    /// All the words that contain the original word as prefix
    prefix_of: BTreeSet<Interned<String>>,
    /// All the synonyms of the original word or phrase
@ -94,7 +92,43 @@ impl<T> Lazy<T> {
    }
 }
 #[derive(Clone, Copy)]
 pub enum ExactTerm {
    Phrase(Interned<Phrase>),
    Word(Interned<String>),
 }
 impl ExactTerm {
    pub fn interned_words<'ctx>(
        &self,
        ctx: &'ctx SearchContext<'ctx>,
    ) -> impl Iterator<Item = Option<Interned<String>>> + 'ctx {
        match *self {
            ExactTerm::Phrase(phrase) => {
                let phrase = ctx.phrase_interner.get(phrase);
                Either::Left(phrase.words.iter().copied())
            }
            ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))),
        }
    }
 }
 impl QueryTermSubset {
    pub fn exact_term(&self, ctx: &SearchContext) -> Option<ExactTerm> {
        let full_query_term = ctx.term_interner.get(self.original);
        if full_query_term.ngram_words.is_some() {
            return None;
        }
        // TODO: included in subset
        if let Some(phrase) = full_query_term.zero_typo.phrase {
            self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
        } else if let Some(word) = full_query_term.zero_typo.exact {
            self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word))
        } else {
            None
        }
    }
    pub fn empty(for_term: Interned<QueryTerm>) -> Self {
        Self {
            original: for_term,
@ -155,8 +189,13 @@ impl QueryTermSubset {
        let original = ctx.term_interner.get_mut(self.original);
        if !self.zero_typo_subset.is_empty() {
-            let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } =
+            let ZeroTypoTerm {
-                &original.zero_typo;
+                phrase: _,
                exact: zero_typo,
                prefix_of,
                synonyms: _,
                use_prefix_db: _,
            } = &original.zero_typo;
            result.extend(zero_typo.iter().copied());
            result.extend(prefix_of.iter().copied());
        };
@ -204,7 +243,7 @@ impl QueryTermSubset {
        }
        let original = ctx.term_interner.get_mut(self.original);
-        let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } =
+        let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } =
            &original.zero_typo;
        result.extend(phrase.iter().copied());
        result.extend(synonyms.iter().copied());
@ -270,7 +309,7 @@ impl QueryTermSubset {
 impl ZeroTypoTerm {
    fn is_empty(&self) -> bool {
-        let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self;
+        let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self;
        phrase.is_none()
            && zero_typo.is_none()
            && prefix_of.is_empty()
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@ -266,7 +266,7 @@ impl PhraseBuilder {
                    is_prefix: false,
                    zero_typo: ZeroTypoTerm {
                        phrase: Some(phrase),
-                        zero_typo: None,
+                        exact: None,
                        prefix_of: BTreeSet::default(),
                        synonyms: BTreeSet::default(),
                        use_prefix_db: None,
--- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
@ -0,0 +1,101 @@
 use heed::BytesDecode;
 use roaring::RoaringBitmap;
 use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
 use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
 use crate::search::new::query_graph::{QueryGraph, QueryNode};
 use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
 use crate::{Result, RoaringBitmapCodec, SearchContext, SearchLogger};
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub enum ExactnessCondition {
    ExactInAttribute(LocatedQueryTermSubset),
    Skip(LocatedQueryTermSubset),
 }
 pub enum ExactnessGraph {}
 fn compute_docids(
    ctx: &mut SearchContext,
    dest_node: &LocatedQueryTermSubset,
    universe: &RoaringBitmap,
 ) -> Result<RoaringBitmap> {
    let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) {
        exact_term
    } else {
        return Ok(Default::default());
    };
    let mut candidates = match exact_term {
        ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(),
        ExactTerm::Word(word) => {
            if let Some(word_candidates) = ctx.get_db_word_docids(word)? {
                RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)?
            } else {
                return Ok(Default::default());
            }
        }
    };
    // TODO: synonyms?
    candidates &= universe;
    Ok(candidates)
 }
 impl RankingRuleGraphTrait for ExactnessGraph {
    type Condition = ExactnessCondition;
    fn resolve_condition(
        ctx: &mut SearchContext,
        condition: &Self::Condition,
        universe: &RoaringBitmap,
    ) -> Result<ComputedCondition> {
        let (docids, dest_node) = match condition {
            ExactnessCondition::ExactInAttribute(dest_node) => {
                (compute_docids(ctx, dest_node, universe)?, dest_node)
            }
            ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node),
        };
        Ok(ComputedCondition {
            docids,
            universe_len: universe.len(),
            start_term_subset: None,
            end_term_subset: dest_node.clone(),
        })
    }
    fn build_edges(
        _ctx: &mut SearchContext,
        conditions_interner: &mut DedupInterner<Self::Condition>,
        _source_node: Option<&LocatedQueryTermSubset>,
        dest_node: &LocatedQueryTermSubset,
    ) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
        let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone());
        let exact_condition = conditions_interner.insert(exact_condition);
        let skip_condition = ExactnessCondition::Skip(dest_node.clone());
        let skip_condition = conditions_interner.insert(skip_condition);
        Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)])
    }
    fn log_state(
        _graph: &RankingRuleGraph<Self>,
        _paths: &[Vec<Interned<Self::Condition>>],
        _dead_ends_cache: &DeadEndsCache<Self::Condition>,
        _niverse: &RoaringBitmap,
        _costs: &MappedInterner<QueryNode, Vec<u64>>,
        _cost: u64,
        _logger: &mut dyn SearchLogger<QueryGraph>,
    ) {
    }
    fn label_for_condition(
        _ctx: &mut SearchContext,
        condition: &Self::Condition,
    ) -> Result<String> {
        Ok(match condition {
            ExactnessCondition::ExactInAttribute(_) => "exact",
            ExactnessCondition::Skip(_) => "skip",
        }
        .to_owned())
    }
 }
--- a/milli/src/search/new/ranking_rule_graph/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/mod.rs
@ -10,6 +10,8 @@ mod cheapest_paths;
 mod condition_docids_cache;
 mod dead_ends_cache;
 /// Implementation of the `exactness` ranking rule
 mod exactness;
 /// Implementation of the `proximity` ranking rule
 mod proximity;
 /// Implementation of the `typo` ranking rule
@ -20,6 +22,7 @@ use std::hash::Hash;
 pub use cheapest_paths::PathVisitor;
 pub use condition_docids_cache::ConditionDocIdsCache;
 pub use dead_ends_cache::DeadEndsCache;
 pub use exactness::{ExactnessCondition, ExactnessGraph};
 pub use proximity::{ProximityCondition, ProximityGraph};
 use roaring::RoaringBitmap;
 pub use typo::{TypoCondition, TypoGraph};
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@ -1,14 +1,17 @@
 #![allow(clippy::too_many_arguments)]
 use std::collections::BTreeSet;
 use heed::BytesDecode;
 use roaring::RoaringBitmap;
 use super::ProximityCondition;
 use crate::search::new::interner::Interned;
 use crate::search::new::query_term::{Phrase, QueryTermSubset};
 use crate::search::new::ranking_rule_graph::ComputedCondition;
 use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
 use crate::search::new::SearchContext;
-use crate::{CboRoaringBitmapCodec, Result};
+use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
 use roaring::RoaringBitmap;
 use std::collections::BTreeSet;
 pub fn compute_docids(
    ctx: &mut SearchContext,
@ -90,7 +93,8 @@ pub fn compute_docids(
                    continue;
                }
            } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? {
-                let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?;
+                let left_word_docids =
                    RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?;
                if universe.is_disjoint(&left_word_docids) {
                    continue;
                }
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@ -248,6 +248,11 @@ pub fn snap_word_position_docids(index: &Index) -> String {
        &format!("{word:<16} {position:<6} {}", display_bitmap(&b))
    })
 }
 pub fn snap_word_fid_docids(index: &Index) -> String {
    make_db_snap_from_iter!(index, word_fid_docids, |((word, fid), b)| {
        &format!("{word:<16} {fid:<3} {}", display_bitmap(&b))
    })
 }
 pub fn snap_field_id_word_count_docids(index: &Index) -> String {
    make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| {
        &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b))
@ -477,6 +482,9 @@ macro_rules! full_snap_of_db {
    ($index:ident, word_position_docids) => {{
        $crate::snapshot_tests::snap_word_position_docids(&$index)
    }};
    ($index:ident, word_fid_docids) => {{
        $crate::snapshot_tests::snap_word_fid_docids(&$index)
    }};
    ($index:ident, field_id_word_count_docids) => {{
        $crate::snapshot_tests::snap_field_id_word_count_docids(&$index)
    }};
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@ -153,7 +153,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st
 /// take an iterator on tokens and compute their relative position depending on separator kinds
 /// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
-/// else we keep the standart proximity of 1 between words.
+/// else we keep the standard proximity of 1 between words.
 fn process_tokens<'a>(
    tokens: impl Iterator<Item = Token<'a>>,
 ) -> impl Iterator<Item = (usize, Token<'a>)> {
--- a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
@ -0,0 +1,48 @@
 use std::fs::File;
 use std::io;
 use super::helpers::{
    create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
    try_split_array_at, GrenadParameters,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::{relative_from_absolute_position, DocumentId, Result};
 /// Extracts the word, field id, and the documents ids where this word appear at this field id.
 #[logging_timer::time]
 pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
    docid_word_positions: grenad::Reader<R>,
    indexer: GrenadParameters,
 ) -> Result<grenad::Reader<File>> {
    let max_memory = indexer.max_memory_by_thread();
    let mut word_fid_docids_sorter = create_sorter(
        grenad::SortAlgorithm::Unstable,
        merge_cbo_roaring_bitmaps,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory,
    );
    let mut key_buffer = Vec::new();
    let mut cursor = docid_word_positions.into_cursor()?;
    while let Some((key, value)) = cursor.move_on_next()? {
        let (document_id_bytes, word_bytes) = try_split_array_at(key)
            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
        let document_id = DocumentId::from_be_bytes(document_id_bytes);
        for position in read_u32_ne_bytes(value) {
            key_buffer.clear();
            key_buffer.extend_from_slice(word_bytes);
            let (fid, _) = relative_from_absolute_position(position);
            key_buffer.extend_from_slice(&fid.to_be_bytes());
            word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
        }
    }
    let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
    Ok(word_fid_docids_reader)
 }
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@ -7,17 +7,14 @@ use super::helpers::{
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::{
+use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result};
    absolute_from_relative_position, bucketed_position, relative_from_absolute_position,
    DocumentId, Result,
 };
 /// Extracts the word positions and the documents ids where this word appear.
 ///
 /// Returns a grenad reader with the list of extracted words at positions and
 /// documents ids from the given chunk of docid word positions.
 #[logging_timer::time]
-pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
+pub fn extract_word_position_docids<R: io::Read + io::Seek>(
    docid_word_positions: grenad::Reader<R>,
    indexer: GrenadParameters,
 ) -> Result<grenad::Reader<File>> {
@ -42,9 +39,8 @@ pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
        for position in read_u32_ne_bytes(value) {
            key_buffer.clear();
            key_buffer.extend_from_slice(word_bytes);
-            let (fid, position) = relative_from_absolute_position(position);
+            let (_, position) = relative_from_absolute_position(position);
            let position = bucketed_position(position);
            let position = absolute_from_relative_position(fid, position);
            key_buffer.extend_from_slice(&position.to_be_bytes());
            word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
        }
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@ -5,6 +5,7 @@ mod extract_fid_docid_facet_values;
 mod extract_fid_word_count_docids;
 mod extract_geo_points;
 mod extract_word_docids;
 mod extract_word_fid_docids;
 mod extract_word_pair_proximity_docids;
 mod extract_word_position_docids;
@ -22,8 +23,9 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
 use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
 use self::extract_geo_points::extract_geo_points;
 use self::extract_word_docids::extract_word_docids;
 use self::extract_word_fid_docids::extract_word_fid_docids;
 use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
-use self::extract_word_position_docids::extract_word_fid_and_position_docids;
+use self::extract_word_position_docids::extract_word_position_docids;
 use super::helpers::{
    as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
    GrenadParameters, MergeFn, MergeableReader,
@ -130,14 +132,23 @@ pub(crate) fn data_from_obkv_documents(
    );
    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
-        docid_word_positions_chunks,
+        docid_word_positions_chunks.clone(),
        indexer,
        lmdb_writer_sx.clone(),
-        extract_word_fid_and_position_docids,
+        extract_word_position_docids,
        merge_cbo_roaring_bitmaps,
        TypedChunk::WordPositionDocids,
        "word-position-docids",
    );
    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
        docid_word_positions_chunks,
        indexer,
        lmdb_writer_sx.clone(),
        extract_word_fid_docids,
        merge_cbo_roaring_bitmaps,
        TypedChunk::WordFidDocids,
        "word-fid-docids",
    );
    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
        docid_fid_facet_strings_chunks,
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -2255,4 +2255,61 @@ mod tests {
        {"id":1,"catto":"jorts"}
        "###);
    }
    #[test]
    fn test_word_fid_position() {
        let index = TempIndex::new();
        index
            .add_documents(documents!([
              {"id": 0, "text": "sun flowers are looking at the sun" },
              {"id": 1, "text": "sun flowers are looking at the sun" },
              {"id": 2, "text": "the sun is shining today" },
              {
                "id": 3,
                "text": "a a a a a a a a a a a a a a a a a
                a a a a a a a a a a a a a a a a a a a a a a a a a a 
                a a a a a a a a a a a a a a a a a a a a a a a a a a 
                a a a a a a a a a a a a a a a a a a a a a a a a a a 
                a a a a a a a a a a a a a a a a a a a a a a a a a a 
                a a a a a a a a a a a a a a a a a a a a a a a a a a 
                a a a a a a a a a a a a a a a a a a a a a "
             }
            ]))
            .unwrap();
        db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9");
        db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f");
        index
            .add_documents(documents!([
              {"id": 4, "text": "sun flowers are looking at the sun" },
              {"id": 5, "text2": "sun flowers are looking at the sun" },
              {"id": 6, "text": "b b b" },
              {
                "id": 7,
                "text2": "a a a a"
             }
            ]))
            .unwrap();
        db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
        db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
        let mut wtxn = index.write_txn().unwrap();
        // Delete not all of the documents but some of them.
        let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
        builder.strategy(DeletionStrategy::AlwaysHard);
        builder.delete_external_id("0");
        builder.delete_external_id("3");
        let result = builder.execute().unwrap();
        println!("{result:?}");
        wtxn.commit().unwrap();
        db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
        db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1");
    }
 }
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@ -35,6 +35,7 @@ pub(crate) enum TypedChunk {
        exact_word_docids_reader: grenad::Reader<File>,
    },
    WordPositionDocids(grenad::Reader<File>),
    WordFidDocids(grenad::Reader<File>),
    WordPairProximityDocids(grenad::Reader<File>),
    FieldIdFacetStringDocids(grenad::Reader<File>),
    FieldIdFacetNumberDocids(grenad::Reader<File>),
@ -140,6 +141,17 @@ pub(crate) fn write_typed_chunk_into_index(
            )?;
            is_merged_database = true;
        }
        TypedChunk::WordFidDocids(word_fid_docids_iter) => {
            append_entries_into_database(
                word_fid_docids_iter,
                &index.word_fid_docids,
                wtxn,
                index_is_empty,
                |value, _buffer| Ok(value),
                merge_cbo_roaring_bitmaps,
            )?;
            is_merged_database = true;
        }
        TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
            let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
            indexer.execute(wtxn)?;