Fix several warnings in extractors and remove unreachable macros

2025-07-04 12:27:13 +02:00 · 2024-09-09 14:52:50 +02:00 · 2024-09-09 14:52:50 +02:00 · f69688e8f7
commit f69688e8f7
parent 8fd0afaaaa
5 changed files with 100 additions and 87 deletions
--- a/milli/src/update/new/extract/faceted/mod.rs
+++ b/milli/src/update/new/extract/faceted/mod.rs
@ -87,11 +87,11 @@ pub trait FacetedExtractor {
    where
        MF: MergeFunction,
        MF::Error: Debug,
        grenad::Error<MF::Error>: Into<crate::Error>,
    {
        buffer.clear();
        match Self::build_key(fid, value, buffer) {
-            // TODO manage errors
+            Some(key) => cache_fn(cached_sorter, &key, docid).map_err(Into::into),
            Some(key) => Ok(cache_fn(cached_sorter, &key, docid).unwrap()),
            None => Ok(()),
        }
    }
--- a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs
+++ b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs
@ -1,4 +1,3 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use heed::RoTxn;
@ -25,12 +24,6 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
        Ok(vec![])
    }
    /// This case is unreachable because extract_document_change has been reimplemented to not call this function.
    fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> {
        /// TODO remove this
        unreachable!()
    }
    // This method is reimplemented to count the number of words in the document in each field
    // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
    fn extract_document_change(
@ -59,8 +52,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
                for (fid, count) in fid_word_count.iter() {
                    if *count <= MAX_COUNTED_WORDS {
                        let key = build_key(*fid, *count as u8, &mut key_buffer);
-                        /// TODO manage the error
+                        cached_sorter.insert_del_u32(key, inner.docid())?;
                        cached_sorter.insert_del_u32(key, inner.docid()).unwrap();
                    }
                }
            }
@ -93,13 +85,11 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
                    if *current_count != *new_count {
                        if *current_count <= MAX_COUNTED_WORDS {
                            let key = build_key(*fid, *current_count as u8, &mut key_buffer);
-                            /// TODO manage the error
+                            cached_sorter.insert_del_u32(key, inner.docid())?;
                            cached_sorter.insert_del_u32(key, inner.docid()).unwrap();
                        }
                        if *new_count <= MAX_COUNTED_WORDS {
                            let key = build_key(*fid, *new_count as u8, &mut key_buffer);
-                            /// TODO manage the error
+                            cached_sorter.insert_add_u32(key, inner.docid())?;
                            cached_sorter.insert_add_u32(key, inner.docid()).unwrap();
                        }
                    }
                }
@ -116,8 +106,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
                for (fid, count) in fid_word_count.iter() {
                    if *count <= MAX_COUNTED_WORDS {
                        let key = build_key(*fid, *count as u8, &mut key_buffer);
-                        /// TODO manage the error
+                        cached_sorter.insert_add_u32(key, inner.docid())?;
                        cached_sorter.insert_add_u32(key, inner.docid()).unwrap();
                    }
                }
            }
--- a/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs
@ -2,11 +2,93 @@ use std::borrow::Cow;
 use heed::RoTxn;
-use super::SearchableExtractor;
+use super::{tokenize_document::DocumentTokenizer, SearchableExtractor};
-use crate::{bucketed_position, FieldId, Index, Result};
+use crate::{
    bucketed_position,
    update::{
        new::{extract::cache::CboCachedSorter, DocumentChange},
        MergeDeladdCboRoaringBitmaps,
    },
    FieldId, GlobalFieldsIdsMap, Index, Result,
 };
 trait ProtoWordDocidsExtractor {
    fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>;
    fn attributes_to_extract<'a>(
        _rtxn: &'a RoTxn,
        _index: &'a Index,
    ) -> Result<Option<Vec<&'a str>>>;
    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
 }
 impl<T> SearchableExtractor for T
 where
    T: ProtoWordDocidsExtractor,
 {
    fn extract_document_change(
        rtxn: &RoTxn,
        index: &Index,
        document_tokenizer: &DocumentTokenizer,
        fields_ids_map: &mut GlobalFieldsIdsMap,
        cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
        document_change: DocumentChange,
    ) -> Result<()> {
        match document_change {
            DocumentChange::Deletion(inner) => {
                let mut token_fn = |fid, pos: u16, word: &str| {
                    let key = Self::build_key(fid, pos, word);
                    cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from)
                };
                document_tokenizer.tokenize_document(
                    inner.current(rtxn, index)?.unwrap(),
                    fields_ids_map,
                    &mut token_fn,
                )?;
            }
            DocumentChange::Update(inner) => {
                let mut token_fn = |fid, pos, word: &str| {
                    let key = Self::build_key(fid, pos, word);
                    cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from)
                };
                document_tokenizer.tokenize_document(
                    inner.current(rtxn, index)?.unwrap(),
                    fields_ids_map,
                    &mut token_fn,
                )?;
                let mut token_fn = |fid, pos, word: &str| {
                    let key = Self::build_key(fid, pos, word);
                    cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from)
                };
                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
            }
            DocumentChange::Insertion(inner) => {
                let mut token_fn = |fid, pos, word: &str| {
                    let key = Self::build_key(fid, pos, word);
                    cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from)
                };
                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
            }
        }
        Ok(())
    }
    fn attributes_to_extract<'a>(
        rtxn: &'a RoTxn,
        index: &'a Index,
    ) -> Result<Option<Vec<&'a str>>> {
        Self::attributes_to_extract(rtxn, index)
    }
    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
        Self::attributes_to_skip(rtxn, index)
    }
 }
 pub struct WordDocidsExtractor;
-impl SearchableExtractor for WordDocidsExtractor {
+impl ProtoWordDocidsExtractor for WordDocidsExtractor {
    fn attributes_to_extract<'a>(
        rtxn: &'a RoTxn,
        index: &'a Index,
@ -26,7 +108,7 @@ impl SearchableExtractor for WordDocidsExtractor {
 }
 pub struct ExactWordDocidsExtractor;
-impl SearchableExtractor for ExactWordDocidsExtractor {
+impl ProtoWordDocidsExtractor for ExactWordDocidsExtractor {
    fn attributes_to_extract<'a>(
        rtxn: &'a RoTxn,
        index: &'a Index,
@ -55,7 +137,7 @@ impl SearchableExtractor for ExactWordDocidsExtractor {
 }
 pub struct WordFidDocidsExtractor;
-impl SearchableExtractor for WordFidDocidsExtractor {
+impl ProtoWordDocidsExtractor for WordFidDocidsExtractor {
    fn attributes_to_extract<'a>(
        rtxn: &'a RoTxn,
        index: &'a Index,
@ -77,7 +159,7 @@ impl SearchableExtractor for WordFidDocidsExtractor {
 }
 pub struct WordPositionDocidsExtractor;
-impl SearchableExtractor for WordPositionDocidsExtractor {
+impl ProtoWordDocidsExtractor for WordPositionDocidsExtractor {
    fn attributes_to_extract<'a>(
        rtxn: &'a RoTxn,
        index: &'a Index,
--- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
@ -1,4 +1,3 @@
 use std::borrow::Cow;
 use std::collections::{BTreeMap, VecDeque};
 use heed::RoTxn;
@ -26,12 +25,6 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
        Ok(vec![])
    }
    /// This case is unreachable because extract_document_change has been reimplemented to not call this function.
    fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> {
        /// TODO remove this
        unreachable!()
    }
    // This method is reimplemented to count the number of words in the document in each field
    // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
    fn extract_document_change(
@ -100,18 +93,18 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
            match eob {
                Left(((w1, w2), prox)) => {
                    let key = build_key(*prox, w1, w2, &mut key_buffer);
-                    cached_sorter.insert_del_u32(key, docid).unwrap();
+                    cached_sorter.insert_del_u32(key, docid)?;
                }
                Right(((w1, w2), prox)) => {
                    let key = build_key(*prox, w1, w2, &mut key_buffer);
-                    cached_sorter.insert_add_u32(key, docid).unwrap();
+                    cached_sorter.insert_add_u32(key, docid)?;
                }
                Both(((w1, w2), del_prox), (_, add_prox)) => {
                    if del_prox != add_prox {
                        let key = build_key(*del_prox, w1, w2, &mut key_buffer);
-                        cached_sorter.insert_del_u32(key, docid).unwrap();
+                        cached_sorter.insert_del_u32(key, docid)?;
                        let key = build_key(*add_prox, w1, w2, &mut key_buffer);
-                        cached_sorter.insert_add_u32(key, docid).unwrap();
+                        cached_sorter.insert_add_u32(key, docid)?;
                    }
                }
            };
--- a/milli/src/update/new/extract/searchable/mod.rs
+++ b/milli/src/update/new/extract/searchable/mod.rs
@ -3,7 +3,6 @@ mod extract_word_docids;
 mod extract_word_pair_proximity_docids;
 mod tokenize_document;
 use std::borrow::Cow;
 use std::fs::File;
 pub use extract_fid_word_count_docids::FidWordCountDocidsExtractor;
@ -20,7 +19,7 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer};
 use super::cache::CboCachedSorter;
 use crate::update::new::{DocumentChange, ItemsPool};
 use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
-use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
+use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
 pub trait SearchableExtractor {
    fn run_extraction(
@ -109,60 +108,10 @@ pub trait SearchableExtractor {
        fields_ids_map: &mut GlobalFieldsIdsMap,
        cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
        document_change: DocumentChange,
-    ) -> Result<()> {
+    ) -> Result<()>;
        match document_change {
            DocumentChange::Deletion(inner) => {
                let mut token_fn = |fid, pos: u16, word: &str| {
                    let key = Self::build_key(fid, pos, word);
                    /// TODO manage the error
                    cached_sorter.insert_del_u32(&key, inner.docid()).unwrap();
                    Ok(())
                };
                document_tokenizer.tokenize_document(
                    inner.current(rtxn, index)?.unwrap(),
                    fields_ids_map,
                    &mut token_fn,
                )?;
            }
            DocumentChange::Update(inner) => {
                let mut token_fn = |fid, pos, word: &str| {
                    let key = Self::build_key(fid, pos, word);
                    /// TODO manage the error
                    cached_sorter.insert_del_u32(&key, inner.docid()).unwrap();
                    Ok(())
                };
                document_tokenizer.tokenize_document(
                    inner.current(rtxn, index)?.unwrap(),
                    fields_ids_map,
                    &mut token_fn,
                )?;
                let mut token_fn = |fid, pos, word: &str| {
                    let key = Self::build_key(fid, pos, word);
                    /// TODO manage the error
                    cached_sorter.insert_add_u32(&key, inner.docid()).unwrap();
                    Ok(())
                };
                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
            }
            DocumentChange::Insertion(inner) => {
                let mut token_fn = |fid, pos, word: &str| {
                    let key = Self::build_key(fid, pos, word);
                    /// TODO manage the error
                    cached_sorter.insert_add_u32(&key, inner.docid()).unwrap();
                    Ok(())
                };
                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
            }
        }
        Ok(())
    }
    fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index)
        -> Result<Option<Vec<&'a str>>>;
    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
    fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>;
 }