feat: Store all documents words by document rather than by attribute

2025-07-04 20:37:15 +02:00 · 2019-05-13 16:22:36 +02:00 · 2019-05-13 16:22:36 +02:00 · 169bd4cb39
commit 169bd4cb39
parent aa90f22865
2 changed files with 59 additions and 83 deletions
--- a/meilidb-data/src/indexer.rs
+++ b/meilidb-data/src/indexer.rs
@ -13,12 +13,12 @@ type Word = Vec<u8>; // TODO make it be a SmallVec
 pub struct Indexer {
    word_limit: usize, // the maximum number of indexed words
    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
-    docs_attrs_words: HashMap<(DocumentId, SchemaAttr), Vec<Word>>,
+    docs_words: HashMap<DocumentId, Vec<Word>>,
 }

 pub struct Indexed {
    pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
-    pub docs_attrs_words: HashMap<(DocumentId, SchemaAttr), fst::Set>,
+    pub docs_words: HashMap<DocumentId, fst::Set>,
 }

 impl Indexer {
@ -30,7 +30,7 @@ impl Indexer {
        Indexer {
            word_limit: limit,
            words_doc_indexes: BTreeMap::new(),
-            docs_attrs_words: HashMap::new(),
+            docs_words: HashMap::new(),
        }
    }

@ -42,7 +42,7 @@ impl Indexer {
                attr,
                self.word_limit,
                &mut self.words_doc_indexes,
-                &mut self.docs_attrs_words,
+                &mut self.docs_words,
            );

            if !must_continue { break }
@ -60,7 +60,7 @@ impl Indexer {
                attr,
                self.word_limit,
                &mut self.words_doc_indexes,
-                &mut self.docs_attrs_words,
+                &mut self.docs_words,
            );

            if !must_continue { break }
@ -76,16 +76,16 @@ impl Indexer {
                (word, SetBuf::new_unchecked(indexes))
            }).collect();

-        let docs_attrs_words = self.docs_attrs_words
+        let docs_words = self.docs_words
            .into_iter()
-            .map(|((id, attr), mut words)| {
+            .map(|(id, mut words)| {
                words.sort_unstable();
                words.dedup();
-                ((id, attr), fst::Set::from_iter(words).unwrap())
+                (id, fst::Set::from_iter(words).unwrap())
            })
            .collect();

-        Indexed { words_doc_indexes, docs_attrs_words }
+        Indexed { words_doc_indexes, docs_words }
    }
 }

@ -95,7 +95,7 @@ fn index_token(
    attr: SchemaAttr,
    word_limit: usize,
    words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
-    docs_attrs_words: &mut HashMap<(DocumentId, SchemaAttr), Vec<Word>>,
+    docs_words: &mut HashMap<DocumentId, Vec<Word>>,
 ) -> bool
 {
    if token.word_index >= word_limit { return false }
@ -106,7 +106,7 @@ fn index_token(
        Some(docindex) => {
            let word = Vec::from(token.word);
            words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
-            docs_attrs_words.entry((id, attr)).or_insert_with(Vec::new).push(word);
+            docs_words.entry(id).or_insert_with(Vec::new).push(word);
        },
        None => return false,
    }
@ -119,7 +119,7 @@ fn index_token(
                Some(docindex) => {
                    let word = Vec::from(token.word);
                    words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
-                    docs_attrs_words.entry((id, attr)).or_insert_with(Vec::new).push(word);
+                    docs_words.entry(id).or_insert_with(Vec::new).push(word);
                },
                None => return false,
            }