Use the documents_fileds_count store in the QueryBuilder

2025-05-25 09:03:59 +02:00 · 2019-10-14 18:48:32 +02:00 · 2019-10-14 18:48:32 +02:00 · e629f51af4
commit e629f51af4
parent b377003192
4 changed files with 85 additions and 18 deletions
--- a/meilidb-core/src/query_builder.rs
+++ b/meilidb-core/src/query_builder.rs
@ -22,6 +22,7 @@ pub struct QueryBuilder<'c, FI = fn(DocumentId) -> bool> {
    timeout: Option<Duration>,
    main_store: store::Main,
    postings_lists_store: store::PostingsLists,
    documents_fields_counts_store: store::DocumentsFieldsCounts,
    synonyms_store: store::Synonyms,
 }
@ -130,6 +131,7 @@ fn fetch_raw_documents(
    searchables: Option<&ReorderedAttrs>,
    main_store: &store::Main,
    postings_lists_store: &store::PostingsLists,
    documents_fields_counts_store: &store::DocumentsFieldsCounts,
 ) -> MResult<Vec<RawDocument>>
 {
    let mut matches = Vec::new();
@ -187,22 +189,42 @@ fn fetch_raw_documents(
        SetBuf::new_unchecked(highlights)
    };
-    Ok(raw_documents_from(matches, highlights))
+    let fields_counts = {
        let mut fields_counts = Vec::new();
        for group in matches.linear_group_by_key(|(id, ..)| *id) {
            let id = group[0].0;
            for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
                let (attr, count) = result?;
                fields_counts.push((id, attr, count));
            }
        }
        SetBuf::new(fields_counts).unwrap()
    };
    Ok(raw_documents_from(matches, highlights, fields_counts))
 }
 impl<'c> QueryBuilder<'c> {
    pub fn new(
        main: store::Main,
        postings_lists: store::PostingsLists,
        documents_fields_counts: store::DocumentsFieldsCounts,
        synonyms: store::Synonyms,
    ) -> QueryBuilder<'c>
    {
-        QueryBuilder::with_criteria(main, postings_lists, synonyms, Criteria::default())
+        QueryBuilder::with_criteria(
            main,
            postings_lists,
            documents_fields_counts,
            synonyms,
            Criteria::default(),
        )
    }
    pub fn with_criteria(
        main: store::Main,
        postings_lists: store::PostingsLists,
        documents_fields_counts: store::DocumentsFieldsCounts,
        synonyms: store::Synonyms,
        criteria: Criteria<'c>,
    ) -> QueryBuilder<'c>
@ -214,6 +236,7 @@ impl<'c> QueryBuilder<'c> {
            timeout: None,
            main_store: main,
            postings_lists_store: postings_lists,
            documents_fields_counts_store: documents_fields_counts,
            synonyms_store: synonyms,
        }
    }
@ -230,6 +253,7 @@ impl<'c, FI> QueryBuilder<'c, FI> {
            timeout: self.timeout,
            main_store: self.main_store,
            postings_lists_store: self.postings_lists_store,
            documents_fields_counts_store: self.documents_fields_counts_store,
            synonyms_store: self.synonyms_store,
        }
    }
@ -292,6 +316,7 @@ impl<FI> QueryBuilder<'_, FI> where FI: Fn(DocumentId) -> bool {
                self.searchable_attrs.as_ref(),
                &self.main_store,
                &self.postings_lists_store,
                &self.documents_fields_counts_store,
            )?;
            // stop processing when time is running out
@ -420,6 +445,7 @@ where FI: Fn(DocumentId) -> bool,
                self.inner.searchable_attrs.as_ref(),
                &self.inner.main_store,
                &self.inner.postings_lists_store,
                &self.inner.documents_fields_counts_store,
            )?;
            // stop processing when time is running out
@ -549,6 +575,7 @@ mod tests {
    use fst::{Set, IntoStreamer};
    use sdset::SetBuf;
    use tempfile::TempDir;
    use meilidb_schema::SchemaAttr;
    use crate::automaton::normalize_str;
    use crate::database::Database;
@ -653,11 +680,15 @@ mod tests {
            let mut words_fst = BTreeSet::new();
            let mut postings_lists = HashMap::new();
            let mut fields_counts = HashMap::<_, u64>::new();
            for (word, indexes) in iter {
                let word = word.to_lowercase().into_bytes();
                words_fst.insert(word.clone());
                postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
                for idx in indexes {
                    fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1);
                }
            }
            let words_fst = Set::from_iter(words_fst).unwrap();
@ -669,6 +700,25 @@ mod tests {
                index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap();
            }
            for ((docid, attr, _), count) in fields_counts {
                let prev = index.documents_fields_counts
                    .document_attribute_count(
                        &mut writer,
                        docid,
                        SchemaAttr(attr),
                    ).unwrap();
                let prev = prev.unwrap_or(0);
                index.documents_fields_counts
                    .put_document_field_count(
                        &mut writer,
                        docid,
                        SchemaAttr(attr),
                        prev + count,
                    ).unwrap();
            }
            writer.commit().unwrap();
            drop(rkv);
@ -1470,8 +1520,8 @@ mod tests {
    #[test]
    fn deunicoded_synonyms() {
        let mut store = TempDatabase::from_iter(vec![
-            ("telephone", &[doc_index(0, 0)][..]), // meilidb-data indexes the unidecoded
+            ("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded
-            ("téléphone", &[doc_index(0, 0)][..]), // and the original words with the same DocIndex
+            ("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex
            ("iphone",    &[doc_index(1, 0)][..]),
        ]);
--- a/meilidb-core/src/raw_document.rs
+++ b/meilidb-core/src/raw_document.rs
@ -1,7 +1,10 @@
 use std::sync::Arc;
 use std::fmt;
 use meilidb_schema::SchemaAttr;
 use sdset::SetBuf;
 use slice_group_by::GroupBy;
 use crate::{TmpMatch, DocumentId, Highlight};
 #[derive(Clone)]
@ -9,13 +12,10 @@ pub struct RawDocument {
    pub id: DocumentId,
    pub matches: SharedMatches,
    pub highlights: Vec<Highlight>,
    pub fields_counts: SetBuf<(SchemaAttr, u64)>,
 }
 impl RawDocument {
    fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
        RawDocument { id, matches, highlights }
    }
    pub fn query_index(&self) -> &[u32] {
        let r = self.matches.range;
        // it is safe because construction/modifications
@ -69,31 +69,34 @@ impl fmt::Debug for RawDocument {
 pub fn raw_documents_from(
    matches: SetBuf<(DocumentId, TmpMatch)>,
    highlights: SetBuf<(DocumentId, Highlight)>,
    fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
 ) -> Vec<RawDocument>
 {
-    let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
+    let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
    let mut matches2 = Matches::with_capacity(matches.len());
    let matches = matches.linear_group_by_key(|(id, _)| *id);
    let highlights = highlights.linear_group_by_key(|(id, _)| *id);
    let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
-    for (mgroup, hgroup) in matches.zip(highlights) {
+    for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
        debug_assert_eq!(mgroup[0].0, hgroup[0].0);
        debug_assert_eq!(mgroup[0].0, fgroup[0].0);
        let document_id = mgroup[0].0;
-        let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
+        let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
        let end = start + mgroup.len();
        let highlights = hgroup.iter().map(|(_, h)| *h).collect();
-        docs_ranges.push((document_id, Range { start, end }, highlights));
+        let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
        docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
        matches2.extend_from_slice(mgroup);
    }
    let matches = Arc::new(matches2);
-    docs_ranges.into_iter().map(|(id, range, highlights)| {
+    docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| {
        let matches = SharedMatches { range, matches: matches.clone() };
-        RawDocument::new(id, matches, highlights)
+        RawDocument { id, matches, highlights, fields_counts }
    }).collect()
 }
--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@ -47,6 +47,9 @@ impl RawIndexer {
        let iter = Some(lowercase_text).into_iter().chain(next);
        for text in iter {
            // we must not count 2 times the same words
            number_of_words = 0;
            for token in Tokenizer::new(&text) {
                let must_continue = index_token(
                    token,
--- a/meilidb-core/src/store/mod.rs
+++ b/meilidb-core/src/store/mod.rs
@ -201,11 +201,22 @@ impl Index {
    }
    pub fn query_builder(&self) -> QueryBuilder {
-        QueryBuilder::new(self.main, self.postings_lists, self.synonyms)
+        QueryBuilder::new(
            self.main,
            self.postings_lists,
            self.documents_fields_counts,
            self.synonyms,
        )
    }
    pub fn query_builder_with_criteria<'c>(&self, criteria: Criteria<'c>) -> QueryBuilder<'c> {
-        QueryBuilder::with_criteria(self.main, self.postings_lists, self.synonyms, criteria)
+        QueryBuilder::with_criteria(
            self.main,
            self.postings_lists,
            self.documents_fields_counts,
            self.synonyms,
            criteria,
        )
    }
 }