From e629f51af4813d109ced1a180bd5472a041d8089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 14 Oct 2019 18:48:32 +0200 Subject: [PATCH] Use the documents_fileds_count store in the QueryBuilder --- meilidb-core/src/query_builder.rs | 58 ++++++++++++++++++++++++++++--- meilidb-core/src/raw_document.rs | 27 +++++++------- meilidb-core/src/raw_indexer.rs | 3 ++ meilidb-core/src/store/mod.rs | 15 ++++++-- 4 files changed, 85 insertions(+), 18 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index c46093ce9..78557c9a4 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -22,6 +22,7 @@ pub struct QueryBuilder<'c, FI = fn(DocumentId) -> bool> { timeout: Option, main_store: store::Main, postings_lists_store: store::PostingsLists, + documents_fields_counts_store: store::DocumentsFieldsCounts, synonyms_store: store::Synonyms, } @@ -130,6 +131,7 @@ fn fetch_raw_documents( searchables: Option<&ReorderedAttrs>, main_store: &store::Main, postings_lists_store: &store::PostingsLists, + documents_fields_counts_store: &store::DocumentsFieldsCounts, ) -> MResult> { let mut matches = Vec::new(); @@ -187,22 +189,42 @@ fn fetch_raw_documents( SetBuf::new_unchecked(highlights) }; - Ok(raw_documents_from(matches, highlights)) + let fields_counts = { + let mut fields_counts = Vec::new(); + for group in matches.linear_group_by_key(|(id, ..)| *id) { + let id = group[0].0; + for result in documents_fields_counts_store.document_fields_counts(reader, id)? { + let (attr, count) = result?; + fields_counts.push((id, attr, count)); + } + } + SetBuf::new(fields_counts).unwrap() + }; + + Ok(raw_documents_from(matches, highlights, fields_counts)) } impl<'c> QueryBuilder<'c> { pub fn new( main: store::Main, postings_lists: store::PostingsLists, + documents_fields_counts: store::DocumentsFieldsCounts, synonyms: store::Synonyms, ) -> QueryBuilder<'c> { - QueryBuilder::with_criteria(main, postings_lists, synonyms, Criteria::default()) + QueryBuilder::with_criteria( + main, + postings_lists, + documents_fields_counts, + synonyms, + Criteria::default(), + ) } pub fn with_criteria( main: store::Main, postings_lists: store::PostingsLists, + documents_fields_counts: store::DocumentsFieldsCounts, synonyms: store::Synonyms, criteria: Criteria<'c>, ) -> QueryBuilder<'c> @@ -214,6 +236,7 @@ impl<'c> QueryBuilder<'c> { timeout: None, main_store: main, postings_lists_store: postings_lists, + documents_fields_counts_store: documents_fields_counts, synonyms_store: synonyms, } } @@ -230,6 +253,7 @@ impl<'c, FI> QueryBuilder<'c, FI> { timeout: self.timeout, main_store: self.main_store, postings_lists_store: self.postings_lists_store, + documents_fields_counts_store: self.documents_fields_counts_store, synonyms_store: self.synonyms_store, } } @@ -292,6 +316,7 @@ impl QueryBuilder<'_, FI> where FI: Fn(DocumentId) -> bool { self.searchable_attrs.as_ref(), &self.main_store, &self.postings_lists_store, + &self.documents_fields_counts_store, )?; // stop processing when time is running out @@ -420,6 +445,7 @@ where FI: Fn(DocumentId) -> bool, self.inner.searchable_attrs.as_ref(), &self.inner.main_store, &self.inner.postings_lists_store, + &self.inner.documents_fields_counts_store, )?; // stop processing when time is running out @@ -549,6 +575,7 @@ mod tests { use fst::{Set, IntoStreamer}; use sdset::SetBuf; use tempfile::TempDir; + use meilidb_schema::SchemaAttr; use crate::automaton::normalize_str; use crate::database::Database; @@ -653,11 +680,15 @@ mod tests { let mut words_fst = BTreeSet::new(); let mut postings_lists = HashMap::new(); + let mut fields_counts = HashMap::<_, u64>::new(); for (word, indexes) in iter { let word = word.to_lowercase().into_bytes(); words_fst.insert(word.clone()); postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes); + for idx in indexes { + fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1); + } } let words_fst = Set::from_iter(words_fst).unwrap(); @@ -669,6 +700,25 @@ mod tests { index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap(); } + for ((docid, attr, _), count) in fields_counts { + let prev = index.documents_fields_counts + .document_attribute_count( + &mut writer, + docid, + SchemaAttr(attr), + ).unwrap(); + + let prev = prev.unwrap_or(0); + + index.documents_fields_counts + .put_document_field_count( + &mut writer, + docid, + SchemaAttr(attr), + prev + count, + ).unwrap(); + } + writer.commit().unwrap(); drop(rkv); @@ -1470,8 +1520,8 @@ mod tests { #[test] fn deunicoded_synonyms() { let mut store = TempDatabase::from_iter(vec![ - ("telephone", &[doc_index(0, 0)][..]), // meilidb-data indexes the unidecoded - ("téléphone", &[doc_index(0, 0)][..]), // and the original words with the same DocIndex + ("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded + ("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex ("iphone", &[doc_index(1, 0)][..]), ]); diff --git a/meilidb-core/src/raw_document.rs b/meilidb-core/src/raw_document.rs index 3567c3fd1..16cc9edda 100644 --- a/meilidb-core/src/raw_document.rs +++ b/meilidb-core/src/raw_document.rs @@ -1,7 +1,10 @@ use std::sync::Arc; use std::fmt; + +use meilidb_schema::SchemaAttr; use sdset::SetBuf; use slice_group_by::GroupBy; + use crate::{TmpMatch, DocumentId, Highlight}; #[derive(Clone)] @@ -9,13 +12,10 @@ pub struct RawDocument { pub id: DocumentId, pub matches: SharedMatches, pub highlights: Vec, + pub fields_counts: SetBuf<(SchemaAttr, u64)>, } impl RawDocument { - fn new(id: DocumentId, matches: SharedMatches, highlights: Vec) -> RawDocument { - RawDocument { id, matches, highlights } - } - pub fn query_index(&self) -> &[u32] { let r = self.matches.range; // it is safe because construction/modifications @@ -60,7 +60,7 @@ impl fmt::Debug for RawDocument { f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?; f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?; f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?; - f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?; f.write_str("}")?; Ok(()) } @@ -69,31 +69,34 @@ impl fmt::Debug for RawDocument { pub fn raw_documents_from( matches: SetBuf<(DocumentId, TmpMatch)>, highlights: SetBuf<(DocumentId, Highlight)>, + fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>, ) -> Vec { - let mut docs_ranges: Vec<(_, Range, _)> = Vec::new(); + let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); let mut matches2 = Matches::with_capacity(matches.len()); let matches = matches.linear_group_by_key(|(id, _)| *id); let highlights = highlights.linear_group_by_key(|(id, _)| *id); + let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id); - for (mgroup, hgroup) in matches.zip(highlights) { + for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) { debug_assert_eq!(mgroup[0].0, hgroup[0].0); + debug_assert_eq!(mgroup[0].0, fgroup[0].0); let document_id = mgroup[0].0; - let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); + let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0); let end = start + mgroup.len(); - let highlights = hgroup.iter().map(|(_, h)| *h).collect(); - docs_ranges.push((document_id, Range { start, end }, highlights)); + let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap(); + docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); matches2.extend_from_slice(mgroup); } let matches = Arc::new(matches2); - docs_ranges.into_iter().map(|(id, range, highlights)| { + docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| { let matches = SharedMatches { range, matches: matches.clone() }; - RawDocument::new(id, matches, highlights) + RawDocument { id, matches, highlights, fields_counts } }).collect() } diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index 06d82ed32..980b622f7 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -47,6 +47,9 @@ impl RawIndexer { let iter = Some(lowercase_text).into_iter().chain(next); for text in iter { + // we must not count 2 times the same words + number_of_words = 0; + for token in Tokenizer::new(&text) { let must_continue = index_token( token, diff --git a/meilidb-core/src/store/mod.rs b/meilidb-core/src/store/mod.rs index 136b10bab..3bdd4ca18 100644 --- a/meilidb-core/src/store/mod.rs +++ b/meilidb-core/src/store/mod.rs @@ -201,11 +201,22 @@ impl Index { } pub fn query_builder(&self) -> QueryBuilder { - QueryBuilder::new(self.main, self.postings_lists, self.synonyms) + QueryBuilder::new( + self.main, + self.postings_lists, + self.documents_fields_counts, + self.synonyms, + ) } pub fn query_builder_with_criteria<'c>(&self, criteria: Criteria<'c>) -> QueryBuilder<'c> { - QueryBuilder::with_criteria(self.main, self.postings_lists, self.synonyms, criteria) + QueryBuilder::with_criteria( + self.main, + self.postings_lists, + self.documents_fields_counts, + self.synonyms, + criteria, + ) } }