Use the documents_fileds_count store in the QueryBuilder

This commit is contained in:
Clément Renault 2019-10-14 18:48:32 +02:00
parent b377003192
commit e629f51af4
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
4 changed files with 85 additions and 18 deletions

View File

@ -22,6 +22,7 @@ pub struct QueryBuilder<'c, FI = fn(DocumentId) -> bool> {
timeout: Option<Duration>, timeout: Option<Duration>,
main_store: store::Main, main_store: store::Main,
postings_lists_store: store::PostingsLists, postings_lists_store: store::PostingsLists,
documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms, synonyms_store: store::Synonyms,
} }
@ -130,6 +131,7 @@ fn fetch_raw_documents(
searchables: Option<&ReorderedAttrs>, searchables: Option<&ReorderedAttrs>,
main_store: &store::Main, main_store: &store::Main,
postings_lists_store: &store::PostingsLists, postings_lists_store: &store::PostingsLists,
documents_fields_counts_store: &store::DocumentsFieldsCounts,
) -> MResult<Vec<RawDocument>> ) -> MResult<Vec<RawDocument>>
{ {
let mut matches = Vec::new(); let mut matches = Vec::new();
@ -187,22 +189,42 @@ fn fetch_raw_documents(
SetBuf::new_unchecked(highlights) SetBuf::new_unchecked(highlights)
}; };
Ok(raw_documents_from(matches, highlights)) let fields_counts = {
let mut fields_counts = Vec::new();
for group in matches.linear_group_by_key(|(id, ..)| *id) {
let id = group[0].0;
for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
let (attr, count) = result?;
fields_counts.push((id, attr, count));
}
}
SetBuf::new(fields_counts).unwrap()
};
Ok(raw_documents_from(matches, highlights, fields_counts))
} }
impl<'c> QueryBuilder<'c> { impl<'c> QueryBuilder<'c> {
pub fn new( pub fn new(
main: store::Main, main: store::Main,
postings_lists: store::PostingsLists, postings_lists: store::PostingsLists,
documents_fields_counts: store::DocumentsFieldsCounts,
synonyms: store::Synonyms, synonyms: store::Synonyms,
) -> QueryBuilder<'c> ) -> QueryBuilder<'c>
{ {
QueryBuilder::with_criteria(main, postings_lists, synonyms, Criteria::default()) QueryBuilder::with_criteria(
main,
postings_lists,
documents_fields_counts,
synonyms,
Criteria::default(),
)
} }
pub fn with_criteria( pub fn with_criteria(
main: store::Main, main: store::Main,
postings_lists: store::PostingsLists, postings_lists: store::PostingsLists,
documents_fields_counts: store::DocumentsFieldsCounts,
synonyms: store::Synonyms, synonyms: store::Synonyms,
criteria: Criteria<'c>, criteria: Criteria<'c>,
) -> QueryBuilder<'c> ) -> QueryBuilder<'c>
@ -214,6 +236,7 @@ impl<'c> QueryBuilder<'c> {
timeout: None, timeout: None,
main_store: main, main_store: main,
postings_lists_store: postings_lists, postings_lists_store: postings_lists,
documents_fields_counts_store: documents_fields_counts,
synonyms_store: synonyms, synonyms_store: synonyms,
} }
} }
@ -230,6 +253,7 @@ impl<'c, FI> QueryBuilder<'c, FI> {
timeout: self.timeout, timeout: self.timeout,
main_store: self.main_store, main_store: self.main_store,
postings_lists_store: self.postings_lists_store, postings_lists_store: self.postings_lists_store,
documents_fields_counts_store: self.documents_fields_counts_store,
synonyms_store: self.synonyms_store, synonyms_store: self.synonyms_store,
} }
} }
@ -292,6 +316,7 @@ impl<FI> QueryBuilder<'_, FI> where FI: Fn(DocumentId) -> bool {
self.searchable_attrs.as_ref(), self.searchable_attrs.as_ref(),
&self.main_store, &self.main_store,
&self.postings_lists_store, &self.postings_lists_store,
&self.documents_fields_counts_store,
)?; )?;
// stop processing when time is running out // stop processing when time is running out
@ -420,6 +445,7 @@ where FI: Fn(DocumentId) -> bool,
self.inner.searchable_attrs.as_ref(), self.inner.searchable_attrs.as_ref(),
&self.inner.main_store, &self.inner.main_store,
&self.inner.postings_lists_store, &self.inner.postings_lists_store,
&self.inner.documents_fields_counts_store,
)?; )?;
// stop processing when time is running out // stop processing when time is running out
@ -549,6 +575,7 @@ mod tests {
use fst::{Set, IntoStreamer}; use fst::{Set, IntoStreamer};
use sdset::SetBuf; use sdset::SetBuf;
use tempfile::TempDir; use tempfile::TempDir;
use meilidb_schema::SchemaAttr;
use crate::automaton::normalize_str; use crate::automaton::normalize_str;
use crate::database::Database; use crate::database::Database;
@ -653,11 +680,15 @@ mod tests {
let mut words_fst = BTreeSet::new(); let mut words_fst = BTreeSet::new();
let mut postings_lists = HashMap::new(); let mut postings_lists = HashMap::new();
let mut fields_counts = HashMap::<_, u64>::new();
for (word, indexes) in iter { for (word, indexes) in iter {
let word = word.to_lowercase().into_bytes(); let word = word.to_lowercase().into_bytes();
words_fst.insert(word.clone()); words_fst.insert(word.clone());
postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes); postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
for idx in indexes {
fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1);
}
} }
let words_fst = Set::from_iter(words_fst).unwrap(); let words_fst = Set::from_iter(words_fst).unwrap();
@ -669,6 +700,25 @@ mod tests {
index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap(); index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap();
} }
for ((docid, attr, _), count) in fields_counts {
let prev = index.documents_fields_counts
.document_attribute_count(
&mut writer,
docid,
SchemaAttr(attr),
).unwrap();
let prev = prev.unwrap_or(0);
index.documents_fields_counts
.put_document_field_count(
&mut writer,
docid,
SchemaAttr(attr),
prev + count,
).unwrap();
}
writer.commit().unwrap(); writer.commit().unwrap();
drop(rkv); drop(rkv);
@ -1470,8 +1520,8 @@ mod tests {
#[test] #[test]
fn deunicoded_synonyms() { fn deunicoded_synonyms() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("telephone", &[doc_index(0, 0)][..]), // meilidb-data indexes the unidecoded ("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded
("téléphone", &[doc_index(0, 0)][..]), // and the original words with the same DocIndex ("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex
("iphone", &[doc_index(1, 0)][..]), ("iphone", &[doc_index(1, 0)][..]),
]); ]);

View File

@ -1,7 +1,10 @@
use std::sync::Arc; use std::sync::Arc;
use std::fmt; use std::fmt;
use meilidb_schema::SchemaAttr;
use sdset::SetBuf; use sdset::SetBuf;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::{TmpMatch, DocumentId, Highlight}; use crate::{TmpMatch, DocumentId, Highlight};
#[derive(Clone)] #[derive(Clone)]
@ -9,13 +12,10 @@ pub struct RawDocument {
pub id: DocumentId, pub id: DocumentId,
pub matches: SharedMatches, pub matches: SharedMatches,
pub highlights: Vec<Highlight>, pub highlights: Vec<Highlight>,
pub fields_counts: SetBuf<(SchemaAttr, u64)>,
} }
impl RawDocument { impl RawDocument {
fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
RawDocument { id, matches, highlights }
}
pub fn query_index(&self) -> &[u32] { pub fn query_index(&self) -> &[u32] {
let r = self.matches.range; let r = self.matches.range;
// it is safe because construction/modifications // it is safe because construction/modifications
@ -60,7 +60,7 @@ impl fmt::Debug for RawDocument {
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?; f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?; f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?; f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?; f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?;
f.write_str("}")?; f.write_str("}")?;
Ok(()) Ok(())
} }
@ -69,31 +69,34 @@ impl fmt::Debug for RawDocument {
pub fn raw_documents_from( pub fn raw_documents_from(
matches: SetBuf<(DocumentId, TmpMatch)>, matches: SetBuf<(DocumentId, TmpMatch)>,
highlights: SetBuf<(DocumentId, Highlight)>, highlights: SetBuf<(DocumentId, Highlight)>,
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
) -> Vec<RawDocument> ) -> Vec<RawDocument>
{ {
let mut docs_ranges: Vec<(_, Range, _)> = Vec::new(); let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
let mut matches2 = Matches::with_capacity(matches.len()); let mut matches2 = Matches::with_capacity(matches.len());
let matches = matches.linear_group_by_key(|(id, _)| *id); let matches = matches.linear_group_by_key(|(id, _)| *id);
let highlights = highlights.linear_group_by_key(|(id, _)| *id); let highlights = highlights.linear_group_by_key(|(id, _)| *id);
let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
for (mgroup, hgroup) in matches.zip(highlights) { for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
debug_assert_eq!(mgroup[0].0, hgroup[0].0); debug_assert_eq!(mgroup[0].0, hgroup[0].0);
debug_assert_eq!(mgroup[0].0, fgroup[0].0);
let document_id = mgroup[0].0; let document_id = mgroup[0].0;
let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
let end = start + mgroup.len(); let end = start + mgroup.len();
let highlights = hgroup.iter().map(|(_, h)| *h).collect(); let highlights = hgroup.iter().map(|(_, h)| *h).collect();
docs_ranges.push((document_id, Range { start, end }, highlights)); let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
matches2.extend_from_slice(mgroup); matches2.extend_from_slice(mgroup);
} }
let matches = Arc::new(matches2); let matches = Arc::new(matches2);
docs_ranges.into_iter().map(|(id, range, highlights)| { docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| {
let matches = SharedMatches { range, matches: matches.clone() }; let matches = SharedMatches { range, matches: matches.clone() };
RawDocument::new(id, matches, highlights) RawDocument { id, matches, highlights, fields_counts }
}).collect() }).collect()
} }

View File

@ -47,6 +47,9 @@ impl RawIndexer {
let iter = Some(lowercase_text).into_iter().chain(next); let iter = Some(lowercase_text).into_iter().chain(next);
for text in iter { for text in iter {
// we must not count 2 times the same words
number_of_words = 0;
for token in Tokenizer::new(&text) { for token in Tokenizer::new(&text) {
let must_continue = index_token( let must_continue = index_token(
token, token,

View File

@ -201,11 +201,22 @@ impl Index {
} }
pub fn query_builder(&self) -> QueryBuilder { pub fn query_builder(&self) -> QueryBuilder {
QueryBuilder::new(self.main, self.postings_lists, self.synonyms) QueryBuilder::new(
self.main,
self.postings_lists,
self.documents_fields_counts,
self.synonyms,
)
} }
pub fn query_builder_with_criteria<'c>(&self, criteria: Criteria<'c>) -> QueryBuilder<'c> { pub fn query_builder_with_criteria<'c>(&self, criteria: Criteria<'c>) -> QueryBuilder<'c> {
QueryBuilder::with_criteria(self.main, self.postings_lists, self.synonyms, criteria) QueryBuilder::with_criteria(
self.main,
self.postings_lists,
self.documents_fields_counts,
self.synonyms,
criteria,
)
} }
} }