mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Use the documents_fileds_count store in the QueryBuilder
This commit is contained in:
parent
b377003192
commit
e629f51af4
@ -22,6 +22,7 @@ pub struct QueryBuilder<'c, FI = fn(DocumentId) -> bool> {
|
|||||||
timeout: Option<Duration>,
|
timeout: Option<Duration>,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
postings_lists_store: store::PostingsLists,
|
postings_lists_store: store::PostingsLists,
|
||||||
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
synonyms_store: store::Synonyms,
|
synonyms_store: store::Synonyms,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -130,6 +131,7 @@ fn fetch_raw_documents(
|
|||||||
searchables: Option<&ReorderedAttrs>,
|
searchables: Option<&ReorderedAttrs>,
|
||||||
main_store: &store::Main,
|
main_store: &store::Main,
|
||||||
postings_lists_store: &store::PostingsLists,
|
postings_lists_store: &store::PostingsLists,
|
||||||
|
documents_fields_counts_store: &store::DocumentsFieldsCounts,
|
||||||
) -> MResult<Vec<RawDocument>>
|
) -> MResult<Vec<RawDocument>>
|
||||||
{
|
{
|
||||||
let mut matches = Vec::new();
|
let mut matches = Vec::new();
|
||||||
@ -187,22 +189,42 @@ fn fetch_raw_documents(
|
|||||||
SetBuf::new_unchecked(highlights)
|
SetBuf::new_unchecked(highlights)
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(raw_documents_from(matches, highlights))
|
let fields_counts = {
|
||||||
|
let mut fields_counts = Vec::new();
|
||||||
|
for group in matches.linear_group_by_key(|(id, ..)| *id) {
|
||||||
|
let id = group[0].0;
|
||||||
|
for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
|
||||||
|
let (attr, count) = result?;
|
||||||
|
fields_counts.push((id, attr, count));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
SetBuf::new(fields_counts).unwrap()
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(raw_documents_from(matches, highlights, fields_counts))
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'c> QueryBuilder<'c> {
|
impl<'c> QueryBuilder<'c> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
main: store::Main,
|
main: store::Main,
|
||||||
postings_lists: store::PostingsLists,
|
postings_lists: store::PostingsLists,
|
||||||
|
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||||
synonyms: store::Synonyms,
|
synonyms: store::Synonyms,
|
||||||
) -> QueryBuilder<'c>
|
) -> QueryBuilder<'c>
|
||||||
{
|
{
|
||||||
QueryBuilder::with_criteria(main, postings_lists, synonyms, Criteria::default())
|
QueryBuilder::with_criteria(
|
||||||
|
main,
|
||||||
|
postings_lists,
|
||||||
|
documents_fields_counts,
|
||||||
|
synonyms,
|
||||||
|
Criteria::default(),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_criteria(
|
pub fn with_criteria(
|
||||||
main: store::Main,
|
main: store::Main,
|
||||||
postings_lists: store::PostingsLists,
|
postings_lists: store::PostingsLists,
|
||||||
|
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||||
synonyms: store::Synonyms,
|
synonyms: store::Synonyms,
|
||||||
criteria: Criteria<'c>,
|
criteria: Criteria<'c>,
|
||||||
) -> QueryBuilder<'c>
|
) -> QueryBuilder<'c>
|
||||||
@ -214,6 +236,7 @@ impl<'c> QueryBuilder<'c> {
|
|||||||
timeout: None,
|
timeout: None,
|
||||||
main_store: main,
|
main_store: main,
|
||||||
postings_lists_store: postings_lists,
|
postings_lists_store: postings_lists,
|
||||||
|
documents_fields_counts_store: documents_fields_counts,
|
||||||
synonyms_store: synonyms,
|
synonyms_store: synonyms,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -230,6 +253,7 @@ impl<'c, FI> QueryBuilder<'c, FI> {
|
|||||||
timeout: self.timeout,
|
timeout: self.timeout,
|
||||||
main_store: self.main_store,
|
main_store: self.main_store,
|
||||||
postings_lists_store: self.postings_lists_store,
|
postings_lists_store: self.postings_lists_store,
|
||||||
|
documents_fields_counts_store: self.documents_fields_counts_store,
|
||||||
synonyms_store: self.synonyms_store,
|
synonyms_store: self.synonyms_store,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -292,6 +316,7 @@ impl<FI> QueryBuilder<'_, FI> where FI: Fn(DocumentId) -> bool {
|
|||||||
self.searchable_attrs.as_ref(),
|
self.searchable_attrs.as_ref(),
|
||||||
&self.main_store,
|
&self.main_store,
|
||||||
&self.postings_lists_store,
|
&self.postings_lists_store,
|
||||||
|
&self.documents_fields_counts_store,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// stop processing when time is running out
|
// stop processing when time is running out
|
||||||
@ -420,6 +445,7 @@ where FI: Fn(DocumentId) -> bool,
|
|||||||
self.inner.searchable_attrs.as_ref(),
|
self.inner.searchable_attrs.as_ref(),
|
||||||
&self.inner.main_store,
|
&self.inner.main_store,
|
||||||
&self.inner.postings_lists_store,
|
&self.inner.postings_lists_store,
|
||||||
|
&self.inner.documents_fields_counts_store,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// stop processing when time is running out
|
// stop processing when time is running out
|
||||||
@ -549,6 +575,7 @@ mod tests {
|
|||||||
use fst::{Set, IntoStreamer};
|
use fst::{Set, IntoStreamer};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
|
use meilidb_schema::SchemaAttr;
|
||||||
|
|
||||||
use crate::automaton::normalize_str;
|
use crate::automaton::normalize_str;
|
||||||
use crate::database::Database;
|
use crate::database::Database;
|
||||||
@ -653,11 +680,15 @@ mod tests {
|
|||||||
|
|
||||||
let mut words_fst = BTreeSet::new();
|
let mut words_fst = BTreeSet::new();
|
||||||
let mut postings_lists = HashMap::new();
|
let mut postings_lists = HashMap::new();
|
||||||
|
let mut fields_counts = HashMap::<_, u64>::new();
|
||||||
|
|
||||||
for (word, indexes) in iter {
|
for (word, indexes) in iter {
|
||||||
let word = word.to_lowercase().into_bytes();
|
let word = word.to_lowercase().into_bytes();
|
||||||
words_fst.insert(word.clone());
|
words_fst.insert(word.clone());
|
||||||
postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
|
postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
|
||||||
|
for idx in indexes {
|
||||||
|
fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let words_fst = Set::from_iter(words_fst).unwrap();
|
let words_fst = Set::from_iter(words_fst).unwrap();
|
||||||
@ -669,6 +700,25 @@ mod tests {
|
|||||||
index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap();
|
index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for ((docid, attr, _), count) in fields_counts {
|
||||||
|
let prev = index.documents_fields_counts
|
||||||
|
.document_attribute_count(
|
||||||
|
&mut writer,
|
||||||
|
docid,
|
||||||
|
SchemaAttr(attr),
|
||||||
|
).unwrap();
|
||||||
|
|
||||||
|
let prev = prev.unwrap_or(0);
|
||||||
|
|
||||||
|
index.documents_fields_counts
|
||||||
|
.put_document_field_count(
|
||||||
|
&mut writer,
|
||||||
|
docid,
|
||||||
|
SchemaAttr(attr),
|
||||||
|
prev + count,
|
||||||
|
).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
writer.commit().unwrap();
|
writer.commit().unwrap();
|
||||||
drop(rkv);
|
drop(rkv);
|
||||||
|
|
||||||
@ -1470,8 +1520,8 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn deunicoded_synonyms() {
|
fn deunicoded_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("telephone", &[doc_index(0, 0)][..]), // meilidb-data indexes the unidecoded
|
("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded
|
||||||
("téléphone", &[doc_index(0, 0)][..]), // and the original words with the same DocIndex
|
("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex
|
||||||
|
|
||||||
("iphone", &[doc_index(1, 0)][..]),
|
("iphone", &[doc_index(1, 0)][..]),
|
||||||
]);
|
]);
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
|
use meilidb_schema::SchemaAttr;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::{TmpMatch, DocumentId, Highlight};
|
use crate::{TmpMatch, DocumentId, Highlight};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@ -9,13 +12,10 @@ pub struct RawDocument {
|
|||||||
pub id: DocumentId,
|
pub id: DocumentId,
|
||||||
pub matches: SharedMatches,
|
pub matches: SharedMatches,
|
||||||
pub highlights: Vec<Highlight>,
|
pub highlights: Vec<Highlight>,
|
||||||
|
pub fields_counts: SetBuf<(SchemaAttr, u64)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RawDocument {
|
impl RawDocument {
|
||||||
fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
|
|
||||||
RawDocument { id, matches, highlights }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn query_index(&self) -> &[u32] {
|
pub fn query_index(&self) -> &[u32] {
|
||||||
let r = self.matches.range;
|
let r = self.matches.range;
|
||||||
// it is safe because construction/modifications
|
// it is safe because construction/modifications
|
||||||
@ -69,31 +69,34 @@ impl fmt::Debug for RawDocument {
|
|||||||
pub fn raw_documents_from(
|
pub fn raw_documents_from(
|
||||||
matches: SetBuf<(DocumentId, TmpMatch)>,
|
matches: SetBuf<(DocumentId, TmpMatch)>,
|
||||||
highlights: SetBuf<(DocumentId, Highlight)>,
|
highlights: SetBuf<(DocumentId, Highlight)>,
|
||||||
|
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
|
||||||
) -> Vec<RawDocument>
|
) -> Vec<RawDocument>
|
||||||
{
|
{
|
||||||
let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
|
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
|
||||||
let mut matches2 = Matches::with_capacity(matches.len());
|
let mut matches2 = Matches::with_capacity(matches.len());
|
||||||
|
|
||||||
let matches = matches.linear_group_by_key(|(id, _)| *id);
|
let matches = matches.linear_group_by_key(|(id, _)| *id);
|
||||||
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
|
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
|
||||||
|
let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
|
||||||
|
|
||||||
for (mgroup, hgroup) in matches.zip(highlights) {
|
for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
|
||||||
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
|
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
|
||||||
|
debug_assert_eq!(mgroup[0].0, fgroup[0].0);
|
||||||
|
|
||||||
let document_id = mgroup[0].0;
|
let document_id = mgroup[0].0;
|
||||||
let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
|
let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
|
||||||
let end = start + mgroup.len();
|
let end = start + mgroup.len();
|
||||||
|
|
||||||
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
|
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
|
||||||
docs_ranges.push((document_id, Range { start, end }, highlights));
|
let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
|
||||||
|
|
||||||
|
docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
|
||||||
matches2.extend_from_slice(mgroup);
|
matches2.extend_from_slice(mgroup);
|
||||||
}
|
}
|
||||||
|
|
||||||
let matches = Arc::new(matches2);
|
let matches = Arc::new(matches2);
|
||||||
docs_ranges.into_iter().map(|(id, range, highlights)| {
|
docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| {
|
||||||
let matches = SharedMatches { range, matches: matches.clone() };
|
let matches = SharedMatches { range, matches: matches.clone() };
|
||||||
RawDocument::new(id, matches, highlights)
|
RawDocument { id, matches, highlights, fields_counts }
|
||||||
}).collect()
|
}).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -47,6 +47,9 @@ impl RawIndexer {
|
|||||||
let iter = Some(lowercase_text).into_iter().chain(next);
|
let iter = Some(lowercase_text).into_iter().chain(next);
|
||||||
|
|
||||||
for text in iter {
|
for text in iter {
|
||||||
|
// we must not count 2 times the same words
|
||||||
|
number_of_words = 0;
|
||||||
|
|
||||||
for token in Tokenizer::new(&text) {
|
for token in Tokenizer::new(&text) {
|
||||||
let must_continue = index_token(
|
let must_continue = index_token(
|
||||||
token,
|
token,
|
||||||
|
@ -201,11 +201,22 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn query_builder(&self) -> QueryBuilder {
|
pub fn query_builder(&self) -> QueryBuilder {
|
||||||
QueryBuilder::new(self.main, self.postings_lists, self.synonyms)
|
QueryBuilder::new(
|
||||||
|
self.main,
|
||||||
|
self.postings_lists,
|
||||||
|
self.documents_fields_counts,
|
||||||
|
self.synonyms,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn query_builder_with_criteria<'c>(&self, criteria: Criteria<'c>) -> QueryBuilder<'c> {
|
pub fn query_builder_with_criteria<'c>(&self, criteria: Criteria<'c>) -> QueryBuilder<'c> {
|
||||||
QueryBuilder::with_criteria(self.main, self.postings_lists, self.synonyms, criteria)
|
QueryBuilder::with_criteria(
|
||||||
|
self.main,
|
||||||
|
self.postings_lists,
|
||||||
|
self.documents_fields_counts,
|
||||||
|
self.synonyms,
|
||||||
|
criteria,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user