mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Merge pull request #217 from meilisearch/improve-exactness-criterion
Improve the exactness criterion
This commit is contained in:
commit
fdc98f9ef3
@ -1,16 +1,38 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
|
use sdset::Set;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
use meilidb_schema::SchemaAttr;
|
||||||
|
|
||||||
use crate::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::RawDocument;
|
use crate::RawDocument;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
|
fn number_exact_matches(
|
||||||
|
query_index: &[u32],
|
||||||
|
attribute: &[u16],
|
||||||
|
is_exact: &[bool],
|
||||||
|
fields_counts: &Set<(SchemaAttr, u64)>,
|
||||||
|
) -> usize
|
||||||
|
{
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let mut index = 0;
|
let mut index = 0;
|
||||||
|
|
||||||
for group in query_index.linear_group() {
|
for group in query_index.linear_group() {
|
||||||
let len = group.len();
|
let len = group.len();
|
||||||
count += is_exact[index..index + len].contains(&true) as usize;
|
|
||||||
|
let mut found_exact = false;
|
||||||
|
for (pos, _) in is_exact[index..index + len].iter().filter(|x| **x).enumerate() {
|
||||||
|
found_exact = true;
|
||||||
|
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
|
||||||
|
let (_, count) = fields_counts[pos];
|
||||||
|
if count == 1 {
|
||||||
|
return usize::max_value()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
count += found_exact as usize;
|
||||||
index += len;
|
index += len;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -25,13 +47,19 @@ impl Criterion for Exact {
|
|||||||
let lhs = {
|
let lhs = {
|
||||||
let query_index = lhs.query_index();
|
let query_index = lhs.query_index();
|
||||||
let is_exact = lhs.is_exact();
|
let is_exact = lhs.is_exact();
|
||||||
number_exact_matches(query_index, is_exact)
|
let attribute = lhs.attribute();
|
||||||
|
let fields_counts = &lhs.fields_counts;
|
||||||
|
|
||||||
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
};
|
};
|
||||||
|
|
||||||
let rhs = {
|
let rhs = {
|
||||||
let query_index = rhs.query_index();
|
let query_index = rhs.query_index();
|
||||||
let is_exact = rhs.is_exact();
|
let is_exact = rhs.is_exact();
|
||||||
number_exact_matches(query_index, is_exact)
|
let attribute = rhs.attribute();
|
||||||
|
let fields_counts = &rhs.fields_counts;
|
||||||
|
|
||||||
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
};
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs).reverse()
|
lhs.cmp(&rhs).reverse()
|
||||||
@ -52,14 +80,51 @@ mod tests {
|
|||||||
// doc1: "souliereres rouge"
|
// doc1: "souliereres rouge"
|
||||||
#[test]
|
#[test]
|
||||||
fn easy_case() {
|
fn easy_case() {
|
||||||
let query_index0 = &[0];
|
let doc0 = {
|
||||||
let is_exact0 = &[true];
|
let query_index = &[0];
|
||||||
|
let attribute = &[0];
|
||||||
|
let is_exact = &[true];
|
||||||
|
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
||||||
|
|
||||||
let query_index1 = &[0];
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
let is_exact1 = &[false];
|
};
|
||||||
|
|
||||||
|
let doc1 = {
|
||||||
|
let query_index = &[0];
|
||||||
|
let attribute = &[0];
|
||||||
|
let is_exact = &[false];
|
||||||
|
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
||||||
|
|
||||||
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
|
};
|
||||||
|
|
||||||
|
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||||
|
}
|
||||||
|
|
||||||
|
// typing: "soulier"
|
||||||
|
//
|
||||||
|
// doc0: { 0. "soulier" }
|
||||||
|
// doc1: { 0. "soulier bleu et blanc" }
|
||||||
|
#[test]
|
||||||
|
fn basic() {
|
||||||
|
let doc0 = {
|
||||||
|
let query_index = &[0];
|
||||||
|
let attribute = &[0];
|
||||||
|
let is_exact = &[true];
|
||||||
|
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
|
||||||
|
|
||||||
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
|
};
|
||||||
|
|
||||||
|
let doc1 = {
|
||||||
|
let query_index = &[0];
|
||||||
|
let attribute = &[0];
|
||||||
|
let is_exact = &[true];
|
||||||
|
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
|
||||||
|
|
||||||
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
|
};
|
||||||
|
|
||||||
let doc0 = number_exact_matches(query_index0, is_exact0);
|
|
||||||
let doc1 = number_exact_matches(query_index1, is_exact1);
|
|
||||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -22,6 +22,7 @@ pub struct QueryBuilder<'c, FI = fn(DocumentId) -> bool> {
|
|||||||
timeout: Option<Duration>,
|
timeout: Option<Duration>,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
postings_lists_store: store::PostingsLists,
|
postings_lists_store: store::PostingsLists,
|
||||||
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
synonyms_store: store::Synonyms,
|
synonyms_store: store::Synonyms,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -130,6 +131,7 @@ fn fetch_raw_documents(
|
|||||||
searchables: Option<&ReorderedAttrs>,
|
searchables: Option<&ReorderedAttrs>,
|
||||||
main_store: &store::Main,
|
main_store: &store::Main,
|
||||||
postings_lists_store: &store::PostingsLists,
|
postings_lists_store: &store::PostingsLists,
|
||||||
|
documents_fields_counts_store: &store::DocumentsFieldsCounts,
|
||||||
) -> MResult<Vec<RawDocument>>
|
) -> MResult<Vec<RawDocument>>
|
||||||
{
|
{
|
||||||
let mut matches = Vec::new();
|
let mut matches = Vec::new();
|
||||||
@ -187,22 +189,42 @@ fn fetch_raw_documents(
|
|||||||
SetBuf::new_unchecked(highlights)
|
SetBuf::new_unchecked(highlights)
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(raw_documents_from(matches, highlights))
|
let fields_counts = {
|
||||||
|
let mut fields_counts = Vec::new();
|
||||||
|
for group in matches.linear_group_by_key(|(id, ..)| *id) {
|
||||||
|
let id = group[0].0;
|
||||||
|
for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
|
||||||
|
let (attr, count) = result?;
|
||||||
|
fields_counts.push((id, attr, count));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
SetBuf::new(fields_counts).unwrap()
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(raw_documents_from(matches, highlights, fields_counts))
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'c> QueryBuilder<'c> {
|
impl<'c> QueryBuilder<'c> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
main: store::Main,
|
main: store::Main,
|
||||||
postings_lists: store::PostingsLists,
|
postings_lists: store::PostingsLists,
|
||||||
|
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||||
synonyms: store::Synonyms,
|
synonyms: store::Synonyms,
|
||||||
) -> QueryBuilder<'c>
|
) -> QueryBuilder<'c>
|
||||||
{
|
{
|
||||||
QueryBuilder::with_criteria(main, postings_lists, synonyms, Criteria::default())
|
QueryBuilder::with_criteria(
|
||||||
|
main,
|
||||||
|
postings_lists,
|
||||||
|
documents_fields_counts,
|
||||||
|
synonyms,
|
||||||
|
Criteria::default(),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_criteria(
|
pub fn with_criteria(
|
||||||
main: store::Main,
|
main: store::Main,
|
||||||
postings_lists: store::PostingsLists,
|
postings_lists: store::PostingsLists,
|
||||||
|
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||||
synonyms: store::Synonyms,
|
synonyms: store::Synonyms,
|
||||||
criteria: Criteria<'c>,
|
criteria: Criteria<'c>,
|
||||||
) -> QueryBuilder<'c>
|
) -> QueryBuilder<'c>
|
||||||
@ -214,6 +236,7 @@ impl<'c> QueryBuilder<'c> {
|
|||||||
timeout: None,
|
timeout: None,
|
||||||
main_store: main,
|
main_store: main,
|
||||||
postings_lists_store: postings_lists,
|
postings_lists_store: postings_lists,
|
||||||
|
documents_fields_counts_store: documents_fields_counts,
|
||||||
synonyms_store: synonyms,
|
synonyms_store: synonyms,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -230,6 +253,7 @@ impl<'c, FI> QueryBuilder<'c, FI> {
|
|||||||
timeout: self.timeout,
|
timeout: self.timeout,
|
||||||
main_store: self.main_store,
|
main_store: self.main_store,
|
||||||
postings_lists_store: self.postings_lists_store,
|
postings_lists_store: self.postings_lists_store,
|
||||||
|
documents_fields_counts_store: self.documents_fields_counts_store,
|
||||||
synonyms_store: self.synonyms_store,
|
synonyms_store: self.synonyms_store,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -292,6 +316,7 @@ impl<FI> QueryBuilder<'_, FI> where FI: Fn(DocumentId) -> bool {
|
|||||||
self.searchable_attrs.as_ref(),
|
self.searchable_attrs.as_ref(),
|
||||||
&self.main_store,
|
&self.main_store,
|
||||||
&self.postings_lists_store,
|
&self.postings_lists_store,
|
||||||
|
&self.documents_fields_counts_store,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// stop processing when time is running out
|
// stop processing when time is running out
|
||||||
@ -420,6 +445,7 @@ where FI: Fn(DocumentId) -> bool,
|
|||||||
self.inner.searchable_attrs.as_ref(),
|
self.inner.searchable_attrs.as_ref(),
|
||||||
&self.inner.main_store,
|
&self.inner.main_store,
|
||||||
&self.inner.postings_lists_store,
|
&self.inner.postings_lists_store,
|
||||||
|
&self.inner.documents_fields_counts_store,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// stop processing when time is running out
|
// stop processing when time is running out
|
||||||
@ -549,6 +575,7 @@ mod tests {
|
|||||||
use fst::{Set, IntoStreamer};
|
use fst::{Set, IntoStreamer};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
|
use meilidb_schema::SchemaAttr;
|
||||||
|
|
||||||
use crate::automaton::normalize_str;
|
use crate::automaton::normalize_str;
|
||||||
use crate::database::Database;
|
use crate::database::Database;
|
||||||
@ -653,11 +680,15 @@ mod tests {
|
|||||||
|
|
||||||
let mut words_fst = BTreeSet::new();
|
let mut words_fst = BTreeSet::new();
|
||||||
let mut postings_lists = HashMap::new();
|
let mut postings_lists = HashMap::new();
|
||||||
|
let mut fields_counts = HashMap::<_, u64>::new();
|
||||||
|
|
||||||
for (word, indexes) in iter {
|
for (word, indexes) in iter {
|
||||||
let word = word.to_lowercase().into_bytes();
|
let word = word.to_lowercase().into_bytes();
|
||||||
words_fst.insert(word.clone());
|
words_fst.insert(word.clone());
|
||||||
postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
|
postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
|
||||||
|
for idx in indexes {
|
||||||
|
fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let words_fst = Set::from_iter(words_fst).unwrap();
|
let words_fst = Set::from_iter(words_fst).unwrap();
|
||||||
@ -669,6 +700,25 @@ mod tests {
|
|||||||
index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap();
|
index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for ((docid, attr, _), count) in fields_counts {
|
||||||
|
let prev = index.documents_fields_counts
|
||||||
|
.document_field_count(
|
||||||
|
&mut writer,
|
||||||
|
docid,
|
||||||
|
SchemaAttr(attr),
|
||||||
|
).unwrap();
|
||||||
|
|
||||||
|
let prev = prev.unwrap_or(0);
|
||||||
|
|
||||||
|
index.documents_fields_counts
|
||||||
|
.put_document_field_count(
|
||||||
|
&mut writer,
|
||||||
|
docid,
|
||||||
|
SchemaAttr(attr),
|
||||||
|
prev + count,
|
||||||
|
).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
writer.commit().unwrap();
|
writer.commit().unwrap();
|
||||||
drop(rkv);
|
drop(rkv);
|
||||||
|
|
||||||
@ -1470,8 +1520,8 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn deunicoded_synonyms() {
|
fn deunicoded_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("telephone", &[doc_index(0, 0)][..]), // meilidb-data indexes the unidecoded
|
("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded
|
||||||
("téléphone", &[doc_index(0, 0)][..]), // and the original words with the same DocIndex
|
("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex
|
||||||
|
|
||||||
("iphone", &[doc_index(1, 0)][..]),
|
("iphone", &[doc_index(1, 0)][..]),
|
||||||
]);
|
]);
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
|
use meilidb_schema::SchemaAttr;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::{TmpMatch, DocumentId, Highlight};
|
use crate::{TmpMatch, DocumentId, Highlight};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@ -9,13 +12,10 @@ pub struct RawDocument {
|
|||||||
pub id: DocumentId,
|
pub id: DocumentId,
|
||||||
pub matches: SharedMatches,
|
pub matches: SharedMatches,
|
||||||
pub highlights: Vec<Highlight>,
|
pub highlights: Vec<Highlight>,
|
||||||
|
pub fields_counts: SetBuf<(SchemaAttr, u64)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RawDocument {
|
impl RawDocument {
|
||||||
fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
|
|
||||||
RawDocument { id, matches, highlights }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn query_index(&self) -> &[u32] {
|
pub fn query_index(&self) -> &[u32] {
|
||||||
let r = self.matches.range;
|
let r = self.matches.range;
|
||||||
// it is safe because construction/modifications
|
// it is safe because construction/modifications
|
||||||
@ -60,7 +60,7 @@ impl fmt::Debug for RawDocument {
|
|||||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?;
|
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?;
|
||||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?;
|
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?;
|
||||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?;
|
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?;
|
||||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?;
|
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?;
|
||||||
f.write_str("}")?;
|
f.write_str("}")?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -69,31 +69,34 @@ impl fmt::Debug for RawDocument {
|
|||||||
pub fn raw_documents_from(
|
pub fn raw_documents_from(
|
||||||
matches: SetBuf<(DocumentId, TmpMatch)>,
|
matches: SetBuf<(DocumentId, TmpMatch)>,
|
||||||
highlights: SetBuf<(DocumentId, Highlight)>,
|
highlights: SetBuf<(DocumentId, Highlight)>,
|
||||||
|
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
|
||||||
) -> Vec<RawDocument>
|
) -> Vec<RawDocument>
|
||||||
{
|
{
|
||||||
let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
|
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
|
||||||
let mut matches2 = Matches::with_capacity(matches.len());
|
let mut matches2 = Matches::with_capacity(matches.len());
|
||||||
|
|
||||||
let matches = matches.linear_group_by_key(|(id, _)| *id);
|
let matches = matches.linear_group_by_key(|(id, _)| *id);
|
||||||
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
|
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
|
||||||
|
let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
|
||||||
|
|
||||||
for (mgroup, hgroup) in matches.zip(highlights) {
|
for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
|
||||||
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
|
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
|
||||||
|
debug_assert_eq!(mgroup[0].0, fgroup[0].0);
|
||||||
|
|
||||||
let document_id = mgroup[0].0;
|
let document_id = mgroup[0].0;
|
||||||
let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
|
let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
|
||||||
let end = start + mgroup.len();
|
let end = start + mgroup.len();
|
||||||
|
|
||||||
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
|
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
|
||||||
docs_ranges.push((document_id, Range { start, end }, highlights));
|
let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
|
||||||
|
|
||||||
|
docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
|
||||||
matches2.extend_from_slice(mgroup);
|
matches2.extend_from_slice(mgroup);
|
||||||
}
|
}
|
||||||
|
|
||||||
let matches = Arc::new(matches2);
|
let matches = Arc::new(matches2);
|
||||||
docs_ranges.into_iter().map(|(id, range, highlights)| {
|
docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| {
|
||||||
let matches = SharedMatches { range, matches: matches.clone() };
|
let matches = SharedMatches { range, matches: matches.clone() };
|
||||||
RawDocument::new(id, matches, highlights)
|
RawDocument { id, matches, highlights, fields_counts }
|
||||||
}).collect()
|
}).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,7 +33,8 @@ impl RawIndexer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
|
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize {
|
||||||
|
let mut number_of_words = 0;
|
||||||
let lowercase_text = text.to_lowercase();
|
let lowercase_text = text.to_lowercase();
|
||||||
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
|
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
|
||||||
|
|
||||||
@ -46,6 +47,9 @@ impl RawIndexer {
|
|||||||
let iter = Some(lowercase_text).into_iter().chain(next);
|
let iter = Some(lowercase_text).into_iter().chain(next);
|
||||||
|
|
||||||
for text in iter {
|
for text in iter {
|
||||||
|
// we must not count 2 times the same words
|
||||||
|
number_of_words = 0;
|
||||||
|
|
||||||
for token in Tokenizer::new(&text) {
|
for token in Tokenizer::new(&text) {
|
||||||
let must_continue = index_token(
|
let must_continue = index_token(
|
||||||
token,
|
token,
|
||||||
@ -57,8 +61,12 @@ impl RawIndexer {
|
|||||||
);
|
);
|
||||||
|
|
||||||
if !must_continue { break }
|
if !must_continue { break }
|
||||||
|
|
||||||
|
number_of_words += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
number_of_words
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
|
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
|
||||||
|
@ -13,7 +13,7 @@ pub struct Indexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::Serializer for Indexer<'a> {
|
impl<'a> ser::Serializer for Indexer<'a> {
|
||||||
type Ok = ();
|
type Ok = Option<usize>;
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
type SerializeSeq = SeqIndexer<'a>;
|
type SerializeSeq = SeqIndexer<'a>;
|
||||||
type SerializeTuple = TupleIndexer<'a>;
|
type SerializeTuple = TupleIndexer<'a>;
|
||||||
@ -83,8 +83,8 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
|
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
|
||||||
self.indexer.index_text(self.document_id, self.attribute, text);
|
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, text);
|
||||||
Ok(())
|
Ok(Some(number_of_words))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||||
@ -99,8 +99,8 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
where T: ser::Serialize,
|
where T: ser::Serialize,
|
||||||
{
|
{
|
||||||
let text = value.serialize(ConvertToString)?;
|
let text = value.serialize(ConvertToString)?;
|
||||||
self.indexer.index_text(self.document_id, self.attribute, &text);
|
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, &text);
|
||||||
Ok(())
|
Ok(Some(number_of_words))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||||
@ -225,7 +225,7 @@ pub struct SeqIndexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
||||||
type Ok = ();
|
type Ok = Option<usize>;
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||||
@ -239,7 +239,7 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
|||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(())
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -251,7 +251,7 @@ pub struct MapIndexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
||||||
type Ok = ();
|
type Ok = Option<usize>;
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||||
@ -273,7 +273,7 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
|||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(())
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -285,7 +285,7 @@ pub struct StructSerializer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
||||||
type Ok = ();
|
type Ok = Option<usize>;
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_field<T: ?Sized>(
|
fn serialize_field<T: ?Sized>(
|
||||||
@ -305,7 +305,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
|||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(())
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -317,7 +317,7 @@ pub struct TupleIndexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
||||||
type Ok = ();
|
type Ok = Option<usize>;
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||||
@ -331,6 +331,6 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
|||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(())
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use meilidb_schema::Schema;
|
use std::collections::HashMap;
|
||||||
|
use meilidb_schema::{Schema, SchemaAttr};
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
|
|
||||||
use crate::{DocumentId, RankedMap};
|
use crate::{DocumentId, RankedMap};
|
||||||
@ -10,6 +11,7 @@ use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer};
|
|||||||
pub struct Serializer<'a> {
|
pub struct Serializer<'a> {
|
||||||
pub schema: &'a Schema,
|
pub schema: &'a Schema,
|
||||||
pub document_store: &'a mut RamDocumentStore,
|
pub document_store: &'a mut RamDocumentStore,
|
||||||
|
pub document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||||
pub indexer: &'a mut RawIndexer,
|
pub indexer: &'a mut RawIndexer,
|
||||||
pub ranked_map: &'a mut RankedMap,
|
pub ranked_map: &'a mut RankedMap,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
@ -135,6 +137,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
schema: self.schema,
|
schema: self.schema,
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
document_store: self.document_store,
|
document_store: self.document_store,
|
||||||
|
document_fields_counts: self.document_fields_counts,
|
||||||
indexer: self.indexer,
|
indexer: self.indexer,
|
||||||
ranked_map: self.ranked_map,
|
ranked_map: self.ranked_map,
|
||||||
current_key_name: None,
|
current_key_name: None,
|
||||||
@ -151,6 +154,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
schema: self.schema,
|
schema: self.schema,
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
document_store: self.document_store,
|
document_store: self.document_store,
|
||||||
|
document_fields_counts: self.document_fields_counts,
|
||||||
indexer: self.indexer,
|
indexer: self.indexer,
|
||||||
ranked_map: self.ranked_map,
|
ranked_map: self.ranked_map,
|
||||||
})
|
})
|
||||||
@ -172,6 +176,7 @@ pub struct MapSerializer<'a> {
|
|||||||
schema: &'a Schema,
|
schema: &'a Schema,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
document_store: &'a mut RamDocumentStore,
|
document_store: &'a mut RamDocumentStore,
|
||||||
|
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||||
indexer: &'a mut RawIndexer,
|
indexer: &'a mut RawIndexer,
|
||||||
ranked_map: &'a mut RankedMap,
|
ranked_map: &'a mut RankedMap,
|
||||||
current_key_name: Option<String>,
|
current_key_name: Option<String>,
|
||||||
@ -209,6 +214,7 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
|
|||||||
self.schema,
|
self.schema,
|
||||||
self.document_id,
|
self.document_id,
|
||||||
self.document_store,
|
self.document_store,
|
||||||
|
self.document_fields_counts,
|
||||||
self.indexer,
|
self.indexer,
|
||||||
self.ranked_map,
|
self.ranked_map,
|
||||||
&key,
|
&key,
|
||||||
@ -225,6 +231,7 @@ pub struct StructSerializer<'a> {
|
|||||||
schema: &'a Schema,
|
schema: &'a Schema,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
document_store: &'a mut RamDocumentStore,
|
document_store: &'a mut RamDocumentStore,
|
||||||
|
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||||
indexer: &'a mut RawIndexer,
|
indexer: &'a mut RawIndexer,
|
||||||
ranked_map: &'a mut RankedMap,
|
ranked_map: &'a mut RankedMap,
|
||||||
}
|
}
|
||||||
@ -244,6 +251,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
|||||||
self.schema,
|
self.schema,
|
||||||
self.document_id,
|
self.document_id,
|
||||||
self.document_store,
|
self.document_store,
|
||||||
|
self.document_fields_counts,
|
||||||
self.indexer,
|
self.indexer,
|
||||||
self.ranked_map,
|
self.ranked_map,
|
||||||
key,
|
key,
|
||||||
@ -260,6 +268,7 @@ fn serialize_value<T: ?Sized>(
|
|||||||
schema: &Schema,
|
schema: &Schema,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
document_store: &mut RamDocumentStore,
|
document_store: &mut RamDocumentStore,
|
||||||
|
documents_fields_counts: &mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||||
indexer: &mut RawIndexer,
|
indexer: &mut RawIndexer,
|
||||||
ranked_map: &mut RankedMap,
|
ranked_map: &mut RankedMap,
|
||||||
key: &str,
|
key: &str,
|
||||||
@ -275,7 +284,9 @@ where T: ser::Serialize,
|
|||||||
|
|
||||||
if props.is_indexed() {
|
if props.is_indexed() {
|
||||||
let indexer = Indexer { attribute, indexer, document_id };
|
let indexer = Indexer { attribute, indexer, document_id };
|
||||||
value.serialize(indexer)?;
|
if let Some(number_of_words) = value.serialize(indexer)? {
|
||||||
|
documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if props.is_ranked() {
|
if props.is_ranked() {
|
||||||
|
@ -1,37 +1,13 @@
|
|||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
use meilidb_schema::SchemaAttr;
|
use meilidb_schema::SchemaAttr;
|
||||||
use crate::DocumentId;
|
use crate::DocumentId;
|
||||||
|
use super::{document_attribute_into_key, document_attribute_from_key};
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct DocumentsFields {
|
pub struct DocumentsFields {
|
||||||
pub(crate) documents_fields: rkv::SingleStore,
|
pub(crate) documents_fields: rkv::SingleStore,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn document_attribute_into_key(document_id: DocumentId, attribute: SchemaAttr) -> [u8; 10] {
|
|
||||||
let document_id_bytes = document_id.0.to_be_bytes();
|
|
||||||
let attr_bytes = attribute.0.to_be_bytes();
|
|
||||||
|
|
||||||
let mut key = [0u8; 10];
|
|
||||||
key[0..8].copy_from_slice(&document_id_bytes);
|
|
||||||
key[8..10].copy_from_slice(&attr_bytes);
|
|
||||||
|
|
||||||
key
|
|
||||||
}
|
|
||||||
|
|
||||||
fn document_attribute_from_key(key: [u8; 10]) -> (DocumentId, SchemaAttr) {
|
|
||||||
let document_id = {
|
|
||||||
let array = TryFrom::try_from(&key[0..8]).unwrap();
|
|
||||||
DocumentId(u64::from_be_bytes(array))
|
|
||||||
};
|
|
||||||
|
|
||||||
let schema_attr = {
|
|
||||||
let array = TryFrom::try_from(&key[8..8+2]).unwrap();
|
|
||||||
SchemaAttr(u16::from_be_bytes(array))
|
|
||||||
};
|
|
||||||
|
|
||||||
(document_id, schema_attr)
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DocumentsFields {
|
impl DocumentsFields {
|
||||||
pub fn put_document_field(
|
pub fn put_document_field(
|
||||||
&self,
|
&self,
|
||||||
@ -100,15 +76,6 @@ impl DocumentsFields {
|
|||||||
let iter = self.documents_fields.iter_from(reader, document_id_bytes)?;
|
let iter = self.documents_fields.iter_from(reader, document_id_bytes)?;
|
||||||
Ok(DocumentFieldsIter { document_id, iter })
|
Ok(DocumentFieldsIter { document_id, iter })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn documents_ids<'r, T: rkv::Readable>(
|
|
||||||
&self,
|
|
||||||
reader: &'r T,
|
|
||||||
) -> Result<DocumentsIdsIter<'r>, rkv::StoreError>
|
|
||||||
{
|
|
||||||
let iter = self.documents_fields.iter_start(reader)?;
|
|
||||||
Ok(DocumentsIdsIter { last_seen_id: None, iter })
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct DocumentFieldsIter<'r> {
|
pub struct DocumentFieldsIter<'r> {
|
||||||
@ -134,30 +101,3 @@ impl<'r> Iterator for DocumentFieldsIter<'r> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct DocumentsIdsIter<'r> {
|
|
||||||
last_seen_id: Option<DocumentId>,
|
|
||||||
iter: rkv::store::single::Iter<'r>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'r> Iterator for DocumentsIdsIter<'r> {
|
|
||||||
type Item = Result<DocumentId, rkv::StoreError>;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
for result in &mut self.iter {
|
|
||||||
match result {
|
|
||||||
Ok((key, _)) => {
|
|
||||||
let array = TryFrom::try_from(key).unwrap();
|
|
||||||
let (document_id, _) = document_attribute_from_key(array);
|
|
||||||
if Some(document_id) != self.last_seen_id {
|
|
||||||
self.last_seen_id = Some(document_id);
|
|
||||||
return Some(Ok(document_id))
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Err(e) => return Some(Err(e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
134
meilidb-core/src/store/documents_fields_counts.rs
Normal file
134
meilidb-core/src/store/documents_fields_counts.rs
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
use std::convert::TryFrom;
|
||||||
|
use meilidb_schema::SchemaAttr;
|
||||||
|
use crate::DocumentId;
|
||||||
|
use super::{document_attribute_into_key, document_attribute_from_key};
|
||||||
|
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
|
pub struct DocumentsFieldsCounts {
|
||||||
|
pub(crate) documents_fields_counts: rkv::SingleStore,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocumentsFieldsCounts {
|
||||||
|
pub fn put_document_field_count(
|
||||||
|
&self,
|
||||||
|
writer: &mut rkv::Writer,
|
||||||
|
document_id: DocumentId,
|
||||||
|
attribute: SchemaAttr,
|
||||||
|
value: u64,
|
||||||
|
) -> Result<(), rkv::StoreError>
|
||||||
|
{
|
||||||
|
let key = document_attribute_into_key(document_id, attribute);
|
||||||
|
self.documents_fields_counts.put(writer, key, &rkv::Value::U64(value))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn del_all_document_fields_counts(
|
||||||
|
&self,
|
||||||
|
writer: &mut rkv::Writer,
|
||||||
|
document_id: DocumentId,
|
||||||
|
) -> Result<usize, rkv::StoreError>
|
||||||
|
{
|
||||||
|
let mut keys_to_delete = Vec::new();
|
||||||
|
|
||||||
|
// WARN we can not delete the keys using the iterator
|
||||||
|
// so we store them and delete them just after
|
||||||
|
for result in self.document_fields_counts(writer, document_id)? {
|
||||||
|
let (attribute, _) = result?;
|
||||||
|
let key = document_attribute_into_key(document_id, attribute);
|
||||||
|
keys_to_delete.push(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
let count = keys_to_delete.len();
|
||||||
|
for key in keys_to_delete {
|
||||||
|
self.documents_fields_counts.delete(writer, key)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(count)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn document_field_count(
|
||||||
|
&self,
|
||||||
|
reader: &impl rkv::Readable,
|
||||||
|
document_id: DocumentId,
|
||||||
|
attribute: SchemaAttr,
|
||||||
|
) -> Result<Option<u64>, rkv::StoreError>
|
||||||
|
{
|
||||||
|
let key = document_attribute_into_key(document_id, attribute);
|
||||||
|
|
||||||
|
match self.documents_fields_counts.get(reader, key)? {
|
||||||
|
Some(rkv::Value::U64(count)) => Ok(Some(count)),
|
||||||
|
Some(value) => panic!("invalid type {:?}", value),
|
||||||
|
None => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn document_fields_counts<'r, T: rkv::Readable>(
|
||||||
|
&self,
|
||||||
|
reader: &'r T,
|
||||||
|
document_id: DocumentId,
|
||||||
|
) -> Result<DocumentFieldsCountsIter<'r>, rkv::StoreError>
|
||||||
|
{
|
||||||
|
let document_id_bytes = document_id.0.to_be_bytes();
|
||||||
|
let iter = self.documents_fields_counts.iter_from(reader, document_id_bytes)?;
|
||||||
|
Ok(DocumentFieldsCountsIter { document_id, iter })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn documents_ids<'r, T: rkv::Readable>(
|
||||||
|
&self,
|
||||||
|
reader: &'r T,
|
||||||
|
) -> Result<DocumentsIdsIter<'r>, rkv::StoreError>
|
||||||
|
{
|
||||||
|
let iter = self.documents_fields_counts.iter_start(reader)?;
|
||||||
|
Ok(DocumentsIdsIter { last_seen_id: None, iter })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DocumentFieldsCountsIter<'r> {
|
||||||
|
document_id: DocumentId,
|
||||||
|
iter: rkv::store::single::Iter<'r>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for DocumentFieldsCountsIter<'_> {
|
||||||
|
type Item = Result<(SchemaAttr, u64), rkv::StoreError>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
match self.iter.next() {
|
||||||
|
Some(Ok((key, Some(rkv::Value::U64(count))))) => {
|
||||||
|
let array = TryFrom::try_from(key).unwrap();
|
||||||
|
let (current_document_id, attr) = document_attribute_from_key(array);
|
||||||
|
if current_document_id != self.document_id { return None; }
|
||||||
|
|
||||||
|
Some(Ok((attr, count)))
|
||||||
|
},
|
||||||
|
Some(Ok((key, data))) => panic!("{:?}, {:?}", key, data),
|
||||||
|
Some(Err(e)) => Some(Err(e)),
|
||||||
|
None => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DocumentsIdsIter<'r> {
|
||||||
|
last_seen_id: Option<DocumentId>,
|
||||||
|
iter: rkv::store::single::Iter<'r>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for DocumentsIdsIter<'_> {
|
||||||
|
type Item = Result<DocumentId, rkv::StoreError>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
for result in &mut self.iter {
|
||||||
|
match result {
|
||||||
|
Ok((key, _)) => {
|
||||||
|
let array = TryFrom::try_from(key).unwrap();
|
||||||
|
let (document_id, _) = document_attribute_from_key(array);
|
||||||
|
if Some(document_id) != self.last_seen_id {
|
||||||
|
self.last_seen_id = Some(document_id);
|
||||||
|
return Some(Ok(document_id))
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(e) => return Some(Err(e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
@ -3,7 +3,6 @@ use std::convert::TryInto;
|
|||||||
|
|
||||||
use meilidb_schema::Schema;
|
use meilidb_schema::Schema;
|
||||||
use rkv::Value;
|
use rkv::Value;
|
||||||
use serde::de;
|
|
||||||
use crate::{RankedMap, MResult};
|
use crate::{RankedMap, MResult};
|
||||||
|
|
||||||
const CUSTOMS_KEY: &str = "customs-key";
|
const CUSTOMS_KEY: &str = "customs-key";
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
mod docs_words;
|
mod docs_words;
|
||||||
mod documents_fields;
|
mod documents_fields;
|
||||||
|
mod documents_fields_counts;
|
||||||
mod main;
|
mod main;
|
||||||
mod postings_lists;
|
mod postings_lists;
|
||||||
mod synonyms;
|
mod synonyms;
|
||||||
@ -8,6 +9,7 @@ mod updates_results;
|
|||||||
|
|
||||||
pub use self::docs_words::DocsWords;
|
pub use self::docs_words::DocsWords;
|
||||||
pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter};
|
pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter};
|
||||||
|
pub use self::documents_fields_counts::{DocumentsFieldsCounts, DocumentFieldsCountsIter, DocumentsIdsIter};
|
||||||
pub use self::main::Main;
|
pub use self::main::Main;
|
||||||
pub use self::postings_lists::PostingsLists;
|
pub use self::postings_lists::PostingsLists;
|
||||||
pub use self::synonyms::Synonyms;
|
pub use self::synonyms::Synonyms;
|
||||||
@ -15,8 +17,11 @@ pub use self::updates::Updates;
|
|||||||
pub use self::updates_results::UpdatesResults;
|
pub use self::updates_results::UpdatesResults;
|
||||||
|
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
use std::convert::TryFrom;
|
||||||
|
|
||||||
use meilidb_schema::{Schema, SchemaAttr};
|
use meilidb_schema::{Schema, SchemaAttr};
|
||||||
use serde::{ser, de};
|
use serde::de;
|
||||||
|
|
||||||
use crate::criterion::Criteria;
|
use crate::criterion::Criteria;
|
||||||
use crate::serde::Deserializer;
|
use crate::serde::Deserializer;
|
||||||
use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error};
|
use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error};
|
||||||
@ -25,6 +30,31 @@ fn aligned_to(bytes: &[u8], align: usize) -> bool {
|
|||||||
(bytes as *const _ as *const () as usize) % align == 0
|
(bytes as *const _ as *const () as usize) % align == 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn document_attribute_into_key(document_id: DocumentId, attribute: SchemaAttr) -> [u8; 10] {
|
||||||
|
let document_id_bytes = document_id.0.to_be_bytes();
|
||||||
|
let attr_bytes = attribute.0.to_be_bytes();
|
||||||
|
|
||||||
|
let mut key = [0u8; 10];
|
||||||
|
key[0..8].copy_from_slice(&document_id_bytes);
|
||||||
|
key[8..10].copy_from_slice(&attr_bytes);
|
||||||
|
|
||||||
|
key
|
||||||
|
}
|
||||||
|
|
||||||
|
fn document_attribute_from_key(key: [u8; 10]) -> (DocumentId, SchemaAttr) {
|
||||||
|
let document_id = {
|
||||||
|
let array = TryFrom::try_from(&key[0..8]).unwrap();
|
||||||
|
DocumentId(u64::from_be_bytes(array))
|
||||||
|
};
|
||||||
|
|
||||||
|
let schema_attr = {
|
||||||
|
let array = TryFrom::try_from(&key[8..8+2]).unwrap();
|
||||||
|
SchemaAttr(u16::from_be_bytes(array))
|
||||||
|
};
|
||||||
|
|
||||||
|
(document_id, schema_attr)
|
||||||
|
}
|
||||||
|
|
||||||
fn main_name(name: &str) -> String {
|
fn main_name(name: &str) -> String {
|
||||||
format!("store-{}", name)
|
format!("store-{}", name)
|
||||||
}
|
}
|
||||||
@ -37,6 +67,10 @@ fn documents_fields_name(name: &str) -> String {
|
|||||||
format!("store-{}-documents-fields", name)
|
format!("store-{}-documents-fields", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn documents_fields_counts_name(name: &str) -> String {
|
||||||
|
format!("store-{}-documents-fields-counts", name)
|
||||||
|
}
|
||||||
|
|
||||||
fn synonyms_name(name: &str) -> String {
|
fn synonyms_name(name: &str) -> String {
|
||||||
format!("store-{}-synonyms", name)
|
format!("store-{}-synonyms", name)
|
||||||
}
|
}
|
||||||
@ -58,6 +92,7 @@ pub struct Index {
|
|||||||
pub main: Main,
|
pub main: Main,
|
||||||
pub postings_lists: PostingsLists,
|
pub postings_lists: PostingsLists,
|
||||||
pub documents_fields: DocumentsFields,
|
pub documents_fields: DocumentsFields,
|
||||||
|
pub documents_fields_counts: DocumentsFieldsCounts,
|
||||||
pub synonyms: Synonyms,
|
pub synonyms: Synonyms,
|
||||||
pub docs_words: DocsWords,
|
pub docs_words: DocsWords,
|
||||||
|
|
||||||
@ -166,11 +201,22 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn query_builder(&self) -> QueryBuilder {
|
pub fn query_builder(&self) -> QueryBuilder {
|
||||||
QueryBuilder::new(self.main, self.postings_lists, self.synonyms)
|
QueryBuilder::new(
|
||||||
|
self.main,
|
||||||
|
self.postings_lists,
|
||||||
|
self.documents_fields_counts,
|
||||||
|
self.synonyms,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn query_builder_with_criteria<'c>(&self, criteria: Criteria<'c>) -> QueryBuilder<'c> {
|
pub fn query_builder_with_criteria<'c>(&self, criteria: Criteria<'c>) -> QueryBuilder<'c> {
|
||||||
QueryBuilder::with_criteria(self.main, self.postings_lists, self.synonyms, criteria)
|
QueryBuilder::with_criteria(
|
||||||
|
self.main,
|
||||||
|
self.postings_lists,
|
||||||
|
self.documents_fields_counts,
|
||||||
|
self.synonyms,
|
||||||
|
criteria,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -205,6 +251,7 @@ fn open_options(
|
|||||||
let main_name = main_name(name);
|
let main_name = main_name(name);
|
||||||
let postings_lists_name = postings_lists_name(name);
|
let postings_lists_name = postings_lists_name(name);
|
||||||
let documents_fields_name = documents_fields_name(name);
|
let documents_fields_name = documents_fields_name(name);
|
||||||
|
let documents_fields_counts_name = documents_fields_counts_name(name);
|
||||||
let synonyms_name = synonyms_name(name);
|
let synonyms_name = synonyms_name(name);
|
||||||
let docs_words_name = docs_words_name(name);
|
let docs_words_name = docs_words_name(name);
|
||||||
let updates_name = updates_name(name);
|
let updates_name = updates_name(name);
|
||||||
@ -214,6 +261,7 @@ fn open_options(
|
|||||||
let main = env.open_single(main_name.as_str(), options)?;
|
let main = env.open_single(main_name.as_str(), options)?;
|
||||||
let postings_lists = env.open_single(postings_lists_name.as_str(), options)?;
|
let postings_lists = env.open_single(postings_lists_name.as_str(), options)?;
|
||||||
let documents_fields = env.open_single(documents_fields_name.as_str(), options)?;
|
let documents_fields = env.open_single(documents_fields_name.as_str(), options)?;
|
||||||
|
let documents_fields_counts = env.open_single(documents_fields_counts_name.as_str(), options)?;
|
||||||
let synonyms = env.open_single(synonyms_name.as_str(), options)?;
|
let synonyms = env.open_single(synonyms_name.as_str(), options)?;
|
||||||
let docs_words = env.open_single(docs_words_name.as_str(), options)?;
|
let docs_words = env.open_single(docs_words_name.as_str(), options)?;
|
||||||
let updates = env.open_single(updates_name.as_str(), options)?;
|
let updates = env.open_single(updates_name.as_str(), options)?;
|
||||||
@ -223,6 +271,7 @@ fn open_options(
|
|||||||
main: Main { main },
|
main: Main { main },
|
||||||
postings_lists: PostingsLists { postings_lists },
|
postings_lists: PostingsLists { postings_lists },
|
||||||
documents_fields: DocumentsFields { documents_fields },
|
documents_fields: DocumentsFields { documents_fields },
|
||||||
|
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
|
||||||
synonyms: Synonyms { synonyms },
|
synonyms: Synonyms { synonyms },
|
||||||
docs_words: DocsWords { docs_words },
|
docs_words: DocsWords { docs_words },
|
||||||
updates: Updates { updates },
|
updates: Updates { updates },
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use crate::{store, error::UnsupportedOperation, MResult};
|
|
||||||
use crate::update::{Update, next_update_id};
|
use crate::update::{Update, next_update_id};
|
||||||
|
use crate::{store, MResult};
|
||||||
|
|
||||||
pub fn apply_customs_update(
|
pub fn apply_customs_update(
|
||||||
writer: &mut rkv::Writer,
|
writer: &mut rkv::Writer,
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
use fst::{SetBuilder, set::OpBuilder};
|
use fst::{SetBuilder, set::OpBuilder};
|
||||||
use sdset::{SetOperation, duo::Union};
|
use sdset::{SetOperation, duo::Union};
|
||||||
@ -82,6 +82,7 @@ pub fn apply_documents_addition(
|
|||||||
writer: &mut rkv::Writer,
|
writer: &mut rkv::Writer,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
documents_fields_store: store::DocumentsFields,
|
documents_fields_store: store::DocumentsFields,
|
||||||
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
postings_lists_store: store::PostingsLists,
|
postings_lists_store: store::PostingsLists,
|
||||||
docs_words_store: store::DocsWords,
|
docs_words_store: store::DocsWords,
|
||||||
mut ranked_map: RankedMap,
|
mut ranked_map: RankedMap,
|
||||||
@ -90,6 +91,7 @@ pub fn apply_documents_addition(
|
|||||||
{
|
{
|
||||||
let mut document_ids = HashSet::new();
|
let mut document_ids = HashSet::new();
|
||||||
let mut document_store = RamDocumentStore::new();
|
let mut document_store = RamDocumentStore::new();
|
||||||
|
let mut document_fields_counts = HashMap::new();
|
||||||
let mut indexer = RawIndexer::new();
|
let mut indexer = RawIndexer::new();
|
||||||
|
|
||||||
let schema = match main_store.schema(writer)? {
|
let schema = match main_store.schema(writer)? {
|
||||||
@ -112,6 +114,7 @@ pub fn apply_documents_addition(
|
|||||||
let serializer = Serializer {
|
let serializer = Serializer {
|
||||||
schema: &schema,
|
schema: &schema,
|
||||||
document_store: &mut document_store,
|
document_store: &mut document_store,
|
||||||
|
document_fields_counts: &mut document_fields_counts,
|
||||||
indexer: &mut indexer,
|
indexer: &mut indexer,
|
||||||
ranked_map: &mut ranked_map,
|
ranked_map: &mut ranked_map,
|
||||||
document_id,
|
document_id,
|
||||||
@ -126,6 +129,7 @@ pub fn apply_documents_addition(
|
|||||||
writer,
|
writer,
|
||||||
main_store,
|
main_store,
|
||||||
documents_fields_store,
|
documents_fields_store,
|
||||||
|
documents_fields_counts_store,
|
||||||
postings_lists_store,
|
postings_lists_store,
|
||||||
docs_words_store,
|
docs_words_store,
|
||||||
ranked_map.clone(),
|
ranked_map.clone(),
|
||||||
@ -137,6 +141,11 @@ pub fn apply_documents_addition(
|
|||||||
documents_fields_store.put_document_field(writer, id, attr, &value)?;
|
documents_fields_store.put_document_field(writer, id, attr, &value)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 3. insert new document attributes counts
|
||||||
|
for ((id, attr), count) in document_fields_counts {
|
||||||
|
documents_fields_counts_store.put_document_field_count(writer, id, attr, count)?;
|
||||||
|
}
|
||||||
|
|
||||||
let indexed = indexer.build();
|
let indexed = indexer.build();
|
||||||
let mut delta_words_builder = SetBuilder::memory();
|
let mut delta_words_builder = SetBuilder::memory();
|
||||||
|
|
||||||
|
@ -86,6 +86,7 @@ pub fn apply_documents_deletion(
|
|||||||
writer: &mut rkv::Writer,
|
writer: &mut rkv::Writer,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
documents_fields_store: store::DocumentsFields,
|
documents_fields_store: store::DocumentsFields,
|
||||||
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
postings_lists_store: store::PostingsLists,
|
postings_lists_store: store::PostingsLists,
|
||||||
docs_words_store: store::DocsWords,
|
docs_words_store: store::DocsWords,
|
||||||
mut ranked_map: RankedMap,
|
mut ranked_map: RankedMap,
|
||||||
@ -140,6 +141,7 @@ pub fn apply_documents_deletion(
|
|||||||
}
|
}
|
||||||
|
|
||||||
for id in document_ids {
|
for id in document_ids {
|
||||||
|
documents_fields_counts_store.del_all_document_fields_counts(writer, id)?;
|
||||||
if documents_fields_store.del_all_document_fields(writer, id)? != 0 {
|
if documents_fields_store.del_all_document_fields(writer, id)? != 0 {
|
||||||
deleted_documents.insert(id);
|
deleted_documents.insert(id);
|
||||||
}
|
}
|
||||||
|
@ -138,6 +138,7 @@ pub fn update_task(writer: &mut rkv::Writer, index: store::Index) -> MResult<Opt
|
|||||||
writer,
|
writer,
|
||||||
index.main,
|
index.main,
|
||||||
index.documents_fields,
|
index.documents_fields,
|
||||||
|
index.documents_fields_counts,
|
||||||
index.postings_lists,
|
index.postings_lists,
|
||||||
index.docs_words,
|
index.docs_words,
|
||||||
ranked_map,
|
ranked_map,
|
||||||
@ -160,6 +161,7 @@ pub fn update_task(writer: &mut rkv::Writer, index: store::Index) -> MResult<Opt
|
|||||||
writer,
|
writer,
|
||||||
index.main,
|
index.main,
|
||||||
index.documents_fields,
|
index.documents_fields,
|
||||||
|
index.documents_fields_counts,
|
||||||
index.postings_lists,
|
index.postings_lists,
|
||||||
index.docs_words,
|
index.docs_words,
|
||||||
ranked_map,
|
ranked_map,
|
||||||
|
Loading…
Reference in New Issue
Block a user