From 169bd4cb399703508c562881396b6800eb3c45e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 13 May 2019 16:22:36 +0200 Subject: [PATCH] feat: Store all documents words by document rather than by attribute --- meilidb-data/src/database.rs | 118 ++++++++++++++--------------------- meilidb-data/src/indexer.rs | 24 +++---- 2 files changed, 59 insertions(+), 83 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 01273e186..badbc2bf8 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -15,7 +15,7 @@ use zerocopy::{AsBytes, LayoutVerified}; use fst::{SetBuilder, set::OpBuilder, Streamer}; use crate::document_attr_key::DocumentAttrKey; -use crate::indexer::{Indexer, Indexed}; +use crate::indexer::Indexer; use crate::serde::extract_document_id; use crate::serde::{Serializer, RamDocumentStore, Deserializer, SerializerError}; use crate::{Schema, SchemaAttr, RankedMap}; @@ -131,10 +131,10 @@ impl Database { WordsIndex(tree) }; - let attrs_words = { - let tree_name = format!("{}-attrs-words", name); + let docs_words = { + let tree_name = format!("{}-docs-words", name); let tree = self.inner.open_tree(tree_name)?; - AttrsWords(tree) + DocsWords(tree) }; let documents = { @@ -143,7 +143,7 @@ impl Database { DocumentsIndex(tree) }; - let raw_index = RawIndex { main, words, attrs_words, documents }; + let raw_index = RawIndex { main, words, docs_words, documents }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() @@ -180,10 +180,10 @@ impl Database { WordsIndex(tree) }; - let attrs_words = { - let tree_name = format!("{}-attrs-words", name); + let docs_words = { + let tree_name = format!("{}-docs-words", name); let tree = self.inner.open_tree(tree_name)?; - AttrsWords(tree) + DocsWords(tree) }; let documents = { @@ -196,7 +196,7 @@ impl Database { indexes.insert(name.to_string()); self.set_indexes(&indexes)?; - let raw_index = RawIndex { main, words, attrs_words, documents }; + let raw_index = RawIndex { main, words, docs_words, documents }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() @@ -211,7 +211,7 @@ impl Database { pub struct RawIndex { pub main: MainIndex, pub words: WordsIndex, - pub attrs_words: AttrsWords, + pub docs_words: DocsWords, pub documents: DocumentsIndex, } @@ -299,11 +299,11 @@ impl WordsIndex { } #[derive(Clone)] -pub struct AttrsWords(Arc); +pub struct DocsWords(Arc); -impl AttrsWords { - pub fn attr_words(&self, id: DocumentId, attr: SchemaAttr) -> Result, Error> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); +impl DocsWords { + pub fn doc_words(&self, id: DocumentId) -> Result, Error> { + let key = id.0.to_be_bytes(); match self.0.get(key)? { Some(bytes) => { let len = bytes.len(); @@ -315,54 +315,19 @@ impl AttrsWords { } } - pub fn attrs_words(&self, id: DocumentId) -> DocumentAttrsWordsIter { - let start = DocumentAttrKey::new(id, SchemaAttr::min()); - let start = start.to_be_bytes(); - - let end = DocumentAttrKey::new(id, SchemaAttr::max()); - let end = end.to_be_bytes(); - - DocumentAttrsWordsIter(self.0.range(start..=end)) - } - - pub fn set_attr_words(&self, id: DocumentId, attr: SchemaAttr, words: &fst::Set) -> Result<(), Error> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + pub fn set_doc_words(&self, id: DocumentId, words: &fst::Set) -> Result<(), Error> { + let key = id.0.to_be_bytes(); self.0.set(key, words.as_fst().as_bytes())?; Ok(()) } - pub fn del_attr_words(&self, id: DocumentId, attr: SchemaAttr) -> Result<(), Error> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); + pub fn del_doc_words(&self, id: DocumentId) -> Result<(), Error> { + let key = id.0.to_be_bytes(); self.0.del(key)?; Ok(()) } } -pub struct DocumentAttrsWordsIter<'a>(sled::Iter<'a>); - -impl<'a> Iterator for DocumentAttrsWordsIter<'a> { - type Item = sled::Result<(SchemaAttr, fst::Set)>; - - fn next(&mut self) -> Option { - match self.0.next() { - Some(Ok((key, bytes))) => { - let slice: &[u8] = key.as_ref(); - let array = slice.try_into().unwrap(); - let key = DocumentAttrKey::from_be_bytes(array); - - let len = bytes.len(); - let value = bytes.into(); - let fst = fst::raw::Fst::from_shared_bytes(value, 0, len).unwrap(); - let set = fst::Set::from(fst); - - Some(Ok((key.attribute, set))) - }, - Some(Err(e)) => Some(Err(e.into())), - None => None, - } - } -} - #[derive(Clone)] pub struct DocumentsIndex(Arc); @@ -384,6 +349,18 @@ impl DocumentsIndex { Ok(()) } + pub fn del_all_document_fields(&self, id: DocumentId) -> sled::Result<()> { + let start = DocumentAttrKey::new(id, SchemaAttr::min()).to_be_bytes(); + let end = DocumentAttrKey::new(id, SchemaAttr::max()).to_be_bytes(); + let document_attrs = self.0.range(start..=end).keys(); + + for key in document_attrs { + self.0.del(key?)?; + } + + Ok(()) + } + pub fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter { let start = DocumentAttrKey::new(id, SchemaAttr::min()); let start = start.to_be_bytes(); @@ -571,7 +548,7 @@ impl<'a> DocumentsAddition<'a> { let lease_inner = self.inner.lease_inner(); let main = &lease_inner.raw.main; let words = &lease_inner.raw.words; - let attrs_words = &lease_inner.raw.attrs_words; + let docs_words = &lease_inner.raw.docs_words; let documents = &lease_inner.raw.documents; // 1. remove the previous documents match indexes @@ -584,10 +561,10 @@ impl<'a> DocumentsAddition<'a> { documents.set_document_field(id, attr, value)?; } - let Indexed { words_doc_indexes, docs_attrs_words } = self.indexer.build(); + let indexed = self.indexer.build(); let mut delta_words_builder = SetBuilder::memory(); - for (word, delta_set) in words_doc_indexes { + for (word, delta_set) in indexed.words_doc_indexes { delta_words_builder.insert(&word).unwrap(); let set = match words.doc_indexes(&word)? { @@ -598,8 +575,8 @@ impl<'a> DocumentsAddition<'a> { words.set_doc_indexes(&word, &set)?; } - for ((id, attr), words) in docs_attrs_words { - attrs_words.set_attr_words(id, attr, &words)?; + for (id, words) in indexed.docs_words { + docs_words.set_doc_words(id, &words)?; } let delta_words = delta_words_builder @@ -656,7 +633,7 @@ impl<'a> DocumentsDeletion<'a> { pub fn finalize(mut self) -> Result<(), Error> { let lease_inner = self.inner.lease_inner(); let main = &lease_inner.raw.main; - let attrs_words = &lease_inner.raw.attrs_words; + let docs_words = &lease_inner.raw.docs_words; let words = &lease_inner.raw.words; let documents = &lease_inner.raw.documents; @@ -666,26 +643,25 @@ impl<'a> DocumentsDeletion<'a> { SetBuf::new_unchecked(self.documents) }; - let mut words_attrs = HashMap::new(); + let mut words_document_ids = HashMap::new(); for id in idset.into_vec() { - for result in attrs_words.attrs_words(id) { - let (attr, words) = result?; + if let Some(words) = docs_words.doc_words(id)? { let mut stream = words.stream(); while let Some(word) = stream.next() { let word = word.to_vec(); - words_attrs.entry(word).or_insert_with(Vec::new).push((id, attr)); + words_document_ids.entry(word).or_insert_with(Vec::new).push(id); } } } let mut removed_words = BTreeSet::new(); - for (word, mut attrs) in words_attrs { - attrs.sort_unstable(); - attrs.dedup(); - let attrs = SetBuf::new_unchecked(attrs); + for (word, mut document_ids) in words_document_ids { + document_ids.sort_unstable(); + document_ids.dedup(); + let document_ids = SetBuf::new_unchecked(document_ids); if let Some(doc_indexes) = words.doc_indexes(&word)? { - let op = DifferenceByKey::new(&doc_indexes, &attrs, |d| d.document_id, |(id, _)| *id); + let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id); let doc_indexes = op.into_set_buf(); if !doc_indexes.is_empty() { @@ -696,9 +672,9 @@ impl<'a> DocumentsDeletion<'a> { } } - for (id, attr) in attrs.into_vec() { - documents.del_document_field(id, attr)?; - attrs_words.del_attr_words(id, attr)?; + for id in document_ids.into_vec() { + documents.del_all_document_fields(id)?; + docs_words.del_doc_words(id)?; } } diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs index 1aa3095c8..708aac14d 100644 --- a/meilidb-data/src/indexer.rs +++ b/meilidb-data/src/indexer.rs @@ -13,12 +13,12 @@ type Word = Vec; // TODO make it be a SmallVec pub struct Indexer { word_limit: usize, // the maximum number of indexed words words_doc_indexes: BTreeMap>, - docs_attrs_words: HashMap<(DocumentId, SchemaAttr), Vec>, + docs_words: HashMap>, } pub struct Indexed { pub words_doc_indexes: BTreeMap>, - pub docs_attrs_words: HashMap<(DocumentId, SchemaAttr), fst::Set>, + pub docs_words: HashMap, } impl Indexer { @@ -30,7 +30,7 @@ impl Indexer { Indexer { word_limit: limit, words_doc_indexes: BTreeMap::new(), - docs_attrs_words: HashMap::new(), + docs_words: HashMap::new(), } } @@ -42,7 +42,7 @@ impl Indexer { attr, self.word_limit, &mut self.words_doc_indexes, - &mut self.docs_attrs_words, + &mut self.docs_words, ); if !must_continue { break } @@ -60,7 +60,7 @@ impl Indexer { attr, self.word_limit, &mut self.words_doc_indexes, - &mut self.docs_attrs_words, + &mut self.docs_words, ); if !must_continue { break } @@ -76,16 +76,16 @@ impl Indexer { (word, SetBuf::new_unchecked(indexes)) }).collect(); - let docs_attrs_words = self.docs_attrs_words + let docs_words = self.docs_words .into_iter() - .map(|((id, attr), mut words)| { + .map(|(id, mut words)| { words.sort_unstable(); words.dedup(); - ((id, attr), fst::Set::from_iter(words).unwrap()) + (id, fst::Set::from_iter(words).unwrap()) }) .collect(); - Indexed { words_doc_indexes, docs_attrs_words } + Indexed { words_doc_indexes, docs_words } } } @@ -95,7 +95,7 @@ fn index_token( attr: SchemaAttr, word_limit: usize, words_doc_indexes: &mut BTreeMap>, - docs_attrs_words: &mut HashMap<(DocumentId, SchemaAttr), Vec>, + docs_words: &mut HashMap>, ) -> bool { if token.word_index >= word_limit { return false } @@ -106,7 +106,7 @@ fn index_token( Some(docindex) => { let word = Vec::from(token.word); words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex); - docs_attrs_words.entry((id, attr)).or_insert_with(Vec::new).push(word); + docs_words.entry(id).or_insert_with(Vec::new).push(word); }, None => return false, } @@ -119,7 +119,7 @@ fn index_token( Some(docindex) => { let word = Vec::from(token.word); words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex); - docs_attrs_words.entry((id, attr)).or_insert_with(Vec::new).push(word); + docs_words.entry(id).or_insert_with(Vec::new).push(word); }, None => return false, }