mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-24 13:40:31 +01:00
feat: Introduce the DocumentsDeletion type
This commit is contained in:
parent
e67ada8823
commit
1f2abce7c3
@ -107,7 +107,10 @@ where S: Store,
|
|||||||
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
||||||
|
|
||||||
let doc_indexes = self.store.word_indexes(input)?;
|
let doc_indexes = self.store.word_indexes(input)?;
|
||||||
let doc_indexes = doc_indexes.expect("word doc-indexes not found");
|
let doc_indexes = match doc_indexes {
|
||||||
|
Some(doc_indexes) => doc_indexes,
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
for di in doc_indexes.as_slice() {
|
for di in doc_indexes.as_slice() {
|
||||||
if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) {
|
if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) {
|
||||||
|
@ -7,7 +7,6 @@ edition = "2018"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
arc-swap = "0.3.11"
|
arc-swap = "0.3.11"
|
||||||
bincode = "1.1.2"
|
bincode = "1.1.2"
|
||||||
byteorder = "1.3.1"
|
|
||||||
deunicode = "1.0.0"
|
deunicode = "1.0.0"
|
||||||
hashbrown = { version = "0.2.2", features = ["serde"] }
|
hashbrown = { version = "0.2.2", features = ["serde"] }
|
||||||
linked-hash-map = { version = "0.5.2", features = ["serde_impl"] }
|
linked-hash-map = { version = "0.5.2", features = ["serde_impl"] }
|
||||||
|
@ -1,25 +1,22 @@
|
|||||||
use std::collections::{HashSet, HashMap};
|
use std::collections::{BTreeSet, HashSet, HashMap};
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::io::{self, Cursor, BufRead};
|
|
||||||
use std::iter::FromIterator;
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use std::{error, fmt};
|
use std::{error, fmt};
|
||||||
|
|
||||||
use arc_swap::{ArcSwap, Lease};
|
use arc_swap::{ArcSwap, Lease};
|
||||||
use byteorder::{ReadBytesExt, BigEndian};
|
|
||||||
use meilidb_core::{criterion::Criteria, QueryBuilder, Store, DocumentId, DocIndex};
|
use meilidb_core::{criterion::Criteria, QueryBuilder, Store, DocumentId, DocIndex};
|
||||||
use rmp_serde::decode::{Error as RmpError};
|
use rmp_serde::decode::{Error as RmpError};
|
||||||
use sdset::{Set, SetBuf, SetOperation, duo::Union};
|
use sdset::{Set, SetBuf, SetOperation, duo::{Union, DifferenceByKey}};
|
||||||
use serde::de;
|
use serde::de;
|
||||||
use sled::IVec;
|
use sled::IVec;
|
||||||
use zerocopy::{AsBytes, LayoutVerified};
|
use zerocopy::{AsBytes, LayoutVerified};
|
||||||
use fst::{SetBuilder, set::OpBuilder};
|
use fst::{SetBuilder, set::OpBuilder, Streamer};
|
||||||
|
|
||||||
use crate::{Schema, SchemaAttr, RankedMap};
|
use crate::{Schema, SchemaAttr, RankedMap};
|
||||||
use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError};
|
use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError};
|
||||||
use crate::indexer::Indexer;
|
use crate::indexer::{Indexer, Indexed};
|
||||||
use crate::document_attr_key::DocumentAttrKey;
|
use crate::document_attr_key::DocumentAttrKey;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@ -88,6 +85,22 @@ impl Database {
|
|||||||
Ok(Database { cache, inner })
|
Ok(Database { cache, inner })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn indexes(&self) -> Result<Option<HashSet<String>>, Error> {
|
||||||
|
let bytes = match self.inner.get("indexes")? {
|
||||||
|
Some(bytes) => bytes,
|
||||||
|
None => return Ok(None),
|
||||||
|
};
|
||||||
|
|
||||||
|
let indexes = bincode::deserialize(&bytes)?;
|
||||||
|
Ok(Some(indexes))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_indexes(&self, value: &HashSet<String>) -> Result<(), Error> {
|
||||||
|
let bytes = bincode::serialize(value)?;
|
||||||
|
self.inner.set("indexes", bytes)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn open_index(&self, name: &str) -> Result<Option<Arc<Index>>, Error> {
|
pub fn open_index(&self, name: &str) -> Result<Option<Arc<Index>>, Error> {
|
||||||
{
|
{
|
||||||
let cache = self.cache.read().unwrap();
|
let cache = self.cache.read().unwrap();
|
||||||
@ -102,14 +115,8 @@ impl Database {
|
|||||||
occupied.get().clone()
|
occupied.get().clone()
|
||||||
},
|
},
|
||||||
Entry::Vacant(vacant) => {
|
Entry::Vacant(vacant) => {
|
||||||
let bytes = match self.inner.get("indexes")? {
|
if !self.indexes()?.map_or(false, |x| !x.contains(name)) {
|
||||||
Some(bytes) => bytes,
|
return Ok(None)
|
||||||
None => return Ok(None),
|
|
||||||
};
|
|
||||||
|
|
||||||
let indexes: HashSet<&str> = bincode::deserialize(&bytes)?;
|
|
||||||
if indexes.get(name).is_none() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let main = {
|
let main = {
|
||||||
@ -123,13 +130,19 @@ impl Database {
|
|||||||
WordsIndex(tree)
|
WordsIndex(tree)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let attrs_words = {
|
||||||
|
let tree_name = format!("{}-attrs-words", name);
|
||||||
|
let tree = self.inner.open_tree(tree_name)?;
|
||||||
|
AttrsWords(tree)
|
||||||
|
};
|
||||||
|
|
||||||
let documents = {
|
let documents = {
|
||||||
let tree_name = format!("{}-documents", name);
|
let tree_name = format!("{}-documents", name);
|
||||||
let tree = self.inner.open_tree(tree_name)?;
|
let tree = self.inner.open_tree(tree_name)?;
|
||||||
DocumentsIndex(tree)
|
DocumentsIndex(tree)
|
||||||
};
|
};
|
||||||
|
|
||||||
let raw_index = RawIndex { main, words, documents };
|
let raw_index = RawIndex { main, words, attrs_words, documents };
|
||||||
let index = Index::from_raw(raw_index)?;
|
let index = Index::from_raw(raw_index)?;
|
||||||
|
|
||||||
vacant.insert(Arc::new(index)).clone()
|
vacant.insert(Arc::new(index)).clone()
|
||||||
@ -147,16 +160,6 @@ impl Database {
|
|||||||
occupied.get().clone()
|
occupied.get().clone()
|
||||||
},
|
},
|
||||||
Entry::Vacant(vacant) => {
|
Entry::Vacant(vacant) => {
|
||||||
let bytes = self.inner.get("indexes")?;
|
|
||||||
let bytes = bytes.as_ref();
|
|
||||||
|
|
||||||
let mut indexes: HashSet<&str> = match bytes {
|
|
||||||
Some(bytes) => bincode::deserialize(bytes)?,
|
|
||||||
None => HashSet::new(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let new_insertion = indexes.insert(name);
|
|
||||||
|
|
||||||
let main = {
|
let main = {
|
||||||
let tree = self.inner.open_tree(name)?;
|
let tree = self.inner.open_tree(name)?;
|
||||||
MainIndex(tree)
|
MainIndex(tree)
|
||||||
@ -168,19 +171,31 @@ impl Database {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
main.set_schema(&schema)?;
|
||||||
|
|
||||||
let words = {
|
let words = {
|
||||||
let tree_name = format!("{}-words", name);
|
let tree_name = format!("{}-words", name);
|
||||||
let tree = self.inner.open_tree(tree_name)?;
|
let tree = self.inner.open_tree(tree_name)?;
|
||||||
WordsIndex(tree)
|
WordsIndex(tree)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let attrs_words = {
|
||||||
|
let tree_name = format!("{}-attrs-words", name);
|
||||||
|
let tree = self.inner.open_tree(tree_name)?;
|
||||||
|
AttrsWords(tree)
|
||||||
|
};
|
||||||
|
|
||||||
let documents = {
|
let documents = {
|
||||||
let tree_name = format!("{}-documents", name);
|
let tree_name = format!("{}-documents", name);
|
||||||
let tree = self.inner.open_tree(tree_name)?;
|
let tree = self.inner.open_tree(tree_name)?;
|
||||||
DocumentsIndex(tree)
|
DocumentsIndex(tree)
|
||||||
};
|
};
|
||||||
|
|
||||||
let raw_index = RawIndex { main, words, documents };
|
let mut indexes = self.indexes()?.unwrap_or_else(HashSet::new);
|
||||||
|
indexes.insert(name.to_string());
|
||||||
|
self.set_indexes(&indexes)?;
|
||||||
|
|
||||||
|
let raw_index = RawIndex { main, words, attrs_words, documents };
|
||||||
let index = Index::from_raw(raw_index)?;
|
let index = Index::from_raw(raw_index)?;
|
||||||
|
|
||||||
vacant.insert(Arc::new(index)).clone()
|
vacant.insert(Arc::new(index)).clone()
|
||||||
@ -195,6 +210,7 @@ impl Database {
|
|||||||
pub struct RawIndex {
|
pub struct RawIndex {
|
||||||
pub main: MainIndex,
|
pub main: MainIndex,
|
||||||
pub words: WordsIndex,
|
pub words: WordsIndex,
|
||||||
|
pub attrs_words: AttrsWords,
|
||||||
pub documents: DocumentsIndex,
|
pub documents: DocumentsIndex,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -212,6 +228,13 @@ impl MainIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn set_schema(&self, schema: &Schema) -> Result<(), Error> {
|
||||||
|
let mut bytes = Vec::new();
|
||||||
|
schema.write_to_bin(&mut bytes)?;
|
||||||
|
self.0.set("schema", bytes)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn words_set(&self) -> Result<Option<fst::Set>, Error> {
|
pub fn words_set(&self) -> Result<Option<fst::Set>, Error> {
|
||||||
match self.0.get("words")? {
|
match self.0.get("words")? {
|
||||||
Some(bytes) => {
|
Some(bytes) => {
|
||||||
@ -263,16 +286,87 @@ impl WordsIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_doc_indexes(&self, word: &[u8], set: Option<&Set<DocIndex>>) -> sled::Result<()> {
|
pub fn set_doc_indexes(&self, word: &[u8], set: &Set<DocIndex>) -> sled::Result<()> {
|
||||||
match set {
|
self.0.set(word, set.as_bytes())?;
|
||||||
Some(set) => self.0.set(word, set.as_bytes())?,
|
Ok(())
|
||||||
None => self.0.del(word)?,
|
}
|
||||||
|
|
||||||
|
pub fn del_doc_indexes(&self, word: &[u8]) -> sled::Result<()> {
|
||||||
|
self.0.del(word)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct AttrsWords(Arc<sled::Tree>);
|
||||||
|
|
||||||
|
impl AttrsWords {
|
||||||
|
pub fn attr_words(&self, id: DocumentId, attr: SchemaAttr) -> Result<Option<fst::Set>, Error> {
|
||||||
|
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
|
||||||
|
match self.0.get(key)? {
|
||||||
|
Some(bytes) => {
|
||||||
|
let len = bytes.len();
|
||||||
|
let value = bytes.into();
|
||||||
|
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
|
||||||
|
Ok(Some(fst::Set::from(fst)))
|
||||||
|
},
|
||||||
|
None => Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn attrs_words(&self, id: DocumentId) -> DocumentAttrsWordsIter {
|
||||||
|
let start = DocumentAttrKey::new(id, SchemaAttr::min());
|
||||||
|
let start = start.to_be_bytes();
|
||||||
|
|
||||||
|
let end = DocumentAttrKey::new(id, SchemaAttr::max());
|
||||||
|
let end = end.to_be_bytes();
|
||||||
|
|
||||||
|
DocumentAttrsWordsIter(self.0.range(start..=end))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_attr_words(
|
||||||
|
&self,
|
||||||
|
id: DocumentId,
|
||||||
|
attr: SchemaAttr,
|
||||||
|
words: Option<&fst::Set>,
|
||||||
|
) -> Result<(), Error>
|
||||||
|
{
|
||||||
|
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
|
||||||
|
|
||||||
|
match words {
|
||||||
|
Some(words) => self.0.set(key, words.as_fst().as_bytes())?,
|
||||||
|
None => self.0.del(key)?,
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct DocumentAttrsWordsIter<'a>(sled::Iter<'a>);
|
||||||
|
|
||||||
|
impl<'a> Iterator for DocumentAttrsWordsIter<'a> {
|
||||||
|
type Item = sled::Result<(SchemaAttr, fst::Set)>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
match self.0.next() {
|
||||||
|
Some(Ok((key, bytes))) => {
|
||||||
|
let slice: &[u8] = key.as_ref();
|
||||||
|
let array = slice.try_into().unwrap();
|
||||||
|
let key = DocumentAttrKey::from_be_bytes(array);
|
||||||
|
|
||||||
|
let len = bytes.len();
|
||||||
|
let value = bytes.into();
|
||||||
|
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len).unwrap();
|
||||||
|
let set = fst::Set::from(fst);
|
||||||
|
|
||||||
|
Some(Ok((key.attribute, set)))
|
||||||
|
},
|
||||||
|
Some(Err(e)) => Some(Err(e.into())),
|
||||||
|
None => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct DocumentsIndex(Arc<sled::Tree>);
|
pub struct DocumentsIndex(Arc<sled::Tree>);
|
||||||
|
|
||||||
@ -288,6 +382,12 @@ impl DocumentsIndex {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn del_document_field(&self, id: DocumentId, attr: SchemaAttr) -> sled::Result<()> {
|
||||||
|
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
|
||||||
|
self.0.del(key)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter {
|
pub fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter {
|
||||||
let start = DocumentAttrKey::new(id, SchemaAttr::min());
|
let start = DocumentAttrKey::new(id, SchemaAttr::min());
|
||||||
let start = start.to_be_bytes();
|
let start = start.to_be_bytes();
|
||||||
@ -375,9 +475,7 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn documents_deletion(&self) -> DocumentsDeletion {
|
pub fn documents_deletion(&self) -> DocumentsDeletion {
|
||||||
// let index = self.0.clone();
|
DocumentsDeletion::new(self)
|
||||||
// DocumentsDeletion::from_raw(index)
|
|
||||||
unimplemented!()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn document<T>(
|
pub fn document<T>(
|
||||||
@ -467,11 +565,12 @@ impl<'a> DocumentsAddition<'a> {
|
|||||||
let lease_inner = self.inner.lease_inner();
|
let lease_inner = self.inner.lease_inner();
|
||||||
let main = &lease_inner.raw.main;
|
let main = &lease_inner.raw.main;
|
||||||
let words = &lease_inner.raw.words;
|
let words = &lease_inner.raw.words;
|
||||||
|
let attrs_words = &lease_inner.raw.attrs_words;
|
||||||
|
|
||||||
let delta_index = self.indexer.build();
|
let Indexed { words_doc_indexes, docs_attrs_words } = self.indexer.build();
|
||||||
let mut delta_words_builder = SetBuilder::memory();
|
let mut delta_words_builder = SetBuilder::memory();
|
||||||
|
|
||||||
for (word, delta_set) in delta_index {
|
for (word, delta_set) in words_doc_indexes {
|
||||||
delta_words_builder.insert(&word).unwrap();
|
delta_words_builder.insert(&word).unwrap();
|
||||||
|
|
||||||
let set = match words.doc_indexes(&word)? {
|
let set = match words.doc_indexes(&word)? {
|
||||||
@ -479,7 +578,11 @@ impl<'a> DocumentsAddition<'a> {
|
|||||||
None => delta_set,
|
None => delta_set,
|
||||||
};
|
};
|
||||||
|
|
||||||
words.set_doc_indexes(&word, Some(&set))?;
|
words.set_doc_indexes(&word, &set)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
for ((id, attr), words) in docs_attrs_words {
|
||||||
|
attrs_words.set_attr_words(id, attr, Some(&words))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let delta_words = delta_words_builder
|
let delta_words = delta_words_builder
|
||||||
@ -534,20 +637,83 @@ impl<'a> DocumentsDeletion<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn finalize(mut self) -> Result<(), Error> {
|
pub fn finalize(mut self) -> Result<(), Error> {
|
||||||
self.documents.sort_unstable();
|
let lease_inner = self.inner.lease_inner();
|
||||||
self.documents.dedup();
|
let main = &lease_inner.raw.main;
|
||||||
|
let attrs_words = &lease_inner.raw.attrs_words;
|
||||||
|
let words = &lease_inner.raw.words;
|
||||||
|
let documents = &lease_inner.raw.documents;
|
||||||
|
|
||||||
let idset = SetBuf::new_unchecked(self.documents);
|
let idset = {
|
||||||
|
self.documents.sort_unstable();
|
||||||
|
self.documents.dedup();
|
||||||
|
SetBuf::new_unchecked(self.documents)
|
||||||
|
};
|
||||||
|
|
||||||
// let index = self.inner.word_index();
|
let mut words_attrs = HashMap::new();
|
||||||
|
for id in idset.into_vec() {
|
||||||
|
for result in attrs_words.attrs_words(id) {
|
||||||
|
let (attr, words) = result?;
|
||||||
|
let mut stream = words.stream();
|
||||||
|
while let Some(word) = stream.next() {
|
||||||
|
let word = word.to_vec();
|
||||||
|
words_attrs.entry(word).or_insert_with(Vec::new).push((id, attr));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// let new_index = index.remove_documents(&idset)?;
|
let mut removed_words = BTreeSet::new();
|
||||||
// let new_index = Arc::from(new_index);
|
for (word, mut attrs) in words_attrs {
|
||||||
|
attrs.sort_unstable();
|
||||||
|
attrs.dedup();
|
||||||
|
let attrs = SetBuf::new_unchecked(attrs);
|
||||||
|
|
||||||
// self.inner.update_word_index(new_index);
|
if let Some(doc_indexes) = words.doc_indexes(&word)? {
|
||||||
|
let op = DifferenceByKey::new(&doc_indexes, &attrs, |d| d.document_id, |(id, _)| *id);
|
||||||
|
let doc_indexes = op.into_set_buf();
|
||||||
|
|
||||||
// Ok(())
|
if !doc_indexes.is_empty() {
|
||||||
|
words.set_doc_indexes(&word, &doc_indexes)?;
|
||||||
|
} else {
|
||||||
|
words.del_doc_indexes(&word)?;
|
||||||
|
removed_words.insert(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
unimplemented!("documents deletion finalize")
|
for (id, attr) in attrs.into_vec() {
|
||||||
|
documents.del_document_field(id, attr)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let removed_words = fst::Set::from_iter(removed_words).unwrap();
|
||||||
|
let words = match main.words_set()? {
|
||||||
|
Some(words_set) => {
|
||||||
|
let op = fst::set::OpBuilder::new()
|
||||||
|
.add(words_set.stream())
|
||||||
|
.add(removed_words.stream())
|
||||||
|
.difference();
|
||||||
|
|
||||||
|
let mut words_builder = SetBuilder::memory();
|
||||||
|
words_builder.extend_stream(op).unwrap();
|
||||||
|
words_builder
|
||||||
|
.into_inner()
|
||||||
|
.and_then(fst::Set::from_bytes)
|
||||||
|
.unwrap()
|
||||||
|
},
|
||||||
|
None => fst::Set::default(),
|
||||||
|
};
|
||||||
|
|
||||||
|
main.set_words_set(&words)?;
|
||||||
|
|
||||||
|
// TODO must update the ranked_map too!
|
||||||
|
|
||||||
|
// update the "consistent" view of the Index
|
||||||
|
let ranked_map = lease_inner.ranked_map.clone();
|
||||||
|
let schema = lease_inner.schema.clone();
|
||||||
|
let raw = lease_inner.raw.clone();
|
||||||
|
|
||||||
|
let inner = InnerIndex { words, schema, ranked_map, raw };
|
||||||
|
self.inner.0.store(Arc::new(inner));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::{BTreeMap, HashMap};
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
|
|
||||||
use deunicode::deunicode_with_tofu;
|
use deunicode::deunicode_with_tofu;
|
||||||
use meilidb_core::{DocumentId, DocIndex, Store};
|
use meilidb_core::{DocumentId, DocIndex};
|
||||||
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
|
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
|
|
||||||
@ -12,27 +12,39 @@ type Word = Vec<u8>; // TODO make it be a SmallVec
|
|||||||
|
|
||||||
pub struct Indexer {
|
pub struct Indexer {
|
||||||
word_limit: usize, // the maximum number of indexed words
|
word_limit: usize, // the maximum number of indexed words
|
||||||
indexed: BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
||||||
|
docs_attrs_words: HashMap<(DocumentId, SchemaAttr), Vec<Word>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Indexed {
|
||||||
|
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
|
||||||
|
pub docs_attrs_words: HashMap<(DocumentId, SchemaAttr), fst::Set>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Indexer {
|
impl Indexer {
|
||||||
pub fn new() -> Indexer {
|
pub fn new() -> Indexer {
|
||||||
Indexer {
|
Indexer::with_word_limit(1000)
|
||||||
word_limit: 1000,
|
|
||||||
indexed: BTreeMap::new(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_word_limit(limit: usize) -> Indexer {
|
pub fn with_word_limit(limit: usize) -> Indexer {
|
||||||
Indexer {
|
Indexer {
|
||||||
word_limit: limit,
|
word_limit: limit,
|
||||||
indexed: BTreeMap::new(),
|
words_doc_indexes: BTreeMap::new(),
|
||||||
|
docs_attrs_words: HashMap::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
|
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
|
||||||
for token in Tokenizer::new(text) {
|
for token in Tokenizer::new(text) {
|
||||||
let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed);
|
let must_continue = index_token(
|
||||||
|
token,
|
||||||
|
id,
|
||||||
|
attr,
|
||||||
|
self.word_limit,
|
||||||
|
&mut self.words_doc_indexes,
|
||||||
|
&mut self.docs_attrs_words,
|
||||||
|
);
|
||||||
|
|
||||||
if !must_continue { break }
|
if !must_continue { break }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -42,17 +54,38 @@ impl Indexer {
|
|||||||
{
|
{
|
||||||
let iter = iter.into_iter();
|
let iter = iter.into_iter();
|
||||||
for token in SeqTokenizer::new(iter) {
|
for token in SeqTokenizer::new(iter) {
|
||||||
let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed);
|
let must_continue = index_token(
|
||||||
|
token,
|
||||||
|
id,
|
||||||
|
attr,
|
||||||
|
self.word_limit,
|
||||||
|
&mut self.words_doc_indexes,
|
||||||
|
&mut self.docs_attrs_words,
|
||||||
|
);
|
||||||
|
|
||||||
if !must_continue { break }
|
if !must_continue { break }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn build(self) -> BTreeMap<Word, SetBuf<DocIndex>> {
|
pub fn build(self) -> Indexed {
|
||||||
self.indexed.into_iter().map(|(word, mut indexes)| {
|
let words_doc_indexes = self.words_doc_indexes
|
||||||
indexes.sort_unstable();
|
.into_iter()
|
||||||
indexes.dedup();
|
.map(|(word, mut indexes)| {
|
||||||
(word, SetBuf::new_unchecked(indexes))
|
indexes.sort_unstable();
|
||||||
}).collect()
|
indexes.dedup();
|
||||||
|
(word, SetBuf::new_unchecked(indexes))
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
let docs_attrs_words = self.docs_attrs_words
|
||||||
|
.into_iter()
|
||||||
|
.map(|((id, attr), mut words)| {
|
||||||
|
words.sort_unstable();
|
||||||
|
words.dedup();
|
||||||
|
((id, attr), fst::Set::from_iter(words).unwrap())
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Indexed { words_doc_indexes, docs_attrs_words }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -61,7 +94,8 @@ fn index_token(
|
|||||||
id: DocumentId,
|
id: DocumentId,
|
||||||
attr: SchemaAttr,
|
attr: SchemaAttr,
|
||||||
word_limit: usize,
|
word_limit: usize,
|
||||||
indexed: &mut BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||||
|
docs_attrs_words: &mut HashMap<(DocumentId, SchemaAttr), Vec<Word>>,
|
||||||
) -> bool
|
) -> bool
|
||||||
{
|
{
|
||||||
if token.word_index >= word_limit { return false }
|
if token.word_index >= word_limit { return false }
|
||||||
@ -71,7 +105,8 @@ fn index_token(
|
|||||||
match token_to_docindex(id, attr, token) {
|
match token_to_docindex(id, attr, token) {
|
||||||
Some(docindex) => {
|
Some(docindex) => {
|
||||||
let word = Vec::from(token.word);
|
let word = Vec::from(token.word);
|
||||||
indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
|
||||||
|
docs_attrs_words.entry((id, attr)).or_insert_with(Vec::new).push(word);
|
||||||
},
|
},
|
||||||
None => return false,
|
None => return false,
|
||||||
}
|
}
|
||||||
@ -83,7 +118,8 @@ fn index_token(
|
|||||||
match token_to_docindex(id, attr, token) {
|
match token_to_docindex(id, attr, token) {
|
||||||
Some(docindex) => {
|
Some(docindex) => {
|
||||||
let word = Vec::from(token.word);
|
let word = Vec::from(token.word);
|
||||||
indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
|
||||||
|
docs_attrs_words.entry((id, attr)).or_insert_with(Vec::new).push(word);
|
||||||
},
|
},
|
||||||
None => return false,
|
None => return false,
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user