From 25a4961453c1057da2575bae71db591ee43b7b10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 15 Apr 2019 15:16:53 +0200 Subject: [PATCH] feat: Introduce the Indexer struct --- meilidb-data/Cargo.toml | 2 + meilidb-data/src/indexer.rs | 84 +++++++++++++++++++++++++++++++++++++ meilidb-data/src/lib.rs | 10 +++-- 3 files changed, 92 insertions(+), 4 deletions(-) create mode 100644 meilidb-data/src/indexer.rs diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index d98b9c491..7c13e9f72 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -9,7 +9,9 @@ bincode = "1.1.2" hashbrown = { version = "0.1.8", features = ["serde"] } linked-hash-map = { version = "0.5.2", features = ["serde_impl"] } meilidb-core = { path = "../meilidb-core", version = "0.1.0" } +meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } ordered-float = { version = "1.0.2", features = ["serde"] } +sdset = "0.3.1" serde = { version = "1.0.88", features = ["derive"] } serde_json = { version = "1.0.39", features = ["preserve_order"] } sled = "0.22.1" diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs new file mode 100644 index 000000000..82a4ae156 --- /dev/null +++ b/meilidb-data/src/indexer.rs @@ -0,0 +1,84 @@ +use std::collections::BTreeMap; +use std::convert::TryFrom; + +use meilidb_core::{DocumentId, DocIndex}; +use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder}; +use meilidb_tokenizer::{Tokenizer, SeqTokenizer, Token}; +use crate::SchemaAttr; + +use sdset::Set; + +type Word = Vec; // TODO make it be a SmallVec + +pub struct Indexer { + word_limit: usize, // the maximum number of indexed words + indexed: BTreeMap>, +} + +impl Indexer { + pub fn new() -> Indexer { + Indexer { + word_limit: 1000, + indexed: BTreeMap::new(), + } + } + + pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) { + for token in Tokenizer::new(text) { + if token.word_index >= self.word_limit { break } + let docindex = match token_to_docindex(id, attr, token) { + Some(docindex) => docindex, + None => break, + }; + + let word = Vec::from(token.word); + self.indexed.entry(word).or_insert_with(Vec::new).push(docindex); + } + } + + pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) + where I: IntoIterator, + { + let iter = iter.into_iter(); + for token in SeqTokenizer::new(iter) { + if token.word_index >= self.word_limit { break } + let docindex = match token_to_docindex(id, attr, token) { + Some(docindex) => docindex, + None => break, + }; + + let word = Vec::from(token.word); + self.indexed.entry(word).or_insert_with(Vec::new).push(docindex); + } + } + + pub fn build(self) -> WordIndex { + let mut builder = WordIndexBuilder::new(); + + for (key, mut indexes) in self.indexed { + indexes.sort_unstable(); + indexes.dedup(); + + let indexes = Set::new_unchecked(&indexes); + builder.insert(key, indexes).unwrap(); + } + + builder.build() + } +} + +fn token_to_docindex<'a>(id: DocumentId, attr: SchemaAttr, token: Token<'a>) -> Option { + let word_index = u16::try_from(token.word_index).ok()?; + let char_index = u16::try_from(token.char_index).ok()?; + let char_length = u16::try_from(token.word.chars().count()).ok()?; + + let docindex = DocIndex { + document_id: id, + attribute: attr.0, + word_index: word_index, + char_index: char_index, + char_length: char_length, + }; + + Some(docindex) +} diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs index 96d6bdf6e..c601105ed 100644 --- a/meilidb-data/src/lib.rs +++ b/meilidb-data/src/lib.rs @@ -1,9 +1,11 @@ mod database; -pub mod schema; -mod ranked_map; +mod indexer; mod number; +mod ranked_map; +pub mod schema; pub use self::database::{Database, Index}; -pub use self::schema::{Schema, SchemaAttr}; -pub use self::ranked_map::RankedMap; pub use self::number::Number; +pub use self::ranked_map::RankedMap; +pub use self::schema::{Schema, SchemaAttr}; +pub use self::indexer::Indexer;