mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 22:14:32 +01:00
feat: Introduce the Indexer struct
This commit is contained in:
parent
7338e522bd
commit
25a4961453
@ -9,7 +9,9 @@ bincode = "1.1.2"
|
|||||||
hashbrown = { version = "0.1.8", features = ["serde"] }
|
hashbrown = { version = "0.1.8", features = ["serde"] }
|
||||||
linked-hash-map = { version = "0.5.2", features = ["serde_impl"] }
|
linked-hash-map = { version = "0.5.2", features = ["serde_impl"] }
|
||||||
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
||||||
|
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
||||||
ordered-float = { version = "1.0.2", features = ["serde"] }
|
ordered-float = { version = "1.0.2", features = ["serde"] }
|
||||||
|
sdset = "0.3.1"
|
||||||
serde = { version = "1.0.88", features = ["derive"] }
|
serde = { version = "1.0.88", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.39", features = ["preserve_order"] }
|
serde_json = { version = "1.0.39", features = ["preserve_order"] }
|
||||||
sled = "0.22.1"
|
sled = "0.22.1"
|
||||||
|
84
meilidb-data/src/indexer.rs
Normal file
84
meilidb-data/src/indexer.rs
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
|
use std::convert::TryFrom;
|
||||||
|
|
||||||
|
use meilidb_core::{DocumentId, DocIndex};
|
||||||
|
use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder};
|
||||||
|
use meilidb_tokenizer::{Tokenizer, SeqTokenizer, Token};
|
||||||
|
use crate::SchemaAttr;
|
||||||
|
|
||||||
|
use sdset::Set;
|
||||||
|
|
||||||
|
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||||
|
|
||||||
|
pub struct Indexer {
|
||||||
|
word_limit: usize, // the maximum number of indexed words
|
||||||
|
indexed: BTreeMap<Word, Vec<DocIndex>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Indexer {
|
||||||
|
pub fn new() -> Indexer {
|
||||||
|
Indexer {
|
||||||
|
word_limit: 1000,
|
||||||
|
indexed: BTreeMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
|
||||||
|
for token in Tokenizer::new(text) {
|
||||||
|
if token.word_index >= self.word_limit { break }
|
||||||
|
let docindex = match token_to_docindex(id, attr, token) {
|
||||||
|
Some(docindex) => docindex,
|
||||||
|
None => break,
|
||||||
|
};
|
||||||
|
|
||||||
|
let word = Vec::from(token.word);
|
||||||
|
self.indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
|
||||||
|
where I: IntoIterator<Item=&'a str>,
|
||||||
|
{
|
||||||
|
let iter = iter.into_iter();
|
||||||
|
for token in SeqTokenizer::new(iter) {
|
||||||
|
if token.word_index >= self.word_limit { break }
|
||||||
|
let docindex = match token_to_docindex(id, attr, token) {
|
||||||
|
Some(docindex) => docindex,
|
||||||
|
None => break,
|
||||||
|
};
|
||||||
|
|
||||||
|
let word = Vec::from(token.word);
|
||||||
|
self.indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(self) -> WordIndex {
|
||||||
|
let mut builder = WordIndexBuilder::new();
|
||||||
|
|
||||||
|
for (key, mut indexes) in self.indexed {
|
||||||
|
indexes.sort_unstable();
|
||||||
|
indexes.dedup();
|
||||||
|
|
||||||
|
let indexes = Set::new_unchecked(&indexes);
|
||||||
|
builder.insert(key, indexes).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.build()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_to_docindex<'a>(id: DocumentId, attr: SchemaAttr, token: Token<'a>) -> Option<DocIndex> {
|
||||||
|
let word_index = u16::try_from(token.word_index).ok()?;
|
||||||
|
let char_index = u16::try_from(token.char_index).ok()?;
|
||||||
|
let char_length = u16::try_from(token.word.chars().count()).ok()?;
|
||||||
|
|
||||||
|
let docindex = DocIndex {
|
||||||
|
document_id: id,
|
||||||
|
attribute: attr.0,
|
||||||
|
word_index: word_index,
|
||||||
|
char_index: char_index,
|
||||||
|
char_length: char_length,
|
||||||
|
};
|
||||||
|
|
||||||
|
Some(docindex)
|
||||||
|
}
|
@ -1,9 +1,11 @@
|
|||||||
mod database;
|
mod database;
|
||||||
pub mod schema;
|
mod indexer;
|
||||||
mod ranked_map;
|
|
||||||
mod number;
|
mod number;
|
||||||
|
mod ranked_map;
|
||||||
|
pub mod schema;
|
||||||
|
|
||||||
pub use self::database::{Database, Index};
|
pub use self::database::{Database, Index};
|
||||||
pub use self::schema::{Schema, SchemaAttr};
|
|
||||||
pub use self::ranked_map::RankedMap;
|
|
||||||
pub use self::number::Number;
|
pub use self::number::Number;
|
||||||
|
pub use self::ranked_map::RankedMap;
|
||||||
|
pub use self::schema::{Schema, SchemaAttr};
|
||||||
|
pub use self::indexer::Indexer;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user