MeiliSearch/meilidb-core/src/raw_indexer.rs

260 lines
7.4 KiB
Rust
Raw Normal View History

use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
2019-10-18 13:05:28 +02:00
use crate::{DocIndex, DocumentId};
use deunicode::deunicode_with_tofu;
use meilidb_schema::SchemaAttr;
2019-10-18 13:05:28 +02:00
use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
use sdset::SetBuf;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer {
word_limit: usize, // the maximum number of indexed words
2019-10-29 15:53:45 +01:00
stop_words: fst::Set,
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
}
pub struct Indexed {
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
pub docs_words: HashMap<DocumentId, fst::Set>,
}
impl RawIndexer {
2019-10-29 15:53:45 +01:00
pub fn new(stop_words: fst::Set) -> RawIndexer {
RawIndexer::with_word_limit(stop_words, 1000)
}
2019-10-29 15:53:45 +01:00
pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
RawIndexer {
word_limit: limit,
2019-10-29 15:53:45 +01:00
stop_words,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
}
}
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize {
let mut number_of_words = 0;
let lowercase_text = text.to_lowercase();
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
// TODO compute the deunicoded version after the cjk check
let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
Some(deunicoded)
} else {
None
};
let iter = Some(lowercase_text).into_iter().chain(next);
for text in iter {
// we must not count 2 times the same words
number_of_words = 0;
for token in Tokenizer::new(&text) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
2019-10-29 15:53:45 +01:00
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
2019-10-18 13:05:28 +02:00
if !must_continue {
break;
}
number_of_words += 1;
}
}
number_of_words
}
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
2019-10-18 13:05:28 +02:00
where
I: IntoIterator<Item = &'a str, IntoIter = IT>,
IT: Iterator<Item = &'a str> + Clone,
{
// TODO serialize this to one call to the SeqTokenizer loop
let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
let iter = lowercased.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
2019-10-29 15:53:45 +01:00
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
2019-10-18 13:05:28 +02:00
if !must_continue {
break;
}
}
2019-10-18 13:05:28 +02:00
let deunicoded: Vec<_> = lowercased
.into_iter()
.map(|lowercase_text| {
if lowercase_text.contains(is_cjk) {
return lowercase_text;
}
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded {
deunicoded
} else {
lowercase_text
}
})
.collect();
let iter = deunicoded.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
2019-10-29 15:53:45 +01:00
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
2019-10-18 13:05:28 +02:00
if !must_continue {
break;
}
}
}
pub fn build(self) -> Indexed {
2019-10-18 13:05:28 +02:00
let words_doc_indexes = self
.words_doc_indexes
.into_iter()
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
.collect();
2019-10-18 13:05:28 +02:00
let docs_words = self
.docs_words
.into_iter()
.map(|(id, mut words)| {
words.sort_unstable();
words.dedup();
(id, fst::Set::from_iter(words).unwrap())
})
.collect();
2019-10-18 13:05:28 +02:00
Indexed {
words_doc_indexes,
docs_words,
}
}
}
fn index_token(
token: Token,
id: DocumentId,
attr: SchemaAttr,
word_limit: usize,
2019-10-29 15:53:45 +01:00
stop_words: &fst::Set,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
2019-10-18 13:05:28 +02:00
) -> bool {
if token.word_index >= word_limit {
return false;
}
2019-10-29 15:53:45 +01:00
if stop_words.contains(&token.word) {
return false;
}
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
2019-10-18 13:05:28 +02:00
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
2019-10-18 13:05:28 +02:00
}
None => return false,
}
true
}
fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
let word_index = u16::try_from(token.word_index).ok()?;
let char_index = u16::try_from(token.char_index).ok()?;
let char_length = u16::try_from(token.word.chars().count()).ok()?;
let docindex = DocIndex {
document_id: id,
attribute: attr.0,
word_index,
char_index,
char_length,
};
Some(docindex)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strange_apostrophe() {
2019-10-29 15:53:45 +01:00
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "Zut, laspirateur, jai oublié de léteindre !";
indexer.index_text(docid, attr, text);
2019-10-18 13:05:28 +02:00
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
2019-10-18 13:05:28 +02:00
assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn strange_apostrophe_in_sequence() {
2019-10-29 15:53:45 +01:00
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = vec!["Zut, laspirateur, jai oublié de léteindre !"];
indexer.index_text_seq(docid, attr, text);
2019-10-18 13:05:28 +02:00
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
2019-10-18 13:05:28 +02:00
assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes())
.is_some());
}
}