fix indexer tests

This commit is contained in:
mpostma 2020-11-24 21:43:21 +01:00 committed by many
parent 5e00842087
commit 8843062604
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
7 changed files with 1155 additions and 1171 deletions

View File

@ -26,6 +26,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
log = "0.4.11" log = "0.4.11"
meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" } meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" }
meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" } meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" }
meilisearch-tokenizer = { path = "../../Tokenizer" }
meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" } meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" }
once_cell = "1.5.2" once_cell = "1.5.2"
ordered-float = { version = "2.0.1", features = ["serde"] } ordered-float = { version = "2.0.1", features = ["serde"] }

View File

@ -1,4 +1,4 @@
mod dfa; mod dfa;
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,7 @@ use fst::{IntoStreamer, Streamer};
use itertools::{EitherOrBoth, merge_join_by}; use itertools::{EitherOrBoth, merge_join_by};
use log::debug; use log::debug;
use meilisearch_tokenizer::Token; use meilisearch_tokenizer::Token;
use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use sdset::{Set, SetBuf, SetOperation}; use sdset::{Set, SetBuf, SetOperation};
use crate::database::MainT; use crate::database::MainT;

View File

@ -1,9 +1,10 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap}; use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom; use std::convert::TryFrom;
use std::println;
use meilisearch_schema::IndexedPos; use meilisearch_schema::IndexedPos;
use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use meilisearch_tokenizer::Token; use meilisearch_tokenizer::Token;
use sdset::SetBuf; use sdset::SetBuf;
@ -14,9 +15,8 @@ const WORD_LENGTH_LIMIT: usize = 80;
type Word = Vec<u8>; // TODO make it be a SmallVec type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer<A> { pub struct RawIndexer {
word_limit: usize, // the maximum number of indexed words word_limit: usize, // the maximum number of indexed words
stop_words: fst::Set<A>,
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>, words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>, docs_words: HashMap<DocumentId, Vec<Word>>,
analyzer: Analyzer, analyzer: Analyzer,
@ -27,28 +27,26 @@ pub struct Indexed<'a> {
pub docs_words: HashMap<DocumentId, FstSetCow<'a>>, pub docs_words: HashMap<DocumentId, FstSetCow<'a>>,
} }
impl<A> RawIndexer<A> { impl RawIndexer {
pub fn new(stop_words: fst::Set<A>) -> RawIndexer<A> { pub fn new<A: AsRef<[u8]>>(stop_words: fst::Set<A>) -> RawIndexer {
RawIndexer::with_word_limit(stop_words, 1000) RawIndexer::with_word_limit(stop_words, 1000)
} }
pub fn with_word_limit(stop_words: fst::Set<A>, limit: usize) -> RawIndexer<A> { pub fn with_word_limit<A: AsRef<[u8]>>(stop_words: fst::Set<A>, limit: usize) -> RawIndexer {
RawIndexer { RawIndexer {
word_limit: limit, word_limit: limit,
stop_words,
words_doc_indexes: BTreeMap::new(), words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(), docs_words: HashMap::new(),
analyzer: Analyzer::new(AnalyzerConfig::default()), analyzer: Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words.stream().into_strs().unwrap().into_iter().collect()))
} }
} }
}
impl<A: AsRef<[u8]>> RawIndexer<A> {
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize { pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
let mut number_of_words = 0; let mut number_of_words = 0;
let analyzed_text = self.analyzer.analyze(text); let analyzed_text = self.analyzer.analyze(text);
for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| !t.is_separator()).enumerate() { for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| t.is_word()).enumerate() {
print!("token: {}", token.word);
let must_continue = index_token( let must_continue = index_token(
token, token,
token_index, token_index,
@ -56,7 +54,6 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
id, id,
indexed_pos, indexed_pos,
self.word_limit, self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes, &mut self.words_doc_indexes,
&mut self.docs_words, &mut self.docs_words,
); );
@ -88,6 +85,7 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
let tokens = analyzed_text let tokens = analyzed_text
.tokens() .tokens()
.enumerate() .enumerate()
.filter(|(_, t)| t.is_word())
.map(|(i, mut t)| { .map(|(i, mut t)| {
t.byte_start = t.byte_start + current_byte_offset; t.byte_start = t.byte_start + current_byte_offset;
t.byte_end = t.byte_end + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset;
@ -103,12 +101,11 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
let must_continue = index_token( let must_continue = index_token(
token, token,
token_index,
word_pos, word_pos,
token_index,
id, id,
indexed_pos, indexed_pos,
self.word_limit, self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes, &mut self.words_doc_indexes,
&mut self.docs_words, &mut self.docs_words,
); );
@ -145,24 +142,23 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
} }
} }
fn index_token<A>( fn index_token(
token: Token, token: Token,
position: usize, position: usize,
word_pos: usize, word_pos: usize,
id: DocumentId, id: DocumentId,
indexed_pos: IndexedPos, indexed_pos: IndexedPos,
word_limit: usize, word_limit: usize,
stop_words: &fst::Set<A>,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>, words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>, docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool ) -> bool
where A: AsRef<[u8]>,
{ {
if position >= word_limit { println!(" position {}, limit: {}", position, word_limit);
if word_pos >= word_limit {
return false; return false;
} }
if !stop_words.contains(&token.word.as_ref()) { if !token.is_stopword() {
match token_to_docindex(id, indexed_pos, &token, word_pos) { match token_to_docindex(id, indexed_pos, &token, word_pos) {
Some(docindex) => { Some(docindex) => {
let word = Vec::from(token.word.as_ref()); let word = Vec::from(token.word.as_ref());
@ -220,9 +216,6 @@ mod tests {
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
} }
#[test] #[test]
@ -242,9 +235,6 @@ mod tests {
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
} }
#[test] #[test]
@ -269,9 +259,6 @@ mod tests {
assert!(words_doc_indexes.get(&b"ai"[..]).is_none()); assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
assert!(words_doc_indexes.get(&b"de"[..]).is_none()); assert!(words_doc_indexes.get(&b"de"[..]).is_none());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
} }
#[test] #[test]
@ -303,7 +290,7 @@ mod tests {
let Indexed { let Indexed {
words_doc_indexes, .. words_doc_indexes, ..
} = indexer.build(); } = indexer.build();
assert!(words_doc_indexes.get(&"buffering".to_owned().into_bytes()).is_some()); assert!(words_doc_indexes.get(&"request_buffering".to_owned().into_bytes()).is_some());
} }
#[test] #[test]

View File

@ -110,18 +110,17 @@ pub fn push_documents_addition<D: serde::Serialize>(
} }
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
fn index_document<A>( fn index_document(
writer: &mut heed::RwTxn<MainT>, writer: &mut heed::RwTxn<MainT>,
documents_fields: DocumentsFields, documents_fields: DocumentsFields,
documents_fields_counts: DocumentsFieldsCounts, documents_fields_counts: DocumentsFieldsCounts,
ranked_map: &mut RankedMap, ranked_map: &mut RankedMap,
indexer: &mut RawIndexer<A>, indexer: &mut RawIndexer,
schema: &Schema, schema: &Schema,
field_id: FieldId, field_id: FieldId,
document_id: DocumentId, document_id: DocumentId,
value: &Value, value: &Value,
) -> MResult<()> ) -> MResult<()>
where A: AsRef<[u8]>,
{ {
let serialized = serde_json::to_vec(value)?; let serialized = serde_json::to_vec(value)?;
documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; documents_fields.put_document_field(writer, document_id, field_id, &serialized)?;
@ -373,14 +372,13 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
Ok(()) Ok(())
} }
pub fn write_documents_addition_index<A>( pub fn write_documents_addition_index(
writer: &mut heed::RwTxn<MainT>, writer: &mut heed::RwTxn<MainT>,
index: &store::Index, index: &store::Index,
ranked_map: &RankedMap, ranked_map: &RankedMap,
number_of_inserted_documents: usize, number_of_inserted_documents: usize,
indexer: RawIndexer<A>, indexer: RawIndexer,
) -> MResult<()> ) -> MResult<()>
where A: AsRef<[u8]>,
{ {
let indexed = indexer.build(); let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory(); let mut delta_words_builder = SetBuilder::memory();

View File

@ -12,13 +12,12 @@ use crate::serde::SerializerError;
use crate::store::DiscoverIds; use crate::store::DiscoverIds;
/// Returns the number of words indexed or `None` if the type is unindexable. /// Returns the number of words indexed or `None` if the type is unindexable.
pub fn index_value<A>( pub fn index_value(
indexer: &mut RawIndexer<A>, indexer: &mut RawIndexer,
document_id: DocumentId, document_id: DocumentId,
indexed_pos: IndexedPos, indexed_pos: IndexedPos,
value: &Value, value: &Value,
) -> Option<usize> ) -> Option<usize>
where A: AsRef<[u8]>,
{ {
match value { match value {
Value::Null => None, Value::Null => None,