fix indexer tests

This commit is contained in:
mpostma 2020-11-24 21:43:21 +01:00 committed by many
parent 5e00842087
commit 8843062604
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
7 changed files with 1155 additions and 1171 deletions

View File

@ -26,6 +26,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
log = "0.4.11"
meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" }
meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" }
meilisearch-tokenizer = { path = "../../Tokenizer" }
meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" }
once_cell = "1.5.2"
ordered-float = { version = "2.0.1", features = ["serde"] }

View File

@ -1,4 +1,4 @@
mod dfa;
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,7 @@ use fst::{IntoStreamer, Streamer};
use itertools::{EitherOrBoth, merge_join_by};
use log::debug;
use meilisearch_tokenizer::Token;
use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig};
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use sdset::{Set, SetBuf, SetOperation};
use crate::database::MainT;

View File

@ -1,9 +1,10 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
use std::println;
use meilisearch_schema::IndexedPos;
use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig};
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use meilisearch_tokenizer::Token;
use sdset::SetBuf;
@ -14,9 +15,8 @@ const WORD_LENGTH_LIMIT: usize = 80;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer<A> {
pub struct RawIndexer {
word_limit: usize, // the maximum number of indexed words
stop_words: fst::Set<A>,
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
analyzer: Analyzer,
@ -27,28 +27,26 @@ pub struct Indexed<'a> {
pub docs_words: HashMap<DocumentId, FstSetCow<'a>>,
}
impl<A> RawIndexer<A> {
pub fn new(stop_words: fst::Set<A>) -> RawIndexer<A> {
impl RawIndexer {
pub fn new<A: AsRef<[u8]>>(stop_words: fst::Set<A>) -> RawIndexer {
RawIndexer::with_word_limit(stop_words, 1000)
}
pub fn with_word_limit(stop_words: fst::Set<A>, limit: usize) -> RawIndexer<A> {
pub fn with_word_limit<A: AsRef<[u8]>>(stop_words: fst::Set<A>, limit: usize) -> RawIndexer {
RawIndexer {
word_limit: limit,
stop_words,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
analyzer: Analyzer::new(AnalyzerConfig::default()),
}
analyzer: Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words.stream().into_strs().unwrap().into_iter().collect()))
}
}
impl<A: AsRef<[u8]>> RawIndexer<A> {
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
let mut number_of_words = 0;
let analyzed_text = self.analyzer.analyze(text);
for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| !t.is_separator()).enumerate() {
for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| t.is_word()).enumerate() {
print!("token: {}", token.word);
let must_continue = index_token(
token,
token_index,
@ -56,7 +54,6 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
id,
indexed_pos,
self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
@ -88,6 +85,7 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
let tokens = analyzed_text
.tokens()
.enumerate()
.filter(|(_, t)| t.is_word())
.map(|(i, mut t)| {
t.byte_start = t.byte_start + current_byte_offset;
t.byte_end = t.byte_end + current_byte_offset;
@ -103,12 +101,11 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
let must_continue = index_token(
token,
token_index,
word_pos,
token_index,
id,
indexed_pos,
self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
@ -145,24 +142,23 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
}
}
fn index_token<A>(
fn index_token(
token: Token,
position: usize,
word_pos: usize,
id: DocumentId,
indexed_pos: IndexedPos,
word_limit: usize,
stop_words: &fst::Set<A>,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool
where A: AsRef<[u8]>,
{
if position >= word_limit {
println!(" position {}, limit: {}", position, word_limit);
if word_pos >= word_limit {
return false;
}
if !stop_words.contains(&token.word.as_ref()) {
if !token.is_stopword() {
match token_to_docindex(id, indexed_pos, &token, word_pos) {
Some(docindex) => {
let word = Vec::from(token.word.as_ref());
@ -220,9 +216,6 @@ mod tests {
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
@ -242,9 +235,6 @@ mod tests {
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
@ -269,9 +259,6 @@ mod tests {
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
@ -303,7 +290,7 @@ mod tests {
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&"buffering".to_owned().into_bytes()).is_some());
assert!(words_doc_indexes.get(&"request_buffering".to_owned().into_bytes()).is_some());
}
#[test]

View File

@ -110,18 +110,17 @@ pub fn push_documents_addition<D: serde::Serialize>(
}
#[allow(clippy::too_many_arguments)]
fn index_document<A>(
fn index_document(
writer: &mut heed::RwTxn<MainT>,
documents_fields: DocumentsFields,
documents_fields_counts: DocumentsFieldsCounts,
ranked_map: &mut RankedMap,
indexer: &mut RawIndexer<A>,
indexer: &mut RawIndexer,
schema: &Schema,
field_id: FieldId,
document_id: DocumentId,
value: &Value,
) -> MResult<()>
where A: AsRef<[u8]>,
{
let serialized = serde_json::to_vec(value)?;
documents_fields.put_document_field(writer, document_id, field_id, &serialized)?;
@ -373,14 +372,13 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
Ok(())
}
pub fn write_documents_addition_index<A>(
pub fn write_documents_addition_index(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
ranked_map: &RankedMap,
number_of_inserted_documents: usize,
indexer: RawIndexer<A>,
indexer: RawIndexer,
) -> MResult<()>
where A: AsRef<[u8]>,
{
let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory();

View File

@ -12,13 +12,12 @@ use crate::serde::SerializerError;
use crate::store::DiscoverIds;
/// Returns the number of words indexed or `None` if the type is unindexable.
pub fn index_value<A>(
indexer: &mut RawIndexer<A>,
pub fn index_value(
indexer: &mut RawIndexer,
document_id: DocumentId,
indexed_pos: IndexedPos,
value: &Value,
) -> Option<usize>
where A: AsRef<[u8]>,
{
match value {
Value::Null => None,