mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
fix indexer tests
This commit is contained in:
parent
5e00842087
commit
8843062604
@ -26,6 +26,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
|||||||
log = "0.4.11"
|
log = "0.4.11"
|
||||||
meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" }
|
meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" }
|
||||||
meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" }
|
meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" }
|
||||||
|
meilisearch-tokenizer = { path = "../../Tokenizer" }
|
||||||
meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" }
|
meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" }
|
||||||
once_cell = "1.5.2"
|
once_cell = "1.5.2"
|
||||||
ordered-float = { version = "2.0.1", features = ["serde"] }
|
ordered-float = { version = "2.0.1", features = ["serde"] }
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
mod dfa;
|
mod dfa;
|
||||||
|
|
||||||
|
|
||||||
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -9,7 +9,7 @@ use fst::{IntoStreamer, Streamer};
|
|||||||
use itertools::{EitherOrBoth, merge_join_by};
|
use itertools::{EitherOrBoth, merge_join_by};
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use meilisearch_tokenizer::Token;
|
use meilisearch_tokenizer::Token;
|
||||||
use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig};
|
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
||||||
use sdset::{Set, SetBuf, SetOperation};
|
use sdset::{Set, SetBuf, SetOperation};
|
||||||
|
|
||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::{BTreeMap, HashMap};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
|
use std::println;
|
||||||
|
|
||||||
use meilisearch_schema::IndexedPos;
|
use meilisearch_schema::IndexedPos;
|
||||||
use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig};
|
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
||||||
use meilisearch_tokenizer::Token;
|
use meilisearch_tokenizer::Token;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
|
|
||||||
@ -14,9 +15,8 @@ const WORD_LENGTH_LIMIT: usize = 80;
|
|||||||
|
|
||||||
type Word = Vec<u8>; // TODO make it be a SmallVec
|
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||||
|
|
||||||
pub struct RawIndexer<A> {
|
pub struct RawIndexer {
|
||||||
word_limit: usize, // the maximum number of indexed words
|
word_limit: usize, // the maximum number of indexed words
|
||||||
stop_words: fst::Set<A>,
|
|
||||||
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
||||||
docs_words: HashMap<DocumentId, Vec<Word>>,
|
docs_words: HashMap<DocumentId, Vec<Word>>,
|
||||||
analyzer: Analyzer,
|
analyzer: Analyzer,
|
||||||
@ -27,28 +27,26 @@ pub struct Indexed<'a> {
|
|||||||
pub docs_words: HashMap<DocumentId, FstSetCow<'a>>,
|
pub docs_words: HashMap<DocumentId, FstSetCow<'a>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<A> RawIndexer<A> {
|
impl RawIndexer {
|
||||||
pub fn new(stop_words: fst::Set<A>) -> RawIndexer<A> {
|
pub fn new<A: AsRef<[u8]>>(stop_words: fst::Set<A>) -> RawIndexer {
|
||||||
RawIndexer::with_word_limit(stop_words, 1000)
|
RawIndexer::with_word_limit(stop_words, 1000)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_word_limit(stop_words: fst::Set<A>, limit: usize) -> RawIndexer<A> {
|
pub fn with_word_limit<A: AsRef<[u8]>>(stop_words: fst::Set<A>, limit: usize) -> RawIndexer {
|
||||||
RawIndexer {
|
RawIndexer {
|
||||||
word_limit: limit,
|
word_limit: limit,
|
||||||
stop_words,
|
|
||||||
words_doc_indexes: BTreeMap::new(),
|
words_doc_indexes: BTreeMap::new(),
|
||||||
docs_words: HashMap::new(),
|
docs_words: HashMap::new(),
|
||||||
analyzer: Analyzer::new(AnalyzerConfig::default()),
|
analyzer: Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words.stream().into_strs().unwrap().into_iter().collect()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl<A: AsRef<[u8]>> RawIndexer<A> {
|
|
||||||
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
|
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
|
||||||
let mut number_of_words = 0;
|
let mut number_of_words = 0;
|
||||||
|
|
||||||
let analyzed_text = self.analyzer.analyze(text);
|
let analyzed_text = self.analyzer.analyze(text);
|
||||||
for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| !t.is_separator()).enumerate() {
|
for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| t.is_word()).enumerate() {
|
||||||
|
print!("token: {}", token.word);
|
||||||
let must_continue = index_token(
|
let must_continue = index_token(
|
||||||
token,
|
token,
|
||||||
token_index,
|
token_index,
|
||||||
@ -56,7 +54,6 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
|||||||
id,
|
id,
|
||||||
indexed_pos,
|
indexed_pos,
|
||||||
self.word_limit,
|
self.word_limit,
|
||||||
&self.stop_words,
|
|
||||||
&mut self.words_doc_indexes,
|
&mut self.words_doc_indexes,
|
||||||
&mut self.docs_words,
|
&mut self.docs_words,
|
||||||
);
|
);
|
||||||
@ -88,6 +85,7 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
|||||||
let tokens = analyzed_text
|
let tokens = analyzed_text
|
||||||
.tokens()
|
.tokens()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
|
.filter(|(_, t)| t.is_word())
|
||||||
.map(|(i, mut t)| {
|
.map(|(i, mut t)| {
|
||||||
t.byte_start = t.byte_start + current_byte_offset;
|
t.byte_start = t.byte_start + current_byte_offset;
|
||||||
t.byte_end = t.byte_end + current_byte_offset;
|
t.byte_end = t.byte_end + current_byte_offset;
|
||||||
@ -103,12 +101,11 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
|||||||
|
|
||||||
let must_continue = index_token(
|
let must_continue = index_token(
|
||||||
token,
|
token,
|
||||||
token_index,
|
|
||||||
word_pos,
|
word_pos,
|
||||||
|
token_index,
|
||||||
id,
|
id,
|
||||||
indexed_pos,
|
indexed_pos,
|
||||||
self.word_limit,
|
self.word_limit,
|
||||||
&self.stop_words,
|
|
||||||
&mut self.words_doc_indexes,
|
&mut self.words_doc_indexes,
|
||||||
&mut self.docs_words,
|
&mut self.docs_words,
|
||||||
);
|
);
|
||||||
@ -145,24 +142,23 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn index_token<A>(
|
fn index_token(
|
||||||
token: Token,
|
token: Token,
|
||||||
position: usize,
|
position: usize,
|
||||||
word_pos: usize,
|
word_pos: usize,
|
||||||
id: DocumentId,
|
id: DocumentId,
|
||||||
indexed_pos: IndexedPos,
|
indexed_pos: IndexedPos,
|
||||||
word_limit: usize,
|
word_limit: usize,
|
||||||
stop_words: &fst::Set<A>,
|
|
||||||
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||||
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
||||||
) -> bool
|
) -> bool
|
||||||
where A: AsRef<[u8]>,
|
|
||||||
{
|
{
|
||||||
if position >= word_limit {
|
println!(" position {}, limit: {}", position, word_limit);
|
||||||
|
if word_pos >= word_limit {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if !stop_words.contains(&token.word.as_ref()) {
|
if !token.is_stopword() {
|
||||||
match token_to_docindex(id, indexed_pos, &token, word_pos) {
|
match token_to_docindex(id, indexed_pos, &token, word_pos) {
|
||||||
Some(docindex) => {
|
Some(docindex) => {
|
||||||
let word = Vec::from(token.word.as_ref());
|
let word = Vec::from(token.word.as_ref());
|
||||||
@ -220,9 +216,6 @@ mod tests {
|
|||||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||||
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
||||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||||
assert!(words_doc_indexes
|
|
||||||
.get(&"éteindre".to_owned().into_bytes())
|
|
||||||
.is_some());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -242,9 +235,6 @@ mod tests {
|
|||||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||||
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
||||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||||
assert!(words_doc_indexes
|
|
||||||
.get(&"éteindre".to_owned().into_bytes())
|
|
||||||
.is_some());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -269,9 +259,6 @@ mod tests {
|
|||||||
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
|
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
|
||||||
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
|
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
|
||||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||||
assert!(words_doc_indexes
|
|
||||||
.get(&"éteindre".to_owned().into_bytes())
|
|
||||||
.is_some());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -303,7 +290,7 @@ mod tests {
|
|||||||
let Indexed {
|
let Indexed {
|
||||||
words_doc_indexes, ..
|
words_doc_indexes, ..
|
||||||
} = indexer.build();
|
} = indexer.build();
|
||||||
assert!(words_doc_indexes.get(&"buffering".to_owned().into_bytes()).is_some());
|
assert!(words_doc_indexes.get(&"request_buffering".to_owned().into_bytes()).is_some());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -110,18 +110,17 @@ pub fn push_documents_addition<D: serde::Serialize>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
fn index_document<A>(
|
fn index_document(
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
documents_fields: DocumentsFields,
|
documents_fields: DocumentsFields,
|
||||||
documents_fields_counts: DocumentsFieldsCounts,
|
documents_fields_counts: DocumentsFieldsCounts,
|
||||||
ranked_map: &mut RankedMap,
|
ranked_map: &mut RankedMap,
|
||||||
indexer: &mut RawIndexer<A>,
|
indexer: &mut RawIndexer,
|
||||||
schema: &Schema,
|
schema: &Schema,
|
||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
value: &Value,
|
value: &Value,
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
where A: AsRef<[u8]>,
|
|
||||||
{
|
{
|
||||||
let serialized = serde_json::to_vec(value)?;
|
let serialized = serde_json::to_vec(value)?;
|
||||||
documents_fields.put_document_field(writer, document_id, field_id, &serialized)?;
|
documents_fields.put_document_field(writer, document_id, field_id, &serialized)?;
|
||||||
@ -373,14 +372,13 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write_documents_addition_index<A>(
|
pub fn write_documents_addition_index(
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
index: &store::Index,
|
index: &store::Index,
|
||||||
ranked_map: &RankedMap,
|
ranked_map: &RankedMap,
|
||||||
number_of_inserted_documents: usize,
|
number_of_inserted_documents: usize,
|
||||||
indexer: RawIndexer<A>,
|
indexer: RawIndexer,
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
where A: AsRef<[u8]>,
|
|
||||||
{
|
{
|
||||||
let indexed = indexer.build();
|
let indexed = indexer.build();
|
||||||
let mut delta_words_builder = SetBuilder::memory();
|
let mut delta_words_builder = SetBuilder::memory();
|
||||||
|
@ -12,13 +12,12 @@ use crate::serde::SerializerError;
|
|||||||
use crate::store::DiscoverIds;
|
use crate::store::DiscoverIds;
|
||||||
|
|
||||||
/// Returns the number of words indexed or `None` if the type is unindexable.
|
/// Returns the number of words indexed or `None` if the type is unindexable.
|
||||||
pub fn index_value<A>(
|
pub fn index_value(
|
||||||
indexer: &mut RawIndexer<A>,
|
indexer: &mut RawIndexer,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
indexed_pos: IndexedPos,
|
indexed_pos: IndexedPos,
|
||||||
value: &Value,
|
value: &Value,
|
||||||
) -> Option<usize>
|
) -> Option<usize>
|
||||||
where A: AsRef<[u8]>,
|
|
||||||
{
|
{
|
||||||
match value {
|
match value {
|
||||||
Value::Null => None,
|
Value::Null => None,
|
||||||
|
Loading…
Reference in New Issue
Block a user