mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
fix indexer tests
This commit is contained in:
parent
5e00842087
commit
8843062604
@ -26,6 +26,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||
log = "0.4.11"
|
||||
meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" }
|
||||
meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" }
|
||||
meilisearch-tokenizer = { path = "../../Tokenizer" }
|
||||
meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" }
|
||||
once_cell = "1.5.2"
|
||||
ordered-float = { version = "2.0.1", features = ["serde"] }
|
||||
|
@ -1,4 +1,4 @@
|
||||
mod dfa;
|
||||
|
||||
|
||||
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -9,7 +9,7 @@ use fst::{IntoStreamer, Streamer};
|
||||
use itertools::{EitherOrBoth, merge_join_by};
|
||||
use log::debug;
|
||||
use meilisearch_tokenizer::Token;
|
||||
use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
||||
use sdset::{Set, SetBuf, SetOperation};
|
||||
|
||||
use crate::database::MainT;
|
||||
|
@ -1,9 +1,10 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::convert::TryFrom;
|
||||
use std::println;
|
||||
|
||||
use meilisearch_schema::IndexedPos;
|
||||
use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
||||
use meilisearch_tokenizer::Token;
|
||||
use sdset::SetBuf;
|
||||
|
||||
@ -14,9 +15,8 @@ const WORD_LENGTH_LIMIT: usize = 80;
|
||||
|
||||
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||
|
||||
pub struct RawIndexer<A> {
|
||||
pub struct RawIndexer {
|
||||
word_limit: usize, // the maximum number of indexed words
|
||||
stop_words: fst::Set<A>,
|
||||
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
||||
docs_words: HashMap<DocumentId, Vec<Word>>,
|
||||
analyzer: Analyzer,
|
||||
@ -27,28 +27,26 @@ pub struct Indexed<'a> {
|
||||
pub docs_words: HashMap<DocumentId, FstSetCow<'a>>,
|
||||
}
|
||||
|
||||
impl<A> RawIndexer<A> {
|
||||
pub fn new(stop_words: fst::Set<A>) -> RawIndexer<A> {
|
||||
impl RawIndexer {
|
||||
pub fn new<A: AsRef<[u8]>>(stop_words: fst::Set<A>) -> RawIndexer {
|
||||
RawIndexer::with_word_limit(stop_words, 1000)
|
||||
}
|
||||
|
||||
pub fn with_word_limit(stop_words: fst::Set<A>, limit: usize) -> RawIndexer<A> {
|
||||
pub fn with_word_limit<A: AsRef<[u8]>>(stop_words: fst::Set<A>, limit: usize) -> RawIndexer {
|
||||
RawIndexer {
|
||||
word_limit: limit,
|
||||
stop_words,
|
||||
words_doc_indexes: BTreeMap::new(),
|
||||
docs_words: HashMap::new(),
|
||||
analyzer: Analyzer::new(AnalyzerConfig::default()),
|
||||
analyzer: Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words.stream().into_strs().unwrap().into_iter().collect()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: AsRef<[u8]>> RawIndexer<A> {
|
||||
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
|
||||
let mut number_of_words = 0;
|
||||
|
||||
let analyzed_text = self.analyzer.analyze(text);
|
||||
for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| !t.is_separator()).enumerate() {
|
||||
for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| t.is_word()).enumerate() {
|
||||
print!("token: {}", token.word);
|
||||
let must_continue = index_token(
|
||||
token,
|
||||
token_index,
|
||||
@ -56,7 +54,6 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
||||
id,
|
||||
indexed_pos,
|
||||
self.word_limit,
|
||||
&self.stop_words,
|
||||
&mut self.words_doc_indexes,
|
||||
&mut self.docs_words,
|
||||
);
|
||||
@ -88,6 +85,7 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
||||
let tokens = analyzed_text
|
||||
.tokens()
|
||||
.enumerate()
|
||||
.filter(|(_, t)| t.is_word())
|
||||
.map(|(i, mut t)| {
|
||||
t.byte_start = t.byte_start + current_byte_offset;
|
||||
t.byte_end = t.byte_end + current_byte_offset;
|
||||
@ -103,12 +101,11 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
||||
|
||||
let must_continue = index_token(
|
||||
token,
|
||||
token_index,
|
||||
word_pos,
|
||||
token_index,
|
||||
id,
|
||||
indexed_pos,
|
||||
self.word_limit,
|
||||
&self.stop_words,
|
||||
&mut self.words_doc_indexes,
|
||||
&mut self.docs_words,
|
||||
);
|
||||
@ -145,24 +142,23 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
||||
}
|
||||
}
|
||||
|
||||
fn index_token<A>(
|
||||
fn index_token(
|
||||
token: Token,
|
||||
position: usize,
|
||||
word_pos: usize,
|
||||
id: DocumentId,
|
||||
indexed_pos: IndexedPos,
|
||||
word_limit: usize,
|
||||
stop_words: &fst::Set<A>,
|
||||
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
||||
) -> bool
|
||||
where A: AsRef<[u8]>,
|
||||
{
|
||||
if position >= word_limit {
|
||||
println!(" position {}, limit: {}", position, word_limit);
|
||||
if word_pos >= word_limit {
|
||||
return false;
|
||||
}
|
||||
|
||||
if !stop_words.contains(&token.word.as_ref()) {
|
||||
if !token.is_stopword() {
|
||||
match token_to_docindex(id, indexed_pos, &token, word_pos) {
|
||||
Some(docindex) => {
|
||||
let word = Vec::from(token.word.as_ref());
|
||||
@ -220,9 +216,6 @@ mod tests {
|
||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||
assert!(words_doc_indexes
|
||||
.get(&"éteindre".to_owned().into_bytes())
|
||||
.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -242,9 +235,6 @@ mod tests {
|
||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||
assert!(words_doc_indexes
|
||||
.get(&"éteindre".to_owned().into_bytes())
|
||||
.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -269,9 +259,6 @@ mod tests {
|
||||
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
|
||||
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
|
||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||
assert!(words_doc_indexes
|
||||
.get(&"éteindre".to_owned().into_bytes())
|
||||
.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -303,7 +290,7 @@ mod tests {
|
||||
let Indexed {
|
||||
words_doc_indexes, ..
|
||||
} = indexer.build();
|
||||
assert!(words_doc_indexes.get(&"buffering".to_owned().into_bytes()).is_some());
|
||||
assert!(words_doc_indexes.get(&"request_buffering".to_owned().into_bytes()).is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -110,18 +110,17 @@ pub fn push_documents_addition<D: serde::Serialize>(
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn index_document<A>(
|
||||
fn index_document(
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
documents_fields: DocumentsFields,
|
||||
documents_fields_counts: DocumentsFieldsCounts,
|
||||
ranked_map: &mut RankedMap,
|
||||
indexer: &mut RawIndexer<A>,
|
||||
indexer: &mut RawIndexer,
|
||||
schema: &Schema,
|
||||
field_id: FieldId,
|
||||
document_id: DocumentId,
|
||||
value: &Value,
|
||||
) -> MResult<()>
|
||||
where A: AsRef<[u8]>,
|
||||
{
|
||||
let serialized = serde_json::to_vec(value)?;
|
||||
documents_fields.put_document_field(writer, document_id, field_id, &serialized)?;
|
||||
@ -373,14 +372,13 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_documents_addition_index<A>(
|
||||
pub fn write_documents_addition_index(
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
index: &store::Index,
|
||||
ranked_map: &RankedMap,
|
||||
number_of_inserted_documents: usize,
|
||||
indexer: RawIndexer<A>,
|
||||
indexer: RawIndexer,
|
||||
) -> MResult<()>
|
||||
where A: AsRef<[u8]>,
|
||||
{
|
||||
let indexed = indexer.build();
|
||||
let mut delta_words_builder = SetBuilder::memory();
|
||||
|
@ -12,13 +12,12 @@ use crate::serde::SerializerError;
|
||||
use crate::store::DiscoverIds;
|
||||
|
||||
/// Returns the number of words indexed or `None` if the type is unindexable.
|
||||
pub fn index_value<A>(
|
||||
indexer: &mut RawIndexer<A>,
|
||||
pub fn index_value(
|
||||
indexer: &mut RawIndexer,
|
||||
document_id: DocumentId,
|
||||
indexed_pos: IndexedPos,
|
||||
value: &Value,
|
||||
) -> Option<usize>
|
||||
where A: AsRef<[u8]>,
|
||||
{
|
||||
match value {
|
||||
Value::Null => None,
|
||||
|
Loading…
Reference in New Issue
Block a user