diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index 85adaf750..3e0f212f7 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -37,54 +37,8 @@ impl RawIndexer { pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize { let mut number_of_words = 0; - let lowercase_text = text.to_lowercase(); - let deunicoded = deunicode_with_tofu(&lowercase_text, ""); - // TODO compute the deunicoded version after the cjk check - let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded { - Some(deunicoded) - } else { - None - }; - let iter = Some(lowercase_text).into_iter().chain(next); - - for text in iter { - // we must not count 2 times the same words - number_of_words = 0; - - for token in Tokenizer::new(&text) { - let must_continue = index_token( - token, - id, - attr, - self.word_limit, - &self.stop_words, - &mut self.words_doc_indexes, - &mut self.docs_words, - ); - - if !must_continue { - break; - } - - number_of_words += 1; - } - } - - number_of_words - } - - pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) - where - I: IntoIterator, - IT: Iterator + Clone, - { - // TODO serialize this to one call to the SeqTokenizer loop - - let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect(); - let iter = lowercased.iter().map(|t| t.as_str()); - - for token in SeqTokenizer::new(iter) { + for token in Tokenizer::new(text) { let must_continue = index_token( token, id, @@ -95,27 +49,21 @@ impl RawIndexer { &mut self.docs_words, ); + number_of_words += 1; + if !must_continue { break; } } - let deunicoded: Vec<_> = lowercased - .into_iter() - .map(|lowercase_text| { - if lowercase_text.contains(is_cjk) { - return lowercase_text; - } - let deunicoded = deunicode_with_tofu(&lowercase_text, ""); - if lowercase_text != deunicoded { - deunicoded - } else { - lowercase_text - } - }) - .collect(); - let iter = deunicoded.iter().map(|t| t.as_str()); + number_of_words + } + pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) + where + I: IntoIterator, + { + let iter = iter.into_iter(); for token in SeqTokenizer::new(iter) { let must_continue = index_token( token, @@ -170,6 +118,12 @@ fn index_token( return false; } + let lower = token.word.to_lowercase(); + let token = Token { + word: &lower, + ..token + }; + if !stop_words.contains(&token.word) { match token_to_docindex(id, attr, token) { Some(docindex) => { @@ -182,6 +136,28 @@ fn index_token( } None => return false, } + + if !lower.contains(is_cjk) { + let unidecoded = deunicode_with_tofu(&lower, ""); + if unidecoded != lower && !unidecoded.is_empty() { + let token = Token { + word: &unidecoded, + ..token + }; + + match token_to_docindex(id, attr, token) { + Some(docindex) => { + let word = Vec::from(token.word); + words_doc_indexes + .entry(word.clone()) + .or_insert_with(Vec::new) + .push(docindex); + docs_words.entry(id).or_insert_with(Vec::new).push(word); + } + None => return false, + } + } + } } true @@ -224,10 +200,8 @@ mod tests { assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); - - // with the ugly apostrophe... assert!(words_doc_indexes - .get(&"l’éteindre".to_owned().into_bytes()) + .get(&"éteindre".to_owned().into_bytes()) .is_some()); } @@ -248,10 +222,8 @@ mod tests { assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); - - // with the ugly apostrophe... assert!(words_doc_indexes - .get(&"l’éteindre".to_owned().into_bytes()) + .get(&"éteindre".to_owned().into_bytes()) .is_some()); } @@ -277,10 +249,26 @@ mod tests { assert!(words_doc_indexes.get(&b"ai"[..]).is_none()); assert!(words_doc_indexes.get(&b"de"[..]).is_none()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); - - // with the ugly apostrophe... assert!(words_doc_indexes - .get(&"l’éteindre".to_owned().into_bytes()) + .get(&"éteindre".to_owned().into_bytes()) + .is_some()); + } + + #[test] + fn no_empty_unidecode() { + let mut indexer = RawIndexer::new(fst::Set::default()); + + let docid = DocumentId(0); + let attr = SchemaAttr(0); + let text = "🇯🇵"; + indexer.index_text(docid, attr, text); + + let Indexed { + words_doc_indexes, .. + } = indexer.build(); + + assert!(words_doc_indexes + .get(&"🇯🇵".to_owned().into_bytes()) .is_some()); } } diff --git a/meilidb-tokenizer/Cargo.toml b/meilidb-tokenizer/Cargo.toml index 06a3be9ad..97220ef24 100644 --- a/meilidb-tokenizer/Cargo.toml +++ b/meilidb-tokenizer/Cargo.toml @@ -5,4 +5,5 @@ authors = ["Kerollmops "] edition = "2018" [dependencies] +deunicode = "1.0.0" slice-group-by = "0.2.4" diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs index 106d0f91f..6bbcbea3d 100644 --- a/meilidb-tokenizer/src/lib.rs +++ b/meilidb-tokenizer/src/lib.rs @@ -1,4 +1,5 @@ use self::SeparatorCategory::*; +use deunicode::deunicode_char; use slice_group_by::StrGroupBy; use std::iter::Peekable; @@ -43,7 +44,10 @@ fn is_separator(c: char) -> bool { fn classify_separator(c: char) -> Option { match c { - ' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft), + c if c.is_whitespace() => Some(Soft), // whitespaces + c if deunicode_char(c) == Some("'") => Some(Soft), // quotes + c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes + '-' | '_' | '\'' | ':' => Some(Soft), '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard), _ => None, }