From ccded7b4299b5923c457147b6fea53e1f583ba77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 4 Nov 2019 16:09:32 +0100 Subject: [PATCH 1/4] Improve the indexer to not not deunicode before indexing Revert of #179 --- meilidb-core/src/raw_indexer.rs | 99 ++++++++++++--------------------- 1 file changed, 37 insertions(+), 62 deletions(-) diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index 85adaf750..13c87f5c3 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -37,54 +37,8 @@ impl RawIndexer { pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize { let mut number_of_words = 0; - let lowercase_text = text.to_lowercase(); - let deunicoded = deunicode_with_tofu(&lowercase_text, ""); - // TODO compute the deunicoded version after the cjk check - let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded { - Some(deunicoded) - } else { - None - }; - let iter = Some(lowercase_text).into_iter().chain(next); - - for text in iter { - // we must not count 2 times the same words - number_of_words = 0; - - for token in Tokenizer::new(&text) { - let must_continue = index_token( - token, - id, - attr, - self.word_limit, - &self.stop_words, - &mut self.words_doc_indexes, - &mut self.docs_words, - ); - - if !must_continue { - break; - } - - number_of_words += 1; - } - } - - number_of_words - } - - pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) - where - I: IntoIterator, - IT: Iterator + Clone, - { - // TODO serialize this to one call to the SeqTokenizer loop - - let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect(); - let iter = lowercased.iter().map(|t| t.as_str()); - - for token in SeqTokenizer::new(iter) { + for token in Tokenizer::new(text) { let must_continue = index_token( token, id, @@ -95,27 +49,21 @@ impl RawIndexer { &mut self.docs_words, ); + number_of_words += 1; + if !must_continue { break; } } - let deunicoded: Vec<_> = lowercased - .into_iter() - .map(|lowercase_text| { - if lowercase_text.contains(is_cjk) { - return lowercase_text; - } - let deunicoded = deunicode_with_tofu(&lowercase_text, ""); - if lowercase_text != deunicoded { - deunicoded - } else { - lowercase_text - } - }) - .collect(); - let iter = deunicoded.iter().map(|t| t.as_str()); + number_of_words + } + pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) + where + I: IntoIterator, + { + let iter = iter.into_iter(); for token in SeqTokenizer::new(iter) { let must_continue = index_token( token, @@ -170,6 +118,12 @@ fn index_token( return false; } + let lower = token.word.to_lowercase(); + let token = Token { + word: &lower, + ..token + }; + if !stop_words.contains(&token.word) { match token_to_docindex(id, attr, token) { Some(docindex) => { @@ -182,6 +136,27 @@ fn index_token( } None => return false, } + + if !lower.contains(is_cjk) { + let unidecoded = deunicode_with_tofu(&lower, ""); + if unidecoded != lower { + let token = Token { + word: &unidecoded, + ..token + }; + match token_to_docindex(id, attr, token) { + Some(docindex) => { + let word = Vec::from(token.word); + words_doc_indexes + .entry(word.clone()) + .or_insert_with(Vec::new) + .push(docindex); + docs_words.entry(id).or_insert_with(Vec::new).push(word); + } + None => return false, + } + } + } } true From de2b8672d48e4d75444cd34e321ba8d14f7e6465 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 4 Nov 2019 16:10:13 +0100 Subject: [PATCH 2/4] Make the tokenizer understand strange whitespaces/quotes --- meilidb-tokenizer/Cargo.toml | 1 + meilidb-tokenizer/src/lib.rs | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/meilidb-tokenizer/Cargo.toml b/meilidb-tokenizer/Cargo.toml index 06a3be9ad..97220ef24 100644 --- a/meilidb-tokenizer/Cargo.toml +++ b/meilidb-tokenizer/Cargo.toml @@ -5,4 +5,5 @@ authors = ["Kerollmops "] edition = "2018" [dependencies] +deunicode = "1.0.0" slice-group-by = "0.2.4" diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs index 106d0f91f..6bbcbea3d 100644 --- a/meilidb-tokenizer/src/lib.rs +++ b/meilidb-tokenizer/src/lib.rs @@ -1,4 +1,5 @@ use self::SeparatorCategory::*; +use deunicode::deunicode_char; use slice_group_by::StrGroupBy; use std::iter::Peekable; @@ -43,7 +44,10 @@ fn is_separator(c: char) -> bool { fn classify_separator(c: char) -> Option { match c { - ' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft), + c if c.is_whitespace() => Some(Soft), // whitespaces + c if deunicode_char(c) == Some("'") => Some(Soft), // quotes + c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes + '-' | '_' | '\'' | ':' => Some(Soft), '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard), _ => None, } From 4571b80a4945f13b812956ebdb7cd5b7719da7cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 4 Nov 2019 16:10:31 +0100 Subject: [PATCH 3/4] Update the tests --- meilidb-core/src/raw_indexer.rs | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index 13c87f5c3..988a5182e 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -199,10 +199,8 @@ mod tests { assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); - - // with the ugly apostrophe... assert!(words_doc_indexes - .get(&"l’éteindre".to_owned().into_bytes()) + .get(&"éteindre".to_owned().into_bytes()) .is_some()); } @@ -223,10 +221,8 @@ mod tests { assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); - - // with the ugly apostrophe... assert!(words_doc_indexes - .get(&"l’éteindre".to_owned().into_bytes()) + .get(&"éteindre".to_owned().into_bytes()) .is_some()); } @@ -252,10 +248,8 @@ mod tests { assert!(words_doc_indexes.get(&b"ai"[..]).is_none()); assert!(words_doc_indexes.get(&b"de"[..]).is_none()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); - - // with the ugly apostrophe... assert!(words_doc_indexes - .get(&"l’éteindre".to_owned().into_bytes()) + .get(&"éteindre".to_owned().into_bytes()) .is_some()); } } From 3b1cbed2385edcc776a2b84b6c484a9fe25114a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 4 Nov 2019 16:58:02 +0100 Subject: [PATCH 4/4] Check that the unidecoded words are not empty --- meilidb-core/src/raw_indexer.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index 988a5182e..3e0f212f7 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -139,11 +139,12 @@ fn index_token( if !lower.contains(is_cjk) { let unidecoded = deunicode_with_tofu(&lower, ""); - if unidecoded != lower { + if unidecoded != lower && !unidecoded.is_empty() { let token = Token { word: &unidecoded, ..token }; + match token_to_docindex(id, attr, token) { Some(docindex) => { let word = Vec::from(token.word); @@ -252,4 +253,22 @@ mod tests { .get(&"éteindre".to_owned().into_bytes()) .is_some()); } + + #[test] + fn no_empty_unidecode() { + let mut indexer = RawIndexer::new(fst::Set::default()); + + let docid = DocumentId(0); + let attr = SchemaAttr(0); + let text = "🇯🇵"; + indexer.index_text(docid, attr, text); + + let Indexed { + words_doc_indexes, .. + } = indexer.build(); + + assert!(words_doc_indexes + .get(&"🇯🇵".to_owned().into_bytes()) + .is_some()); + } }