3866: Update charabia v0.8.0 r=dureuill a=ManyTheFish

# Pull Request

Update Charabia:
- enhance Japanese segmentation
- enhance Latin Tokenization
  - words containing `_` are now properly segmented into several words
  - brackets `{([])}` are no more considered as context separators so word separated by brackets are now considered near together for the proximity ranking rule
- fixes #3815
- fixes #3778
- fixes [product#151](https://github.com/meilisearch/product/discussions/151)

> Important note: now the float numbers are segmented around the `.` so `3.22` is segmented as [`3`, `.`, `22`] but the middle dot isn't considered as a hard separator, which means that if we search `3.22` we find documents containing `3.22`

Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2023-06-29 15:24:36 +00:00 committed by GitHub
commit 661d1f90dc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 252 additions and 187 deletions

View file

@ -128,10 +128,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
.map(|reader| (documents_ids, reader, script_language_docids))
}
fn extract_tokens_from_document<T: AsRef<[u8]>>(
fn extract_tokens_from_document(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
tokenizer: &Tokenizer<T>,
tokenizer: &Tokenizer,
max_positions_per_attributes: u32,
buffers: &mut Buffers,
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,

View file

@ -1,18 +1,21 @@
---
source: milli/src/update/index_documents/mod.rs
---
0 [1, ]
1 [2, ]
10.0 [1, ]
10 [1, ]
12 [0, ]
1344 [3, ]
2 [0, ]
23 [5, ]
25.99 [2, ]
3.5 [0, ]
25 [2, ]
3 [0, ]
35 [5, ]
4 [4, ]
42 [0, 5, ]
456 [1, ]
5 [0, ]
99 [2, ]
adams [5, ]
adventure [1, ]
alice [2, ]
@ -29,7 +32,7 @@ galaxy [5, ]
guide [5, ]
half [4, ]
harry [4, ]
hitchhiker' [5, ]
hitchhiker [5, ]
hobbit [3, ]
in [2, ]
j [3, 4, ]

View file

@ -1,19 +1,22 @@
---
source: milli/src/update/index_documents/mod.rs
---
0 [1, 7, ]
1 [2, ]
10.0 [1, 7, ]
10 [1, 7, ]
12 [0, 8, ]
1344 [3, ]
1813 [8, ]
2 [0, 8, ]
23 [5, ]
25.99 [2, ]
3.5 [0, 8, ]
25 [2, ]
3 [0, 8, ]
35 [5, ]
4 [4, 6, ]
42 [0, 5, 8, ]
456 [1, 7, ]
5 [0, 8, ]
99 [2, ]
adams [5, ]
adventure [1, 7, ]
alice [2, ]
@ -31,7 +34,7 @@ galaxy [5, ]
guide [5, ]
half [4, 6, ]
harry [4, 6, ]
hitchhiker' [5, ]
hitchhiker [5, ]
hobbit [3, ]
in [2, ]
j [3, 4, 6, 8, ]

View file

@ -1,7 +1,7 @@
use std::collections::{BTreeSet, HashMap, HashSet};
use std::result::Result as StdResult;
use charabia::{Tokenizer, TokenizerBuilder};
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
use deserr::{DeserializeError, Deserr};
use itertools::Itertools;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
@ -423,6 +423,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
match self.stop_words {
Setting::Set(ref stop_words) => {
let current = self.index.stop_words(self.wtxn)?;
// Apply an unlossy normalization on stop_words
let stop_words = stop_words
.iter()
.map(|w| w.as_str().normalize(&Default::default()).into_owned());
// since we can't compare a BTreeSet with an FST we are going to convert the
// BTreeSet to an FST and then compare bytes per bytes the two FSTs.
let fst = fst::Set::from_iter(stop_words)?;
@ -446,7 +452,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
fn update_synonyms(&mut self) -> Result<bool> {
match self.synonyms {
Setting::Set(ref synonyms) => {
fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> {
fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
tokenizer
.tokenize(text)
.filter_map(|token| {
@ -647,7 +653,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
fn update_exact_words(&mut self) -> Result<()> {
match self.exact_words {
Setting::Set(ref mut words) => {
fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String {
fn normalize(tokenizer: &Tokenizer, text: &str) -> String {
tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect()
}

View file

@ -1,27 +1,28 @@
---
source: milli/src/update/delete_documents.rs
---
1_36 [3, ]
1_37 [4, ]
1_38 [5, ]
1_39 [6, ]
1_40 [7, ]
1_41 [8, ]
1_42 [9, ]
1_43 [10, ]
1_44 [11, ]
1_45 [12, ]
1_46 [13, ]
1_47 [14, ]
1_5 [1, ]
1_52 [15, ]
1_57 [16, ]
1_58 [17, ]
1_68 [18, ]
1_69 [19, ]
1_7 [2, ]
1_71 [21, ]
2.2 [21, ]
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
2 [21, ]
36 [3, ]
37 [4, ]
38 [5, ]
39 [6, ]
40 [7, ]
41 [8, ]
42 [9, ]
43 [10, ]
44 [11, ]
45 [12, ]
46 [13, ]
47 [14, ]
5 [1, ]
52 [15, ]
57 [16, ]
58 [17, ]
68 [18, ]
69 [19, ]
7 [2, ]
71 [21, ]
abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
aquarium [5, ]
art [4, 5, 8, 9, 10, 12, 17, ]

View file

@ -1,4 +1,25 @@
---
source: milli/src/update/delete_documents.rs
---
1 1 36 [3, ]
1 1 37 [4, ]
1 1 38 [5, ]
1 1 39 [6, ]
1 1 40 [7, ]
1 1 41 [8, ]
1 1 42 [9, ]
1 1 43 [10, ]
1 1 44 [11, ]
1 1 45 [12, ]
1 1 46 [13, ]
1 1 47 [14, ]
1 1 5 [1, ]
1 1 52 [15, ]
1 1 57 [16, ]
1 1 58 [17, ]
1 1 68 [18, ]
1 1 69 [19, ]
1 1 7 [2, ]
1 1 71 [21, ]
1 2 2 [21, ]

View file

@ -1,31 +1,31 @@
---
source: milli/src/update/delete_documents.rs
---
1.2 [20, 22, ]
1_36 [3, ]
1_37 [4, ]
1_38 [5, ]
1_39 [6, ]
1_4 [0, ]
1_40 [7, ]
1_41 [8, ]
1_42 [9, ]
1_43 [10, ]
1_44 [11, ]
1_45 [12, ]
1_46 [13, ]
1_47 [14, ]
1_5 [1, ]
1_52 [15, ]
1_57 [16, ]
1_58 [17, ]
1_68 [18, ]
1_69 [19, ]
1_7 [2, ]
1_70 [20, ]
1_71 [21, ]
1_72 [22, ]
2.2 [21, ]
1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ]
2 [20, 21, 22, ]
36 [3, ]
37 [4, ]
38 [5, ]
39 [6, ]
4 [0, ]
40 [7, ]
41 [8, ]
42 [9, ]
43 [10, ]
44 [11, ]
45 [12, ]
46 [13, ]
47 [14, ]
5 [1, ]
52 [15, ]
57 [16, ]
58 [17, ]
68 [18, ]
69 [19, ]
7 [2, ]
70 [20, ]
71 [21, ]
72 [22, ]
abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
aquarium [5, ]
art [4, 5, 8, 9, 10, 12, 17, ]

View file

@ -1,4 +1,29 @@
---
source: milli/src/update/delete_documents.rs
---
1 1 2 [20, 22, ]
1 1 36 [3, ]
1 1 37 [4, ]
1 1 38 [5, ]
1 1 39 [6, ]
1 1 4 [0, ]
1 1 40 [7, ]
1 1 41 [8, ]
1 1 42 [9, ]
1 1 43 [10, ]
1 1 44 [11, ]
1 1 45 [12, ]
1 1 46 [13, ]
1 1 47 [14, ]
1 1 5 [1, ]
1 1 52 [15, ]
1 1 57 [16, ]
1 1 58 [17, ]
1 1 68 [18, ]
1 1 69 [19, ]
1 1 7 [2, ]
1 1 70 [20, ]
1 1 71 [21, ]
1 1 72 [22, ]
1 2 2 [21, ]