mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 20:07:09 +02:00
Merge #3866
3866: Update charabia v0.8.0 r=dureuill a=ManyTheFish # Pull Request Update Charabia: - enhance Japanese segmentation - enhance Latin Tokenization - words containing `_` are now properly segmented into several words - brackets `{([])}` are no more considered as context separators so word separated by brackets are now considered near together for the proximity ranking rule - fixes #3815 - fixes #3778 - fixes [product#151](https://github.com/meilisearch/product/discussions/151) > Important note: now the float numbers are segmented around the `.` so `3.22` is segmented as [`3`, `.`, `22`] but the middle dot isn't considered as a hard separator, which means that if we search `3.22` we find documents containing `3.22` Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
commit
661d1f90dc
14 changed files with 252 additions and 187 deletions
|
@ -256,7 +256,8 @@ pub(crate) mod tests {
|
|||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let mut ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let tokenizer = TokenizerBuilder::new().build();
|
||||
let mut builder = TokenizerBuilder::default();
|
||||
let tokenizer = builder.build();
|
||||
let tokens = tokenizer.tokenize("split this world");
|
||||
let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||
|
|
|
@ -12,16 +12,16 @@ const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
|
|||
const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
|
||||
|
||||
/// Structure used to build a Matcher allowing to customize formating tags.
|
||||
pub struct MatcherBuilder<'a, A> {
|
||||
pub struct MatcherBuilder<'m> {
|
||||
matching_words: MatchingWords,
|
||||
tokenizer: Tokenizer<'a, 'a, A>,
|
||||
tokenizer: Tokenizer<'m>,
|
||||
crop_marker: Option<String>,
|
||||
highlight_prefix: Option<String>,
|
||||
highlight_suffix: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a, A> MatcherBuilder<'a, A> {
|
||||
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self {
|
||||
impl<'m> MatcherBuilder<'m> {
|
||||
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self {
|
||||
Self {
|
||||
matching_words,
|
||||
tokenizer,
|
||||
|
@ -46,7 +46,7 @@ impl<'a, A> MatcherBuilder<'a, A> {
|
|||
self
|
||||
}
|
||||
|
||||
pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> {
|
||||
pub fn build<'t>(&'m self, text: &'t str) -> Matcher<'t, 'm> {
|
||||
let crop_marker = match &self.crop_marker {
|
||||
Some(marker) => marker.as_str(),
|
||||
None => DEFAULT_CROP_MARKER,
|
||||
|
@ -103,17 +103,17 @@ pub struct MatchBounds {
|
|||
|
||||
/// Structure used to analize a string, compute words that match,
|
||||
/// and format the source string, returning a highlighted and cropped sub-string.
|
||||
pub struct Matcher<'t, 'm, A> {
|
||||
pub struct Matcher<'t, 'm> {
|
||||
text: &'t str,
|
||||
matching_words: &'m MatchingWords,
|
||||
tokenizer: &'m Tokenizer<'m, 'm, A>,
|
||||
tokenizer: &'m Tokenizer<'m>,
|
||||
crop_marker: &'m str,
|
||||
highlight_prefix: &'m str,
|
||||
highlight_suffix: &'m str,
|
||||
matches: Option<(Vec<Token<'t>>, Vec<Match>)>,
|
||||
}
|
||||
|
||||
impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
||||
impl<'t> Matcher<'t, '_> {
|
||||
/// Iterates over tokens and save any of them that matches the query.
|
||||
fn compute_matches(&mut self) -> &mut Self {
|
||||
/// some words are counted as matches only if they are close together and in the good order,
|
||||
|
@ -503,7 +503,7 @@ mod tests {
|
|||
use crate::index::tests::TempIndex;
|
||||
use crate::{execute_search, SearchContext};
|
||||
|
||||
impl<'a> MatcherBuilder<'a, &[u8]> {
|
||||
impl<'a> MatcherBuilder<'a> {
|
||||
fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
|
||||
let mut ctx = SearchContext::new(index, rtxn);
|
||||
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
|
||||
|
@ -530,7 +530,7 @@ mod tests {
|
|||
None => MatchingWords::default(),
|
||||
};
|
||||
|
||||
MatcherBuilder::new(matching_words, TokenizerBuilder::new().build())
|
||||
MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer())
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -690,7 +690,7 @@ mod tests {
|
|||
// should crop the phrase instead of croping around the match.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"… Split The World is a book written by Emily Henry…"
|
||||
@"…Split The World is a book written by Emily Henry…"
|
||||
);
|
||||
|
||||
// Text containing some matches.
|
||||
|
|
|
@ -7,7 +7,7 @@ use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
|||
/// Convert the tokenised search query into a list of located query terms.
|
||||
pub fn located_query_terms_from_tokens(
|
||||
ctx: &mut SearchContext,
|
||||
query: NormalizedTokenIter<&[u8]>,
|
||||
query: NormalizedTokenIter,
|
||||
words_limit: Option<usize>,
|
||||
) -> Result<Vec<LocatedQueryTerm>> {
|
||||
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||
|
@ -303,7 +303,8 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn start_with_hard_separator() -> Result<()> {
|
||||
let tokenizer = TokenizerBuilder::new().build();
|
||||
let mut builder = TokenizerBuilder::default();
|
||||
let tokenizer = builder.build();
|
||||
let tokens = tokenizer.tokenize(".");
|
||||
let index = temp_index_with_documents();
|
||||
let rtxn = index.read_txn()?;
|
||||
|
|
|
@ -128,10 +128,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||
.map(|reader| (documents_ids, reader, script_language_docids))
|
||||
}
|
||||
|
||||
fn extract_tokens_from_document<T: AsRef<[u8]>>(
|
||||
fn extract_tokens_from_document(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
tokenizer: &Tokenizer<T>,
|
||||
tokenizer: &Tokenizer,
|
||||
max_positions_per_attributes: u32,
|
||||
buffers: &mut Buffers,
|
||||
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
||||
|
|
|
@ -1,18 +1,21 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
0 [1, ]
|
||||
1 [2, ]
|
||||
10.0 [1, ]
|
||||
10 [1, ]
|
||||
12 [0, ]
|
||||
1344 [3, ]
|
||||
2 [0, ]
|
||||
23 [5, ]
|
||||
25.99 [2, ]
|
||||
3.5 [0, ]
|
||||
25 [2, ]
|
||||
3 [0, ]
|
||||
35 [5, ]
|
||||
4 [4, ]
|
||||
42 [0, 5, ]
|
||||
456 [1, ]
|
||||
5 [0, ]
|
||||
99 [2, ]
|
||||
adams [5, ]
|
||||
adventure [1, ]
|
||||
alice [2, ]
|
||||
|
@ -29,7 +32,7 @@ galaxy [5, ]
|
|||
guide [5, ]
|
||||
half [4, ]
|
||||
harry [4, ]
|
||||
hitchhiker' [5, ]
|
||||
hitchhiker [5, ]
|
||||
hobbit [3, ]
|
||||
in [2, ]
|
||||
j [3, 4, ]
|
||||
|
|
|
@ -1,19 +1,22 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
0 [1, 7, ]
|
||||
1 [2, ]
|
||||
10.0 [1, 7, ]
|
||||
10 [1, 7, ]
|
||||
12 [0, 8, ]
|
||||
1344 [3, ]
|
||||
1813 [8, ]
|
||||
2 [0, 8, ]
|
||||
23 [5, ]
|
||||
25.99 [2, ]
|
||||
3.5 [0, 8, ]
|
||||
25 [2, ]
|
||||
3 [0, 8, ]
|
||||
35 [5, ]
|
||||
4 [4, 6, ]
|
||||
42 [0, 5, 8, ]
|
||||
456 [1, 7, ]
|
||||
5 [0, 8, ]
|
||||
99 [2, ]
|
||||
adams [5, ]
|
||||
adventure [1, 7, ]
|
||||
alice [2, ]
|
||||
|
@ -31,7 +34,7 @@ galaxy [5, ]
|
|||
guide [5, ]
|
||||
half [4, 6, ]
|
||||
harry [4, 6, ]
|
||||
hitchhiker' [5, ]
|
||||
hitchhiker [5, ]
|
||||
hobbit [3, ]
|
||||
in [2, ]
|
||||
j [3, 4, 6, 8, ]
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use charabia::{Tokenizer, TokenizerBuilder};
|
||||
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
|
||||
use deserr::{DeserializeError, Deserr};
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
|
@ -423,6 +423,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||
match self.stop_words {
|
||||
Setting::Set(ref stop_words) => {
|
||||
let current = self.index.stop_words(self.wtxn)?;
|
||||
|
||||
// Apply an unlossy normalization on stop_words
|
||||
let stop_words = stop_words
|
||||
.iter()
|
||||
.map(|w| w.as_str().normalize(&Default::default()).into_owned());
|
||||
|
||||
// since we can't compare a BTreeSet with an FST we are going to convert the
|
||||
// BTreeSet to an FST and then compare bytes per bytes the two FSTs.
|
||||
let fst = fst::Set::from_iter(stop_words)?;
|
||||
|
@ -446,7 +452,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||
fn update_synonyms(&mut self) -> Result<bool> {
|
||||
match self.synonyms {
|
||||
Setting::Set(ref synonyms) => {
|
||||
fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> {
|
||||
fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
|
||||
tokenizer
|
||||
.tokenize(text)
|
||||
.filter_map(|token| {
|
||||
|
@ -647,7 +653,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||
fn update_exact_words(&mut self) -> Result<()> {
|
||||
match self.exact_words {
|
||||
Setting::Set(ref mut words) => {
|
||||
fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String {
|
||||
fn normalize(tokenizer: &Tokenizer, text: &str) -> String {
|
||||
tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect()
|
||||
}
|
||||
|
||||
|
|
|
@ -1,27 +1,28 @@
|
|||
---
|
||||
source: milli/src/update/delete_documents.rs
|
||||
---
|
||||
1_36 [3, ]
|
||||
1_37 [4, ]
|
||||
1_38 [5, ]
|
||||
1_39 [6, ]
|
||||
1_40 [7, ]
|
||||
1_41 [8, ]
|
||||
1_42 [9, ]
|
||||
1_43 [10, ]
|
||||
1_44 [11, ]
|
||||
1_45 [12, ]
|
||||
1_46 [13, ]
|
||||
1_47 [14, ]
|
||||
1_5 [1, ]
|
||||
1_52 [15, ]
|
||||
1_57 [16, ]
|
||||
1_58 [17, ]
|
||||
1_68 [18, ]
|
||||
1_69 [19, ]
|
||||
1_7 [2, ]
|
||||
1_71 [21, ]
|
||||
2.2 [21, ]
|
||||
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
|
||||
2 [21, ]
|
||||
36 [3, ]
|
||||
37 [4, ]
|
||||
38 [5, ]
|
||||
39 [6, ]
|
||||
40 [7, ]
|
||||
41 [8, ]
|
||||
42 [9, ]
|
||||
43 [10, ]
|
||||
44 [11, ]
|
||||
45 [12, ]
|
||||
46 [13, ]
|
||||
47 [14, ]
|
||||
5 [1, ]
|
||||
52 [15, ]
|
||||
57 [16, ]
|
||||
58 [17, ]
|
||||
68 [18, ]
|
||||
69 [19, ]
|
||||
7 [2, ]
|
||||
71 [21, ]
|
||||
abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
|
||||
aquarium [5, ]
|
||||
art [4, 5, 8, 9, 10, 12, 17, ]
|
||||
|
|
|
@ -1,4 +1,25 @@
|
|||
---
|
||||
source: milli/src/update/delete_documents.rs
|
||||
---
|
||||
1 1 36 [3, ]
|
||||
1 1 37 [4, ]
|
||||
1 1 38 [5, ]
|
||||
1 1 39 [6, ]
|
||||
1 1 40 [7, ]
|
||||
1 1 41 [8, ]
|
||||
1 1 42 [9, ]
|
||||
1 1 43 [10, ]
|
||||
1 1 44 [11, ]
|
||||
1 1 45 [12, ]
|
||||
1 1 46 [13, ]
|
||||
1 1 47 [14, ]
|
||||
1 1 5 [1, ]
|
||||
1 1 52 [15, ]
|
||||
1 1 57 [16, ]
|
||||
1 1 58 [17, ]
|
||||
1 1 68 [18, ]
|
||||
1 1 69 [19, ]
|
||||
1 1 7 [2, ]
|
||||
1 1 71 [21, ]
|
||||
1 2 2 [21, ]
|
||||
|
||||
|
|
|
@ -1,31 +1,31 @@
|
|||
---
|
||||
source: milli/src/update/delete_documents.rs
|
||||
---
|
||||
1.2 [20, 22, ]
|
||||
1_36 [3, ]
|
||||
1_37 [4, ]
|
||||
1_38 [5, ]
|
||||
1_39 [6, ]
|
||||
1_4 [0, ]
|
||||
1_40 [7, ]
|
||||
1_41 [8, ]
|
||||
1_42 [9, ]
|
||||
1_43 [10, ]
|
||||
1_44 [11, ]
|
||||
1_45 [12, ]
|
||||
1_46 [13, ]
|
||||
1_47 [14, ]
|
||||
1_5 [1, ]
|
||||
1_52 [15, ]
|
||||
1_57 [16, ]
|
||||
1_58 [17, ]
|
||||
1_68 [18, ]
|
||||
1_69 [19, ]
|
||||
1_7 [2, ]
|
||||
1_70 [20, ]
|
||||
1_71 [21, ]
|
||||
1_72 [22, ]
|
||||
2.2 [21, ]
|
||||
1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ]
|
||||
2 [20, 21, 22, ]
|
||||
36 [3, ]
|
||||
37 [4, ]
|
||||
38 [5, ]
|
||||
39 [6, ]
|
||||
4 [0, ]
|
||||
40 [7, ]
|
||||
41 [8, ]
|
||||
42 [9, ]
|
||||
43 [10, ]
|
||||
44 [11, ]
|
||||
45 [12, ]
|
||||
46 [13, ]
|
||||
47 [14, ]
|
||||
5 [1, ]
|
||||
52 [15, ]
|
||||
57 [16, ]
|
||||
58 [17, ]
|
||||
68 [18, ]
|
||||
69 [19, ]
|
||||
7 [2, ]
|
||||
70 [20, ]
|
||||
71 [21, ]
|
||||
72 [22, ]
|
||||
abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
|
||||
aquarium [5, ]
|
||||
art [4, 5, 8, 9, 10, 12, 17, ]
|
||||
|
|
|
@ -1,4 +1,29 @@
|
|||
---
|
||||
source: milli/src/update/delete_documents.rs
|
||||
---
|
||||
1 1 2 [20, 22, ]
|
||||
1 1 36 [3, ]
|
||||
1 1 37 [4, ]
|
||||
1 1 38 [5, ]
|
||||
1 1 39 [6, ]
|
||||
1 1 4 [0, ]
|
||||
1 1 40 [7, ]
|
||||
1 1 41 [8, ]
|
||||
1 1 42 [9, ]
|
||||
1 1 43 [10, ]
|
||||
1 1 44 [11, ]
|
||||
1 1 45 [12, ]
|
||||
1 1 46 [13, ]
|
||||
1 1 47 [14, ]
|
||||
1 1 5 [1, ]
|
||||
1 1 52 [15, ]
|
||||
1 1 57 [16, ]
|
||||
1 1 58 [17, ]
|
||||
1 1 68 [18, ]
|
||||
1 1 69 [19, ]
|
||||
1 1 7 [2, ]
|
||||
1 1 70 [20, ]
|
||||
1 1 71 [21, ]
|
||||
1 1 72 [22, ]
|
||||
1 2 2 [21, ]
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue