From 84845de9ef23db748e6854aa20263d9de7123f4d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 28 Jun 2023 18:52:32 +0200 Subject: [PATCH] Update Charabia --- Cargo.lock | 220 +++++++++--------- meilisearch/src/search.rs | 8 +- milli/Cargo.toml | 2 +- .../src/search/new/matches/matching_words.rs | 3 +- milli/src/search/new/matches/mod.rs | 22 +- .../src/search/new/query_term/parse_query.rs | 5 +- milli/src/search/new/tests/stop_words.rs | 14 +- .../extract/extract_docid_word_positions.rs | 4 +- milli/src/update/settings.rs | 12 +- 9 files changed, 150 insertions(+), 140 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ccf79f9a2..636066d84 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -152,7 +152,7 @@ dependencies = [ "pin-project-lite", "tokio-rustls 0.23.4", "tokio-util", - "webpki-roots", + "webpki-roots 0.22.6", ] [[package]] @@ -706,23 +706,24 @@ dependencies = [ [[package]] name = "charabia" version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413155d93157bff9130895c3bd83970ac7f35659ca57226a96aa35cf1e8e102c" dependencies = [ + "aho-corasick", "cow-utils", "csv", "deunicode", + "either", "finl_unicode", "fst", "irg-kvariants", "jieba-rs", - "lindera", + "lindera-core", + "lindera-dictionary", + "lindera-tokenizer", "once_cell", "pinyin", "serde", "slice-group-by", "unicode-normalization", - "unicode-segmentation", "wana_kana", "whatlang", ] @@ -2135,15 +2136,6 @@ dependencies = [ "simple_asn1", ] -[[package]] -name = "kanaria" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" -dependencies = [ - "bitflags", -] - [[package]] name = "language-tags" version = "0.3.2" @@ -2211,38 +2203,11 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "lindera" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72be283281bec2768687b1784be03a678609b51f2f90f6f9d9b4f07953e6dd25" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "encoding", - "kanaria", - "lindera-cc-cedict-builder", - "lindera-core", - "lindera-dictionary", - "lindera-filter", - "lindera-ipadic-builder", - "lindera-ko-dic-builder", - "lindera-unidic-builder", - "regex", - "serde", - "serde_json", - "thiserror", - "unicode-blocks", - "unicode-normalization", - "yada", -] - [[package]] name = "lindera-cc-cedict-builder" -version = "0.23.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10fbafd37adab44ccc2668a40fba2dbc4e665cb3c36018c15dfe2e2b830e28ce" +checksum = "4c6bf79b29a90bcd22036e494d6cc9ac3abe9ab604b21f3258ba6dc1ce501801" dependencies = [ "anyhow", "bincode", @@ -2259,9 +2224,9 @@ dependencies = [ [[package]] name = "lindera-compress" -version = "0.23.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed9196bf5995503f6878a090dfee6114ba86430c72f67ef3624246b564869937" +checksum = "8f2e99e67736352bbb6ed1c273643975822505067ca32194b0981040bc50527a" dependencies = [ "anyhow", "flate2", @@ -2270,9 +2235,9 @@ dependencies = [ [[package]] name = "lindera-core" -version = "0.23.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5f0baa9932f682e9c5b388897330f155d3c40de80016e60125897fde5e0e246" +checksum = "7c3935e966409156f22cb4b334b21b0dce84b7aa1cad62214b466489d249c8e5" dependencies = [ "anyhow", "bincode", @@ -2287,9 +2252,9 @@ dependencies = [ [[package]] name = "lindera-decompress" -version = "0.23.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6e63fa6ef0bc3ce2c26d372aa6185b7a316194494a84f81678f5da2893bf4a2" +checksum = "7476406abb63c49d7f59c88b9b868ee8d2981495ea7e2c3ad129902f9916b3c6" dependencies = [ "anyhow", "flate2", @@ -2298,63 +2263,50 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.23.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd765c36166016de87a1f447ea971573e4c63e334836c46ad0020f0408c88bfc" +checksum = "808b7d2b3cabc25a4022526d484a4cfd1d5924dc76a26e0379707698841acef2" dependencies = [ "anyhow", "bincode", "byteorder", + "lindera-cc-cedict-builder", "lindera-core", - "lindera-ipadic", - "lindera-ko-dic", - "serde", -] - -[[package]] -name = "lindera-filter" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5345e37fb9521ab3cee19283bed135d46b3521dc1fd13a49fa0992379056203" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "kanaria", - "lindera-core", - "lindera-dictionary", - "once_cell", - "regex", - "serde", - "serde_json", - "unicode-blocks", - "unicode-normalization", - "unicode-segmentation", - "yada", -] - -[[package]] -name = "lindera-ipadic" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60eeb356295f784e7db4cfd2c6772f2bd059e565a7744e246642a07bc333a88a" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-decompress", "lindera-ipadic-builder", - "once_cell", - "tar", + "lindera-ipadic-neologd-builder", + "lindera-ko-dic", + "lindera-ko-dic-builder", + "lindera-unidic", + "lindera-unidic-builder", + "serde", ] [[package]] name = "lindera-ipadic-builder" -version = "0.23.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a16a2a88db9d956f5086bc976deb9951ca2dbbfef41a002df0a7bfb2c845aab" +checksum = "31f373a280958c930e5ee4a1e4db3a0ee0542afaf02d3b5cacb8cab4e298648e" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "encoding_rs", + "encoding_rs_io", + "env_logger", + "glob", + "lindera-core", + "lindera-decompress", + "log", + "serde", + "yada", +] + +[[package]] +name = "lindera-ipadic-neologd-builder" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92eff98e9ed1a7a412b91709c2343457a04ef02fa0c27c27e3a5892f5591eae9" dependencies = [ "anyhow", "bincode", @@ -2364,7 +2316,6 @@ dependencies = [ "encoding_rs_io", "env_logger", "glob", - "lindera-compress", "lindera-core", "lindera-decompress", "log", @@ -2374,9 +2325,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic" -version = "0.23.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb479b170a841b8cfbe602d772e30849ffe0562b219190a378368968b8c8f66" +checksum = "74c6d5bf7d8092bd6d10de7a5d74b70ea7cf234586235b0d6cdb903b05a6c9e2" dependencies = [ "bincode", "byteorder", @@ -2391,9 +2342,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic-builder" -version = "0.23.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b9b58213552560717c48e7833444a20d2d7fe26a6e565f7ce0cbbf85784c7cf" +checksum = "f0a4add6d3c1e41ec9e2690d33e287d0223fb59a30ccee4980c23f31368cae1e" dependencies = [ "anyhow", "bincode", @@ -2410,10 +2361,42 @@ dependencies = [ ] [[package]] -name = "lindera-unidic-builder" -version = "0.23.0" +name = "lindera-tokenizer" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6858147cdaf4a7b564c08a247449d3aca38e9b4812499651af08afbf85324596" +checksum = "cb6a8acbd068019d1cdac7316f0dcb87f8e33ede2b13aa237f45114f9750afb8" +dependencies = [ + "bincode", + "byteorder", + "lindera-core", + "lindera-dictionary", + "once_cell", + "serde", + "serde_json", +] + +[[package]] +name = "lindera-unidic" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14abf0613d350b30d3b0406a33b1de8fa8d829f26516909421702174785991c8" +dependencies = [ + "bincode", + "byteorder", + "encoding", + "lindera-core", + "lindera-decompress", + "lindera-unidic-builder", + "once_cell", + "ureq", + "zip", +] + +[[package]] +name = "lindera-unidic-builder" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e204ed53d9bd63227d1e6a6c1f122ca039e00a8634ac32e7fb0281eeec8615c4" dependencies = [ "anyhow", "bincode", @@ -2422,6 +2405,7 @@ dependencies = [ "encoding", "env_logger", "glob", + "lindera-compress", "lindera-core", "lindera-decompress", "log", @@ -3427,7 +3411,7 @@ dependencies = [ "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "webpki-roots", + "webpki-roots 0.22.6", "winreg", ] @@ -4210,12 +4194,6 @@ version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" -[[package]] -name = "unicode-blocks" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "943e3f1f50cc455d072e0801ccb71ff893b0c88060b1169f92e35fb5bb881cc6" - [[package]] name = "unicode-ident" version = "1.0.9" @@ -4249,6 +4227,21 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" +[[package]] +name = "ureq" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b11c96ac7ee530603dcdf68ed1557050f374ce55a5a07193ebf8cbc9f8927e9" +dependencies = [ + "base64 0.21.2", + "log", + "once_cell", + "rustls 0.21.1", + "rustls-webpki", + "url", + "webpki-roots 0.23.1", +] + [[package]] name = "url" version = "2.3.1" @@ -4457,6 +4450,15 @@ dependencies = [ "webpki", ] +[[package]] +name = "webpki-roots" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338" +dependencies = [ + "rustls-webpki", +] + [[package]] name = "whatlang" version = "0.16.2" diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 87cfdadb3..7583c75fd 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -727,10 +727,10 @@ fn extract_field( } } -fn format_fields>( +fn format_fields( document: &Document, field_ids_map: &FieldsIdsMap, - builder: &MatcherBuilder<'_, A>, + builder: &MatcherBuilder, formatted_options: &BTreeMap, compute_matches: bool, displayable_ids: &BTreeSet, @@ -775,9 +775,9 @@ fn format_fields>( Ok((matches_position, document)) } -fn format_value>( +fn format_value( value: Value, - builder: &MatcherBuilder<'_, A>, + builder: &MatcherBuilder, format_options: Option, infos: &mut Vec, compute_matches: bool, diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 08f0c2645..57f6007be 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ bincode = "1.3.3" bstr = "1.4.0" bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] } byteorder = "1.4.3" -charabia = { version = "0.7.2", default-features = false } +charabia = { version = "0.8.1", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.8" deserr = "0.5.0" diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 0ba8b198e..2bac05934 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -256,7 +256,8 @@ pub(crate) mod tests { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); let mut ctx = SearchContext::new(&temp_index, &rtxn); - let tokenizer = TokenizerBuilder::new().build(); + let mut builder = TokenizerBuilder::default(); + let tokenizer = builder.build(); let tokens = tokenizer.tokenize("split this world"); let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); let matching_words = MatchingWords::new(ctx, query_terms); diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index ce28e16c1..72e155b3e 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -12,16 +12,16 @@ const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; /// Structure used to build a Matcher allowing to customize formating tags. -pub struct MatcherBuilder<'a, A> { +pub struct MatcherBuilder<'m> { matching_words: MatchingWords, - tokenizer: Tokenizer<'a, 'a, A>, + tokenizer: Tokenizer<'m>, crop_marker: Option, highlight_prefix: Option, highlight_suffix: Option, } -impl<'a, A> MatcherBuilder<'a, A> { - pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { +impl<'m> MatcherBuilder<'m> { + pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self { Self { matching_words, tokenizer, @@ -46,7 +46,7 @@ impl<'a, A> MatcherBuilder<'a, A> { self } - pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { + pub fn build<'t>(&'m self, text: &'t str) -> Matcher<'t, 'm> { let crop_marker = match &self.crop_marker { Some(marker) => marker.as_str(), None => DEFAULT_CROP_MARKER, @@ -103,17 +103,17 @@ pub struct MatchBounds { /// Structure used to analize a string, compute words that match, /// and format the source string, returning a highlighted and cropped sub-string. -pub struct Matcher<'t, 'm, A> { +pub struct Matcher<'t, 'm> { text: &'t str, matching_words: &'m MatchingWords, - tokenizer: &'m Tokenizer<'m, 'm, A>, + tokenizer: &'m Tokenizer<'m>, crop_marker: &'m str, highlight_prefix: &'m str, highlight_suffix: &'m str, matches: Option<(Vec>, Vec)>, } -impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { +impl<'t> Matcher<'t, '_> { /// Iterates over tokens and save any of them that matches the query. fn compute_matches(&mut self) -> &mut Self { /// some words are counted as matches only if they are close together and in the good order, @@ -503,7 +503,7 @@ mod tests { use crate::index::tests::TempIndex; use crate::{execute_search, SearchContext}; - impl<'a> MatcherBuilder<'a, &[u8]> { + impl<'a> MatcherBuilder<'a> { fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self { let mut ctx = SearchContext::new(index, rtxn); let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search( @@ -530,7 +530,7 @@ mod tests { None => MatchingWords::default(), }; - MatcherBuilder::new(matching_words, TokenizerBuilder::new().build()) + MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer()) } } @@ -690,7 +690,7 @@ mod tests { // should crop the phrase instead of croping around the match. insta::assert_snapshot!( matcher.format(format_options), - @"… Split The World is a book written by Emily Henry…" + @"…Split The World is a book written by Emily Henry…" ); // Text containing some matches. diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 5e97d6578..64fe07a31 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -7,7 +7,7 @@ use crate::{Result, SearchContext, MAX_WORD_LENGTH}; /// Convert the tokenised search query into a list of located query terms. pub fn located_query_terms_from_tokens( ctx: &mut SearchContext, - query: NormalizedTokenIter<&[u8]>, + query: NormalizedTokenIter, words_limit: Option, ) -> Result> { let nbr_typos = number_of_typos_allowed(ctx)?; @@ -303,7 +303,8 @@ mod tests { #[test] fn start_with_hard_separator() -> Result<()> { - let tokenizer = TokenizerBuilder::new().build(); + let mut builder = TokenizerBuilder::default(); + let tokenizer = builder.build(); let tokens = tokenizer.tokenize("."); let index = temp_index_with_documents(); let rtxn = index.read_txn()?; diff --git a/milli/src/search/new/tests/stop_words.rs b/milli/src/search/new/tests/stop_words.rs index 4ad587240..63bba3b3b 100644 --- a/milli/src/search/new/tests/stop_words.rs +++ b/milli/src/search/new/tests/stop_words.rs @@ -113,7 +113,7 @@ fn test_ignore_stop_words() { ), Position( Rank { - rank: 9, + rank: 7, max_rank: 11, }, ), @@ -166,7 +166,7 @@ fn test_ignore_stop_words() { ), Position( Rank { - rank: 9, + rank: 7, max_rank: 11, }, ), @@ -219,7 +219,7 @@ fn test_ignore_stop_words() { ), Position( Rank { - rank: 9, + rank: 7, max_rank: 11, }, ), @@ -259,7 +259,7 @@ fn test_ignore_stop_words() { ), Proximity( Rank { - rank: 7, + rank: 1, max_rank: 8, }, ), @@ -271,7 +271,7 @@ fn test_ignore_stop_words() { ), Position( Rank { - rank: 17, + rank: 15, max_rank: 21, }, ), @@ -411,7 +411,7 @@ fn test_stop_words_in_phrase() { ), Proximity( Rank { - rank: 6, + rank: 1, max_rank: 8, }, ), @@ -423,7 +423,7 @@ fn test_stop_words_in_phrase() { ), Position( Rank { - rank: 29, + rank: 27, max_rank: 31, }, ), diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 0cce91938..8985534db 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -128,10 +128,10 @@ pub fn extract_docid_word_positions( .map(|reader| (documents_ids, reader, script_language_docids)) } -fn extract_tokens_from_document>( +fn extract_tokens_from_document( obkv: &KvReader, searchable_fields: &Option>, - tokenizer: &Tokenizer, + tokenizer: &Tokenizer, max_positions_per_attributes: u32, buffers: &mut Buffers, script_language_word_count: &mut HashMap>, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 3e271924b..a1a2a57e4 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,7 +1,7 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::result::Result as StdResult; -use charabia::{Tokenizer, TokenizerBuilder}; +use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; use itertools::Itertools; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -413,6 +413,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { match self.stop_words { Setting::Set(ref stop_words) => { let current = self.index.stop_words(self.wtxn)?; + + // Apply an unlossy normalization on stop_words + let stop_words = stop_words + .iter() + .map(|w| w.as_str().normalize(&Default::default()).into_owned()); + // since we can't compare a BTreeSet with an FST we are going to convert the // BTreeSet to an FST and then compare bytes per bytes the two FSTs. let fst = fst::Set::from_iter(stop_words)?; @@ -436,7 +442,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_synonyms(&mut self) -> Result { match self.synonyms { Setting::Set(ref synonyms) => { - fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec { + fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec { tokenizer .tokenize(text) .filter_map(|token| { @@ -637,7 +643,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_exact_words(&mut self) -> Result<()> { match self.exact_words { Setting::Set(ref mut words) => { - fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String { + fn normalize(tokenizer: &Tokenizer, text: &str) -> String { tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect() }