mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
Merge #736
736: Update charabia r=curquiza a=ManyTheFish Update Charabia to the last version. > We are now Romanizing Chinese characters into Pinyin. > Note that we keep the accent because they are in fact never typed directly by the end-user, moreover, changing an accent leads to a different Chinese character, and I don't have sufficient knowledge to forecast the impact of removing accents in this context. Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
commit
6a10e85707
@ -9,7 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] }
|
|||||||
bincode = "1.3.3"
|
bincode = "1.3.3"
|
||||||
bstr = "1.0.1"
|
bstr = "1.0.1"
|
||||||
byteorder = "1.4.3"
|
byteorder = "1.4.3"
|
||||||
charabia = { version = "0.6.0", default-features = false }
|
charabia = { version = "0.7.0", default-features = false }
|
||||||
concat-arrays = "0.1.2"
|
concat-arrays = "0.1.2"
|
||||||
crossbeam-channel = "0.5.6"
|
crossbeam-channel = "0.5.6"
|
||||||
either = "1.8.0"
|
either = "1.8.0"
|
||||||
@ -70,6 +70,10 @@ hebrew = ["charabia/hebrew"]
|
|||||||
|
|
||||||
# allow japanese specialized tokenization
|
# allow japanese specialized tokenization
|
||||||
japanese = ["charabia/japanese"]
|
japanese = ["charabia/japanese"]
|
||||||
|
japanese-transliteration = ["charabia/japanese-transliteration"]
|
||||||
|
|
||||||
|
# allow korean specialized tokenization
|
||||||
|
korean = ["charabia/korean"]
|
||||||
|
|
||||||
# allow thai specialized tokenization
|
# allow thai specialized tokenization
|
||||||
thai = ["charabia/thai"]
|
thai = ["charabia/thai"]
|
||||||
|
@ -14,14 +14,14 @@ const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
|
|||||||
/// Structure used to build a Matcher allowing to customize formating tags.
|
/// Structure used to build a Matcher allowing to customize formating tags.
|
||||||
pub struct MatcherBuilder<'a, A> {
|
pub struct MatcherBuilder<'a, A> {
|
||||||
matching_words: MatchingWords,
|
matching_words: MatchingWords,
|
||||||
tokenizer: Tokenizer<'a, A>,
|
tokenizer: Tokenizer<'a, 'a, A>,
|
||||||
crop_marker: Option<String>,
|
crop_marker: Option<String>,
|
||||||
highlight_prefix: Option<String>,
|
highlight_prefix: Option<String>,
|
||||||
highlight_suffix: Option<String>,
|
highlight_suffix: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, A> MatcherBuilder<'a, A> {
|
impl<'a, A> MatcherBuilder<'a, A> {
|
||||||
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, A>) -> Self {
|
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
matching_words,
|
matching_words,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
@ -106,7 +106,7 @@ pub struct MatchBounds {
|
|||||||
pub struct Matcher<'t, 'm, A> {
|
pub struct Matcher<'t, 'm, A> {
|
||||||
text: &'t str,
|
text: &'t str,
|
||||||
matching_words: &'m MatchingWords,
|
matching_words: &'m MatchingWords,
|
||||||
tokenizer: &'m Tokenizer<'m, A>,
|
tokenizer: &'m Tokenizer<'m, 'm, A>,
|
||||||
crop_marker: &'m str,
|
crop_marker: &'m str,
|
||||||
highlight_prefix: &'m str,
|
highlight_prefix: &'m str,
|
||||||
highlight_suffix: &'m str,
|
highlight_suffix: &'m str,
|
||||||
|
@ -6,7 +6,7 @@ use std::hash::Hash;
|
|||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
use std::{fmt, mem};
|
use std::{fmt, mem};
|
||||||
|
|
||||||
use charabia::classifier::ClassifiedTokenIter;
|
use charabia::normalizer::NormalizedTokenIter;
|
||||||
use charabia::{SeparatorKind, TokenKind};
|
use charabia::{SeparatorKind, TokenKind};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
@ -270,7 +270,7 @@ impl<'a> QueryTreeBuilder<'a> {
|
|||||||
/// (the criterion `typo` will be ignored)
|
/// (the criterion `typo` will be ignored)
|
||||||
pub fn build<A: AsRef<[u8]>>(
|
pub fn build<A: AsRef<[u8]>>(
|
||||||
&self,
|
&self,
|
||||||
query: ClassifiedTokenIter<A>,
|
query: NormalizedTokenIter<A>,
|
||||||
) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
|
) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
|
||||||
let primitive_query = create_primitive_query(query, self.words_limit);
|
let primitive_query = create_primitive_query(query, self.words_limit);
|
||||||
if !primitive_query.is_empty() {
|
if !primitive_query.is_empty() {
|
||||||
@ -778,7 +778,7 @@ impl PrimitiveQueryPart {
|
|||||||
/// Create primitive query from tokenized query string,
|
/// Create primitive query from tokenized query string,
|
||||||
/// the primitive query is an intermediate state to build the query tree.
|
/// the primitive query is an intermediate state to build the query tree.
|
||||||
fn create_primitive_query<A>(
|
fn create_primitive_query<A>(
|
||||||
query: ClassifiedTokenIter<A>,
|
query: NormalizedTokenIter<A>,
|
||||||
words_limit: Option<usize>,
|
words_limit: Option<usize>,
|
||||||
) -> PrimitiveQuery
|
) -> PrimitiveQuery
|
||||||
where
|
where
|
||||||
@ -892,7 +892,7 @@ mod test {
|
|||||||
terms_matching_strategy: TermsMatchingStrategy,
|
terms_matching_strategy: TermsMatchingStrategy,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
words_limit: Option<usize>,
|
words_limit: Option<usize>,
|
||||||
query: ClassifiedTokenIter<A>,
|
query: NormalizedTokenIter<A>,
|
||||||
) -> Result<Option<(Operation, PrimitiveQuery)>> {
|
) -> Result<Option<(Operation, PrimitiveQuery)>> {
|
||||||
let primitive_query = create_primitive_query(query, words_limit);
|
let primitive_query = create_primitive_query(query, words_limit);
|
||||||
if !primitive_query.is_empty() {
|
if !primitive_query.is_empty() {
|
||||||
|
@ -1575,11 +1575,11 @@ mod tests {
|
|||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
// Only the first document should match.
|
// Only the first document should match.
|
||||||
let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len();
|
let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
|
||||||
assert_eq!(count, 1);
|
assert_eq!(count, 1);
|
||||||
|
|
||||||
// Only the second document should match.
|
// Only the second document should match.
|
||||||
let count = index.word_docids.get(&rtxn, "包").unwrap().unwrap().len();
|
let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len();
|
||||||
assert_eq!(count, 1);
|
assert_eq!(count, 1);
|
||||||
|
|
||||||
let mut search = crate::Search::new(&rtxn, &index);
|
let mut search = crate::Search::new(&rtxn, &index);
|
||||||
|
Loading…
Reference in New Issue
Block a user