4169: update charabia r=curquiza a=ManyTheFish

Update Charabia to v0.8.5 and add the new khmer tokenizer

Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2023-10-26 17:21:30 +00:00 committed by GitHub
commit 9cacc82307
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 761 additions and 32 deletions

782
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -50,6 +50,7 @@ hebrew = ["milli/hebrew"]
japanese = ["milli/japanese"]
# thai specialized tokenization
thai = ["milli/thai"]
# allow greek specialized tokenization
greek = ["milli/greek"]
# allow khmer specialized tokenization
khmer = ["milli/khmer"]

View File

@ -150,6 +150,7 @@ hebrew = ["meilisearch-types/hebrew"]
japanese = ["meilisearch-types/japanese"]
thai = ["meilisearch-types/thai"]
greek = ["meilisearch-types/greek"]
khmer = ["meilisearch-types/khmer"]
[package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.11/build.zip"

View File

@ -17,7 +17,7 @@ bincode = "1.3.3"
bstr = "1.4.0"
bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
byteorder = "1.4.3"
charabia = { version = "0.8.3", default-features = false }
charabia = { version = "0.8.5", default-features = false }
concat-arrays = "0.1.2"
crossbeam-channel = "0.5.8"
deserr = { version = "0.6.0", features = ["actix-web"]}
@ -82,7 +82,7 @@ md5 = "0.7.0"
rand = { version = "0.8.5", features = ["small_rng"] }
[features]
all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek"]
all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer"]
# Use POSIX semaphores instead of SysV semaphores in LMDB
# For more information on this feature, see heed's Cargo.toml
@ -106,3 +106,6 @@ thai = ["charabia/thai"]
# allow greek specialized tokenization
greek = ["charabia/greek"]
# allow khmer specialized tokenization
khmer = ["charabia/khmer"]