mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-10 06:58:56 +01:00
39407885c2
3347: Enhance language detection r=irevoire a=ManyTheFish ## Summary Some completely unrelated Languages can share the same characters, in Meilisearch we detect the Languages using `whatlang`, which works well on large texts but fails on small search queries leading to a bad segmentation and normalization of the query. This PR now stores the Languages detected during the indexing in order to reduce the Languages list that can be detected during the search. ## Detail - Create a 19th database mapping the scripts and the Languages detected with the documents where the Language is detected - Fill the newly created database during indexing - Create an allow-list with this database and pass it to Charabia - Add a test ensuring that a Japanese request containing kanjis only is detected as Japanese and not Chinese ## Related issues Fixes #2403 Fixes #3513 Co-authored-by: f3r10 <frledesma@outlook.com> Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Many the fish <many@meilisearch.com>
92 lines
2.5 KiB
TOML
92 lines
2.5 KiB
TOML
[package]
|
|
name = "milli"
|
|
edition = "2018"
|
|
publish = false
|
|
|
|
version.workspace = true
|
|
authors.workspace = true
|
|
description.workspace = true
|
|
homepage.workspace = true
|
|
readme.workspace = true
|
|
# edition.workspace = true
|
|
license.workspace = true
|
|
|
|
[dependencies]
|
|
bimap = { version = "0.6.2", features = ["serde"] }
|
|
bincode = "1.3.3"
|
|
bstr = "1.0.1"
|
|
byteorder = "1.4.3"
|
|
charabia = { version = "0.7.1", default-features = false }
|
|
concat-arrays = "0.1.2"
|
|
crossbeam-channel = "0.5.6"
|
|
deserr = "0.5.0"
|
|
either = "1.8.0"
|
|
flatten-serde-json = { path = "../flatten-serde-json" }
|
|
fst = "0.4.7"
|
|
fxhash = "0.2.1"
|
|
geoutils = "0.5.1"
|
|
grenad = { version = "0.4.3", default-features = false, features = ["tempfile"] }
|
|
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.5", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
|
json-depth-checker = { path = "../json-depth-checker" }
|
|
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
|
memmap2 = "0.5.7"
|
|
obkv = "0.2.0"
|
|
once_cell = "1.15.0"
|
|
ordered-float = "3.2.0"
|
|
rayon = "1.5.3"
|
|
roaring = "0.10.1"
|
|
rstar = { version = "0.9.3", features = ["serde"] }
|
|
serde = { version = "1.0.145", features = ["derive"] }
|
|
serde_json = { version = "1.0.85", features = ["preserve_order"] }
|
|
slice-group-by = "0.3.0"
|
|
smallstr = { version = "0.3.0", features = ["serde"] }
|
|
smallvec = "1.10.0"
|
|
smartstring = "1.0.1"
|
|
tempfile = "3.3.0"
|
|
thiserror = "1.0.37"
|
|
time = { version = "0.3.15", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
|
uuid = { version = "1.1.2", features = ["v4"] }
|
|
|
|
filter-parser = { path = "../filter-parser" }
|
|
|
|
# documents words self-join
|
|
itertools = "0.10.5"
|
|
|
|
# logging
|
|
log = "0.4.17"
|
|
logging_timer = "1.1.0"
|
|
csv = "1.1.6"
|
|
|
|
[dev-dependencies]
|
|
big_s = "1.0.2"
|
|
insta = "1.21.0"
|
|
maplit = "1.0.2"
|
|
md5 = "0.7.0"
|
|
rand = {version = "0.8.5", features = ["small_rng"] }
|
|
|
|
[target.'cfg(fuzzing)'.dev-dependencies]
|
|
fuzzcheck = "0.12.1"
|
|
|
|
[features]
|
|
default = [ "charabia/default" ]
|
|
|
|
# Use POSIX semaphores instead of SysV semaphores in LMDB
|
|
# For more information on this feature, see heed's Cargo.toml
|
|
lmdb-posix-sem = ["heed/posix-sem"]
|
|
|
|
# allow chinese specialized tokenization
|
|
chinese = ["charabia/chinese"]
|
|
|
|
# allow hebrew specialized tokenization
|
|
hebrew = ["charabia/hebrew"]
|
|
|
|
# allow japanese specialized tokenization
|
|
japanese = ["charabia/japanese"]
|
|
japanese-transliteration = ["charabia/japanese-transliteration"]
|
|
|
|
# allow korean specialized tokenization
|
|
korean = ["charabia/korean"]
|
|
|
|
# allow thai specialized tokenization
|
|
thai = ["charabia/thai"]
|