integrate with meilisearch tokenizer

This commit is contained in:
mpostma 2020-12-23 19:09:01 +01:00
parent 7e1c94ab9c
commit 1ae761311e
No known key found for this signature in database
GPG key ID: CBC8A7C1D7A28C3A
10 changed files with 460 additions and 269 deletions

145
http-ui/Cargo.lock generated
View file

@ -6,6 +6,12 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e"
[[package]]
name = "ahash"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217"
[[package]]
name = "aho-corasick"
version = "0.7.15"
@ -213,6 +219,15 @@ dependencies = [
"jobserver",
]
[[package]]
name = "cedarwood"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d"
dependencies = [
"smallvec",
]
[[package]]
name = "cfg-if"
version = "0.1.10"
@ -225,6 +240,15 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "character_converter"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c"
dependencies = [
"bincode",
]
[[package]]
name = "chrono"
version = "0.4.19"
@ -265,6 +289,12 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c478836e029dcef17fb47c89023448c64f781a046e0300e257ad8225ae59afab"
[[package]]
name = "cow-utils"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173"
[[package]]
name = "cpuid-bool"
version = "0.1.2"
@ -368,6 +398,12 @@ dependencies = [
"memchr",
]
[[package]]
name = "deunicode"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80115a2dfde04491e181c2440a39e4be26e52d9ca4e92bed213f65b94e0b8db1"
[[package]]
name = "digest"
version = "0.8.1"
@ -640,6 +676,16 @@ dependencies = [
"tracing-futures",
]
[[package]]
name = "hashbrown"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf"
dependencies = [
"ahash",
"autocfg 1.0.1",
]
[[package]]
name = "hashbrown"
version = "0.9.1"
@ -840,7 +886,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2"
dependencies = [
"autocfg 1.0.1",
"hashbrown",
"hashbrown 0.9.1",
]
[[package]]
@ -897,6 +943,21 @@ dependencies = [
"libc",
]
[[package]]
name = "jieba-rs"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34fbdeee8786790f4a99fa30ff5c5f88aa5183f7583693e3788d17fc8a48f33a"
dependencies = [
"cedarwood",
"fxhash",
"hashbrown 0.9.1",
"lazy_static",
"phf",
"phf_codegen",
"regex",
]
[[package]]
name = "jobserver"
version = "0.1.21"
@ -975,6 +1036,22 @@ version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08"
[[package]]
name = "meilisearch-tokenizer"
version = "0.1.1"
source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#8d91cd52f30aa4b651a085c15056938f7b599646"
dependencies = [
"character_converter",
"cow-utils",
"deunicode",
"fst",
"jieba-rs",
"once_cell",
"slice-group-by",
"unicode-segmentation",
"whatlang",
]
[[package]]
name = "memchr"
version = "2.3.4"
@ -1022,6 +1099,7 @@ dependencies = [
"levenshtein_automata",
"linked-hash-map",
"log",
"meilisearch-tokenizer",
"memmap",
"near-proximity",
"num-traits",
@ -1323,6 +1401,44 @@ dependencies = [
"sha-1 0.8.2",
]
[[package]]
name = "phf"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared",
"rand 0.7.3",
]
[[package]]
name = "phf_shared"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project"
version = "0.4.27"
@ -1461,7 +1577,7 @@ dependencies = [
"rand_isaac",
"rand_jitter",
"rand_os",
"rand_pcg",
"rand_pcg 0.1.2",
"rand_xorshift",
"winapi 0.3.9",
]
@ -1477,6 +1593,7 @@ dependencies = [
"rand_chacha 0.2.2",
"rand_core 0.5.1",
"rand_hc 0.2.0",
"rand_pcg 0.2.1",
]
[[package]]
@ -1585,6 +1702,15 @@ dependencies = [
"rand_core 0.4.2",
]
[[package]]
name = "rand_pcg"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "rand_xorshift"
version = "0.1.1"
@ -1787,6 +1913,12 @@ dependencies = [
"libc",
]
[[package]]
name = "siphasher"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7"
[[package]]
name = "slab"
version = "0.4.2"
@ -2280,6 +2412,15 @@ version = "0.10.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
[[package]]
name = "whatlang"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc0289c1d1548414a5645e6583e118e9c569c579ec2a0c32417cc3dbf7a89075"
dependencies = [
"hashbrown 0.7.2",
]
[[package]]
name = "winapi"
version = "0.2.8"

View file

@ -10,6 +10,7 @@ anyhow = "1.0.28"
byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" }
heed = "0.10.5"
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" }
memmap = "0.7.0"
milli = { path = ".." }
once_cell = "1.4.1"