Merge pull request #63 from meilisearch/meilisearch-tokenizer

Meilisearch tokenizer
This commit is contained in:
Clément Renault 2021-01-12 13:26:24 +01:00 committed by GitHub
commit 51d1785576
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 502 additions and 292 deletions

147
Cargo.lock generated
View File

@ -6,6 +6,12 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10" checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10"
[[package]]
name = "ahash"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217"
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "0.7.15" version = "0.7.15"
@ -132,6 +138,15 @@ dependencies = [
"jobserver", "jobserver",
] ]
[[package]]
name = "cedarwood"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d"
dependencies = [
"smallvec",
]
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "0.1.10" version = "0.1.10"
@ -144,6 +159,15 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "character_converter"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c"
dependencies = [
"bincode",
]
[[package]] [[package]]
name = "chrono" name = "chrono"
version = "0.4.19" version = "0.4.19"
@ -175,6 +199,12 @@ version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce90df4c658c62f12d78f7508cf92f9173e5184a539c10bfe54a3107b3ffd0f2" checksum = "ce90df4c658c62f12d78f7508cf92f9173e5184a539c10bfe54a3107b3ffd0f2"
[[package]]
name = "cow-utils"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173"
[[package]] [[package]]
name = "crc32fast" name = "crc32fast"
version = "1.2.0" version = "1.2.0"
@ -330,6 +360,12 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "deunicode"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80115a2dfde04491e181c2440a39e4be26e52d9ca4e92bed213f65b94e0b8db1"
[[package]] [[package]]
name = "digest" name = "digest"
version = "0.8.1" version = "0.8.1"
@ -381,9 +417,9 @@ checksum = "5f2a4a2034423744d2cc7ca2068453168dcdb82c438419e639a26bd87839c674"
[[package]] [[package]]
name = "fst" name = "fst"
version = "0.4.4" version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7293de202dbfe786c0b3fe6110a027836c5438ed06db7b715c9955ff4bfea51" checksum = "d79238883cf0307100b90aba4a755d8051a3182305dfe7f649a1e9dc0517006f"
[[package]] [[package]]
name = "fxhash" name = "fxhash"
@ -440,6 +476,16 @@ version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d36fab90f82edc3c747f9d438e06cf0a491055896f2a279638bb5beed6c40177" checksum = "d36fab90f82edc3c747f9d438e06cf0a491055896f2a279638bb5beed6c40177"
[[package]]
name = "hashbrown"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf"
dependencies = [
"ahash",
"autocfg",
]
[[package]] [[package]]
name = "hashbrown" name = "hashbrown"
version = "0.9.1" version = "0.9.1"
@ -525,7 +571,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2" checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2"
dependencies = [ dependencies = [
"autocfg", "autocfg",
"hashbrown", "hashbrown 0.9.1",
] ]
[[package]] [[package]]
@ -564,6 +610,21 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "jieba-rs"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34fbdeee8786790f4a99fa30ff5c5f88aa5183f7583693e3788d17fc8a48f33a"
dependencies = [
"cedarwood",
"fxhash",
"hashbrown 0.9.1",
"lazy_static",
"phf",
"phf_codegen",
"regex",
]
[[package]] [[package]]
name = "jobserver" name = "jobserver"
version = "0.1.21" version = "0.1.21"
@ -647,6 +708,22 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
[[package]]
name = "meilisearch-tokenizer"
version = "0.1.1"
source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#147b6154b1b34cb8f5da2df6a416b7da191bc850"
dependencies = [
"character_converter",
"cow-utils",
"deunicode",
"fst",
"jieba-rs",
"once_cell",
"slice-group-by",
"unicode-segmentation",
"whatlang",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.3.3" version = "2.3.3"
@ -696,6 +773,7 @@ dependencies = [
"linked-hash-map", "linked-hash-map",
"log", "log",
"maplit", "maplit",
"meilisearch-tokenizer",
"memmap", "memmap",
"near-proximity", "near-proximity",
"num-traits", "num-traits",
@ -883,6 +961,44 @@ dependencies = [
"sha-1", "sha-1",
] ]
[[package]]
name = "phf"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared",
"rand",
]
[[package]]
name = "phf_shared"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher",
]
[[package]] [[package]]
name = "pkg-config" name = "pkg-config"
version = "0.3.19" version = "0.3.19"
@ -962,6 +1078,7 @@ dependencies = [
"rand_chacha", "rand_chacha",
"rand_core", "rand_core",
"rand_hc", "rand_hc",
"rand_pcg",
] ]
[[package]] [[package]]
@ -992,6 +1109,15 @@ dependencies = [
"rand_core", "rand_core",
] ]
[[package]]
name = "rand_pcg"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"rand_core",
]
[[package]] [[package]]
name = "rayon" name = "rayon"
version = "1.3.1" version = "1.3.1"
@ -1182,6 +1308,12 @@ dependencies = [
"opaque-debug", "opaque-debug",
] ]
[[package]]
name = "siphasher"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7"
[[package]] [[package]]
name = "slice-group-by" name = "slice-group-by"
version = "0.2.6" version = "0.2.6"
@ -1558,6 +1690,15 @@ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]]
name = "whatlang"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc0289c1d1548414a5645e6583e118e9c569c579ec2a0c32417cc3dbf7a89075"
dependencies = [
"hashbrown 0.7.2",
]
[[package]] [[package]]
name = "winapi" name = "winapi"
version = "0.3.8" version = "0.3.8"

View File

@ -13,7 +13,7 @@ crossbeam-channel = "0.5.0"
csv = "1.1.3" csv = "1.1.3"
either = "1.6.1" either = "1.6.1"
flate2 = "1.0.17" flate2 = "1.0.17"
fst = "0.4.4" fst = "0.4.5"
fxhash = "0.2.1" fxhash = "0.2.1"
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" }
heed = { version = "0.10.5", default-features = false, features = ["lmdb", "sync-read-txn"] } heed = { version = "0.10.5", default-features = false, features = ["lmdb", "sync-read-txn"] }
@ -21,6 +21,7 @@ human_format = "1.0.3"
jemallocator = "0.3.2" jemallocator = "0.3.2"
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
linked-hash-map = "0.5.3" linked-hash-map = "0.5.3"
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" }
memmap = "0.7.0" memmap = "0.7.0"
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" }
num-traits = "0.2.14" num-traits = "0.2.14"
@ -56,7 +57,7 @@ criterion = "0.3.3"
maplit = "1.0.2" maplit = "1.0.2"
[build-dependencies] [build-dependencies]
fst = "0.4.4" fst = "0.4.5"
[features] [features]
default = [] default = []

147
http-ui/Cargo.lock generated
View File

@ -6,6 +6,12 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e"
[[package]]
name = "ahash"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217"
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "0.7.15" version = "0.7.15"
@ -213,6 +219,15 @@ dependencies = [
"jobserver", "jobserver",
] ]
[[package]]
name = "cedarwood"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d"
dependencies = [
"smallvec",
]
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "0.1.10" version = "0.1.10"
@ -225,6 +240,15 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "character_converter"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c"
dependencies = [
"bincode",
]
[[package]] [[package]]
name = "chrono" name = "chrono"
version = "0.4.19" version = "0.4.19"
@ -265,6 +289,12 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c478836e029dcef17fb47c89023448c64f781a046e0300e257ad8225ae59afab" checksum = "c478836e029dcef17fb47c89023448c64f781a046e0300e257ad8225ae59afab"
[[package]]
name = "cow-utils"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173"
[[package]] [[package]]
name = "cpuid-bool" name = "cpuid-bool"
version = "0.1.2" version = "0.1.2"
@ -368,6 +398,12 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "deunicode"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80115a2dfde04491e181c2440a39e4be26e52d9ca4e92bed213f65b94e0b8db1"
[[package]] [[package]]
name = "digest" name = "digest"
version = "0.8.1" version = "0.8.1"
@ -640,6 +676,16 @@ dependencies = [
"tracing-futures", "tracing-futures",
] ]
[[package]]
name = "hashbrown"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf"
dependencies = [
"ahash",
"autocfg 1.0.1",
]
[[package]] [[package]]
name = "hashbrown" name = "hashbrown"
version = "0.9.1" version = "0.9.1"
@ -757,10 +803,12 @@ dependencies = [
"byte-unit", "byte-unit",
"bytes", "bytes",
"flate2", "flate2",
"fst",
"futures", "futures",
"grenad", "grenad",
"heed", "heed",
"log", "log",
"meilisearch-tokenizer",
"memmap", "memmap",
"milli", "milli",
"once_cell", "once_cell",
@ -840,7 +888,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2" checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2"
dependencies = [ dependencies = [
"autocfg 1.0.1", "autocfg 1.0.1",
"hashbrown", "hashbrown 0.9.1",
] ]
[[package]] [[package]]
@ -897,6 +945,21 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "jieba-rs"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34fbdeee8786790f4a99fa30ff5c5f88aa5183f7583693e3788d17fc8a48f33a"
dependencies = [
"cedarwood",
"fxhash",
"hashbrown 0.9.1",
"lazy_static",
"phf",
"phf_codegen",
"regex",
]
[[package]] [[package]]
name = "jobserver" name = "jobserver"
version = "0.1.21" version = "0.1.21"
@ -975,6 +1038,22 @@ version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08"
[[package]]
name = "meilisearch-tokenizer"
version = "0.1.1"
source = "git+https://github.com/meilisearch/Tokenizer.git?branch=token-eq#daeb4a4ac91081f1c592e3ebb3ec5d8dcb4e6976"
dependencies = [
"character_converter",
"cow-utils",
"deunicode",
"fst",
"jieba-rs",
"once_cell",
"slice-group-by",
"unicode-segmentation",
"whatlang",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.3.4" version = "2.3.4"
@ -1022,6 +1101,7 @@ dependencies = [
"levenshtein_automata", "levenshtein_automata",
"linked-hash-map", "linked-hash-map",
"log", "log",
"meilisearch-tokenizer",
"memmap", "memmap",
"near-proximity", "near-proximity",
"num-traits", "num-traits",
@ -1323,6 +1403,44 @@ dependencies = [
"sha-1 0.8.2", "sha-1 0.8.2",
] ]
[[package]]
name = "phf"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared",
"rand 0.7.3",
]
[[package]]
name = "phf_shared"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher",
]
[[package]] [[package]]
name = "pin-project" name = "pin-project"
version = "0.4.27" version = "0.4.27"
@ -1461,7 +1579,7 @@ dependencies = [
"rand_isaac", "rand_isaac",
"rand_jitter", "rand_jitter",
"rand_os", "rand_os",
"rand_pcg", "rand_pcg 0.1.2",
"rand_xorshift", "rand_xorshift",
"winapi 0.3.9", "winapi 0.3.9",
] ]
@ -1477,6 +1595,7 @@ dependencies = [
"rand_chacha 0.2.2", "rand_chacha 0.2.2",
"rand_core 0.5.1", "rand_core 0.5.1",
"rand_hc 0.2.0", "rand_hc 0.2.0",
"rand_pcg 0.2.1",
] ]
[[package]] [[package]]
@ -1585,6 +1704,15 @@ dependencies = [
"rand_core 0.4.2", "rand_core 0.4.2",
] ]
[[package]]
name = "rand_pcg"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"rand_core 0.5.1",
]
[[package]] [[package]]
name = "rand_xorshift" name = "rand_xorshift"
version = "0.1.1" version = "0.1.1"
@ -1787,6 +1915,12 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "siphasher"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7"
[[package]] [[package]]
name = "slab" name = "slab"
version = "0.4.2" version = "0.4.2"
@ -2280,6 +2414,15 @@ version = "0.10.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
[[package]]
name = "whatlang"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc0289c1d1548414a5645e6583e118e9c569c579ec2a0c32417cc3dbf7a89075"
dependencies = [
"hashbrown 0.7.2",
]
[[package]] [[package]]
name = "winapi" name = "winapi"
version = "0.2.8" version = "0.2.8"

View File

@ -10,6 +10,7 @@ anyhow = "1.0.28"
byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" }
heed = "0.10.5" heed = "0.10.5"
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" }
memmap = "0.7.0" memmap = "0.7.0"
milli = { path = ".." } milli = { path = ".." }
once_cell = "1.4.1" once_cell = "1.4.1"
@ -31,3 +32,4 @@ warp = "0.2.2"
# logging # logging
log = "0.4.11" log = "0.4.11"
stderrlog = "0.5.0" stderrlog = "0.5.0"
fst = "0.4.5"

View File

@ -27,8 +27,9 @@ use tokio::io::AsyncWriteExt;
use tokio::sync::broadcast; use tokio::sync::broadcast;
use warp::filters::ws::Message; use warp::filters::ws::Message;
use warp::{Filter, http::Response}; use warp::{Filter, http::Response};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use fst::Set;
use milli::tokenizer::{simple_tokenizer, TokenType};
use milli::update::UpdateIndexingStep::*; use milli::update::UpdateIndexingStep::*;
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
@ -121,49 +122,61 @@ pub struct IndexerOpt {
pub indexing_jobs: Option<usize>, pub indexing_jobs: Option<usize>,
} }
fn highlight_record( struct Highlighter<'a, A> {
object: &mut Map<String, Value>, analyzer: Analyzer<'a, A>,
words_to_highlight: &HashSet<String>, }
attributes_to_highlight: &HashSet<String>,
) { impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
// TODO do we need to create a string for element that are not and needs to be highlight? fn new(stop_words: &'a fst::Set<A>) -> Self {
fn highlight_value(value: Value, words_to_highlight: &HashSet<String>) -> Value { let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
Self { analyzer }
}
fn highlight_value(&self, value: Value, words_to_highlight: &HashSet<String>) -> Value {
match value { match value {
Value::Null => Value::Null, Value::Null => Value::Null,
Value::Bool(boolean) => Value::Bool(boolean), Value::Bool(boolean) => Value::Bool(boolean),
Value::Number(number) => Value::Number(number), Value::Number(number) => Value::Number(number),
Value::String(old_string) => { Value::String(old_string) => {
let mut string = String::new(); let mut string = String::new();
for (token_type, token) in simple_tokenizer(&old_string) { let analyzed = self.analyzer.analyze(&old_string);
if token_type == TokenType::Word { for (word, token) in analyzed.reconstruct() {
let lowercase_token = token.to_lowercase(); if token.is_word() {
let to_highlight = words_to_highlight.contains(&lowercase_token); let to_highlight = words_to_highlight.contains(token.text());
if to_highlight { string.push_str("<mark>") } if to_highlight { string.push_str("<mark>") }
string.push_str(token); string.push_str(word);
if to_highlight { string.push_str("</mark>") } if to_highlight { string.push_str("</mark>") }
} else { } else {
string.push_str(token); string.push_str(word);
} }
} }
Value::String(string) Value::String(string)
}, },
Value::Array(values) => { Value::Array(values) => {
Value::Array(values.into_iter() Value::Array(values.into_iter()
.map(|v| highlight_value(v, words_to_highlight)) .map(|v| self.highlight_value(v, words_to_highlight))
.collect()) .collect())
}, },
Value::Object(object) => { Value::Object(object) => {
Value::Object(object.into_iter() Value::Object(object.into_iter()
.map(|(k, v)| (k, highlight_value(v, words_to_highlight))) .map(|(k, v)| (k, self.highlight_value(v, words_to_highlight)))
.collect()) .collect())
}, },
} }
} }
fn highlight_record(
&self,
object: &mut Map<String, Value>,
words_to_highlight: &HashSet<String>,
attributes_to_highlight: &HashSet<String>,
) {
// TODO do we need to create a string for element that are not and needs to be highlight?
for (key, value) in object.iter_mut() { for (key, value) in object.iter_mut() {
if attributes_to_highlight.contains(key) { if attributes_to_highlight.contains(key) {
let old_value = mem::take(value); let old_value = mem::take(value);
*value = highlight_value(old_value, words_to_highlight); *value = self.highlight_value(old_value, words_to_highlight);
}
} }
} }
} }
@ -651,10 +664,13 @@ async fn main() -> anyhow::Result<()> {
None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(), None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(),
}; };
let stop_words = fst::Set::default();
let highlighter = Highlighter::new(&stop_words);
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
if !disable_highlighting { if !disable_highlighting {
highlight_record(&mut object, &found_words, &attributes_to_highlight); highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight);
} }
documents.push(object); documents.push(object);
@ -716,7 +732,7 @@ async fn main() -> anyhow::Result<()> {
} }
let file = file.into_std().await; let file = file.into_std().await;
let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; let mmap = unsafe { memmap::Mmap::map(&file).expect("can't map file") };
let method = match update_method.as_deref() { let method = match update_method.as_deref() {
Some("replace") => String::from("replace"), Some("replace") => String::from("replace"),

View File

@ -12,7 +12,6 @@ pub mod facet;
pub mod heed_codec; pub mod heed_codec;
pub mod proximity; pub mod proximity;
pub mod subcommand; pub mod subcommand;
pub mod tokenizer;
pub mod update; pub mod update;
use std::borrow::Cow; use std::borrow::Cow;

View File

@ -1,5 +1,4 @@
use std::str; use meilisearch_tokenizer::{Token, TokenKind};
use crate::tokenizer::{simple_tokenizer, TokenType};
#[derive(Debug)] #[derive(Debug)]
enum State { enum State {
@ -18,138 +17,201 @@ impl State {
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub enum QueryToken<'a> { pub enum QueryToken<'a> {
Free(&'a str), Free(Token<'a>),
Quoted(&'a str), Quoted(Token<'a>),
} }
pub struct QueryTokens<'a> { pub fn query_tokens<'a>(mut tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = QueryToken<'a>> {
state: State, let mut state = State::Free;
iter: Box<dyn Iterator<Item=(TokenType, &'a str)> + 'a>, let f = move || {
}
impl QueryTokens<'_> {
pub fn new(query: &str) -> QueryTokens {
QueryTokens {
state: State::Free,
iter: Box::new(simple_tokenizer(query)),
}
}
}
impl<'a> Iterator for QueryTokens<'a> {
type Item = QueryToken<'a>;
fn next(&mut self) -> Option<Self::Item> {
loop { loop {
match self.iter.next()? { let token = tokens.next()?;
(TokenType::Other, "\"") => self.state.swap(), match token.kind() {
(TokenType::Word, token) => { _ if token.text().trim() == "\"" => state.swap(),
let token = match self.state { TokenKind::Word => {
let token = match state {
State::Quoted => QueryToken::Quoted(token), State::Quoted => QueryToken::Quoted(token),
State::Free => QueryToken::Free(token), State::Free => QueryToken::Free(token),
}; };
return Some(token); return Some(token);
}, },
(_, _) => (), _ => (),
}
} }
} }
};
std::iter::from_fn(f)
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use QueryToken::{Quoted, Free}; use QueryToken::{Quoted, Free};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use fst::Set;
macro_rules! assert_eq_query_token {
($test:expr, Quoted($val:literal)) => {
match $test {
Quoted(val) => assert_eq!(val.text(), $val),
Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()),
}
};
($test:expr, Free($val:literal)) => {
match $test {
Quoted(val) => panic!("expected Free(\"{}\"), found Quoted(\"{}\")", $val, val.text()),
Free(val) => assert_eq!(val.text(), $val),
}
};
}
#[test] #[test]
fn empty() { fn empty() {
let mut iter = QueryTokens::new(""); let stop_words = Set::default();
assert_eq!(iter.next(), None); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert!(iter.next().is_none());
let mut iter = QueryTokens::new(" "); let query = " ";
assert_eq!(iter.next(), None); let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert!(iter.next().is_none());
} }
#[test] #[test]
fn one_quoted_string() { fn one_quoted_string() {
let mut iter = QueryTokens::new("\"hello\""); let stop_words = Set::default();
assert_eq!(iter.next(), Some(Quoted("hello"))); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
assert_eq!(iter.next(), None); let query = "\"hello\"";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
assert!(iter.next().is_none());
} }
#[test] #[test]
fn one_pending_quoted_string() { fn one_pending_quoted_string() {
let mut iter = QueryTokens::new("\"hello"); let stop_words = Set::default();
assert_eq!(iter.next(), Some(Quoted("hello"))); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
assert_eq!(iter.next(), None); let query = "\"hello";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
assert!(iter.next().is_none());
} }
#[test] #[test]
fn one_non_quoted_string() { fn one_non_quoted_string() {
let mut iter = QueryTokens::new("hello"); let stop_words = Set::default();
assert_eq!(iter.next(), Some(Free("hello"))); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
assert_eq!(iter.next(), None); let query = "hello";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
assert!(iter.next().is_none());
} }
#[test] #[test]
fn quoted_directly_followed_by_free_strings() { fn quoted_directly_followed_by_free_strings() {
let mut iter = QueryTokens::new("\"hello\"world"); let stop_words = Set::default();
assert_eq!(iter.next(), Some(Quoted("hello"))); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
assert_eq!(iter.next(), Some(Free("world"))); let query = "\"hello\"world";
assert_eq!(iter.next(), None); let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
assert_eq_query_token!(iter.next().unwrap(), Free("world"));
assert!(iter.next().is_none());
} }
#[test] #[test]
fn free_directly_followed_by_quoted_strings() { fn free_directly_followed_by_quoted_strings() {
let mut iter = QueryTokens::new("hello\"world\""); let stop_words = Set::default();
assert_eq!(iter.next(), Some(Free("hello"))); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
assert_eq!(iter.next(), Some(Quoted("world"))); let query = "hello\"world\"";
assert_eq!(iter.next(), None); let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
assert!(iter.next().is_none());
} }
#[test] #[test]
fn free_followed_by_quoted_strings() { fn free_followed_by_quoted_strings() {
let mut iter = QueryTokens::new("hello \"world\""); let stop_words = Set::default();
assert_eq!(iter.next(), Some(Free("hello"))); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
assert_eq!(iter.next(), Some(Quoted("world"))); let query = "hello \"world\"";
assert_eq!(iter.next(), None); let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
assert!(iter.next().is_none());
} }
#[test] #[test]
fn multiple_spaces_separated_strings() { fn multiple_spaces_separated_strings() {
let mut iter = QueryTokens::new("hello world "); let stop_words = Set::default();
assert_eq!(iter.next(), Some(Free("hello"))); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
assert_eq!(iter.next(), Some(Free("world"))); let query = "hello world ";
assert_eq!(iter.next(), None); let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
assert_eq_query_token!(iter.next().unwrap(), Free("world"));
assert!(iter.next().is_none());
} }
#[test] #[test]
fn multi_interleaved_quoted_free_strings() { fn multi_interleaved_quoted_free_strings() {
let mut iter = QueryTokens::new("hello \"world\" coucou \"monde\""); let stop_words = Set::default();
assert_eq!(iter.next(), Some(Free("hello"))); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
assert_eq!(iter.next(), Some(Quoted("world"))); let query = "hello \"world\" coucou \"monde\"";
assert_eq!(iter.next(), Some(Free("coucou"))); let analyzed = analyzer.analyze(query);
assert_eq!(iter.next(), Some(Quoted("monde"))); let tokens = analyzed.tokens();
assert_eq!(iter.next(), None); let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
assert_eq_query_token!(iter.next().unwrap(), Free("coucou"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("monde"));
assert!(iter.next().is_none());
} }
#[test] #[test]
fn multi_quoted_strings() { fn multi_quoted_strings() {
let mut iter = QueryTokens::new("\"hello world\" coucou \"monde est beau\""); let stop_words = Set::default();
assert_eq!(iter.next(), Some(Quoted("hello"))); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
assert_eq!(iter.next(), Some(Quoted("world"))); let query = "\"hello world\" coucou \"monde est beau\"";
assert_eq!(iter.next(), Some(Free("coucou"))); let analyzed = analyzer.analyze(query);
assert_eq!(iter.next(), Some(Quoted("monde"))); let tokens = analyzed.tokens();
assert_eq!(iter.next(), Some(Quoted("est"))); let mut iter = query_tokens(tokens);
assert_eq!(iter.next(), Some(Quoted("beau"))); assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
assert_eq!(iter.next(), None); assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
assert_eq_query_token!(iter.next().unwrap(), Free("coucou"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("monde"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("est"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("beau"));
assert!(iter.next().is_none());
} }
#[test] #[test]
fn chinese() { fn chinese() {
let mut iter = QueryTokens::new("汽车男生"); let stop_words = Set::default();
assert_eq!(iter.next(), Some(Free("汽车"))); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
assert_eq!(iter.next(), Some(Free("男生"))); let query = "汽车男生";
assert_eq!(iter.next(), None); let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("汽车"));
assert_eq_query_token!(iter.next().unwrap(), Free("男生"));
assert!(iter.next().is_none());
} }
} }

View File

@ -4,10 +4,11 @@ use std::fmt;
use std::time::Instant; use std::time::Instant;
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer, Set};
use levenshtein_automata::DFA; use levenshtein_automata::DFA;
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
use log::debug; use log::debug;
use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::bitmap::RoaringBitmap; use roaring::bitmap::RoaringBitmap;
@ -16,7 +17,7 @@ use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
use crate::mdfs::Mdfs; use crate::mdfs::Mdfs;
use crate::query_tokens::{QueryTokens, QueryToken}; use crate::query_tokens::{query_tokens, QueryToken};
use crate::{Index, FieldId, DocumentId, Criterion}; use crate::{Index, FieldId, DocumentId, Criterion};
pub use self::facet::{FacetCondition, FacetNumberOperator, FacetStringOperator}; pub use self::facet::{FacetCondition, FacetNumberOperator, FacetStringOperator};
@ -68,14 +69,19 @@ impl<'a> Search<'a> {
fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> {
let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2); let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2);
let words: Vec<_> = QueryTokens::new(query).collect(); let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let words: Vec<_> = query_tokens(tokens).collect();
let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace); let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let number_of_words = words.len(); let number_of_words = words.len();
words.into_iter().enumerate().map(|(i, word)| { words.into_iter().enumerate().map(|(i, word)| {
let (word, quoted) = match word { let (word, quoted) = match word {
QueryToken::Free(word) => (word.to_lowercase(), word.len() <= 3), QueryToken::Free(token) => (token.text().to_string(), token.text().len() <= 3),
QueryToken::Quoted(word) => (word.to_lowercase(), true), QueryToken::Quoted(token) => (token.text().to_string(), true),
}; };
let is_last = i + 1 == number_of_words; let is_last = i + 1 == number_of_words;
let is_prefix = is_last && !ends_with_whitespace && !quoted; let is_prefix = is_last && !ends_with_whitespace && !quoted;

View File

@ -1,174 +0,0 @@
use std::{str, iter, mem};
use fst::raw::{Fst, Output};
use once_cell::sync::Lazy;
use slice_group_by::StrGroupBy;
use CharCategory::*;
const CHINESE_FST_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/chinese-words.fst"));
static CHINESE_WORDS_FST: Lazy<Fst<&[u8]>> = Lazy::new(|| Fst::new(CHINESE_FST_BYTES).unwrap());
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenType {
Word,
Space,
Other,
}
pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
text
.linear_group_by_key(CharCategory::new)
.flat_map(|mut string| {
let first = string.chars().next().unwrap();
let category = CharCategory::new(first);
iter::from_fn(move || {
if string.is_empty() { return None }
match category {
Chinese => {
let fst = &CHINESE_WORDS_FST;
match find_longest_prefix(fst, string.as_bytes()) {
Some((_, l)) => {
let s = &string[..l];
string = &string[l..];
Some((TokenType::Word, s))
},
None => {
let first = string.chars().next().unwrap();
let len = first.len_utf8();
let (head, tail) = string.split_at(len);
string = tail;
Some((TokenType::Word, head))
},
}
},
Alphanumeric => Some((TokenType::Word, mem::take(&mut string))),
Space => Some((TokenType::Space, mem::take(&mut string))),
Other => Some((TokenType::Other, mem::take(&mut string))),
}
})
})
}
pub fn only_token((t, w): (TokenType, &str)) -> Option<&str> {
if t == TokenType::Word { Some(w) } else { None }
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum CharCategory {
Chinese,
Alphanumeric,
Space,
Other,
}
impl CharCategory {
fn new(c: char) -> Self {
if c.is_alphanumeric() {
if is_chinese(c) { Chinese } else { Alphanumeric }
} else if c.is_whitespace() { Space } else { Other }
}
}
fn is_chinese(c: char) -> bool {
matches!(
u32::from(c),
0x4E00..=0x9FEF
| 0x3400..=0x4DBF
| 0x20000..=0x2A6DF
| 0x2A700..=0x2B73F
| 0x2B740..=0x2B81F
| 0x2B820..=0x2CEAF
| 0x2CEB0..=0x2EBEF
| 0x3007..=0x3007
)
}
/// Find the longest key that is prefix of the given value.
///
/// If the key exists, then `Some((value, key_len))` is returned, where
/// `value` is the value associated with the key, and `key_len` is the
/// length of the found key. Otherwise `None` is returned.
///
/// This can be used to e.g. build tokenizing functions.
// Copyright @llogiq
// https://github.com/BurntSushi/fst/pull/104
#[inline]
fn find_longest_prefix(fst: &Fst<&[u8]>, value: &[u8]) -> Option<(u64, usize)> {
let mut node = fst.root();
let mut out = Output::zero();
let mut last_match = None;
for (i, &b) in value.iter().enumerate() {
if let Some(trans_index) = node.find_input(b) {
let t = node.transition(trans_index);
node = fst.node(t.addr);
out = out.cat(t.out);
if node.is_final() {
last_match = Some((out.cat(node.final_output()).value(), i + 1));
}
} else {
return last_match;
}
}
last_match
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn without_chinese() {
let mut iter = simple_tokenizer("hello world!");
assert_eq!(iter.next(), Some((TokenType::Word, "hello")));
assert_eq!(iter.next(), Some((TokenType::Space, " ")));
assert_eq!(iter.next(), Some((TokenType::Word, "world")));
assert_eq!(iter.next(), Some((TokenType::Other, "!")));
assert_eq!(iter.next(), None);
}
#[test]
fn only_chinese() {
let mut iter = simple_tokenizer("今天的天气真好");
assert_eq!(iter.next(), Some((TokenType::Word, "今天")));
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), Some((TokenType::Word, "天气")));
assert_eq!(iter.next(), Some((TokenType::Word, "真好")));
assert_eq!(iter.next(), None);
}
#[test]
fn mixup_chinese_with_alphabet() {
let mut iter = simple_tokenizer("今天的天气真好Apple is good今天的天气真好");
assert_eq!(iter.next(), Some((TokenType::Word, "今天")));
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), Some((TokenType::Word, "天气")));
assert_eq!(iter.next(), Some((TokenType::Word, "真好")));
assert_eq!(iter.next(), Some((TokenType::Word, "Apple")));
assert_eq!(iter.next(), Some((TokenType::Space, " ")));
assert_eq!(iter.next(), Some((TokenType::Word, "is")));
assert_eq!(iter.next(), Some((TokenType::Space, " ")));
assert_eq!(iter.next(), Some((TokenType::Word, "good")));
assert_eq!(iter.next(), Some((TokenType::Word, "今天")));
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), Some((TokenType::Word, "天气")));
assert_eq!(iter.next(), Some((TokenType::Word, "真好")));
assert_eq!(iter.next(), None);
}
#[test]
fn unknown_chinese() {
let mut iter = simple_tokenizer("被虾头大讚好识𠱁女仔");
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), Some((TokenType::Word, "𠱁")));
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), Some((TokenType::Word, "")));
assert_eq!(iter.next(), None);
}
}

View File

@ -370,6 +370,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let readers = rayon::iter::repeatn(documents, num_threads) let readers = rayon::iter::repeatn(documents, num_threads)
.enumerate() .enumerate()
.map(|(i, documents)| { .map(|(i, documents)| {
let stop_words = fst::Set::default();
let store = Store::new( let store = Store::new(
searchable_fields.clone(), searchable_fields.clone(),
faceted_fields.clone(), faceted_fields.clone(),
@ -379,6 +380,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
chunk_compression_type, chunk_compression_type,
chunk_compression_level, chunk_compression_level,
chunk_fusing_shrink_size, chunk_fusing_shrink_size,
&stop_words,
)?; )?;
store.index( store.index(
documents, documents,

View File

@ -8,20 +8,21 @@ use std::{cmp, iter};
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use bstr::ByteSlice as _; use bstr::ByteSlice as _;
use fst::Set;
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use heed::BytesEncode; use heed::BytesEncode;
use linked_hash_map::LinkedHashMap; use linked_hash_map::LinkedHashMap;
use log::{debug, info}; use log::{debug, info};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use tempfile::tempfile; use tempfile::tempfile;
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
use crate::tokenizer::{simple_tokenizer, only_token}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::update::UpdateIndexingStep; use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId}; use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId};
@ -47,7 +48,7 @@ pub struct Readers {
pub documents: Reader<FileFuse>, pub documents: Reader<FileFuse>,
} }
pub struct Store { pub struct Store<'s, A> {
// Indexing parameters // Indexing parameters
searchable_fields: HashSet<FieldId>, searchable_fields: HashSet<FieldId>,
faceted_fields: HashMap<FieldId, FacetType>, faceted_fields: HashMap<FieldId, FacetType>,
@ -71,9 +72,11 @@ pub struct Store {
// MTBL writers // MTBL writers
docid_word_positions_writer: Writer<File>, docid_word_positions_writer: Writer<File>,
documents_writer: Writer<File>, documents_writer: Writer<File>,
// tokenizer
analyzer: Analyzer<'s, A>,
} }
impl Store { impl<'s, A: AsRef<[u8]>> Store<'s, A> {
pub fn new( pub fn new(
searchable_fields: HashSet<FieldId>, searchable_fields: HashSet<FieldId>,
faceted_fields: HashMap<FieldId, FacetType>, faceted_fields: HashMap<FieldId, FacetType>,
@ -83,7 +86,8 @@ impl Store {
chunk_compression_type: CompressionType, chunk_compression_type: CompressionType,
chunk_compression_level: Option<u32>, chunk_compression_level: Option<u32>,
chunk_fusing_shrink_size: Option<u64>, chunk_fusing_shrink_size: Option<u64>,
) -> anyhow::Result<Store> stop_words: &'s Set<A>,
) -> anyhow::Result<Self>
{ {
// We divide the max memory by the number of sorter the Store have. // We divide the max memory by the number of sorter the Store have.
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4)); let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4));
@ -137,6 +141,8 @@ impl Store {
create_writer(chunk_compression_type, chunk_compression_level, f) create_writer(chunk_compression_type, chunk_compression_level, f)
})?; })?;
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
Ok(Store { Ok(Store {
// Indexing parameters. // Indexing parameters.
searchable_fields, searchable_fields,
@ -161,6 +167,8 @@ impl Store {
// MTBL writers // MTBL writers
docid_word_positions_writer, docid_word_positions_writer,
documents_writer, documents_writer,
// tokenizer
analyzer,
}) })
} }
@ -462,9 +470,13 @@ impl Store {
None => continue, None => continue,
}; };
let tokens = simple_tokenizer(&content).filter_map(only_token); let analyzed = self.analyzer.analyze(&content);
for (pos, token) in tokens.enumerate().take(MAX_POSITION) { let tokens = analyzed
let word = token.to_lowercase(); .tokens()
.filter(|t| t.is_word())
.map(|t| t.text().to_string());
for (pos, word) in tokens.enumerate().take(MAX_POSITION) {
let position = (attr as usize * MAX_POSITION + pos) as u32; let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
} }