mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-27 07:14:26 +01:00
Merge pull request #38 from meilisearch/facet-queries
Introduce a facet filter system
This commit is contained in:
commit
6120f6590b
143
Cargo.lock
generated
143
Cargo.lock
generated
@ -45,6 +45,27 @@ version = "1.2.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
|
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "block-buffer"
|
||||||
|
version = "0.7.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b"
|
||||||
|
dependencies = [
|
||||||
|
"block-padding",
|
||||||
|
"byte-tools",
|
||||||
|
"byteorder",
|
||||||
|
"generic-array",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "block-padding"
|
||||||
|
version = "0.1.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5"
|
||||||
|
dependencies = [
|
||||||
|
"byte-tools",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bstr"
|
name = "bstr"
|
||||||
version = "0.2.13"
|
version = "0.2.13"
|
||||||
@ -63,6 +84,12 @@ version = "3.4.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820"
|
checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "byte-tools"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "byteorder"
|
name = "byteorder"
|
||||||
version = "1.3.4"
|
version = "1.3.4"
|
||||||
@ -285,12 +312,27 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "digest"
|
||||||
|
version = "0.8.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5"
|
||||||
|
dependencies = [
|
||||||
|
"generic-array",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "either"
|
name = "either"
|
||||||
version = "1.6.1"
|
version = "1.6.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
|
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fake-simd"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flate2"
|
name = "flate2"
|
||||||
version = "1.0.17"
|
version = "1.0.17"
|
||||||
@ -324,6 +366,15 @@ dependencies = [
|
|||||||
"byteorder",
|
"byteorder",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "generic-array"
|
||||||
|
version = "0.12.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c68f0274ae0e023facc3c97b2e00f076be70e254bc851d972503b328db79b2ec"
|
||||||
|
dependencies = [
|
||||||
|
"typenum",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "getrandom"
|
name = "getrandom"
|
||||||
version = "0.1.14"
|
version = "0.1.14"
|
||||||
@ -378,9 +429,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heed"
|
name = "heed"
|
||||||
version = "0.10.1"
|
version = "0.10.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797"
|
checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"heed-traits",
|
"heed-traits",
|
||||||
@ -617,9 +668,12 @@ dependencies = [
|
|||||||
"maplit",
|
"maplit",
|
||||||
"memmap",
|
"memmap",
|
||||||
"near-proximity",
|
"near-proximity",
|
||||||
|
"num-traits",
|
||||||
"obkv",
|
"obkv",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"ordered-float",
|
"ordered-float",
|
||||||
|
"pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)",
|
||||||
|
"pest_derive",
|
||||||
"rayon",
|
"rayon",
|
||||||
"ringtail",
|
"ringtail",
|
||||||
"roaring",
|
"roaring",
|
||||||
@ -675,9 +729,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-traits"
|
name = "num-traits"
|
||||||
version = "0.2.12"
|
version = "0.2.14"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611"
|
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"autocfg",
|
"autocfg",
|
||||||
]
|
]
|
||||||
@ -716,6 +770,12 @@ version = "11.1.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c"
|
checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "opaque-debug"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ordered-float"
|
name = "ordered-float"
|
||||||
version = "2.0.0"
|
version = "2.0.0"
|
||||||
@ -741,6 +801,57 @@ version = "2.1.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
|
checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
|
||||||
|
dependencies = [
|
||||||
|
"ucd-trie",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67"
|
||||||
|
dependencies = [
|
||||||
|
"ucd-trie",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest_derive"
|
||||||
|
version = "2.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0"
|
||||||
|
dependencies = [
|
||||||
|
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"pest_generator",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest_generator"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55"
|
||||||
|
dependencies = [
|
||||||
|
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"pest_meta",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest_meta"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d"
|
||||||
|
dependencies = [
|
||||||
|
"maplit",
|
||||||
|
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"sha-1",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pkg-config"
|
name = "pkg-config"
|
||||||
version = "0.3.19"
|
version = "0.3.19"
|
||||||
@ -1025,6 +1136,18 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sha-1"
|
||||||
|
version = "0.8.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df"
|
||||||
|
dependencies = [
|
||||||
|
"block-buffer",
|
||||||
|
"digest",
|
||||||
|
"fake-simd",
|
||||||
|
"opaque-debug",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "slice-group-by"
|
name = "slice-group-by"
|
||||||
version = "0.2.6"
|
version = "0.2.6"
|
||||||
@ -1233,6 +1356,18 @@ version = "0.1.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "typenum"
|
||||||
|
version = "1.12.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ucd-trie"
|
||||||
|
version = "0.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-bidi"
|
name = "unicode-bidi"
|
||||||
version = "0.3.4"
|
version = "0.3.4"
|
||||||
|
@ -14,13 +14,14 @@ flate2 = "1.0.17"
|
|||||||
fst = "0.4.4"
|
fst = "0.4.4"
|
||||||
fxhash = "0.2.1"
|
fxhash = "0.2.1"
|
||||||
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
|
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
|
||||||
heed = { version = "0.10.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
heed = { version = "0.10.4", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
||||||
human_format = "1.0.3"
|
human_format = "1.0.3"
|
||||||
jemallocator = "0.3.2"
|
jemallocator = "0.3.2"
|
||||||
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||||
linked-hash-map = "0.5.3"
|
linked-hash-map = "0.5.3"
|
||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" }
|
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" }
|
||||||
|
num-traits = "0.2.14"
|
||||||
obkv = "0.1.0"
|
obkv = "0.1.0"
|
||||||
once_cell = "1.4.0"
|
once_cell = "1.4.0"
|
||||||
ordered-float = "2.0.0"
|
ordered-float = "2.0.0"
|
||||||
@ -36,6 +37,10 @@ structopt = { version = "0.3.14", default-features = false, features = ["wrap_he
|
|||||||
tempfile = "3.1.0"
|
tempfile = "3.1.0"
|
||||||
uuid = { version = "0.8.1", features = ["v4"] }
|
uuid = { version = "0.8.1", features = ["v4"] }
|
||||||
|
|
||||||
|
# facet filter parser
|
||||||
|
pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" }
|
||||||
|
pest_derive = "2.1.0"
|
||||||
|
|
||||||
# documents words self-join
|
# documents words self-join
|
||||||
itertools = "0.9.0"
|
itertools = "0.9.0"
|
||||||
|
|
||||||
|
70
http-ui/Cargo.lock
generated
70
http-ui/Cargo.lock
generated
@ -654,9 +654,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heed"
|
name = "heed"
|
||||||
version = "0.10.1"
|
version = "0.10.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797"
|
checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"heed-traits",
|
"heed-traits",
|
||||||
@ -934,6 +934,12 @@ dependencies = [
|
|||||||
"cfg-if 0.1.10",
|
"cfg-if 0.1.10",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "maplit"
|
||||||
|
version = "1.0.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "matches"
|
name = "matches"
|
||||||
version = "0.1.8"
|
version = "0.1.8"
|
||||||
@ -987,9 +993,12 @@ dependencies = [
|
|||||||
"log",
|
"log",
|
||||||
"memmap",
|
"memmap",
|
||||||
"near-proximity",
|
"near-proximity",
|
||||||
|
"num-traits",
|
||||||
"obkv",
|
"obkv",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"ordered-float",
|
"ordered-float",
|
||||||
|
"pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)",
|
||||||
|
"pest_derive",
|
||||||
"rayon",
|
"rayon",
|
||||||
"ringtail",
|
"ringtail",
|
||||||
"roaring",
|
"roaring",
|
||||||
@ -1231,6 +1240,57 @@ version = "2.1.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
|
checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
|
||||||
|
dependencies = [
|
||||||
|
"ucd-trie",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67"
|
||||||
|
dependencies = [
|
||||||
|
"ucd-trie",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest_derive"
|
||||||
|
version = "2.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0"
|
||||||
|
dependencies = [
|
||||||
|
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"pest_generator",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest_generator"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55"
|
||||||
|
dependencies = [
|
||||||
|
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"pest_meta",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest_meta"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d"
|
||||||
|
dependencies = [
|
||||||
|
"maplit",
|
||||||
|
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"sha-1 0.8.2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pin-project"
|
name = "pin-project"
|
||||||
version = "0.4.27"
|
version = "0.4.27"
|
||||||
@ -2024,6 +2084,12 @@ version = "1.12.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33"
|
checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ucd-trie"
|
||||||
|
version = "0.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicase"
|
name = "unicase"
|
||||||
version = "2.6.0"
|
version = "2.6.0"
|
||||||
|
@ -8,7 +8,7 @@ edition = "2018"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0.28"
|
anyhow = "1.0.28"
|
||||||
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
|
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
|
||||||
heed = "0.10.1"
|
heed = "0.10.4"
|
||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
milli = { path = ".." }
|
milli = { path = ".." }
|
||||||
once_cell = "1.4.1"
|
once_cell = "1.4.1"
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
var request = null;
|
var request = null;
|
||||||
var timeoutID = null;
|
var timeoutID = null;
|
||||||
|
|
||||||
$('#search').on('input', function () {
|
$('#query, #facet').on('input', function () {
|
||||||
var query = $(this).val();
|
var query = $('#query').val();
|
||||||
|
var facet = $('#facet').val();
|
||||||
var timeoutMs = 100;
|
var timeoutMs = 100;
|
||||||
|
|
||||||
if (timeoutID !== null) {
|
if (timeoutID !== null) {
|
||||||
@ -14,7 +15,7 @@ $('#search').on('input', function () {
|
|||||||
type: "POST",
|
type: "POST",
|
||||||
url: "query",
|
url: "query",
|
||||||
contentType: 'application/json',
|
contentType: 'application/json',
|
||||||
data: JSON.stringify({ 'query': query }),
|
data: JSON.stringify({ 'query': query, 'facetCondition': facet }),
|
||||||
contentType: 'application/json',
|
contentType: 'application/json',
|
||||||
success: function (data, textStatus, request) {
|
success: function (data, textStatus, request) {
|
||||||
results.innerHTML = '';
|
results.innerHTML = '';
|
||||||
@ -77,5 +78,5 @@ $('#db-size').text(function(index, text) {
|
|||||||
// We trigger the input when we load the script, this way
|
// We trigger the input when we load the script, this way
|
||||||
// we execute a placeholder search when the input is empty.
|
// we execute a placeholder search when the input is empty.
|
||||||
$(window).on('load', function () {
|
$(window).on('load', function () {
|
||||||
$('#search').trigger('input');
|
$('#query').trigger('input');
|
||||||
});
|
});
|
||||||
|
@ -2,6 +2,7 @@ use std::borrow::Cow;
|
|||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::fs::{File, create_dir_all};
|
use std::fs::{File, create_dir_all};
|
||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
|
use std::num::NonZeroUsize;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@ -28,7 +29,7 @@ use warp::{Filter, http::Response};
|
|||||||
use milli::tokenizer::{simple_tokenizer, TokenType};
|
use milli::tokenizer::{simple_tokenizer, TokenType};
|
||||||
use milli::update::UpdateIndexingStep::*;
|
use milli::update::UpdateIndexingStep::*;
|
||||||
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
|
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
|
||||||
use milli::{obkv_to_json, Index, UpdateStore, SearchResult};
|
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
|
||||||
|
|
||||||
static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new();
|
static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new();
|
||||||
|
|
||||||
@ -196,6 +197,7 @@ enum UpdateMeta {
|
|||||||
DocumentsAddition { method: String, format: String },
|
DocumentsAddition { method: String, format: String },
|
||||||
ClearDocuments,
|
ClearDocuments,
|
||||||
Settings(Settings),
|
Settings(Settings),
|
||||||
|
Facets(Facets),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@ -231,6 +233,14 @@ struct Settings {
|
|||||||
faceted_attributes: Option<HashMap<String, String>>,
|
faceted_attributes: Option<HashMap<String, String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(deny_unknown_fields)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
struct Facets {
|
||||||
|
level_group_size: Option<NonZeroUsize>,
|
||||||
|
min_level_size: Option<NonZeroUsize>,
|
||||||
|
}
|
||||||
|
|
||||||
// Any value that is present is considered Some value, including null.
|
// Any value that is present is considered Some value, including null.
|
||||||
fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
|
fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
|
||||||
where T: Deserialize<'de>,
|
where T: Deserialize<'de>,
|
||||||
@ -399,6 +409,21 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
Ok(_count) => wtxn.commit().map_err(Into::into),
|
Ok(_count) => wtxn.commit().map_err(Into::into),
|
||||||
Err(e) => Err(e.into())
|
Err(e) => Err(e.into())
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
UpdateMeta::Facets(levels) => {
|
||||||
|
// We must use the write transaction of the update here.
|
||||||
|
let mut wtxn = index_cloned.write_txn()?;
|
||||||
|
let mut builder = update_builder.facets(&mut wtxn, &index_cloned);
|
||||||
|
if let Some(value) = levels.level_group_size {
|
||||||
|
builder.level_group_size(value);
|
||||||
|
}
|
||||||
|
if let Some(value) = levels.min_level_size {
|
||||||
|
builder.min_level_size(value);
|
||||||
|
}
|
||||||
|
match builder.execute() {
|
||||||
|
Ok(()) => wtxn.commit().map_err(Into::into),
|
||||||
|
Err(e) => Err(e.into())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -550,9 +575,12 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
.body(include_str!("../public/logo-black.svg"))
|
.body(include_str!("../public/logo-black.svg"))
|
||||||
);
|
);
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
|
#[serde(deny_unknown_fields)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
struct QueryBody {
|
struct QueryBody {
|
||||||
query: Option<String>,
|
query: Option<String>,
|
||||||
|
facet_condition: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
let disable_highlighting = opt.disable_highlighting;
|
let disable_highlighting = opt.disable_highlighting;
|
||||||
@ -569,6 +597,12 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
if let Some(query) = query.query {
|
if let Some(query) = query.query {
|
||||||
search.query(query);
|
search.query(query);
|
||||||
}
|
}
|
||||||
|
if let Some(condition) = query.facet_condition {
|
||||||
|
if !condition.trim().is_empty() {
|
||||||
|
let condition = FacetCondition::from_str(&rtxn, &index, &condition).unwrap();
|
||||||
|
search.facet_condition(condition);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let SearchResult { found_words, documents_ids } = search.execute().unwrap();
|
let SearchResult { found_words, documents_ids } = search.execute().unwrap();
|
||||||
|
|
||||||
@ -751,6 +785,19 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
Ok(warp::reply())
|
Ok(warp::reply())
|
||||||
});
|
});
|
||||||
|
|
||||||
|
let update_store_cloned = update_store.clone();
|
||||||
|
let update_status_sender_cloned = update_status_sender.clone();
|
||||||
|
let change_facet_levels_route = warp::filters::method::post()
|
||||||
|
.and(warp::path!("facet-level-sizes"))
|
||||||
|
.and(warp::body::json())
|
||||||
|
.map(move |levels: Facets| {
|
||||||
|
let meta = UpdateMeta::Facets(levels);
|
||||||
|
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
|
||||||
|
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
|
||||||
|
eprintln!("update {} registered", update_id);
|
||||||
|
warp::reply()
|
||||||
|
});
|
||||||
|
|
||||||
let update_ws_route = warp::ws()
|
let update_ws_route = warp::ws()
|
||||||
.and(warp::path!("updates" / "ws"))
|
.and(warp::path!("updates" / "ws"))
|
||||||
.map(move |ws: warp::ws::Ws| {
|
.map(move |ws: warp::ws::Ws| {
|
||||||
@ -799,6 +846,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
.or(indexing_json_stream_route)
|
.or(indexing_json_stream_route)
|
||||||
.or(clearing_route)
|
.or(clearing_route)
|
||||||
.or(change_settings_route)
|
.or(change_settings_route)
|
||||||
|
.or(change_facet_levels_route)
|
||||||
.or(update_ws_route);
|
.or(update_ws_route);
|
||||||
|
|
||||||
let addr = SocketAddr::from_str(&opt.http_listen_addr)?;
|
let addr = SocketAddr::from_str(&opt.http_listen_addr)?;
|
||||||
|
@ -55,7 +55,8 @@
|
|||||||
<div class="level-left">
|
<div class="level-left">
|
||||||
<div class="level-item">
|
<div class="level-item">
|
||||||
<div class="field has-addons has-addons-right">
|
<div class="field has-addons has-addons-right">
|
||||||
<input id="search" class="input" type="text" autofocus placeholder="e.g. George Clooney">
|
<input id="query" class="input" type="text" autofocus placeholder="e.g. George Clooney">
|
||||||
|
<input id="facet" class="input" type="text" placeholder="facet filter like released >= 1577836800">
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="level-item"></div>
|
<div class="level-item"></div>
|
||||||
|
86
src/heed_codec/facet/facet_level_value_f64_codec.rs
Normal file
86
src/heed_codec/facet/facet_level_value_f64_codec.rs
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
|
||||||
|
use crate::facet::value_encoding::f64_into_bytes;
|
||||||
|
|
||||||
|
// TODO do not de/serialize right bound when level = 0
|
||||||
|
pub struct FacetLevelValueF64Codec;
|
||||||
|
|
||||||
|
impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
|
||||||
|
type DItem = (u8, u8, f64, f64);
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let (field_id, bytes) = bytes.split_first()?;
|
||||||
|
let (level, bytes) = bytes.split_first()?;
|
||||||
|
|
||||||
|
let (left, right) = if *level != 0 {
|
||||||
|
let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?;
|
||||||
|
let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?;
|
||||||
|
(left, right)
|
||||||
|
} else {
|
||||||
|
let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?;
|
||||||
|
(left, left)
|
||||||
|
};
|
||||||
|
|
||||||
|
Some((*field_id, *level, left, right))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl heed::BytesEncode<'_> for FacetLevelValueF64Codec {
|
||||||
|
type EItem = (u8, u8, f64, f64);
|
||||||
|
|
||||||
|
fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
let mut buffer = [0u8; 32];
|
||||||
|
|
||||||
|
let len = if *level != 0 {
|
||||||
|
// Write the globally ordered floats.
|
||||||
|
let bytes = f64_into_bytes(*left)?;
|
||||||
|
buffer[..8].copy_from_slice(&bytes[..]);
|
||||||
|
|
||||||
|
let bytes = f64_into_bytes(*right)?;
|
||||||
|
buffer[8..16].copy_from_slice(&bytes[..]);
|
||||||
|
|
||||||
|
// Then the f64 values just to be able to read them back.
|
||||||
|
let bytes = left.to_be_bytes();
|
||||||
|
buffer[16..24].copy_from_slice(&bytes[..]);
|
||||||
|
|
||||||
|
let bytes = right.to_be_bytes();
|
||||||
|
buffer[24..].copy_from_slice(&bytes[..]);
|
||||||
|
|
||||||
|
32 // length
|
||||||
|
} else {
|
||||||
|
// Write the globally ordered floats.
|
||||||
|
let bytes = f64_into_bytes(*left)?;
|
||||||
|
buffer[..8].copy_from_slice(&bytes[..]);
|
||||||
|
|
||||||
|
// Then the f64 values just to be able to read them back.
|
||||||
|
let bytes = left.to_be_bytes();
|
||||||
|
buffer[8..16].copy_from_slice(&bytes[..]);
|
||||||
|
|
||||||
|
16 // length
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut bytes = Vec::with_capacity(len + 2);
|
||||||
|
bytes.push(*field_id);
|
||||||
|
bytes.push(*level);
|
||||||
|
bytes.extend_from_slice(&buffer[..len]);
|
||||||
|
Some(Cow::Owned(bytes))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use heed::{BytesEncode, BytesDecode};
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn globally_ordered_f64() {
|
||||||
|
let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap();
|
||||||
|
let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap();
|
||||||
|
assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0));
|
||||||
|
|
||||||
|
let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap();
|
||||||
|
let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap();
|
||||||
|
assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0));
|
||||||
|
}
|
||||||
|
}
|
43
src/heed_codec/facet/facet_level_value_i64_codec.rs
Normal file
43
src/heed_codec/facet/facet_level_value_i64_codec.rs
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
|
||||||
|
use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes};
|
||||||
|
|
||||||
|
pub struct FacetLevelValueI64Codec;
|
||||||
|
|
||||||
|
impl<'a> heed::BytesDecode<'a> for FacetLevelValueI64Codec {
|
||||||
|
type DItem = (u8, u8, i64, i64);
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let (field_id, bytes) = bytes.split_first()?;
|
||||||
|
let (level, bytes) = bytes.split_first()?;
|
||||||
|
|
||||||
|
let left = bytes[..8].try_into().map(i64_from_bytes).ok()?;
|
||||||
|
let right = if *level != 0 {
|
||||||
|
bytes[8..].try_into().map(i64_from_bytes).ok()?
|
||||||
|
} else {
|
||||||
|
left
|
||||||
|
};
|
||||||
|
|
||||||
|
Some((*field_id, *level, left, right))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl heed::BytesEncode<'_> for FacetLevelValueI64Codec {
|
||||||
|
type EItem = (u8, u8, i64, i64);
|
||||||
|
|
||||||
|
fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
let left = i64_into_bytes(*left);
|
||||||
|
let right = i64_into_bytes(*right);
|
||||||
|
|
||||||
|
let mut bytes = Vec::with_capacity(2 + left.len() + right.len());
|
||||||
|
bytes.push(*field_id);
|
||||||
|
bytes.push(*level);
|
||||||
|
bytes.extend_from_slice(&left[..]);
|
||||||
|
if *level != 0 {
|
||||||
|
bytes.extend_from_slice(&right[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(Cow::Owned(bytes))
|
||||||
|
}
|
||||||
|
}
|
@ -1,50 +0,0 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use std::convert::TryInto;
|
|
||||||
|
|
||||||
use crate::facet::value_encoding::f64_into_bytes;
|
|
||||||
|
|
||||||
pub struct FacetValueF64Codec;
|
|
||||||
|
|
||||||
impl<'a> heed::BytesDecode<'a> for FacetValueF64Codec {
|
|
||||||
type DItem = (u8, f64);
|
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
|
||||||
let (field_id, buffer) = bytes.split_first()?;
|
|
||||||
let value = buffer[8..].try_into().ok().map(f64::from_be_bytes)?;
|
|
||||||
Some((*field_id, value))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl heed::BytesEncode<'_> for FacetValueF64Codec {
|
|
||||||
type EItem = (u8, f64);
|
|
||||||
|
|
||||||
fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
|
|
||||||
let mut buffer = [0u8; 16];
|
|
||||||
|
|
||||||
// Write the globally ordered float.
|
|
||||||
let bytes = f64_into_bytes(*value)?;
|
|
||||||
buffer[..8].copy_from_slice(&bytes[..]);
|
|
||||||
|
|
||||||
// Then the f64 value just to be able to read it back.
|
|
||||||
let bytes = value.to_be_bytes();
|
|
||||||
buffer[8..].copy_from_slice(&bytes[..]);
|
|
||||||
|
|
||||||
let mut bytes = Vec::with_capacity(buffer.len() + 1);
|
|
||||||
bytes.push(*field_id);
|
|
||||||
bytes.extend_from_slice(&buffer[..]);
|
|
||||||
Some(Cow::Owned(bytes))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use heed::{BytesEncode, BytesDecode};
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn globally_ordered_f64() {
|
|
||||||
let bytes = FacetValueF64Codec::bytes_encode(&(3, -32.0)).unwrap();
|
|
||||||
let (name, value) = FacetValueF64Codec::bytes_decode(&bytes).unwrap();
|
|
||||||
assert_eq!((name, value), (3, -32.0));
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,28 +0,0 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use std::convert::TryInto;
|
|
||||||
|
|
||||||
use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes};
|
|
||||||
|
|
||||||
pub struct FacetValueI64Codec;
|
|
||||||
|
|
||||||
impl<'a> heed::BytesDecode<'a> for FacetValueI64Codec {
|
|
||||||
type DItem = (u8, i64);
|
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
|
||||||
let (field_id, buffer) = bytes.split_first()?;
|
|
||||||
let value = buffer.try_into().map(i64_from_bytes).ok()?;
|
|
||||||
Some((*field_id, value))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl heed::BytesEncode<'_> for FacetValueI64Codec {
|
|
||||||
type EItem = (u8, i64);
|
|
||||||
|
|
||||||
fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
|
|
||||||
let value = i64_into_bytes(*value);
|
|
||||||
let mut bytes = Vec::with_capacity(value.len() + 1);
|
|
||||||
bytes.push(*field_id);
|
|
||||||
bytes.extend_from_slice(&value[..]);
|
|
||||||
Some(Cow::Owned(bytes))
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,7 +1,7 @@
|
|||||||
mod facet_value_f64_codec;
|
mod facet_level_value_f64_codec;
|
||||||
mod facet_value_i64_codec;
|
mod facet_level_value_i64_codec;
|
||||||
mod facet_value_string_codec;
|
mod facet_value_string_codec;
|
||||||
|
|
||||||
pub use self::facet_value_f64_codec::FacetValueF64Codec;
|
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
|
||||||
pub use self::facet_value_i64_codec::FacetValueI64Codec;
|
pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec;
|
||||||
pub use self::facet_value_string_codec::FacetValueStringCodec;
|
pub use self::facet_value_string_codec::FacetValueStringCodec;
|
||||||
|
22
src/index.rs
22
src/index.rs
@ -18,6 +18,7 @@ use crate::{
|
|||||||
|
|
||||||
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
|
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
|
||||||
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
|
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
|
||||||
|
pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids";
|
||||||
pub const FACETED_FIELDS_KEY: &str = "faceted-fields";
|
pub const FACETED_FIELDS_KEY: &str = "faceted-fields";
|
||||||
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
||||||
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
||||||
@ -224,6 +225,27 @@ impl Index {
|
|||||||
Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default())
|
Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* faceted documents ids */
|
||||||
|
|
||||||
|
/// Writes the documents ids that are faceted under this field id.
|
||||||
|
pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: u8, docids: &RoaringBitmap) -> heed::Result<()> {
|
||||||
|
let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
||||||
|
buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||||
|
*buffer.last_mut().unwrap() = field_id;
|
||||||
|
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Retrieve all the documents ids that faceted under this field id.
|
||||||
|
pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: u8) -> heed::Result<RoaringBitmap> {
|
||||||
|
let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
||||||
|
buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||||
|
*buffer.last_mut().unwrap() = field_id;
|
||||||
|
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
|
||||||
|
Some(docids) => Ok(docids),
|
||||||
|
None => Ok(RoaringBitmap::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* words fst */
|
/* words fst */
|
||||||
|
|
||||||
/// Writes the FST which is the words dictionnary of the engine.
|
/// Writes the FST which is the words dictionnary of the engine.
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
#[macro_use] extern crate pest_derive;
|
||||||
|
|
||||||
mod criterion;
|
mod criterion;
|
||||||
mod external_documents_ids;
|
mod external_documents_ids;
|
||||||
mod fields_ids_map;
|
mod fields_ids_map;
|
||||||
@ -24,7 +26,7 @@ pub use self::criterion::{Criterion, default_criteria};
|
|||||||
pub use self::external_documents_ids::ExternalDocumentsIds;
|
pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||||
pub use self::fields_ids_map::FieldsIdsMap;
|
pub use self::fields_ids_map::FieldsIdsMap;
|
||||||
pub use self::index::Index;
|
pub use self::index::Index;
|
||||||
pub use self::search::{Search, SearchResult};
|
pub use self::search::{Search, FacetCondition, SearchResult};
|
||||||
pub use self::heed_codec::{
|
pub use self::heed_codec::{
|
||||||
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
|
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
|
||||||
ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||||
|
29
src/search/facet/grammar.pest
Normal file
29
src/search/facet/grammar.pest
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
key = _{quoted | word}
|
||||||
|
value = _{quoted | word}
|
||||||
|
quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP }
|
||||||
|
string = {char*}
|
||||||
|
word = ${(LETTER | NUMBER | "_" | "-" | ".")+}
|
||||||
|
|
||||||
|
char = _{ !(PEEK | "\\") ~ ANY
|
||||||
|
| "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t")
|
||||||
|
| "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})}
|
||||||
|
|
||||||
|
condition = _{between | eq | greater | less | geq | leq | neq}
|
||||||
|
between = {key ~ value ~ "TO" ~ value}
|
||||||
|
geq = {key ~ ">=" ~ value}
|
||||||
|
leq = {key ~ "<=" ~ value}
|
||||||
|
neq = {key ~ "!=" ~ value}
|
||||||
|
eq = {key ~ "=" ~ value}
|
||||||
|
greater = {key ~ ">" ~ value}
|
||||||
|
less = {key ~ "<" ~ value}
|
||||||
|
|
||||||
|
prgm = {SOI ~ expr ~ EOI}
|
||||||
|
expr = _{ ( term ~ (operation ~ term)* ) }
|
||||||
|
term = { ("(" ~ expr ~ ")") | condition | not }
|
||||||
|
operation = _{ and | or }
|
||||||
|
and = {"AND"}
|
||||||
|
or = {"OR"}
|
||||||
|
|
||||||
|
not = {"NOT" ~ term}
|
||||||
|
|
||||||
|
WHITESPACE = _{ " " }
|
655
src/search/facet/mod.rs
Normal file
655
src/search/facet/mod.rs
Normal file
@ -0,0 +1,655 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fmt::Debug;
|
||||||
|
use std::ops::Bound::{self, Unbounded, Included, Excluded};
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use heed::types::{ByteSlice, DecodeIgnore};
|
||||||
|
use log::debug;
|
||||||
|
use num_traits::Bounded;
|
||||||
|
use parser::{PREC_CLIMBER, FilterParser};
|
||||||
|
use pest::error::{Error as PestError, ErrorVariant};
|
||||||
|
use pest::iterators::{Pair, Pairs};
|
||||||
|
use pest::Parser;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::facet::FacetType;
|
||||||
|
use crate::heed_codec::facet::FacetValueStringCodec;
|
||||||
|
use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec};
|
||||||
|
use crate::{Index, FieldsIdsMap, CboRoaringBitmapCodec};
|
||||||
|
|
||||||
|
use self::FacetCondition::*;
|
||||||
|
use self::FacetNumberOperator::*;
|
||||||
|
use self::parser::Rule;
|
||||||
|
|
||||||
|
mod parser;
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone, PartialEq)]
|
||||||
|
pub enum FacetNumberOperator<T> {
|
||||||
|
GreaterThan(T),
|
||||||
|
GreaterThanOrEqual(T),
|
||||||
|
Equal(T),
|
||||||
|
NotEqual(T),
|
||||||
|
LowerThan(T),
|
||||||
|
LowerThanOrEqual(T),
|
||||||
|
Between(T, T),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> FacetNumberOperator<T> {
|
||||||
|
/// This method can return two operations in case it must express
|
||||||
|
/// an OR operation for the between case (i.e. `TO`).
|
||||||
|
fn negate(self) -> (Self, Option<Self>) {
|
||||||
|
match self {
|
||||||
|
GreaterThan(x) => (LowerThanOrEqual(x), None),
|
||||||
|
GreaterThanOrEqual(x) => (LowerThan(x), None),
|
||||||
|
Equal(x) => (NotEqual(x), None),
|
||||||
|
NotEqual(x) => (Equal(x), None),
|
||||||
|
LowerThan(x) => (GreaterThanOrEqual(x), None),
|
||||||
|
LowerThanOrEqual(x) => (GreaterThan(x), None),
|
||||||
|
Between(x, y) => (LowerThan(x), Some(GreaterThan(y))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub enum FacetStringOperator {
|
||||||
|
Equal(String),
|
||||||
|
NotEqual(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FacetStringOperator {
|
||||||
|
fn negate(self) -> Self {
|
||||||
|
match self {
|
||||||
|
FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x),
|
||||||
|
FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub enum FacetCondition {
|
||||||
|
OperatorI64(u8, FacetNumberOperator<i64>),
|
||||||
|
OperatorF64(u8, FacetNumberOperator<f64>),
|
||||||
|
OperatorString(u8, FacetStringOperator),
|
||||||
|
Or(Box<Self>, Box<Self>),
|
||||||
|
And(Box<Self>, Box<Self>),
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_field_id_facet_type<'a>(
|
||||||
|
fields_ids_map: &FieldsIdsMap,
|
||||||
|
faceted_fields: &HashMap<u8, FacetType>,
|
||||||
|
items: &mut Pairs<'a, Rule>,
|
||||||
|
) -> Result<(u8, FacetType), PestError<Rule>>
|
||||||
|
{
|
||||||
|
// lexing ensures that we at least have a key
|
||||||
|
let key = items.next().unwrap();
|
||||||
|
let field_id = fields_ids_map
|
||||||
|
.id(key.as_str())
|
||||||
|
.ok_or_else(|| {
|
||||||
|
PestError::new_from_span(
|
||||||
|
ErrorVariant::CustomError {
|
||||||
|
message: format!(
|
||||||
|
"attribute `{}` not found, available attributes are: {}",
|
||||||
|
key.as_str(),
|
||||||
|
fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", ")
|
||||||
|
),
|
||||||
|
},
|
||||||
|
key.as_span(),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let facet_type = faceted_fields
|
||||||
|
.get(&field_id)
|
||||||
|
.copied()
|
||||||
|
.ok_or_else(|| {
|
||||||
|
PestError::new_from_span(
|
||||||
|
ErrorVariant::CustomError {
|
||||||
|
message: format!(
|
||||||
|
"attribute `{}` is not faceted, available faceted attributes are: {}",
|
||||||
|
key.as_str(),
|
||||||
|
faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::<Vec<_>>().join(", ")
|
||||||
|
),
|
||||||
|
},
|
||||||
|
key.as_span(),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok((field_id, facet_type))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn pest_parse<T>(pair: Pair<Rule>) -> Result<T, pest::error::Error<Rule>>
|
||||||
|
where T: FromStr,
|
||||||
|
T::Err: ToString,
|
||||||
|
{
|
||||||
|
match pair.as_str().parse() {
|
||||||
|
Ok(value) => Ok(value),
|
||||||
|
Err(e) => {
|
||||||
|
Err(PestError::<Rule>::new_from_span(
|
||||||
|
ErrorVariant::CustomError { message: e.to_string() },
|
||||||
|
pair.as_span(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FacetCondition {
|
||||||
|
pub fn from_str(
|
||||||
|
rtxn: &heed::RoTxn,
|
||||||
|
index: &Index,
|
||||||
|
expression: &str,
|
||||||
|
) -> anyhow::Result<FacetCondition>
|
||||||
|
{
|
||||||
|
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||||
|
let faceted_fields = index.faceted_fields(rtxn)?;
|
||||||
|
let lexed = FilterParser::parse(Rule::prgm, expression)?;
|
||||||
|
FacetCondition::from_pairs(&fields_ids_map, &faceted_fields, lexed)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_pairs(
|
||||||
|
fim: &FieldsIdsMap,
|
||||||
|
ff: &HashMap<u8, FacetType>,
|
||||||
|
expression: Pairs<Rule>,
|
||||||
|
) -> anyhow::Result<Self>
|
||||||
|
{
|
||||||
|
PREC_CLIMBER.climb(
|
||||||
|
expression,
|
||||||
|
|pair: Pair<Rule>| match pair.as_rule() {
|
||||||
|
Rule::greater => Ok(Self::greater_than(fim, ff, pair)?),
|
||||||
|
Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?),
|
||||||
|
Rule::eq => Ok(Self::equal(fim, ff, pair)?),
|
||||||
|
Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()),
|
||||||
|
Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?),
|
||||||
|
Rule::less => Ok(Self::lower_than(fim, ff, pair)?),
|
||||||
|
Rule::between => Ok(Self::between(fim, ff, pair)?),
|
||||||
|
Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()),
|
||||||
|
Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()),
|
||||||
|
Rule::term => Self::from_pairs(fim, ff, pair.into_inner()),
|
||||||
|
_ => unreachable!(),
|
||||||
|
},
|
||||||
|
|lhs: anyhow::Result<Self>, op: Pair<Rule>, rhs: anyhow::Result<Self>| {
|
||||||
|
match op.as_rule() {
|
||||||
|
Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))),
|
||||||
|
Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))),
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn negate(self) -> FacetCondition {
|
||||||
|
match self {
|
||||||
|
OperatorI64(fid, op) => match op.negate() {
|
||||||
|
(op, None) => OperatorI64(fid, op),
|
||||||
|
(a, Some(b)) => Or(Box::new(OperatorI64(fid, a)), Box::new(OperatorI64(fid, b))),
|
||||||
|
},
|
||||||
|
OperatorF64(fid, op) => match op.negate() {
|
||||||
|
(op, None) => OperatorF64(fid, op),
|
||||||
|
(a, Some(b)) => Or(Box::new(OperatorF64(fid, a)), Box::new(OperatorF64(fid, b))),
|
||||||
|
},
|
||||||
|
OperatorString(fid, op) => OperatorString(fid, op.negate()),
|
||||||
|
Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())),
|
||||||
|
And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn between(
|
||||||
|
fields_ids_map: &FieldsIdsMap,
|
||||||
|
faceted_fields: &HashMap<u8, FacetType>,
|
||||||
|
item: Pair<Rule>,
|
||||||
|
) -> anyhow::Result<FacetCondition>
|
||||||
|
{
|
||||||
|
let item_span = item.as_span();
|
||||||
|
let mut items = item.into_inner();
|
||||||
|
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
|
||||||
|
let lvalue = items.next().unwrap();
|
||||||
|
let rvalue = items.next().unwrap();
|
||||||
|
match ftype {
|
||||||
|
FacetType::Integer => {
|
||||||
|
let lvalue = pest_parse(lvalue)?;
|
||||||
|
let rvalue = pest_parse(rvalue)?;
|
||||||
|
Ok(OperatorI64(fid, Between(lvalue, rvalue)))
|
||||||
|
},
|
||||||
|
FacetType::Float => {
|
||||||
|
let lvalue = pest_parse(lvalue)?;
|
||||||
|
let rvalue = pest_parse(rvalue)?;
|
||||||
|
Ok(OperatorF64(fid, Between(lvalue, rvalue)))
|
||||||
|
},
|
||||||
|
FacetType::String => {
|
||||||
|
Err(PestError::<Rule>::new_from_span(
|
||||||
|
ErrorVariant::CustomError {
|
||||||
|
message: format!("invalid operator on a faceted string"),
|
||||||
|
},
|
||||||
|
item_span,
|
||||||
|
).into())
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn equal(
|
||||||
|
fields_ids_map: &FieldsIdsMap,
|
||||||
|
faceted_fields: &HashMap<u8, FacetType>,
|
||||||
|
item: Pair<Rule>,
|
||||||
|
) -> anyhow::Result<FacetCondition>
|
||||||
|
{
|
||||||
|
let mut items = item.into_inner();
|
||||||
|
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
|
||||||
|
let value = items.next().unwrap();
|
||||||
|
match ftype {
|
||||||
|
FacetType::Integer => Ok(OperatorI64(fid, Equal(pest_parse(value)?))),
|
||||||
|
FacetType::Float => Ok(OperatorF64(fid, Equal(pest_parse(value)?))),
|
||||||
|
FacetType::String => {
|
||||||
|
Ok(OperatorString(fid, FacetStringOperator::Equal(value.as_str().to_string())))
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn greater_than(
|
||||||
|
fields_ids_map: &FieldsIdsMap,
|
||||||
|
faceted_fields: &HashMap<u8, FacetType>,
|
||||||
|
item: Pair<Rule>,
|
||||||
|
) -> anyhow::Result<FacetCondition>
|
||||||
|
{
|
||||||
|
let item_span = item.as_span();
|
||||||
|
let mut items = item.into_inner();
|
||||||
|
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
|
||||||
|
let value = items.next().unwrap();
|
||||||
|
match ftype {
|
||||||
|
FacetType::Integer => Ok(OperatorI64(fid, GreaterThan(pest_parse(value)?))),
|
||||||
|
FacetType::Float => Ok(OperatorF64(fid, GreaterThan(pest_parse(value)?))),
|
||||||
|
FacetType::String => {
|
||||||
|
Err(PestError::<Rule>::new_from_span(
|
||||||
|
ErrorVariant::CustomError {
|
||||||
|
message: format!("invalid operator on a faceted string"),
|
||||||
|
},
|
||||||
|
item_span,
|
||||||
|
).into())
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn greater_than_or_equal(
|
||||||
|
fields_ids_map: &FieldsIdsMap,
|
||||||
|
faceted_fields: &HashMap<u8, FacetType>,
|
||||||
|
item: Pair<Rule>,
|
||||||
|
) -> anyhow::Result<FacetCondition>
|
||||||
|
{
|
||||||
|
let item_span = item.as_span();
|
||||||
|
let mut items = item.into_inner();
|
||||||
|
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
|
||||||
|
let value = items.next().unwrap();
|
||||||
|
match ftype {
|
||||||
|
FacetType::Integer => Ok(OperatorI64(fid, GreaterThanOrEqual(pest_parse(value)?))),
|
||||||
|
FacetType::Float => Ok(OperatorF64(fid, GreaterThanOrEqual(pest_parse(value)?))),
|
||||||
|
FacetType::String => {
|
||||||
|
Err(PestError::<Rule>::new_from_span(
|
||||||
|
ErrorVariant::CustomError {
|
||||||
|
message: format!("invalid operator on a faceted string"),
|
||||||
|
},
|
||||||
|
item_span,
|
||||||
|
).into())
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn lower_than(
|
||||||
|
fields_ids_map: &FieldsIdsMap,
|
||||||
|
faceted_fields: &HashMap<u8, FacetType>,
|
||||||
|
item: Pair<Rule>,
|
||||||
|
) -> anyhow::Result<FacetCondition>
|
||||||
|
{
|
||||||
|
let item_span = item.as_span();
|
||||||
|
let mut items = item.into_inner();
|
||||||
|
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
|
||||||
|
let value = items.next().unwrap();
|
||||||
|
match ftype {
|
||||||
|
FacetType::Integer => Ok(OperatorI64(fid, LowerThan(pest_parse(value)?))),
|
||||||
|
FacetType::Float => Ok(OperatorF64(fid, LowerThan(pest_parse(value)?))),
|
||||||
|
FacetType::String => {
|
||||||
|
Err(PestError::<Rule>::new_from_span(
|
||||||
|
ErrorVariant::CustomError {
|
||||||
|
message: format!("invalid operator on a faceted string"),
|
||||||
|
},
|
||||||
|
item_span,
|
||||||
|
).into())
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn lower_than_or_equal(
|
||||||
|
fields_ids_map: &FieldsIdsMap,
|
||||||
|
faceted_fields: &HashMap<u8, FacetType>,
|
||||||
|
item: Pair<Rule>,
|
||||||
|
) -> anyhow::Result<FacetCondition>
|
||||||
|
{
|
||||||
|
let item_span = item.as_span();
|
||||||
|
let mut items = item.into_inner();
|
||||||
|
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
|
||||||
|
let value = items.next().unwrap();
|
||||||
|
match ftype {
|
||||||
|
FacetType::Integer => Ok(OperatorI64(fid, LowerThanOrEqual(pest_parse(value)?))),
|
||||||
|
FacetType::Float => Ok(OperatorF64(fid, LowerThanOrEqual(pest_parse(value)?))),
|
||||||
|
FacetType::String => {
|
||||||
|
Err(PestError::<Rule>::new_from_span(
|
||||||
|
ErrorVariant::CustomError {
|
||||||
|
message: format!("invalid operator on a faceted string"),
|
||||||
|
},
|
||||||
|
item_span,
|
||||||
|
).into())
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FacetCondition {
|
||||||
|
/// Aggregates the documents ids that are part of the specified range automatically
|
||||||
|
/// going deeper through the levels.
|
||||||
|
fn explore_facet_levels<'t, T: 't, KC>(
|
||||||
|
rtxn: &'t heed::RoTxn,
|
||||||
|
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||||
|
field_id: u8,
|
||||||
|
level: u8,
|
||||||
|
left: Bound<T>,
|
||||||
|
right: Bound<T>,
|
||||||
|
output: &mut RoaringBitmap,
|
||||||
|
) -> anyhow::Result<()>
|
||||||
|
where
|
||||||
|
T: Copy + PartialEq + PartialOrd + Bounded + Debug,
|
||||||
|
KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>,
|
||||||
|
KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
|
||||||
|
{
|
||||||
|
match (left, right) {
|
||||||
|
// If the request is an exact value we must go directly to the deepest level.
|
||||||
|
(Included(l), Included(r)) if l == r && level > 0 => {
|
||||||
|
return Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, 0, left, right, output);
|
||||||
|
},
|
||||||
|
// lower TO upper when lower > upper must return no result
|
||||||
|
(Included(l), Included(r)) if l > r => return Ok(()),
|
||||||
|
(Included(l), Excluded(r)) if l >= r => return Ok(()),
|
||||||
|
(Excluded(l), Excluded(r)) if l >= r => return Ok(()),
|
||||||
|
(Excluded(l), Included(r)) if l >= r => return Ok(()),
|
||||||
|
(_, _) => (),
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut left_found = None;
|
||||||
|
let mut right_found = None;
|
||||||
|
|
||||||
|
// We must create a custom iterator to be able to iterate over the
|
||||||
|
// requested range as the range iterator cannot express some conditions.
|
||||||
|
let left_bound = match left {
|
||||||
|
Included(left) => Included((field_id, level, left, T::min_value())),
|
||||||
|
Excluded(left) => Excluded((field_id, level, left, T::min_value())),
|
||||||
|
Unbounded => Unbounded,
|
||||||
|
};
|
||||||
|
let right_bound = Included((field_id, level, T::max_value(), T::max_value()));
|
||||||
|
// We also make sure that we don't decode the data before we are sure we must return it.
|
||||||
|
let iter = db
|
||||||
|
.remap_key_type::<KC>()
|
||||||
|
.lazily_decode_data()
|
||||||
|
.range(rtxn, &(left_bound, right_bound))?
|
||||||
|
.take_while(|r| r.as_ref().map_or(true, |((.., r), _)| {
|
||||||
|
match right {
|
||||||
|
Included(right) => *r <= right,
|
||||||
|
Excluded(right) => *r < right,
|
||||||
|
Unbounded => true,
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
.map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data))));
|
||||||
|
|
||||||
|
debug!("Iterating between {:?} and {:?} (level {})", left, right, level);
|
||||||
|
|
||||||
|
for (i, result) in iter.enumerate() {
|
||||||
|
let ((_fid, level, l, r), docids) = result?;
|
||||||
|
debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len());
|
||||||
|
output.union_with(&docids);
|
||||||
|
// We save the leftest and rightest bounds we actually found at this level.
|
||||||
|
if i == 0 { left_found = Some(l); }
|
||||||
|
right_found = Some(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Can we go deeper?
|
||||||
|
let deeper_level = match level.checked_sub(1) {
|
||||||
|
Some(level) => level,
|
||||||
|
None => return Ok(()),
|
||||||
|
};
|
||||||
|
|
||||||
|
// We must refine the left and right bounds of this range by retrieving the
|
||||||
|
// missing part in a deeper level.
|
||||||
|
match left_found.zip(right_found) {
|
||||||
|
Some((left_found, right_found)) => {
|
||||||
|
// If the bound is satisfied we avoid calling this function again.
|
||||||
|
if !matches!(left, Included(l) if l == left_found) {
|
||||||
|
let sub_right = Excluded(left_found);
|
||||||
|
debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level);
|
||||||
|
Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, left, sub_right, output)?;
|
||||||
|
}
|
||||||
|
if !matches!(right, Included(r) if r == right_found) {
|
||||||
|
let sub_left = Excluded(right_found);
|
||||||
|
debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level);
|
||||||
|
Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, sub_left, right, output)?;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
// If we found nothing at this level it means that we must find
|
||||||
|
// the same bounds but at a deeper, more precise level.
|
||||||
|
Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, left, right, output)?;
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn evaluate_number_operator<'t, T: 't, KC>(
|
||||||
|
rtxn: &'t heed::RoTxn,
|
||||||
|
index: &Index,
|
||||||
|
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||||
|
field_id: u8,
|
||||||
|
operator: FacetNumberOperator<T>,
|
||||||
|
) -> anyhow::Result<RoaringBitmap>
|
||||||
|
where
|
||||||
|
T: Copy + PartialEq + PartialOrd + Bounded + Debug,
|
||||||
|
KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>,
|
||||||
|
KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
|
||||||
|
{
|
||||||
|
// Make sure we always bound the ranges with the field id and the level,
|
||||||
|
// as the facets values are all in the same database and prefixed by the
|
||||||
|
// field id and the level.
|
||||||
|
let (left, right) = match operator {
|
||||||
|
GreaterThan(val) => (Excluded(val), Included(T::max_value())),
|
||||||
|
GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())),
|
||||||
|
Equal(val) => (Included(val), Included(val)),
|
||||||
|
NotEqual(val) => {
|
||||||
|
let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?;
|
||||||
|
let docids = Self::evaluate_number_operator::<T, KC>(rtxn, index, db, field_id, Equal(val))?;
|
||||||
|
return Ok(all_documents_ids - docids);
|
||||||
|
},
|
||||||
|
LowerThan(val) => (Included(T::min_value()), Excluded(val)),
|
||||||
|
LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)),
|
||||||
|
Between(left, right) => (Included(left), Included(right)),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Ask for the biggest value that can exist for this specific field, if it exists
|
||||||
|
// that's fine if it don't, the value just before will be returned instead.
|
||||||
|
let biggest_level = db
|
||||||
|
.remap_types::<KC, DecodeIgnore>()
|
||||||
|
.get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))?
|
||||||
|
.and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None });
|
||||||
|
|
||||||
|
match biggest_level {
|
||||||
|
Some(level) => {
|
||||||
|
let mut output = RoaringBitmap::new();
|
||||||
|
Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, level, left, right, &mut output)?;
|
||||||
|
Ok(output)
|
||||||
|
},
|
||||||
|
None => Ok(RoaringBitmap::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn evaluate_string_operator(
|
||||||
|
rtxn: &heed::RoTxn,
|
||||||
|
index: &Index,
|
||||||
|
db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
|
||||||
|
field_id: u8,
|
||||||
|
operator: &FacetStringOperator,
|
||||||
|
) -> anyhow::Result<RoaringBitmap>
|
||||||
|
{
|
||||||
|
match operator {
|
||||||
|
FacetStringOperator::Equal(string) => {
|
||||||
|
match db.get(rtxn, &(field_id, string))? {
|
||||||
|
Some(docids) => Ok(docids),
|
||||||
|
None => Ok(RoaringBitmap::new())
|
||||||
|
}
|
||||||
|
},
|
||||||
|
FacetStringOperator::NotEqual(string) => {
|
||||||
|
let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?;
|
||||||
|
let op = FacetStringOperator::Equal(string.clone());
|
||||||
|
let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?;
|
||||||
|
return Ok(all_documents_ids - docids);
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn evaluate(
|
||||||
|
&self,
|
||||||
|
rtxn: &heed::RoTxn,
|
||||||
|
index: &Index,
|
||||||
|
) -> anyhow::Result<RoaringBitmap>
|
||||||
|
{
|
||||||
|
let db = index.facet_field_id_value_docids;
|
||||||
|
match self {
|
||||||
|
OperatorI64(fid, op) => {
|
||||||
|
Self::evaluate_number_operator::<i64, FacetLevelValueI64Codec>(rtxn, index, db, *fid, *op)
|
||||||
|
},
|
||||||
|
OperatorF64(fid, op) => {
|
||||||
|
Self::evaluate_number_operator::<f64, FacetLevelValueF64Codec>(rtxn, index, db, *fid, *op)
|
||||||
|
},
|
||||||
|
OperatorString(fid, op) => {
|
||||||
|
let db = db.remap_key_type::<FacetValueStringCodec>();
|
||||||
|
Self::evaluate_string_operator(rtxn, index, db, *fid, op)
|
||||||
|
},
|
||||||
|
Or(lhs, rhs) => {
|
||||||
|
let lhs = lhs.evaluate(rtxn, index)?;
|
||||||
|
let rhs = rhs.evaluate(rtxn, index)?;
|
||||||
|
Ok(lhs | rhs)
|
||||||
|
},
|
||||||
|
And(lhs, rhs) => {
|
||||||
|
let lhs = lhs.evaluate(rtxn, index)?;
|
||||||
|
let rhs = rhs.evaluate(rtxn, index)?;
|
||||||
|
Ok(lhs & rhs)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::update::Settings;
|
||||||
|
use heed::EnvOpenOptions;
|
||||||
|
use maplit::hashmap;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn string() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
// Set the faceted fields to be the channel.
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = Settings::new(&mut wtxn, &index);
|
||||||
|
builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() });
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
// Test that the facet condition is correctly generated.
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap();
|
||||||
|
let expected = OperatorString(1, FacetStringOperator::Equal("ponce".into()));
|
||||||
|
assert_eq!(condition, expected);
|
||||||
|
|
||||||
|
let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap();
|
||||||
|
let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into()));
|
||||||
|
assert_eq!(condition, expected);
|
||||||
|
|
||||||
|
let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap();
|
||||||
|
let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into()));
|
||||||
|
assert_eq!(condition, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn i64() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
// Set the faceted fields to be the channel.
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = Settings::new(&mut wtxn, &index);
|
||||||
|
builder.set_faceted_fields(hashmap!{ "timestamp".into() => "integer".into() });
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
// Test that the facet condition is correctly generated.
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap();
|
||||||
|
let expected = OperatorI64(1, Between(22, 44));
|
||||||
|
assert_eq!(condition, expected);
|
||||||
|
|
||||||
|
let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap();
|
||||||
|
let expected = Or(
|
||||||
|
Box::new(OperatorI64(1, LowerThan(22))),
|
||||||
|
Box::new(OperatorI64(1, GreaterThan(44))),
|
||||||
|
);
|
||||||
|
assert_eq!(condition, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parentheses() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
// Set the faceted fields to be the channel.
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = Settings::new(&mut wtxn, &index);
|
||||||
|
builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order
|
||||||
|
builder.set_faceted_fields(hashmap!{
|
||||||
|
"channel".into() => "string".into(),
|
||||||
|
"timestamp".into() => "integer".into(),
|
||||||
|
});
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
// Test that the facet condition is correctly generated.
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let condition = FacetCondition::from_str(
|
||||||
|
&rtxn, &index,
|
||||||
|
"channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)",
|
||||||
|
).unwrap();
|
||||||
|
let expected = Or(
|
||||||
|
Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))),
|
||||||
|
Box::new(And(
|
||||||
|
Box::new(OperatorI64(1, Between(22, 44))),
|
||||||
|
Box::new(OperatorString(0, FacetStringOperator::NotEqual("ponce".into()))),
|
||||||
|
))
|
||||||
|
);
|
||||||
|
assert_eq!(condition, expected);
|
||||||
|
|
||||||
|
let condition = FacetCondition::from_str(
|
||||||
|
&rtxn, &index,
|
||||||
|
"channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)",
|
||||||
|
).unwrap();
|
||||||
|
let expected = Or(
|
||||||
|
Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))),
|
||||||
|
Box::new(Or(
|
||||||
|
Box::new(Or(
|
||||||
|
Box::new(OperatorI64(1, LowerThan(22))),
|
||||||
|
Box::new(OperatorI64(1, GreaterThan(44))),
|
||||||
|
)),
|
||||||
|
Box::new(OperatorString(0, FacetStringOperator::Equal("ponce".into()))),
|
||||||
|
)),
|
||||||
|
);
|
||||||
|
assert_eq!(condition, expected);
|
||||||
|
}
|
||||||
|
}
|
12
src/search/facet/parser.rs
Normal file
12
src/search/facet/parser.rs
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use pest::prec_climber::{Operator, Assoc, PrecClimber};
|
||||||
|
|
||||||
|
pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| {
|
||||||
|
use Assoc::*;
|
||||||
|
use Rule::*;
|
||||||
|
pest::prec_climber::PrecClimber::new(vec![Operator::new(or, Left), Operator::new(and, Left)])
|
||||||
|
});
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[grammar = "search/facet/grammar.pest"]
|
||||||
|
pub struct FilterParser;
|
@ -1,5 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
|
use std::fmt;
|
||||||
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use levenshtein_automata::DFA;
|
use levenshtein_automata::DFA;
|
||||||
@ -8,17 +9,22 @@ use log::debug;
|
|||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use roaring::bitmap::RoaringBitmap;
|
use roaring::bitmap::RoaringBitmap;
|
||||||
|
|
||||||
use crate::query_tokens::{QueryTokens, QueryToken};
|
|
||||||
use crate::mdfs::Mdfs;
|
use crate::mdfs::Mdfs;
|
||||||
|
use crate::query_tokens::{QueryTokens, QueryToken};
|
||||||
use crate::{Index, DocumentId};
|
use crate::{Index, DocumentId};
|
||||||
|
|
||||||
|
pub use self::facet::FacetCondition;
|
||||||
|
|
||||||
// Building these factories is not free.
|
// Building these factories is not free.
|
||||||
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||||
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
|
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
|
||||||
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
||||||
|
|
||||||
|
mod facet;
|
||||||
|
|
||||||
pub struct Search<'a> {
|
pub struct Search<'a> {
|
||||||
query: Option<String>,
|
query: Option<String>,
|
||||||
|
facet_condition: Option<FacetCondition>,
|
||||||
offset: usize,
|
offset: usize,
|
||||||
limit: usize,
|
limit: usize,
|
||||||
rtxn: &'a heed::RoTxn<'a>,
|
rtxn: &'a heed::RoTxn<'a>,
|
||||||
@ -27,7 +33,7 @@ pub struct Search<'a> {
|
|||||||
|
|
||||||
impl<'a> Search<'a> {
|
impl<'a> Search<'a> {
|
||||||
pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> {
|
pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> {
|
||||||
Search { query: None, offset: 0, limit: 20, rtxn, index }
|
Search { query: None, facet_condition: None, offset: 0, limit: 20, rtxn, index }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn query(&mut self, query: impl Into<String>) -> &mut Search<'a> {
|
pub fn query(&mut self, query: impl Into<String>) -> &mut Search<'a> {
|
||||||
@ -45,6 +51,11 @@ impl<'a> Search<'a> {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> {
|
||||||
|
self.facet_condition = Some(condition);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
/// Extracts the query words from the query string and returns the DFAs accordingly.
|
/// Extracts the query words from the query string and returns the DFAs accordingly.
|
||||||
/// TODO introduce settings for the number of typos regarding the words lengths.
|
/// TODO introduce settings for the number of typos regarding the words lengths.
|
||||||
fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> {
|
fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> {
|
||||||
@ -135,22 +146,44 @@ impl<'a> Search<'a> {
|
|||||||
|
|
||||||
pub fn execute(&self) -> anyhow::Result<SearchResult> {
|
pub fn execute(&self) -> anyhow::Result<SearchResult> {
|
||||||
let limit = self.limit;
|
let limit = self.limit;
|
||||||
|
|
||||||
let fst = self.index.words_fst(self.rtxn)?;
|
let fst = self.index.words_fst(self.rtxn)?;
|
||||||
|
|
||||||
// Construct the DFAs related to the query words.
|
// Construct the DFAs related to the query words.
|
||||||
let dfas = match self.query.as_deref().map(Self::generate_query_dfas) {
|
let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) {
|
||||||
Some(dfas) if !dfas.is_empty() => dfas,
|
Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?),
|
||||||
_ => {
|
_otherwise => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
// We create the original candidates with the facet conditions results.
|
||||||
|
let facet_candidates = match &self.facet_condition {
|
||||||
|
Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?),
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
debug!("facet candidates: {:?}", facet_candidates);
|
||||||
|
|
||||||
|
let (candidates, derived_words) = match (facet_candidates, derived_words) {
|
||||||
|
(Some(mut facet_candidates), Some(derived_words)) => {
|
||||||
|
let words_candidates = Self::compute_candidates(&derived_words);
|
||||||
|
facet_candidates.intersect_with(&words_candidates);
|
||||||
|
(facet_candidates, derived_words)
|
||||||
|
},
|
||||||
|
(None, Some(derived_words)) => {
|
||||||
|
(Self::compute_candidates(&derived_words), derived_words)
|
||||||
|
},
|
||||||
|
(Some(facet_candidates), None) => {
|
||||||
|
// If the query is not set or results in no DFAs but
|
||||||
|
// there is some facet conditions we return a placeholder.
|
||||||
|
let documents_ids = facet_candidates.iter().take(limit).collect();
|
||||||
|
return Ok(SearchResult { documents_ids, ..Default::default() })
|
||||||
|
},
|
||||||
|
(None, None) => {
|
||||||
// If the query is not set or results in no DFAs we return a placeholder.
|
// If the query is not set or results in no DFAs we return a placeholder.
|
||||||
let documents_ids = self.index.documents_ids(self.rtxn)?.iter().take(limit).collect();
|
let documents_ids = self.index.documents_ids(self.rtxn)?.iter().take(limit).collect();
|
||||||
return Ok(SearchResult { documents_ids, ..Default::default() })
|
return Ok(SearchResult { documents_ids, ..Default::default() })
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
let derived_words = self.fetch_words_docids(&fst, dfas)?;
|
|
||||||
let candidates = Self::compute_candidates(&derived_words);
|
|
||||||
|
|
||||||
debug!("candidates: {:?}", candidates);
|
debug!("candidates: {:?}", candidates);
|
||||||
|
|
||||||
// The mana depth first search is a revised DFS that explore
|
// The mana depth first search is a revised DFS that explore
|
||||||
@ -175,6 +208,18 @@ impl<'a> Search<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for Search<'_> {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
let Search { query, facet_condition, offset, limit, rtxn: _, index: _ } = self;
|
||||||
|
f.debug_struct("Search")
|
||||||
|
.field("query", query)
|
||||||
|
.field("facet_condition", facet_condition)
|
||||||
|
.field("offset", offset)
|
||||||
|
.field("limit", limit)
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct SearchResult {
|
pub struct SearchResult {
|
||||||
pub found_words: HashSet<String>,
|
pub found_words: HashSet<String>,
|
@ -1,10 +1,10 @@
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::{str, io};
|
use std::{str, io, fmt};
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use crate::Index;
|
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
use crate::Index;
|
||||||
|
|
||||||
use Command::*;
|
use Command::*;
|
||||||
|
|
||||||
@ -89,6 +89,12 @@ enum Command {
|
|||||||
field_name: String,
|
field_name: String,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/// Outputs some facets statistics for the given facet name.
|
||||||
|
FacetStats {
|
||||||
|
/// The field name in the document.
|
||||||
|
field_name: String,
|
||||||
|
},
|
||||||
|
|
||||||
/// Outputs the total size of all the docid-word-positions keys and values.
|
/// Outputs the total size of all the docid-word-positions keys and values.
|
||||||
TotalDocidWordPositionsSize,
|
TotalDocidWordPositionsSize,
|
||||||
|
|
||||||
@ -165,6 +171,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
|
|||||||
FacetValuesDocids { full_display, field_name } => {
|
FacetValuesDocids { full_display, field_name } => {
|
||||||
facet_values_docids(&index, &rtxn, !full_display, field_name)
|
facet_values_docids(&index, &rtxn, !full_display, field_name)
|
||||||
},
|
},
|
||||||
|
FacetStats { field_name } => facet_stats(&index, &rtxn, field_name),
|
||||||
TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn),
|
TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn),
|
||||||
AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
|
AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
|
||||||
AverageNumberOfPositionsByWord => {
|
AverageNumberOfPositionsByWord => {
|
||||||
@ -225,46 +232,140 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow:
|
|||||||
Ok(wtr.flush()?)
|
Ok(wtr.flush()?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Helper function that converts the facet value key to a unique type
|
||||||
|
/// that can be used to log or display purposes.
|
||||||
|
fn facet_values_iter<'txn, DC: 'txn, T>(
|
||||||
|
rtxn: &'txn heed::RoTxn,
|
||||||
|
db: heed::Database<heed::types::ByteSlice, DC>,
|
||||||
|
field_id: u8,
|
||||||
|
facet_type: crate::facet::FacetType,
|
||||||
|
string_fn: impl Fn(&str) -> T + 'txn,
|
||||||
|
float_fn: impl Fn(u8, f64, f64) -> T + 'txn,
|
||||||
|
integer_fn: impl Fn(u8, i64, i64) -> T + 'txn,
|
||||||
|
) -> heed::Result<Box<dyn Iterator<Item=heed::Result<(T, DC::DItem)>> + 'txn>>
|
||||||
|
where
|
||||||
|
DC: heed::BytesDecode<'txn>,
|
||||||
|
{
|
||||||
|
use crate::facet::FacetType;
|
||||||
|
use crate::heed_codec::facet::{
|
||||||
|
FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec,
|
||||||
|
};
|
||||||
|
|
||||||
|
let iter = db.prefix_iter(&rtxn, &[field_id])?;
|
||||||
|
match facet_type {
|
||||||
|
FacetType::String => {
|
||||||
|
let iter = iter.remap_key_type::<FacetValueStringCodec>()
|
||||||
|
.map(move |r| r.map(|((_, key), value)| (string_fn(key), value)));
|
||||||
|
Ok(Box::new(iter) as Box<dyn Iterator<Item=_>>)
|
||||||
|
},
|
||||||
|
FacetType::Float => {
|
||||||
|
let iter = iter.remap_key_type::<FacetLevelValueF64Codec>()
|
||||||
|
.map(move |r| r.map(|((_, level, left, right), value)| {
|
||||||
|
(float_fn(level, left, right), value)
|
||||||
|
}));
|
||||||
|
Ok(Box::new(iter))
|
||||||
|
},
|
||||||
|
FacetType::Integer => {
|
||||||
|
let iter = iter.remap_key_type::<FacetLevelValueI64Codec>()
|
||||||
|
.map(move |r| r.map(|((_, level, left, right), value)| {
|
||||||
|
(integer_fn(level, left, right), value)
|
||||||
|
}));
|
||||||
|
Ok(Box::new(iter))
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn facet_number_value_to_string<T: fmt::Debug>(level: u8, left: T, right: T) -> String {
|
||||||
|
if level == 0 {
|
||||||
|
format!("{:?} (level {})", left, level)
|
||||||
|
} else {
|
||||||
|
format!("{:?} to {:?} (level {})", left, right, level)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
|
fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
|
||||||
use std::cmp::Reverse;
|
use std::cmp::Reverse;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
use heed::types::{Str, ByteSlice};
|
use heed::types::{Str, ByteSlice};
|
||||||
use crate::heed_codec::BEU32StrCodec;
|
|
||||||
|
let Index {
|
||||||
|
env: _env,
|
||||||
|
main,
|
||||||
|
word_docids,
|
||||||
|
docid_word_positions,
|
||||||
|
word_pair_proximity_docids,
|
||||||
|
facet_field_id_value_docids,
|
||||||
|
documents,
|
||||||
|
} = index;
|
||||||
|
|
||||||
let main_name = "main";
|
let main_name = "main";
|
||||||
let word_docids_name = "word_docids";
|
let word_docids_name = "word_docids";
|
||||||
let docid_word_positions_name = "docid_word_positions";
|
let docid_word_positions_name = "docid_word_positions";
|
||||||
|
let word_pair_proximity_docids_name = "word_pair_proximity_docids";
|
||||||
|
let facet_field_id_value_docids_name = "facet_field_id_value_docids";
|
||||||
|
let documents_name = "documents";
|
||||||
|
|
||||||
let mut heap = BinaryHeap::with_capacity(limit + 1);
|
let mut heap = BinaryHeap::with_capacity(limit + 1);
|
||||||
|
|
||||||
if limit > 0 {
|
if limit > 0 {
|
||||||
let words_fst = index.words_fst(rtxn)?;
|
let words_fst = index.words_fst(rtxn)?;
|
||||||
|
|
||||||
heap.push(Reverse((words_fst.as_fst().as_bytes().len(), format!("words-fst"), main_name)));
|
heap.push(Reverse((words_fst.as_fst().as_bytes().len(), format!("words-fst"), main_name)));
|
||||||
if heap.len() > limit { heap.pop(); }
|
if heap.len() > limit { heap.pop(); }
|
||||||
|
|
||||||
if let Some(documents) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents")? {
|
if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? {
|
||||||
heap.push(Reverse((documents.len(), format!("documents"), main_name)));
|
|
||||||
if heap.len() > limit { heap.pop(); }
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(documents_ids) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? {
|
|
||||||
heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name)));
|
heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name)));
|
||||||
if heap.len() > limit { heap.pop(); }
|
if heap.len() > limit { heap.pop(); }
|
||||||
}
|
}
|
||||||
|
|
||||||
for result in index.word_docids.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? {
|
for result in word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||||
let (word, value) = result?;
|
let (word, value) = result?;
|
||||||
heap.push(Reverse((value.len(), word.to_string(), word_docids_name)));
|
heap.push(Reverse((value.len(), word.to_string(), word_docids_name)));
|
||||||
if heap.len() > limit { heap.pop(); }
|
if heap.len() > limit { heap.pop(); }
|
||||||
}
|
}
|
||||||
|
|
||||||
for result in index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, ByteSlice>(rtxn)? {
|
for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||||
let ((docid, word), value) = result?;
|
let ((docid, word), value) = result?;
|
||||||
let key = format!("{} {}", docid, word);
|
let key = format!("{} {}", docid, word);
|
||||||
heap.push(Reverse((value.len(), key, docid_word_positions_name)));
|
heap.push(Reverse((value.len(), key, docid_word_positions_name)));
|
||||||
if heap.len() > limit { heap.pop(); }
|
if heap.len() > limit { heap.pop(); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for result in word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||||
|
let ((word1, word2, prox), value) = result?;
|
||||||
|
let key = format!("{} {} {}", word1, word2, prox);
|
||||||
|
heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name)));
|
||||||
|
if heap.len() > limit { heap.pop(); }
|
||||||
|
}
|
||||||
|
|
||||||
|
let faceted_fields = index.faceted_fields(rtxn)?;
|
||||||
|
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||||
|
for (field_id, field_type) in faceted_fields {
|
||||||
|
let facet_name = fields_ids_map.name(field_id).unwrap();
|
||||||
|
|
||||||
|
let db = facet_field_id_value_docids.remap_data_type::<ByteSlice>();
|
||||||
|
let iter = facet_values_iter(
|
||||||
|
rtxn,
|
||||||
|
db,
|
||||||
|
field_id,
|
||||||
|
field_type,
|
||||||
|
|key| key.to_owned(),
|
||||||
|
facet_number_value_to_string,
|
||||||
|
facet_number_value_to_string,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
for result in iter {
|
||||||
|
let (fvalue, value) = result?;
|
||||||
|
let key = format!("{} {}", facet_name, fvalue);
|
||||||
|
heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name)));
|
||||||
|
if heap.len() > limit { heap.pop(); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for result in documents.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||||
|
let (id, value) = result?;
|
||||||
|
heap.push(Reverse((value.len(), id.to_string(), documents_name)));
|
||||||
|
if heap.len() > limit { heap.pop(); }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let stdout = io::stdout();
|
let stdout = io::stdout();
|
||||||
@ -298,10 +399,6 @@ fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<Strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> {
|
fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> {
|
||||||
use crate::facet::FacetType;
|
|
||||||
use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec};
|
|
||||||
use heed::{BytesDecode, Error::Decoding};
|
|
||||||
|
|
||||||
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||||
let faceted_fields = index.faceted_fields(&rtxn)?;
|
let faceted_fields = index.faceted_fields(&rtxn)?;
|
||||||
|
|
||||||
@ -310,51 +407,78 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam
|
|||||||
let field_type = faceted_fields.get(&field_id)
|
let field_type = faceted_fields.get(&field_id)
|
||||||
.with_context(|| format!("field {} is not faceted", field_name))?;
|
.with_context(|| format!("field {} is not faceted", field_name))?;
|
||||||
|
|
||||||
let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?;
|
|
||||||
let iter = match field_type {
|
|
||||||
FacetType::String => {
|
|
||||||
let iter = iter
|
|
||||||
.map(|result| result.and_then(|(key, value)| {
|
|
||||||
let (_, key) = FacetValueStringCodec::bytes_decode(key).ok_or(Decoding)?;
|
|
||||||
Ok((key.to_string(), value))
|
|
||||||
}));
|
|
||||||
Box::new(iter) as Box<dyn Iterator<Item=_>>
|
|
||||||
},
|
|
||||||
FacetType::Float => {
|
|
||||||
let iter = iter
|
|
||||||
.map(|result| result.and_then(|(key, value)| {
|
|
||||||
let (_, key) = FacetValueF64Codec::bytes_decode(key).ok_or(Decoding)?;
|
|
||||||
Ok((key.to_string(), value))
|
|
||||||
}));
|
|
||||||
Box::new(iter)
|
|
||||||
},
|
|
||||||
FacetType::Integer => {
|
|
||||||
let iter = iter
|
|
||||||
.map(|result| result.and_then(|(key, value)| {
|
|
||||||
let (_, key) = FacetValueI64Codec::bytes_decode(key).ok_or(Decoding)?;
|
|
||||||
Ok((key.to_string(), value))
|
|
||||||
}));
|
|
||||||
Box::new(iter)
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
let stdout = io::stdout();
|
let stdout = io::stdout();
|
||||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||||
wtr.write_record(&["facet_value", "documents_ids"])?;
|
wtr.write_record(&["facet_value", "documents_count", "documents_ids"])?;
|
||||||
|
|
||||||
|
let db = index.facet_field_id_value_docids;
|
||||||
|
let iter = facet_values_iter(
|
||||||
|
rtxn,
|
||||||
|
db,
|
||||||
|
field_id,
|
||||||
|
*field_type,
|
||||||
|
|key| key.to_owned(),
|
||||||
|
facet_number_value_to_string,
|
||||||
|
facet_number_value_to_string,
|
||||||
|
)?;
|
||||||
|
|
||||||
for result in iter {
|
for result in iter {
|
||||||
let (value, docids) = result?;
|
let (value, docids) = result?;
|
||||||
|
let count = docids.len();
|
||||||
let docids = if debug {
|
let docids = if debug {
|
||||||
format!("{:?}", docids)
|
format!("{:?}", docids)
|
||||||
} else {
|
} else {
|
||||||
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
||||||
};
|
};
|
||||||
wtr.write_record(&[value, docids])?;
|
wtr.write_record(&[value, count.to_string(), docids])?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(wtr.flush()?)
|
Ok(wtr.flush()?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> {
|
||||||
|
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||||
|
let faceted_fields = index.faceted_fields(&rtxn)?;
|
||||||
|
|
||||||
|
let field_id = fields_ids_map.id(&field_name)
|
||||||
|
.with_context(|| format!("field {} not found", field_name))?;
|
||||||
|
let field_type = faceted_fields.get(&field_id)
|
||||||
|
.with_context(|| format!("field {} is not faceted", field_name))?;
|
||||||
|
|
||||||
|
let db = index.facet_field_id_value_docids;
|
||||||
|
let iter = facet_values_iter(
|
||||||
|
rtxn,
|
||||||
|
db,
|
||||||
|
field_id,
|
||||||
|
*field_type,
|
||||||
|
|_key| 0u8,
|
||||||
|
|level, _left, _right| level,
|
||||||
|
|level, _left, _right| level,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
println!("The database {:?} facet stats", field_name);
|
||||||
|
|
||||||
|
let mut level_size = 0;
|
||||||
|
let mut current_level = None;
|
||||||
|
for result in iter {
|
||||||
|
let (level, _) = result?;
|
||||||
|
if let Some(current) = current_level {
|
||||||
|
if current != level {
|
||||||
|
println!("\tnumber of groups at level {}: {}", current, level_size);
|
||||||
|
level_size = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
current_level = Some(level);
|
||||||
|
level_size += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(current) = current_level {
|
||||||
|
println!("\tnumber of groups at level {}: {}", current, level_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> {
|
fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> {
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::Write as _;
|
use std::io::Write as _;
|
||||||
|
@ -24,12 +24,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
// We retrieve the number of documents ids that we are deleting.
|
// We retrieve the number of documents ids that we are deleting.
|
||||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||||
|
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
|
||||||
|
|
||||||
// We clean some of the main engine datastructures.
|
// We clean some of the main engine datastructures.
|
||||||
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
||||||
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
|
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
|
||||||
self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
|
self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
|
||||||
|
|
||||||
|
// We clean all the faceted documents ids.
|
||||||
|
for (field_id, _) in faceted_fields {
|
||||||
|
self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?;
|
||||||
|
}
|
||||||
|
|
||||||
// Clear the other databases.
|
// Clear the other databases.
|
||||||
word_docids.clear(self.wtxn)?;
|
word_docids.clear(self.wtxn)?;
|
||||||
docid_word_positions.clear(self.wtxn)?;
|
docid_word_positions.clear(self.wtxn)?;
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
|
use heed::types::ByteSlice;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds};
|
use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds};
|
||||||
@ -132,11 +133,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?;
|
let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?;
|
||||||
if let Some((key, mut docids)) = iter.next().transpose()? {
|
if let Some((key, mut docids)) = iter.next().transpose()? {
|
||||||
if key == word.as_ref() {
|
if key == word.as_ref() {
|
||||||
|
let previous_len = docids.len();
|
||||||
docids.difference_with(&self.documents_ids);
|
docids.difference_with(&self.documents_ids);
|
||||||
if docids.is_empty() {
|
if docids.is_empty() {
|
||||||
iter.del_current()?;
|
iter.del_current()?;
|
||||||
*must_remove = true;
|
*must_remove = true;
|
||||||
} else {
|
} else if docids.len() != previous_len {
|
||||||
iter.put_current(key, &docids)?;
|
iter.put_current(key, &docids)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -168,27 +170,37 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
// We delete the documents ids that are under the pairs of words,
|
// We delete the documents ids that are under the pairs of words,
|
||||||
// it is faster and use no memory to iterate over all the words pairs than
|
// it is faster and use no memory to iterate over all the words pairs than
|
||||||
// to compute the cartesian product of every words of the deleted documents.
|
// to compute the cartesian product of every words of the deleted documents.
|
||||||
let mut iter = word_pair_proximity_docids.iter_mut(self.wtxn)?;
|
let mut iter = word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
|
||||||
while let Some(result) = iter.next() {
|
while let Some(result) = iter.next() {
|
||||||
let ((w1, w2, prox), mut docids) = result?;
|
let (bytes, mut docids) = result?;
|
||||||
|
let previous_len = docids.len();
|
||||||
docids.difference_with(&self.documents_ids);
|
docids.difference_with(&self.documents_ids);
|
||||||
if docids.is_empty() {
|
if docids.is_empty() {
|
||||||
iter.del_current()?;
|
iter.del_current()?;
|
||||||
} else {
|
} else if docids.len() != previous_len {
|
||||||
iter.put_current(&(w1, w2, prox), &docids)?;
|
iter.put_current(bytes, &docids)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
drop(iter);
|
drop(iter);
|
||||||
|
|
||||||
|
// Remove the documents ids from the faceted documents ids.
|
||||||
|
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
|
||||||
|
for (field_id, _) in faceted_fields {
|
||||||
|
let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?;
|
||||||
|
docids.difference_with(&self.documents_ids);
|
||||||
|
self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?;
|
||||||
|
}
|
||||||
|
|
||||||
// We delete the documents ids that are under the facet field id values.
|
// We delete the documents ids that are under the facet field id values.
|
||||||
let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?;
|
let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?;
|
||||||
while let Some(result) = iter.next() {
|
while let Some(result) = iter.next() {
|
||||||
let (bytes, mut docids) = result?;
|
let (bytes, mut docids) = result?;
|
||||||
|
let previous_len = docids.len();
|
||||||
docids.difference_with(&self.documents_ids);
|
docids.difference_with(&self.documents_ids);
|
||||||
if docids.is_empty() {
|
if docids.is_empty() {
|
||||||
iter.del_current()?;
|
iter.del_current()?;
|
||||||
} else {
|
} else if docids.len() != previous_len {
|
||||||
iter.put_current(bytes, &docids)?;
|
iter.put_current(bytes, &docids)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
256
src/update/facets.rs
Normal file
256
src/update/facets.rs
Normal file
@ -0,0 +1,256 @@
|
|||||||
|
use std::cmp;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::num::NonZeroUsize;
|
||||||
|
|
||||||
|
use grenad::{CompressionType, Reader, Writer, FileFuse};
|
||||||
|
use heed::types::{ByteSlice, DecodeIgnore};
|
||||||
|
use heed::{BytesEncode, Error};
|
||||||
|
use log::debug;
|
||||||
|
use num_traits::{Bounded, Zero};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::facet::FacetType;
|
||||||
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
|
use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec};
|
||||||
|
use crate::Index;
|
||||||
|
use crate::update::index_documents::WriteMethod;
|
||||||
|
use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database};
|
||||||
|
|
||||||
|
pub struct Facets<'t, 'u, 'i> {
|
||||||
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
|
index: &'i Index,
|
||||||
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
|
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||||
|
level_group_size: NonZeroUsize,
|
||||||
|
min_level_size: NonZeroUsize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||||
|
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> {
|
||||||
|
Facets {
|
||||||
|
wtxn,
|
||||||
|
index,
|
||||||
|
chunk_compression_type: CompressionType::None,
|
||||||
|
chunk_compression_level: None,
|
||||||
|
chunk_fusing_shrink_size: None,
|
||||||
|
level_group_size: NonZeroUsize::new(4).unwrap(),
|
||||||
|
min_level_size: NonZeroUsize::new(5).unwrap(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self {
|
||||||
|
self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap();
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self {
|
||||||
|
self.min_level_size = value;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn execute(self) -> anyhow::Result<()> {
|
||||||
|
// We get the faceted fields to be able to create the facet levels.
|
||||||
|
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
|
||||||
|
|
||||||
|
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||||
|
for (field_id, facet_type) in faceted_fields {
|
||||||
|
let (content, documents_ids) = match facet_type {
|
||||||
|
FacetType::Integer => {
|
||||||
|
clear_field_levels::<i64, FacetLevelValueI64Codec>(
|
||||||
|
self.wtxn,
|
||||||
|
self.index.facet_field_id_value_docids,
|
||||||
|
field_id,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let documents_ids = compute_faceted_documents_ids(
|
||||||
|
self.wtxn,
|
||||||
|
self.index.facet_field_id_value_docids,
|
||||||
|
field_id,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let content = compute_facet_levels::<i64, FacetLevelValueI64Codec>(
|
||||||
|
self.wtxn,
|
||||||
|
self.index.facet_field_id_value_docids,
|
||||||
|
self.chunk_compression_type,
|
||||||
|
self.chunk_compression_level,
|
||||||
|
self.chunk_fusing_shrink_size,
|
||||||
|
self.level_group_size,
|
||||||
|
self.min_level_size,
|
||||||
|
field_id,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
(Some(content), documents_ids)
|
||||||
|
},
|
||||||
|
FacetType::Float => {
|
||||||
|
clear_field_levels::<f64, FacetLevelValueF64Codec>(
|
||||||
|
self.wtxn,
|
||||||
|
self.index.facet_field_id_value_docids,
|
||||||
|
field_id,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let documents_ids = compute_faceted_documents_ids(
|
||||||
|
self.wtxn,
|
||||||
|
self.index.facet_field_id_value_docids,
|
||||||
|
field_id,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let content = compute_facet_levels::<f64, FacetLevelValueF64Codec>(
|
||||||
|
self.wtxn,
|
||||||
|
self.index.facet_field_id_value_docids,
|
||||||
|
self.chunk_compression_type,
|
||||||
|
self.chunk_compression_level,
|
||||||
|
self.chunk_fusing_shrink_size,
|
||||||
|
self.level_group_size,
|
||||||
|
self.min_level_size,
|
||||||
|
field_id,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
(Some(content), documents_ids)
|
||||||
|
},
|
||||||
|
FacetType::String => {
|
||||||
|
let documents_ids = compute_faceted_documents_ids(
|
||||||
|
self.wtxn,
|
||||||
|
self.index.facet_field_id_value_docids,
|
||||||
|
field_id,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
(None, documents_ids)
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(content) = content {
|
||||||
|
write_into_lmdb_database(
|
||||||
|
self.wtxn,
|
||||||
|
*self.index.facet_field_id_value_docids.as_polymorph(),
|
||||||
|
content,
|
||||||
|
|_, _| anyhow::bail!("invalid facet level merging"),
|
||||||
|
WriteMethod::GetMergePut,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn clear_field_levels<'t, T: 't, KC>(
|
||||||
|
wtxn: &'t mut heed::RwTxn,
|
||||||
|
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||||
|
field_id: u8,
|
||||||
|
) -> heed::Result<()>
|
||||||
|
where
|
||||||
|
T: Copy + Bounded,
|
||||||
|
KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>,
|
||||||
|
KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
|
||||||
|
{
|
||||||
|
let left = (field_id, 1, T::min_value(), T::min_value());
|
||||||
|
let right = (field_id, u8::MAX, T::max_value(), T::max_value());
|
||||||
|
let range = left..=right;
|
||||||
|
db.remap_key_type::<KC>().delete_range(wtxn, &range).map(drop)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compute_facet_levels<'t, T: 't, KC>(
|
||||||
|
rtxn: &'t heed::RoTxn,
|
||||||
|
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||||
|
compression_type: CompressionType,
|
||||||
|
compression_level: Option<u32>,
|
||||||
|
shrink_size: Option<u64>,
|
||||||
|
level_group_size: NonZeroUsize,
|
||||||
|
min_level_size: NonZeroUsize,
|
||||||
|
field_id: u8,
|
||||||
|
) -> anyhow::Result<Reader<FileFuse>>
|
||||||
|
where
|
||||||
|
T: Copy + PartialEq + PartialOrd + Bounded + Zero,
|
||||||
|
KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>,
|
||||||
|
KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
|
||||||
|
{
|
||||||
|
let first_level_size = db.prefix_iter(rtxn, &[field_id])?
|
||||||
|
.remap_types::<DecodeIgnore, DecodeIgnore>()
|
||||||
|
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
|
||||||
|
|
||||||
|
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
|
||||||
|
// therefore we write the facet levels entries into a grenad file before transfering them.
|
||||||
|
let mut writer = tempfile::tempfile().and_then(|file| {
|
||||||
|
create_writer(compression_type, compression_level, file)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let level_0_range = {
|
||||||
|
let left = (field_id, 0, T::min_value(), T::min_value());
|
||||||
|
let right = (field_id, 0, T::max_value(), T::max_value());
|
||||||
|
left..=right
|
||||||
|
};
|
||||||
|
|
||||||
|
// Groups sizes are always a power of the original level_group_size and therefore a group
|
||||||
|
// always maps groups of the previous level and never splits previous levels groups in half.
|
||||||
|
let group_size_iter = (1u8..)
|
||||||
|
.map(|l| (l, level_group_size.get().pow(l as u32)))
|
||||||
|
.take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
|
||||||
|
|
||||||
|
for (level, group_size) in group_size_iter {
|
||||||
|
let mut left = T::zero();
|
||||||
|
let mut right = T::zero();
|
||||||
|
let mut group_docids = RoaringBitmap::new();
|
||||||
|
|
||||||
|
let db = db.remap_key_type::<KC>();
|
||||||
|
for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() {
|
||||||
|
let ((_field_id, _level, value, _right), docids) = result?;
|
||||||
|
|
||||||
|
if i == 0 {
|
||||||
|
left = value;
|
||||||
|
} else if i % group_size == 0 {
|
||||||
|
// we found the first bound of the next group, we must store the left
|
||||||
|
// and right bounds associated with the docids.
|
||||||
|
write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?;
|
||||||
|
|
||||||
|
// We save the left bound for the new group and also reset the docids.
|
||||||
|
group_docids = RoaringBitmap::new();
|
||||||
|
left = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The right bound is always the bound we run through.
|
||||||
|
group_docids.union_with(&docids);
|
||||||
|
right = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if !group_docids.is_empty() {
|
||||||
|
write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writer_into_reader(writer, shrink_size)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compute_faceted_documents_ids(
|
||||||
|
rtxn: &heed::RoTxn,
|
||||||
|
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||||
|
field_id: u8,
|
||||||
|
) -> anyhow::Result<RoaringBitmap>
|
||||||
|
{
|
||||||
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
|
for result in db.prefix_iter(rtxn, &[field_id])? {
|
||||||
|
let (_key, docids) = result?;
|
||||||
|
documents_ids.union_with(&docids);
|
||||||
|
}
|
||||||
|
Ok(documents_ids)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_entry<T, KC>(
|
||||||
|
writer: &mut Writer<File>,
|
||||||
|
field_id: u8,
|
||||||
|
level: u8,
|
||||||
|
left: T,
|
||||||
|
right: T,
|
||||||
|
ids: &RoaringBitmap,
|
||||||
|
) -> anyhow::Result<()>
|
||||||
|
where
|
||||||
|
KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
|
||||||
|
{
|
||||||
|
let key = (field_id, level, left, right);
|
||||||
|
let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?;
|
||||||
|
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;
|
||||||
|
writer.insert(&key, &data)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
@ -2,6 +2,7 @@ use std::borrow::Cow;
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, Seek, SeekFrom};
|
use std::io::{self, Seek, SeekFrom};
|
||||||
|
use std::num::NonZeroUsize;
|
||||||
use std::sync::mpsc::sync_channel;
|
use std::sync::mpsc::sync_channel;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
@ -15,7 +16,7 @@ use rayon::prelude::*;
|
|||||||
use rayon::ThreadPool;
|
use rayon::ThreadPool;
|
||||||
|
|
||||||
use crate::index::Index;
|
use crate::index::Index;
|
||||||
use crate::update::UpdateIndexingStep;
|
use crate::update::{Facets, UpdateIndexingStep};
|
||||||
use self::store::{Store, Readers};
|
use self::store::{Store, Readers};
|
||||||
use self::merge_function::{
|
use self::merge_function::{
|
||||||
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
||||||
@ -31,12 +32,12 @@ mod store;
|
|||||||
mod transform;
|
mod transform;
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone)]
|
#[derive(Debug, Copy, Clone)]
|
||||||
enum WriteMethod {
|
pub enum WriteMethod {
|
||||||
Append,
|
Append,
|
||||||
GetMergePut,
|
GetMergePut,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> {
|
pub fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> {
|
||||||
let mut builder = Writer::builder();
|
let mut builder = Writer::builder();
|
||||||
builder.compression_type(typ);
|
builder.compression_type(typ);
|
||||||
if let Some(level) = level {
|
if let Some(level) = level {
|
||||||
@ -45,7 +46,7 @@ fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Re
|
|||||||
builder.build(file)
|
builder.build(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn create_sorter(
|
pub fn create_sorter(
|
||||||
merge: MergeFn,
|
merge: MergeFn,
|
||||||
chunk_compression_type: CompressionType,
|
chunk_compression_type: CompressionType,
|
||||||
chunk_compression_level: Option<u32>,
|
chunk_compression_level: Option<u32>,
|
||||||
@ -71,7 +72,7 @@ fn create_sorter(
|
|||||||
builder.build()
|
builder.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow::Result<Reader<FileFuse>> {
|
pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow::Result<Reader<FileFuse>> {
|
||||||
let mut file = writer.into_inner()?;
|
let mut file = writer.into_inner()?;
|
||||||
file.seek(SeekFrom::Start(0))?;
|
file.seek(SeekFrom::Start(0))?;
|
||||||
let file = if let Some(shrink_size) = shrink_size {
|
let file = if let Some(shrink_size) = shrink_size {
|
||||||
@ -82,13 +83,13 @@ fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow:
|
|||||||
Reader::new(file).map_err(Into::into)
|
Reader::new(file).map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_readers(sources: Vec<Reader<FileFuse>>, merge: MergeFn) -> Merger<FileFuse, MergeFn> {
|
pub fn merge_readers(sources: Vec<Reader<FileFuse>>, merge: MergeFn) -> Merger<FileFuse, MergeFn> {
|
||||||
let mut builder = Merger::builder(merge);
|
let mut builder = Merger::builder(merge);
|
||||||
builder.extend(sources);
|
builder.extend(sources);
|
||||||
builder.build()
|
builder.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_into_lmdb_database(
|
pub fn merge_into_lmdb_database(
|
||||||
wtxn: &mut heed::RwTxn,
|
wtxn: &mut heed::RwTxn,
|
||||||
database: heed::PolyDatabase,
|
database: heed::PolyDatabase,
|
||||||
sources: Vec<Reader<FileFuse>>,
|
sources: Vec<Reader<FileFuse>>,
|
||||||
@ -132,7 +133,7 @@ fn merge_into_lmdb_database(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_into_lmdb_database(
|
pub fn write_into_lmdb_database(
|
||||||
wtxn: &mut heed::RwTxn,
|
wtxn: &mut heed::RwTxn,
|
||||||
database: heed::PolyDatabase,
|
database: heed::PolyDatabase,
|
||||||
mut reader: Reader<FileFuse>,
|
mut reader: Reader<FileFuse>,
|
||||||
@ -157,7 +158,7 @@ fn write_into_lmdb_database(
|
|||||||
match iter.next().transpose()? {
|
match iter.next().transpose()? {
|
||||||
Some((key, old_val)) if key == k => {
|
Some((key, old_val)) if key == k => {
|
||||||
let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
|
let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
|
||||||
let val = merge(k, &vals).expect("merge failed");
|
let val = merge(k, &vals)?;
|
||||||
iter.put_current(k, &val)?;
|
iter.put_current(k, &val)?;
|
||||||
},
|
},
|
||||||
_ => {
|
_ => {
|
||||||
@ -207,6 +208,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
pub(crate) chunk_compression_level: Option<u32>,
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||||
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
||||||
|
facet_level_group_size: Option<NonZeroUsize>,
|
||||||
|
facet_min_level_size: Option<NonZeroUsize>,
|
||||||
update_method: IndexDocumentsMethod,
|
update_method: IndexDocumentsMethod,
|
||||||
update_format: UpdateFormat,
|
update_format: UpdateFormat,
|
||||||
autogenerate_docids: bool,
|
autogenerate_docids: bool,
|
||||||
@ -225,6 +228,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
chunk_compression_level: None,
|
chunk_compression_level: None,
|
||||||
chunk_fusing_shrink_size: None,
|
chunk_fusing_shrink_size: None,
|
||||||
thread_pool: None,
|
thread_pool: None,
|
||||||
|
facet_level_group_size: None,
|
||||||
|
facet_min_level_size: None,
|
||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
update_format: UpdateFormat::Json,
|
update_format: UpdateFormat::Json,
|
||||||
autogenerate_docids: true,
|
autogenerate_docids: true,
|
||||||
@ -308,8 +313,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
thread_pool: self.thread_pool,
|
thread_pool: self.thread_pool,
|
||||||
};
|
};
|
||||||
let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?;
|
let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?;
|
||||||
|
debug!("documents to delete {:?}", replaced_documents_ids);
|
||||||
deletion_builder.delete_documents(&replaced_documents_ids);
|
deletion_builder.delete_documents(&replaced_documents_ids);
|
||||||
let _deleted_documents_count = deletion_builder.execute()?;
|
let deleted_documents_count = deletion_builder.execute()?;
|
||||||
|
debug!("{} documents actually deleted", deleted_documents_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mmap;
|
let mmap;
|
||||||
@ -327,7 +334,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
enum DatabaseType {
|
enum DatabaseType {
|
||||||
Main,
|
Main,
|
||||||
WordDocids,
|
WordDocids,
|
||||||
FacetValuesDocids,
|
FacetLevel0ValuesDocids,
|
||||||
}
|
}
|
||||||
|
|
||||||
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
|
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
|
||||||
@ -427,7 +434,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
(DatabaseType::Main, main_readers, main_merge as MergeFn),
|
(DatabaseType::Main, main_readers, main_merge as MergeFn),
|
||||||
(DatabaseType::WordDocids, word_docids_readers, word_docids_merge),
|
(DatabaseType::WordDocids, word_docids_readers, word_docids_merge),
|
||||||
(
|
(
|
||||||
DatabaseType::FacetValuesDocids,
|
DatabaseType::FacetLevel0ValuesDocids,
|
||||||
facet_field_value_docids_readers,
|
facet_field_value_docids_readers,
|
||||||
facet_field_value_docids_merge,
|
facet_field_value_docids_merge,
|
||||||
),
|
),
|
||||||
@ -557,7 +564,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
write_method,
|
write_method,
|
||||||
)?;
|
)?;
|
||||||
},
|
},
|
||||||
DatabaseType::FacetValuesDocids => {
|
DatabaseType::FacetLevel0ValuesDocids => {
|
||||||
debug!("Writing the facet values docids into LMDB on disk...");
|
debug!("Writing the facet values docids into LMDB on disk...");
|
||||||
let db = *self.index.facet_field_id_value_docids.as_polymorph();
|
let db = *self.index.facet_field_id_value_docids.as_polymorph();
|
||||||
write_into_lmdb_database(
|
write_into_lmdb_database(
|
||||||
@ -577,6 +584,18 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut builder = Facets::new(self.wtxn, self.index);
|
||||||
|
builder.chunk_compression_type = self.chunk_compression_type;
|
||||||
|
builder.chunk_compression_level = self.chunk_compression_level;
|
||||||
|
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||||
|
if let Some(value) = self.facet_level_group_size {
|
||||||
|
builder.level_group_size(value);
|
||||||
|
}
|
||||||
|
if let Some(value) = self.facet_min_level_size {
|
||||||
|
builder.min_level_size(value);
|
||||||
|
}
|
||||||
|
builder.execute()?;
|
||||||
|
|
||||||
debug_assert_eq!(database_count, total_databases);
|
debug_assert_eq!(database_count, total_databases);
|
||||||
|
|
||||||
info!("Transform output indexed in {:.02?}", before_indexing.elapsed());
|
info!("Transform output indexed in {:.02?}", before_indexing.elapsed());
|
||||||
|
@ -19,7 +19,7 @@ use tempfile::tempfile;
|
|||||||
|
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||||
use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec};
|
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
|
||||||
use crate::tokenizer::{simple_tokenizer, only_token};
|
use crate::tokenizer::{simple_tokenizer, only_token};
|
||||||
use crate::update::UpdateIndexingStep;
|
use crate::update::UpdateIndexingStep;
|
||||||
use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId};
|
use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId};
|
||||||
@ -337,8 +337,8 @@ impl Store {
|
|||||||
for ((field_id, value), docids) in iter {
|
for ((field_id, value), docids) in iter {
|
||||||
let result = match value {
|
let result = match value {
|
||||||
String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned),
|
String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned),
|
||||||
Float(f) => FacetValueF64Codec::bytes_encode(&(field_id, *f)).map(Cow::into_owned),
|
Float(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned),
|
||||||
Integer(i) => FacetValueI64Codec::bytes_encode(&(field_id, i)).map(Cow::into_owned),
|
Integer(i) => FacetLevelValueI64Codec::bytes_encode(&(field_id, 0, i, i)).map(Cow::into_owned),
|
||||||
};
|
};
|
||||||
let key = result.context("could not serialize facet key")?;
|
let key = result.context("could not serialize facet key")?;
|
||||||
let bytes = CboRoaringBitmapCodec::bytes_encode(&docids)
|
let bytes = CboRoaringBitmapCodec::bytes_encode(&docids)
|
||||||
@ -399,7 +399,7 @@ impl Store {
|
|||||||
// We skip documents that must not be indexed by this thread.
|
// We skip documents that must not be indexed by this thread.
|
||||||
if count % num_threads == thread_index {
|
if count % num_threads == thread_index {
|
||||||
// This is a log routine that we do every `log_every_n` documents.
|
// This is a log routine that we do every `log_every_n` documents.
|
||||||
if log_every_n.map_or(false, |len| count % len == 0) {
|
if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) {
|
||||||
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
|
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
|
||||||
progress_callback(UpdateIndexingStep::IndexDocuments {
|
progress_callback(UpdateIndexingStep::IndexDocuments {
|
||||||
documents_seen: count,
|
documents_seen: count,
|
||||||
@ -571,7 +571,10 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec
|
|||||||
Value::Null => Ok(()),
|
Value::Null => Ok(()),
|
||||||
Value::Bool(b) => Ok(output.push(Integer(*b as i64))),
|
Value::Bool(b) => Ok(output.push(Integer(*b as i64))),
|
||||||
Value::Number(number) => match ftype {
|
Value::Number(number) => match ftype {
|
||||||
FacetType::String => bail!("invalid facet type, expecting {} found number", ftype),
|
FacetType::String => {
|
||||||
|
let string = SmallString32::from(number.to_string());
|
||||||
|
Ok(output.push(String(string)))
|
||||||
|
},
|
||||||
FacetType::Float => match number.as_f64() {
|
FacetType::Float => match number.as_f64() {
|
||||||
Some(float) => Ok(output.push(Float(OrderedFloat(float)))),
|
Some(float) => Ok(output.push(Float(OrderedFloat(float)))),
|
||||||
None => bail!("invalid facet type, expecting {} found integer", ftype),
|
None => bail!("invalid facet type, expecting {} found integer", ftype),
|
||||||
@ -586,7 +589,7 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
Value::String(string) => {
|
Value::String(string) => {
|
||||||
let string = string.trim();
|
let string = string.trim().to_lowercase();
|
||||||
if string.is_empty() { return Ok(()) }
|
if string.is_empty() { return Ok(()) }
|
||||||
match ftype {
|
match ftype {
|
||||||
FacetType::String => {
|
FacetType::String => {
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
mod available_documents_ids;
|
mod available_documents_ids;
|
||||||
mod clear_documents;
|
mod clear_documents;
|
||||||
mod delete_documents;
|
mod delete_documents;
|
||||||
|
mod facets;
|
||||||
mod index_documents;
|
mod index_documents;
|
||||||
mod settings;
|
mod settings;
|
||||||
mod update_builder;
|
mod update_builder;
|
||||||
@ -11,6 +12,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds;
|
|||||||
pub use self::clear_documents::ClearDocuments;
|
pub use self::clear_documents::ClearDocuments;
|
||||||
pub use self::delete_documents::DeleteDocuments;
|
pub use self::delete_documents::DeleteDocuments;
|
||||||
pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat};
|
pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat};
|
||||||
|
pub use self::facets::Facets;
|
||||||
pub use self::settings::Settings;
|
pub use self::settings::Settings;
|
||||||
pub use self::update_builder::UpdateBuilder;
|
pub use self::update_builder::UpdateBuilder;
|
||||||
pub use self::update_step::UpdateIndexingStep;
|
pub use self::update_step::UpdateIndexingStep;
|
||||||
|
@ -412,6 +412,23 @@ mod tests {
|
|||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let fields_ids = index.faceted_fields(&rtxn).unwrap();
|
let fields_ids = index.faceted_fields(&rtxn).unwrap();
|
||||||
assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer });
|
assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer });
|
||||||
|
// Only count the field_id 0 and level 0 facet values.
|
||||||
|
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count();
|
||||||
|
assert_eq!(count, 3);
|
||||||
|
drop(rtxn);
|
||||||
|
|
||||||
|
// Index a little more documents with new and current facets values.
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..];
|
||||||
|
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||||
|
builder.update_format(UpdateFormat::Csv);
|
||||||
|
builder.execute(content, |_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
// Only count the field_id 0 and level 0 facet values.
|
||||||
|
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count();
|
||||||
|
assert_eq!(count, 4);
|
||||||
drop(rtxn);
|
drop(rtxn);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@ use grenad::CompressionType;
|
|||||||
use rayon::ThreadPool;
|
use rayon::ThreadPool;
|
||||||
|
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings};
|
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets};
|
||||||
|
|
||||||
pub struct UpdateBuilder<'a> {
|
pub struct UpdateBuilder<'a> {
|
||||||
pub(crate) log_every_n: Option<usize>,
|
pub(crate) log_every_n: Option<usize>,
|
||||||
@ -118,4 +118,19 @@ impl<'a> UpdateBuilder<'a> {
|
|||||||
|
|
||||||
builder
|
builder
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn facets<'t, 'u, 'i>(
|
||||||
|
self,
|
||||||
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
|
index: &'i Index,
|
||||||
|
) -> Facets<'t, 'u, 'i>
|
||||||
|
{
|
||||||
|
let mut builder = Facets::new(wtxn, index);
|
||||||
|
|
||||||
|
builder.chunk_compression_type = self.chunk_compression_type;
|
||||||
|
builder.chunk_compression_level = self.chunk_compression_level;
|
||||||
|
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||||
|
|
||||||
|
builder
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user