diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index c5e022bd7..7a07997ae 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -104,7 +104,7 @@ jobs: - uses: helix-editor/rust-toolchain@v1 - name: Run cargo tree without default features and check lindera is not present run: | - if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -vqz lindera; then + if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -qz lindera; then echo "lindera has been found in the sources and it shouldn't" exit 1 fi diff --git a/Cargo.lock b/Cargo.lock index f20d2f289..156e3d146 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -36,9 +36,9 @@ dependencies = [ [[package]] name = "actix-http" -version = "3.6.0" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d223b13fd481fc0d1f83bb12659ae774d9e3601814c68a0bc539731698cca743" +checksum = "4eb9843d84c775696c37d9a418bbb01b932629d01870722c0f13eb3f95e2536d" dependencies = [ "actix-codec", "actix-rt", @@ -46,7 +46,7 @@ dependencies = [ "actix-tls", "actix-utils", "ahash", - "base64 0.21.7", + "base64 0.22.1", "bitflags 2.5.0", "brotli", "bytes", @@ -85,13 +85,15 @@ dependencies = [ [[package]] name = "actix-router" -version = "0.5.1" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d66ff4d247d2b160861fa2866457e85706833527840e4133f8f49aa423a38799" +checksum = "13d324164c51f63867b57e73ba5936ea151b8a41a1d23d1031eeb9f70d0236f8" dependencies = [ "bytestring", + "cfg-if", "http 0.2.11", "regex", + "regex-lite", "serde", "tracing", ] @@ -138,9 +140,9 @@ dependencies = [ [[package]] name = "actix-tls" -version = "3.3.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4cce60a2f2b477bc72e5cde0af1812a6e82d8fd85b5570a5dcf2a5bf2c5be5f" +checksum = "ac453898d866cdbecdbc2334fe1738c747b4eba14a677261f2b768ba05329389" dependencies = [ "actix-rt", "actix-service", @@ -167,9 +169,9 @@ dependencies = [ [[package]] name = "actix-web" -version = "4.5.1" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a6556ddebb638c2358714d853257ed226ece6023ef9364f23f0c70737ea984" +checksum = "b1cf67dadb19d7c95e5a299e2dda24193b89d5d4f33a3b9800888ede9e19aa32" dependencies = [ "actix-codec", "actix-http", @@ -196,7 +198,7 @@ dependencies = [ "mime", "once_cell", "pin-project-lite", - "regex", + "regex-lite", "serde", "serde_json", "serde_urlencoded", @@ -220,8 +222,9 @@ dependencies = [ [[package]] name = "actix-web-static-files" -version = "3.0.5" -source = "git+https://github.com/kilork/actix-web-static-files.git?rev=2d3b6160#2d3b6160f0de4ba061c5d76b5704f34fb677f6df" +version = "4.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adf6d1ef6d7a60e084f9e0595e2a5234abda14e76c105ecf8e2d0e8800c41a1f" dependencies = [ "actix-web", "derive_more", @@ -378,9 +381,9 @@ dependencies = [ [[package]] name = "arroy" -version = "0.3.1" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73897699bf04bac935c0b120990d2a511e91e563e0f9769f9c8bb983d98dfbc9" +checksum = "2ece9e5347e7fdaaea3181dec7f916677ad5f3fcbac183648ce1924eb4aeef9a" dependencies = [ "bytemuck", "byteorder", @@ -613,9 +616,9 @@ dependencies = [ [[package]] name = "brotli" -version = "3.4.0" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f" +checksum = "74f7971dbd9326d58187408ab83117d8ac1bb9c17b085fdacd1cf2f598719b6b" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -624,9 +627,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "2.5.1" +version = "4.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f" +checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -676,9 +679,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" [[package]] name = "bytemuck" -version = "1.15.0" +version = "1.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15" +checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e" dependencies = [ "bytemuck_derive", ] @@ -895,9 +898,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "933f20f2269b24d32fd5503e7b3c268af902190daf8d9d2b73ed2e75d77c00b4" +checksum = "11a09ae38cfcc153f01576c3f579dfd916e0320f1b474f298c8d680b2dd92eb6" dependencies = [ "aho-corasick", "cow-utils", @@ -986,7 +989,7 @@ dependencies = [ "anstream", "anstyle", "clap_lex", - "strsim", + "strsim 0.10.0", ] [[package]] @@ -1277,12 +1280,12 @@ dependencies = [ [[package]] name = "darling" -version = "0.20.3" +version = "0.20.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0209d94da627ab5605dcccf08bb18afa5009cfbef48d8a8b7d7bdbc79be25c5e" +checksum = "83b2eb4d90d12bdda5ed17de686c2acb4c57914f8f921b8da7e112b5a36f3fe1" dependencies = [ - "darling_core 0.20.3", - "darling_macro 0.20.3", + "darling_core 0.20.9", + "darling_macro 0.20.9", ] [[package]] @@ -1295,21 +1298,21 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim", + "strsim 0.10.0", "syn 1.0.109", ] [[package]] name = "darling_core" -version = "0.20.3" +version = "0.20.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "177e3443818124b357d8e76f53be906d60937f0d3a90773a664fa63fa253e621" +checksum = "622687fe0bac72a04e5599029151f5796111b90f1baaa9b544d807a5e31cd120" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", - "strsim", + "strsim 0.11.1", "syn 2.0.60", ] @@ -1326,11 +1329,11 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.20.3" +version = "0.20.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" +checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178" dependencies = [ - "darling_core 0.20.3", + "darling_core 0.20.9", "quote", "syn 2.0.60", ] @@ -1383,6 +1386,15 @@ dependencies = [ "derive_builder_macro 0.13.1", ] +[[package]] +name = "derive_builder" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7" +dependencies = [ + "derive_builder_macro 0.20.0", +] + [[package]] name = "derive_builder_core" version = "0.12.0" @@ -1407,6 +1419,18 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_builder_core" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d" +dependencies = [ + "darling 0.20.9", + "proc-macro2", + "quote", + "syn 2.0.60", +] + [[package]] name = "derive_builder_macro" version = "0.12.0" @@ -1427,6 +1451,16 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_builder_macro" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" +dependencies = [ + "derive_builder_core 0.20.0", + "syn 2.0.60", +] + [[package]] name = "derive_more" version = "0.99.17" @@ -1454,7 +1488,7 @@ dependencies = [ "serde-cs", "serde_json", "serde_urlencoded", - "strsim", + "strsim 0.10.0", ] [[package]] @@ -1707,29 +1741,6 @@ dependencies = [ "syn 2.0.60", ] -[[package]] -name = "env_filter" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" -dependencies = [ - "log", - "regex", -] - -[[package]] -name = "env_logger" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" -dependencies = [ - "anstream", - "anstyle", - "env_filter", - "humantime", - "log", -] - [[package]] name = "equivalent" version = "1.0.1" @@ -1784,7 +1795,7 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d15473d7f83b54a44826907af16ae5727eaacaf6e53b51474016d3efd9aa35d5" dependencies = [ - "darling 0.20.3", + "darling 0.20.9", "proc-macro2", "quote", "syn 2.0.60", @@ -2262,9 +2273,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "heed" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f7acb9683d7c7068aa46d47557bfa4e35a277964b350d9504a87b03610163fd" +checksum = "f60d7cff16094be9627830b399c087a25017e93fb3768b87cd656a68ccb1ebe8" dependencies = [ "bitflags 2.5.0", "byteorder", @@ -2379,12 +2390,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" -[[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" - [[package]] name = "hyper" version = "0.14.27" @@ -2450,6 +2455,7 @@ name = "index-scheduler" version = "1.9.0" dependencies = [ "anyhow", + "arroy", "big_s", "bincode", "crossbeam", @@ -2460,6 +2466,7 @@ dependencies = [ "file-store", "flate2", "insta", + "maplit", "meili-snap", "meilisearch-auth", "meilisearch-types", @@ -2778,9 +2785,9 @@ dependencies = [ [[package]] name = "lindera" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1bbf252ea3490053dc397539ece0b510924f2f72605fa28d3e858d86f43ec88" +checksum = "dcd4fa369654517f72c10b24adf03ad4ce69d19facb79c3cb3cf9b4580ac352f" dependencies = [ "lindera-analyzer", "lindera-core", @@ -2791,9 +2798,9 @@ dependencies = [ [[package]] name = "lindera-analyzer" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87febfec0e2859ce2154fb90dd6f66b774ddb0b6e264b44f8e3d1303c9dcedd7" +checksum = "c2cba7fe275cb8ec4c594cfee9cc39e48b71e02a089457d52f3e70dc146a8133" dependencies = [ "anyhow", "bincode", @@ -2821,9 +2828,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb91bb8a93ab0f95dbc3c43b5105354bb059134ef731154f75a64b5d919e71d" +checksum = "240adf9faba3f09ad16557aefcd316dd00ebb940ac94334a629660d772f118c1" dependencies = [ "bincode", "byteorder", @@ -2835,29 +2842,21 @@ dependencies = [ [[package]] name = "lindera-cc-cedict-builder" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6022a8309a287dbef425fd09a61585351670c83001d74f6c089979e2330b683" +checksum = "f12241f9e74babe708a0b9441d9f3fa67cb29fd01257918f30ffd480ca568820" dependencies = [ "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-compress", "lindera-core", "lindera-decompress", - "log", - "yada", + "lindera-dictionary-builder", ] [[package]] name = "lindera-compress" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32363cbcf433f915e7d77c2a0c410db2d6b23442e80715cf2cf6b9864078a500" +checksum = "50f9f7a858d70ff9e4383cbd507ca9e98c8faf0319e08c10df4c30cb58c9ca6c" dependencies = [ "anyhow", "flate2", @@ -2866,9 +2865,9 @@ dependencies = [ [[package]] name = "lindera-core" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9a0e858753a02b1a3524fae4fbb11ca4b3a947128fd7854b797386562678be8" +checksum = "7f09810ab98ce2a084d788ac38fbb7b31697f34bc47c61de0d880320a674bd15" dependencies = [ "anyhow", "bincode", @@ -2883,9 +2882,9 @@ dependencies = [ [[package]] name = "lindera-decompress" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e406345f6f8b665b9a129c67079c18ca9d97e9d171d102b4106a64a592c285e" +checksum = "d53400c9b2dd6b45f82d9fa5b5efe079f3acaf6ce609dba8d42c8a76baaa2b12" dependencies = [ "anyhow", "flate2", @@ -2894,9 +2893,9 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e2a3ec0e5fd6768a27c6ec1040e8470d3a5926418f7afe065859e98aabb3bfe" +checksum = "2053d064a515839250438b8dfa6cf445e2b97633232ded34a54f267e945d196e" dependencies = [ "anyhow", "bincode", @@ -2918,10 +2917,32 @@ dependencies = [ ] [[package]] -name = "lindera-filter" -version = "0.30.0" +name = "lindera-dictionary-builder" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1badaf51bad051185ea4917ba91bbbf2d6f8167e155647e21e0eaaef0982a95d" +checksum = "14f486924055f8bedcc5877572e4dc91fbc10370862430ac2e5f7f0d671a18c8" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "derive_builder 0.20.0", + "encoding", + "encoding_rs", + "encoding_rs_io", + "glob", + "lindera-compress", + "lindera-core", + "lindera-decompress", + "log", + "yada", +] + +[[package]] +name = "lindera-filter" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb3904fc279f0297f6fd6210435adab1f8c82ba84eba8635407c791af51c0d8a" dependencies = [ "anyhow", "csv", @@ -2944,9 +2965,9 @@ dependencies = [ [[package]] name = "lindera-ipadic" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "129ec16366354998f9791467ad38731539197747f649e573ead845358271ce25" +checksum = "4aa3ef2f1f6838b0fa2e2fca2896242bb83bc877c1760cdb6fa23449ab95d664" dependencies = [ "bincode", "byteorder", @@ -2958,31 +2979,21 @@ dependencies = [ [[package]] name = "lindera-ipadic-builder" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f0979a56bc57e9c9be2996dff232c47aa146a2e7baebf5dd567e388eba3dd90" +checksum = "a41287db18eadb58d73a04d49778d41c161549fbbbe155d4338976b7b8541c7d" dependencies = [ "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-compress", "lindera-core", "lindera-decompress", - "log", - "serde", - "yada", + "lindera-dictionary-builder", ] [[package]] name = "lindera-ipadic-neologd" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20076660c4e79ef0316735b44e18ec7644e54786acdee8946c972d5f97086d0f" +checksum = "49382256f245078400bf7e72663f9eb30afcd9ed54cd46f29d7db1be529678e1" dependencies = [ "bincode", "byteorder", @@ -2994,31 +3005,21 @@ dependencies = [ [[package]] name = "lindera-ipadic-neologd-builder" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eccd18ed5f65d1d64ac0cbfa1d6827bfbbaf6530520ae6847e6a91ee38f47e20" +checksum = "5ae9cfd2fda68ef526ef0c7b50c5d4d5582a4daa6ecd0cea9e2b0b62564a2a5d" dependencies = [ "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-compress", "lindera-core", "lindera-decompress", - "log", - "serde", - "yada", + "lindera-dictionary-builder", ] [[package]] name = "lindera-ko-dic" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59073171566c3e498ca048e84c2d0a7e117a42f36c8eb7d7163e65ac38bd6d48" +checksum = "7f86d03a863f3ae1d269e7b7d4dd2cce9385a53463479bafc5d7aa48719f36db" dependencies = [ "bincode", "byteorder", @@ -3034,29 +3035,21 @@ dependencies = [ [[package]] name = "lindera-ko-dic-builder" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae176afa8535ca2a5ee9471873f85d531db0a6c32a3c42b41084506aac22b577" +checksum = "bd0f44f2e56358c5879dfb5e7f76cc6ba7853ec31082c4e3f8fb65fb2d849c51" dependencies = [ "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-compress", "lindera-core", "lindera-decompress", - "log", - "yada", + "lindera-dictionary-builder", ] [[package]] name = "lindera-tokenizer" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "457285bdde84571aa510c9e05371904305a55e8a541fa1473d4393062f06932d" +checksum = "7c5182735cdc2832ac757b31e8a5b150a3514357a30efe3dec212f8dcb06ba14" dependencies = [ "bincode", "lindera-core", @@ -3068,9 +3061,9 @@ dependencies = [ [[package]] name = "lindera-unidic" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5839980be552dfa639b70964c61914a9ad014148663679b0e148aa72e5e30f23" +checksum = "6c63da104728dd1cf14bfa564753cbfa996f6078ed2e23e31475bd1d639fc597" dependencies = [ "bincode", "byteorder", @@ -3086,22 +3079,14 @@ dependencies = [ [[package]] name = "lindera-unidic-builder" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcaab8f061d5b944b1e424f49c7efbf8f276e8a72e4f4ff956d01e46d481f008" +checksum = "04acecbc068dac21766a1b7ed1f2608b6f250d10b4f8bff67abc2a00437a0974" dependencies = [ "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-compress", "lindera-core", "lindera-decompress", - "log", - "yada", + "lindera-dictionary-builder", ] [[package]] @@ -3187,9 +3172,9 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da" [[package]] name = "lmdb-master-sys" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc9048db3a58c0732d7236abc4909058f9d2708cfb6d7d047eb895fddec6419a" +checksum = "a5142795c220effa4c8f4813537bd4c88113a07e45e93100ccb2adc5cec6c7f3" dependencies = [ "cc", "doxygen-rs", @@ -4340,6 +4325,12 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "regex-lite" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" + [[package]] name = "regex-syntax" version = "0.8.2" @@ -4388,12 +4379,6 @@ dependencies = [ "winreg", ] -[[package]] -name = "retain_mut" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" - [[package]] name = "ring" version = "0.17.8" @@ -4411,13 +4396,12 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.2" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" +checksum = "7699249cc2c7d71939f30868f47e9d7add0bdc030d90ee10bfd16887ff8bb1c8" dependencies = [ "bytemuck", "byteorder", - "retain_mut", "serde", ] @@ -4900,6 +4884,12 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum" version = "0.26.2" @@ -5313,9 +5303,9 @@ dependencies = [ [[package]] name = "tracing-actix-web" -version = "0.7.9" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fe0d5feac3f4ca21ba33496bcb1ccab58cca6412b1405ae80f0581541e0ca78" +checksum = "4ee9e39a66d9b615644893ffc1704d2a89b5b315b7fd0228ad3182ca9a306b19" dependencies = [ "actix-web", "mutually_exclusive_features", diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap index 0aad0ea97..a9c76227a 100644 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap @@ -780,7 +780,7 @@ expression: document 1.3484878540039063 ] ], - "userProvided": false + "regenerate": true } } } diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap index f2a5e1d69..e5d28e450 100644 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap @@ -779,7 +779,7 @@ expression: document 1.04031240940094 ] ], - "userProvided": false + "regenerate": true } } } diff --git a/dump/tests/assets/v6-with-vectors.dump b/dump/tests/assets/v6-with-vectors.dump index 9f8ed2ba1..8c0505772 100644 Binary files a/dump/tests/assets/v6-with-vectors.dump and b/dump/tests/assets/v6-with-vectors.dump differ diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 21fa34733..aff3b379f 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -40,7 +40,9 @@ ureq = "2.9.7" uuid = { version = "1.6.1", features = ["serde", "v4"] } [dev-dependencies] +arroy = "0.4.0" big_s = "1.0.2" crossbeam = "0.8.4" insta = { version = "1.34.0", features = ["json", "redactions"] } +maplit = "1.0.2" meili-snap = { path = "../meili-snap" } diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 181ac49a3..cd5525eea 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -909,6 +909,7 @@ impl IndexScheduler { let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index.embedding_configs(&rtxn)?; // 3.1. Dump the documents for ret in index.all_documents(&rtxn)? { @@ -951,16 +952,21 @@ impl IndexScheduler { }; for (embedder_name, embeddings) in embeddings { - // don't change the entry if it already exists, because it was user-provided - vectors.entry(embedder_name).or_insert_with(|| { - let embeddings = ExplicitVectors { - embeddings: VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, - ), - user_provided: false, - }; - serde_json::to_value(embeddings).unwrap() - }); + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(id)); + + let embeddings = ExplicitVectors { + embeddings: Some( + VectorOrArrayOfVectors::from_array_of_vectors(embeddings), + ), + regenerate: !user_provided, + }; + vectors.insert( + embedder_name, + serde_json::to_value(embeddings).unwrap(), + ); } } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 5557764e9..0b98cc22a 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -53,6 +53,7 @@ use meilisearch_types::heed::byteorder::BE; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::milli::documents::DocumentsBatchBuilder; +use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; @@ -1459,33 +1460,39 @@ impl IndexScheduler { // TODO: consider using a type alias or a struct embedder/template pub fn embedders( &self, - embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>, + embedding_configs: Vec, ) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| { - let prompt = - Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); - // optimistically return existing embedder - { - let embedders = self.embedders.read().unwrap(); - if let Some(embedder) = embedders.get(&embedder_options) { - return Ok((name, (embedder.clone(), prompt))); + .map( + |IndexEmbeddingConfig { + name, + config: milli::vector::EmbeddingConfig { embedder_options, prompt }, + .. + }| { + let prompt = + Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); + // optimistically return existing embedder + { + let embedders = self.embedders.read().unwrap(); + if let Some(embedder) = embedders.get(&embedder_options) { + return Ok((name, (embedder.clone(), prompt))); + } } - } - // add missing embedder - let embedder = Arc::new( - Embedder::new(embedder_options.clone()) - .map_err(meilisearch_types::milli::vector::Error::from) - .map_err(meilisearch_types::milli::Error::from)?, - ); - { - let mut embedders = self.embedders.write().unwrap(); - embedders.insert(embedder_options, embedder.clone()); - } - Ok((name, (embedder, prompt))) - }) + // add missing embedder + let embedder = Arc::new( + Embedder::new(embedder_options.clone()) + .map_err(meilisearch_types::milli::vector::Error::from) + .map_err(meilisearch_types::milli::Error::from)?, + ); + { + let mut embedders = self.embedders.write().unwrap(); + embedders.insert(embedder_options, embedder.clone()); + } + Ok((name, (embedder, prompt))) + }, + ) .collect(); res.map(EmbeddingConfigs::new) } @@ -1748,6 +1755,9 @@ mod tests { use meilisearch_types::milli::update::IndexDocumentsMethod::{ ReplaceDocuments, UpdateDocuments, }; + use meilisearch_types::milli::update::Setting; + use meilisearch_types::milli::vector::settings::EmbeddingSettings; + use meilisearch_types::settings::Unchecked; use meilisearch_types::tasks::IndexSwap; use meilisearch_types::VERSION_FILE_NAME; use tempfile::{NamedTempFile, TempDir}; @@ -1826,6 +1836,7 @@ mod tests { assert_eq!(breakpoint, (Init, false)); let index_scheduler_handle = IndexSchedulerHandle { _tempdir: tempdir, + index_scheduler: index_scheduler.private_clone(), test_breakpoint_rcv: receiver, last_breakpoint: breakpoint.0, }; @@ -1914,6 +1925,7 @@ mod tests { pub struct IndexSchedulerHandle { _tempdir: TempDir, + index_scheduler: IndexScheduler, test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>, last_breakpoint: Breakpoint, } @@ -1931,9 +1943,13 @@ mod tests { { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { - panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") + } + Err(RecvTimeoutError::Disconnected) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler crashed.\n{state}") } - Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."), }; // if we've already encountered a breakpoint we're supposed to be stuck on the false // and we expect the same variant with the true to come now. @@ -1952,9 +1968,13 @@ mod tests { { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { - panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") + } + Err(RecvTimeoutError::Disconnected) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler crashed.\n{state}") } - Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."), }; assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite"); @@ -1968,9 +1988,10 @@ mod tests { fn advance_till(&mut self, breakpoints: impl IntoIterator) { for breakpoint in breakpoints { let b = self.advance(); + let state = snapshot_index_scheduler(&self.index_scheduler); assert_eq!( b, breakpoint, - "Was expecting the breakpoint `{:?}` but instead got `{:?}`.", + "Was expecting the breakpoint `{:?}` but instead got `{:?}`.\n{state}", breakpoint, b ); } @@ -1995,6 +2016,7 @@ mod tests { // Wait for one successful batch. #[track_caller] fn advance_one_successful_batch(&mut self) { + self.index_scheduler.assert_internally_consistent(); self.advance_till([Start, BatchCreated]); loop { match self.advance() { @@ -2003,13 +2025,17 @@ mod tests { InsideProcessBatch => (), // the batch went successfully, we can stop the loop and go on with the next states. ProcessBatchSucceeded => break, - AbortedIndexation => panic!("The batch was aborted."), - ProcessBatchFailed => panic!("The batch failed."), + AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), + ProcessBatchFailed => { + while self.advance() != Start {} + panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)) + }, breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), } } self.advance_till([AfterProcessing]); + self.index_scheduler.assert_internally_consistent(); } // Wait for one failed batch. @@ -2023,8 +2049,8 @@ mod tests { InsideProcessBatch => (), // the batch went failed, we can stop the loop and go on with the next states. ProcessBatchFailed => break, - ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)"), - AbortedIndexation => panic!("The batch was aborted."), + ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)\n{}", snapshot_index_scheduler(&self.index_scheduler)), + AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), } } @@ -3052,8 +3078,10 @@ mod tests { let rtxn = index.read_txn().unwrap(); let configs = index.embedding_configs(&rtxn).unwrap(); - let (_, embedding_config) = configs.first().unwrap(); - insta::assert_json_snapshot!(embedding_config.embedder_options); + let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap(); + insta::assert_snapshot!(name, @"default"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_json_snapshot!(config.embedder_options); } #[test] @@ -4989,7 +5017,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors"); @@ -5000,7 +5027,7 @@ mod tests { insta::assert_json_snapshot!(task.details); } - handle.advance_n_successful_batches(1); + handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); { @@ -5017,13 +5044,17 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, fakerest_config) = configs.get(0).unwrap(); - insta::assert_json_snapshot!(name, @r###""A_fakerest""###); + let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } = + configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let (name, simple_hf_config) = configs.get(1).unwrap(); - insta::assert_json_snapshot!(name, @r###""B_small_hf""###); + let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } = + configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); @@ -5038,25 +5069,25 @@ mod tests { // add one doc, specifying vectors let doc = serde_json::json!( - { - "id": 0, - "doggo": "Intel", - "breed": "beagle", - "_vectors": { - &fakerest_name: { - // this will never trigger regeneration, which is good because we can't actually generate with - // this embedder - "userProvided": true, - "embeddings": beagle_embed, - }, - &simple_hf_name: { - // this will be regenerated on updates - "userProvided": false, - "embeddings": lab_embed, - }, - "noise": [0.1, 0.2, 0.3] - } - } + { + "id": 0, + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + &fakerest_name: { + // this will never trigger regeneration, which is good because we can't actually generate with + // this embedder + "regenerate": false, + "embeddings": beagle_embed, + }, + &simple_hf_name: { + // this will be regenerated on updates + "regenerate": true, + "embeddings": lab_embed, + }, + "noise": [0.1, 0.2, 0.3] + } + } ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap(); @@ -5078,7 +5109,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel"); @@ -5091,6 +5121,19 @@ mod tests { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = + configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + let embeddings = index.embeddings(&rtxn, 0).unwrap(); assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); @@ -5140,7 +5183,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); @@ -5153,11 +5195,25 @@ mod tests { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = + configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let IndexEmbeddingConfig { name, config: _, user_provided } = + configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + let embeddings = index.embeddings(&rtxn, 0).unwrap(); - // automatically changed to patou + // automatically changed to patou because set to regenerate assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); - // remained beagle because set to userProvided + // remained beagle assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; @@ -5176,4 +5232,578 @@ mod tests { } } } + + #[test] + fn import_vectors_first_and_embedder_later() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "my_doggo_embedder": vec![1; 384], + "unknown embedder": vec![1, 2, 3], + } + }, + { + "id": 2, + "doggo": "max", + "_vectors": { + "my_doggo_embedder": { + "regenerate": false, + "embeddings": vec![2; 384], + }, + "unknown embedder": vec![4, 5], + }, + }, + { + "id": 3, + "doggo": "marcel", + "_vectors": { + "my_doggo_embedder": { + "regenerate": true, + "embeddings": vec![3; 384], + }, + }, + }, + { + "id": 4, + "doggo": "sora", + "_vectors": { + "my_doggo_embedder": { + "regenerate": true, + }, + }, + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"5"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push"); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + ..Default::default() + }) + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + // the all the vectors linked to the new specified embedder have been removed + // Only the unknown embedders stays in the document DB + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + // even though we specified the vector for the ID 3, it shouldn't be marked + // as user provided since we explicitely marked it as NOT user provided. + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "my_doggo_embedder", + config: EmbeddingConfig { + embedder_options: HuggingFace( + EmbedderOptions { + model: "sentence-transformers/all-MiniLM-L6-v2", + revision: Some( + "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + ), + distribution: None, + }, + ), + prompt: PromptData { + template: "{{doc.doggo}}", + }, + }, + user_provided: RoaringBitmap<[1, 2]>, + }, + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + // the document with the id 3 should keep its original embedding + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let mut embeddings = Vec::new(); + + 'vectors: for i in 0..=u8::MAX { + let reader = arroy::Reader::open(&rtxn, i as u16, index.vector_arroy) + .map(Some) + .or_else(|e| match e { + arroy::Error::MissingMetadata(_) => Ok(None), + e => Err(e), + }) + .transpose(); + + let Some(reader) = reader else { + break 'vectors; + }; + + let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap(); + if let Some(embedding) = embedding { + embeddings.push(embedding) + } else { + break 'vectors; + } + } + + snapshot!(embeddings.len(), @"1"); + assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); + + // If we update marcel it should regenerate its embedding automatically + + let content = serde_json::json!( + [ + { + "id": 3, + "doggo": "marvel", + }, + { + "id": 4, + "doggo": "sorry", + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + // the document with the id 3 should have its original embedding updated + let rtxn = index.read_txn().unwrap(); + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let doc = index.documents(&rtxn, Some(docid)).unwrap()[0]; + let doc = obkv_to_json(&field_ids, &field_ids_map, doc.1).unwrap(); + snapshot!(json_string!(doc), @r###" + { + "id": 3, + "doggo": "marvel" + } + "###); + + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + + assert!(!embedding.is_empty()); + assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); + + // the document with the id 4 should generate an embedding + let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + + assert!(!embedding.is_empty()); + } + + #[test] + fn delete_document_containing_vector() { + // 1. Add an embedder + // 2. Push two documents containing a simple vector + // 3. Delete the first document + // 4. The user defined roaring bitmap shouldn't contains the id of the first document anymore + // 5. Clear the index + // 6. The user defined roaring bitmap shouldn't contains the id of the second document + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), + dimensions: Setting::Set(3), + ..Default::default() + }) + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "manual": vec![0, 0, 0], + } + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "manual": vec![1, 1, 1], + } + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + index_scheduler + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1")], + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + }, + }, + user_provided: RoaringBitmap<[0]>, + }, + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["manual"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + index_scheduler + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); + let conf = index.embedding_configs(&rtxn).unwrap(); + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + }, + }, + user_provided: RoaringBitmap<[]>, + }, + ] + "###); + } + + #[test] + fn delete_embedder_with_user_provided_vectors() { + // 1. Add two embedders + // 2. Push two documents containing a simple vector + // 3. The documents must not contain the vectors after the update as they are in the vectors db + // 3. Delete the embedders + // 4. The documents contain the vectors again + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), + dimensions: Setting::Set(3), + ..Default::default() + }), + S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + ..Default::default() + }), + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "manual": vec![0, 0, 0], + "my_doggo_embedder": vec![1; 384], + } + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "manual": vec![1, 1, 1], + } + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel"}]"###); + } + + { + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Reset, + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + } + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"regenerate":false}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"regenerate":false}}}]"###); + } + + { + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Reset, + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + } + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + + // FIXME: redaction + snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"regenerate\":false},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"regenerate\":false}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"regenerate\":false}}}]""###); + } + } } diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap similarity index 67% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap index 002a42e59..540835dfb 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap @@ -6,10 +6,6 @@ expression: doc "doggo": "Intel", "breed": "beagle", "_vectors": { - "A_fakerest": { - "embeddings": "[vector]", - "userProvided": true - }, "noise": [ 0.1, 0.2, diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap similarity index 67% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap index 718ea229c..bc35d84f6 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap @@ -6,10 +6,6 @@ expression: doc "doggo": "kefir", "breed": "patou", "_vectors": { - "A_fakerest": { - "embeddings": "[vector]", - "userProvided": true - }, "noise": [ 0.1, 0.2, diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap new file mode 100644 index 000000000..d2473d00a --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap @@ -0,0 +1,4 @@ +--- +source: index-scheduler/src/lib.rs +--- +[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"embeddings":[2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0],"regenerate":false},"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"embeddings":[3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0],"regenerate":true}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"embeddings":null,"regenerate":true}}}] diff --git a/meilisearch-types/Cargo.toml b/meilisearch-types/Cargo.toml index a15fc01f8..bae283137 100644 --- a/meilisearch-types/Cargo.toml +++ b/meilisearch-types/Cargo.toml @@ -11,7 +11,7 @@ edition.workspace = true license.workspace = true [dependencies] -actix-web = { version = "4.5.1", default-features = false } +actix-web = { version = "4.6.0", default-features = false } anyhow = "1.0.79" convert_case = "0.6.0" csv = "1.3.0" @@ -30,7 +30,12 @@ serde_json = "1.0.111" tar = "0.4.40" tempfile = "3.9.0" thiserror = "1.0.56" -time = { version = "0.3.31", features = ["serde-well-known", "formatting", "parsing", "macros"] } +time = { version = "0.3.31", features = [ + "serde-well-known", + "formatting", + "parsing", + "macros", +] } tokio = "1.35" uuid = { version = "1.6.1", features = ["serde", "v4"] } diff --git a/meilisearch-types/src/deserr/mod.rs b/meilisearch-types/src/deserr/mod.rs index c593c50fb..1c1b0e987 100644 --- a/meilisearch-types/src/deserr/mod.rs +++ b/meilisearch-types/src/deserr/mod.rs @@ -189,4 +189,6 @@ merge_with_error_impl_take_error_message!(ParseTaskKindError); merge_with_error_impl_take_error_message!(ParseTaskStatusError); merge_with_error_impl_take_error_message!(IndexUidFormatError); merge_with_error_impl_take_error_message!(InvalidSearchSemanticRatio); +merge_with_error_impl_take_error_message!(InvalidSearchRankingScoreThreshold); +merge_with_error_impl_take_error_message!(InvalidSimilarRankingScoreThreshold); merge_with_error_impl_take_error_message!(InvalidSimilarId); diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index d2218807f..f529238e4 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -222,6 +222,7 @@ InvalidApiKeyUid , InvalidRequest , BAD_REQUEST ; InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ; InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFields , InvalidRequest , BAD_REQUEST ; +InvalidDocumentRetrieveVectors , InvalidRequest , BAD_REQUEST ; MissingDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; @@ -240,7 +241,11 @@ InvalidSearchAttributesToSearchOn , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ; InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; +InvalidSimilarRetrieveVectors , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; +InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; +InvalidSimilarRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; +InvalidSearchRetrieveVectors , InvalidRequest , BAD_REQUEST ; InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ; InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ; InvalidSearchFacets , InvalidRequest , BAD_REQUEST ; @@ -268,13 +273,14 @@ InvalidSimilarShowRankingScore , InvalidRequest , BAD_REQUEST ; InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ; InvalidSimilarShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ; InvalidSearchSort , InvalidRequest , BAD_REQUEST ; +InvalidSearchDistinct , InvalidRequest , BAD_REQUEST ; InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ; InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ; InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ; InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ; -InvalidSettingsSearchCutoffMs , InvalidRequest , BAD_REQUEST ; +InvalidSettingsSearchCutoffMs , InvalidRequest , BAD_REQUEST ; InvalidSettingsEmbedders , InvalidRequest , BAD_REQUEST ; InvalidSettingsRankingRules , InvalidRequest , BAD_REQUEST ; InvalidSettingsSearchableAttributes , InvalidRequest , BAD_REQUEST ; @@ -379,6 +385,7 @@ impl ErrorCode for milli::Error { Code::IndexPrimaryKeyMultipleCandidatesFound } UserError::PrimaryKeyCannotBeChanged(_) => Code::IndexPrimaryKeyAlreadyExists, + UserError::InvalidDistinctAttribute { .. } => Code::InvalidSearchDistinct, UserError::SortRankingRuleMissing => Code::InvalidSearchSort, UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets, UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort, @@ -391,7 +398,8 @@ impl ErrorCode for milli::Error { UserError::CriterionError(_) => Code::InvalidSettingsRankingRules, UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField, UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions, - UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType, + UserError::InvalidVectorsMapType { .. } + | UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType, UserError::TooManyVectors(_, _) => Code::TooManyVectors, UserError::SortError(_) => Code::InvalidSearchSort, UserError::InvalidMinTypoWordLenSetting(_, _) => { @@ -505,6 +513,21 @@ impl fmt::Display for deserr_codes::InvalidSimilarId { } } +impl fmt::Display for deserr_codes::InvalidSearchRankingScoreThreshold { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "the value of `rankingScoreThreshold` is invalid, expected a float between `0.0` and `1.0`." + ) + } +} + +impl fmt::Display for deserr_codes::InvalidSimilarRankingScoreThreshold { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + deserr_codes::InvalidSearchRankingScoreThreshold.fmt(f) + } +} + #[macro_export] macro_rules! internal_error { ($target:ty : $($other:path), *) => { diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 223d71658..8a9708d29 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -8,6 +8,7 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; +use milli::index::IndexEmbeddingConfig; use milli::proximity::ProximityPrecision; use milli::update::Setting; use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; @@ -672,7 +673,7 @@ pub fn settings( let embedders: BTreeMap<_, _> = index .embedding_configs(rtxn)? .into_iter() - .map(|(name, config)| (name, Setting::Set(config.into()))) + .map(|IndexEmbeddingConfig { name, config, .. }| (name, Setting::Set(config.into()))) .collect(); let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 4ce2b5fb3..ce73ebdcf 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -14,20 +14,20 @@ default-run = "meilisearch" [dependencies] actix-cors = "0.7.0" -actix-http = { version = "3.6.0", default-features = false, features = [ +actix-http = { version = "3.7.0", default-features = false, features = [ "compress-brotli", "compress-gzip", "rustls-0_21", ] } actix-utils = "3.0.1" -actix-web = { version = "4.5.1", default-features = false, features = [ +actix-web = { version = "4.6.0", default-features = false, features = [ "macros", "compress-brotli", "compress-gzip", "cookies", "rustls-0_21", ] } -actix-web-static-files = { git = "https://github.com/kilork/actix-web-static-files.git", rev = "2d3b6160", optional = true } +actix-web-static-files = { version = "4.0.1", optional = true } anyhow = { version = "1.0.79", features = ["backtrace"] } async-stream = "0.3.5" async-trait = "0.1.77" @@ -104,13 +104,13 @@ url = { version = "2.5.0", features = ["serde"] } tracing = "0.1.40" tracing-subscriber = { version = "0.3.18", features = ["json"] } tracing-trace = { version = "0.1.0", path = "../tracing-trace" } -tracing-actix-web = "0.7.9" +tracing-actix-web = "0.7.10" build-info = { version = "1.7.0", path = "../build-info" } [dev-dependencies] actix-rt = "2.9.0" assert-json-diff = "2.0.2" -brotli = "3.4.0" +brotli = "6.0.0" insta = "1.34.0" manifest-dir-macros = "0.1.18" maplit = "1.0.2" @@ -158,5 +158,5 @@ vietnamese = ["meilisearch-types/vietnamese"] swedish-recomposition = ["meilisearch-types/swedish-recomposition"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip" -sha1 = "e20cc9b390003c6c844f4b8bcc5c5013191a77ff" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip" +sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe" diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index 3468ad2c7..6863dc57b 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -74,8 +74,8 @@ pub enum DocumentDeletionKind { #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DocumentFetchKind { - PerDocumentId, - Normal { with_filter: bool, limit: usize, offset: usize }, + PerDocumentId { retrieve_vectors: bool }, + Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, } pub trait Analytics: Sync + Send { diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index add430893..94e4684d5 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -597,6 +597,9 @@ pub struct SearchAggregator { // every time a request has a filter, this field must be incremented by one sort_total_number_of_criteria: usize, + // distinct + distinct: bool, + // filter filter_with_geo_radius: bool, filter_with_geo_bounding_box: bool, @@ -622,6 +625,7 @@ pub struct SearchAggregator { // Whether a non-default embedder was specified embedder: bool, hybrid: bool, + retrieve_vectors: bool, // every time a search is done, we increment the counter linked to the used settings matching_strategy: HashMap, @@ -648,6 +652,7 @@ pub struct SearchAggregator { // scoring show_ranking_score: bool, show_ranking_score_details: bool, + ranking_score_threshold: bool, } impl SearchAggregator { @@ -661,6 +666,7 @@ impl SearchAggregator { page, hits_per_page, attributes_to_retrieve: _, + retrieve_vectors, attributes_to_crop: _, crop_length, attributes_to_highlight: _, @@ -669,6 +675,7 @@ impl SearchAggregator { show_ranking_score_details, filter, sort, + distinct, facets: _, highlight_pre_tag, highlight_post_tag, @@ -676,6 +683,7 @@ impl SearchAggregator { matching_strategy, attributes_to_search_on, hybrid, + ranking_score_threshold, } = query; let mut ret = Self::default(); @@ -690,6 +698,8 @@ impl SearchAggregator { ret.sort_sum_of_criteria_terms = sort.len(); } + ret.distinct = distinct.is_some(); + if let Some(ref filter) = filter { static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); ret.filter_total_number_of_criteria = 1; @@ -726,6 +736,7 @@ impl SearchAggregator { if let Some(ref vector) = vector { ret.max_vector_size = vector.len(); } + ret.retrieve_vectors |= retrieve_vectors; if query.is_finite_pagination() { let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); @@ -748,6 +759,7 @@ impl SearchAggregator { ret.show_ranking_score = *show_ranking_score; ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); if let Some(hybrid) = hybrid { ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO(); @@ -792,6 +804,7 @@ impl SearchAggregator { sort_with_geo_point, sort_sum_of_criteria_terms, sort_total_number_of_criteria, + distinct, filter_with_geo_radius, filter_with_geo_bounding_box, filter_sum_of_criteria_terms, @@ -800,6 +813,7 @@ impl SearchAggregator { attributes_to_search_on_total_number_of_uses, max_terms_number, max_vector_size, + retrieve_vectors, matching_strategy, max_limit, max_offset, @@ -821,6 +835,7 @@ impl SearchAggregator { hybrid, total_degraded, total_used_negative_operator, + ranking_score_threshold, } = other; if self.timestamp.is_none() { @@ -847,6 +862,9 @@ impl SearchAggregator { self.sort_total_number_of_criteria = self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria); + // distinct + self.distinct |= distinct; + // filter self.filter_with_geo_radius |= filter_with_geo_radius; self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; @@ -869,6 +887,7 @@ impl SearchAggregator { // vector self.max_vector_size = self.max_vector_size.max(max_vector_size); + self.retrieve_vectors |= retrieve_vectors; self.semantic_ratio |= semantic_ratio; self.hybrid |= hybrid; self.embedder |= embedder; @@ -904,6 +923,7 @@ impl SearchAggregator { // scoring self.show_ranking_score |= show_ranking_score; self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; } pub fn into_event(self, user: &User, event_name: &str) -> Option { @@ -916,6 +936,7 @@ impl SearchAggregator { sort_with_geo_point, sort_sum_of_criteria_terms, sort_total_number_of_criteria, + distinct, filter_with_geo_radius, filter_with_geo_bounding_box, filter_sum_of_criteria_terms, @@ -924,6 +945,7 @@ impl SearchAggregator { attributes_to_search_on_total_number_of_uses, max_terms_number, max_vector_size, + retrieve_vectors, matching_strategy, max_limit, max_offset, @@ -945,6 +967,7 @@ impl SearchAggregator { hybrid, total_degraded, total_used_negative_operator, + ranking_score_threshold, } = self; if total_received == 0 { @@ -971,6 +994,7 @@ impl SearchAggregator { "with_geoPoint": sort_with_geo_point, "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), }, + "distinct": distinct, "filter": { "with_geoRadius": filter_with_geo_radius, "with_geoBoundingBox": filter_with_geo_bounding_box, @@ -985,6 +1009,7 @@ impl SearchAggregator { }, "vector": { "max_vector_size": max_vector_size, + "retrieve_vectors": retrieve_vectors, }, "hybrid": { "enabled": hybrid, @@ -1015,6 +1040,7 @@ impl SearchAggregator { "scoring": { "show_ranking_score": show_ranking_score, "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, }, }); @@ -1072,6 +1098,7 @@ impl MultiSearchAggregator { page: _, hits_per_page: _, attributes_to_retrieve: _, + retrieve_vectors: _, attributes_to_crop: _, crop_length: _, attributes_to_highlight: _, @@ -1080,6 +1107,7 @@ impl MultiSearchAggregator { show_matches_position: _, filter: _, sort: _, + distinct: _, facets: _, highlight_pre_tag: _, highlight_post_tag: _, @@ -1087,6 +1115,7 @@ impl MultiSearchAggregator { matching_strategy: _, attributes_to_search_on: _, hybrid: _, + ranking_score_threshold: _, } = query; index_uid.as_str() @@ -1234,6 +1263,7 @@ impl FacetSearchAggregator { matching_strategy, attributes_to_search_on, hybrid, + ranking_score_threshold, } = query; let mut ret = Self::default(); @@ -1248,7 +1278,8 @@ impl FacetSearchAggregator { || filter.is_some() || *matching_strategy != MatchingStrategy::default() || attributes_to_search_on.is_some() - || hybrid.is_some(); + || hybrid.is_some() + || ranking_score_threshold.is_some(); ret } @@ -1524,6 +1555,9 @@ pub struct DocumentsFetchAggregator { // if a filter was used per_filter: bool, + #[serde(rename = "vector.retrieve_vectors")] + retrieve_vectors: bool, + // pagination #[serde(rename = "pagination.max_limit")] max_limit: usize, @@ -1533,18 +1567,21 @@ pub struct DocumentsFetchAggregator { impl DocumentsFetchAggregator { pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self { - let (limit, offset) = match query { - DocumentFetchKind::PerDocumentId => (1, 0), - DocumentFetchKind::Normal { limit, offset, .. } => (*limit, *offset), + let (limit, offset, retrieve_vectors) = match query { + DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), + DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { + (*limit, *offset, *retrieve_vectors) + } }; Self { timestamp: Some(OffsetDateTime::now_utc()), user_agents: extract_user_agents(request).into_iter().collect(), total_received: 1, - per_document_id: matches!(query, DocumentFetchKind::PerDocumentId), + per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), max_limit: limit, max_offset: offset, + retrieve_vectors, } } @@ -1558,6 +1595,7 @@ impl DocumentsFetchAggregator { per_filter, max_limit, max_offset, + retrieve_vectors, } = other; if self.timestamp.is_none() { @@ -1573,6 +1611,8 @@ impl DocumentsFetchAggregator { self.max_limit = self.max_limit.max(max_limit); self.max_offset = self.max_offset.max(max_offset); + + self.retrieve_vectors |= retrieve_vectors; } pub fn into_event(self, user: &User, event_name: &str) -> Option { @@ -1613,6 +1653,7 @@ pub struct SimilarAggregator { // Whether a non-default embedder was specified embedder: bool, + retrieve_vectors: bool, // pagination max_limit: usize, @@ -1624,6 +1665,7 @@ pub struct SimilarAggregator { // scoring show_ranking_score: bool, show_ranking_score_details: bool, + ranking_score_threshold: bool, } impl SimilarAggregator { @@ -1635,9 +1677,11 @@ impl SimilarAggregator { offset, limit, attributes_to_retrieve: _, + retrieve_vectors, show_ranking_score, show_ranking_score_details, filter, + ranking_score_threshold, } = query; let mut ret = Self::default(); @@ -1675,8 +1719,10 @@ impl SimilarAggregator { ret.show_ranking_score = *show_ranking_score; ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); ret.embedder = embedder.is_some(); + ret.retrieve_vectors = *retrieve_vectors; ret } @@ -1708,6 +1754,8 @@ impl SimilarAggregator { show_ranking_score, show_ranking_score_details, embedder, + ranking_score_threshold, + retrieve_vectors, } = other; if self.timestamp.is_none() { @@ -1737,6 +1785,7 @@ impl SimilarAggregator { } self.embedder |= embedder; + self.retrieve_vectors |= retrieve_vectors; // pagination self.max_limit = self.max_limit.max(max_limit); @@ -1749,6 +1798,7 @@ impl SimilarAggregator { // scoring self.show_ranking_score |= show_ranking_score; self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; } pub fn into_event(self, user: &User, event_name: &str) -> Option { @@ -1769,6 +1819,8 @@ impl SimilarAggregator { show_ranking_score, show_ranking_score_details, embedder, + ranking_score_threshold, + retrieve_vectors, } = self; if total_received == 0 { @@ -1795,6 +1847,9 @@ impl SimilarAggregator { "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), }, + "vector": { + "retrieve_vectors": retrieve_vectors, + }, "hybrid": { "embedder": embedder, }, @@ -1808,6 +1863,7 @@ impl SimilarAggregator { "scoring": { "show_ranking_score": show_ranking_score, "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, }, }); diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 43fab1dae..1f413ec7d 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -16,6 +16,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::update::IndexDocumentsMethod; +use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::DocumentId; use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::tasks::KindWithContent; @@ -39,7 +40,7 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; -use crate::search::parse_filter; +use crate::search::{parse_filter, RetrieveVectors}; use crate::Opt; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { @@ -94,6 +95,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) { pub struct GetDocument { #[deserr(default, error = DeserrQueryParamError)] fields: OptionStarOrList, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, } pub async fn get_document( @@ -107,13 +110,20 @@ pub async fn get_document( debug!(parameters = ?params, "Get document"); let index_uid = IndexUid::try_from(index_uid)?; - analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req); - - let GetDocument { fields } = params.into_inner(); + let GetDocument { fields, retrieve_vectors: param_retrieve_vectors } = params.into_inner(); let attributes_to_retrieve = fields.merge_star_and_none(); + let features = index_scheduler.features(); + let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?; + + analytics.get_fetch_documents( + &DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 }, + &req, + ); + let index = index_scheduler.index(&index_uid)?; - let document = retrieve_document(&index, &document_id, attributes_to_retrieve)?; + let document = + retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors)?; debug!(returns = ?document, "Get document"); Ok(HttpResponse::Ok().json(document)) } @@ -153,6 +163,8 @@ pub struct BrowseQueryGet { limit: Param, #[deserr(default, error = DeserrQueryParamError)] fields: OptionStarOrList, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] filter: Option, } @@ -166,6 +178,8 @@ pub struct BrowseQuery { limit: usize, #[deserr(default, error = DeserrJsonError)] fields: Option>, + #[deserr(default, error = DeserrJsonError)] + retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] filter: Option, } @@ -185,6 +199,7 @@ pub async fn documents_by_query_post( with_filter: body.filter.is_some(), limit: body.limit, offset: body.offset, + retrieve_vectors: body.retrieve_vectors, }, &req, ); @@ -201,7 +216,7 @@ pub async fn get_documents( ) -> Result { debug!(parameters = ?params, "Get documents GET"); - let BrowseQueryGet { limit, offset, fields, filter } = params.into_inner(); + let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); let filter = match filter { Some(f) => match serde_json::from_str(&f) { @@ -215,6 +230,7 @@ pub async fn get_documents( offset: offset.0, limit: limit.0, fields: fields.merge_star_and_none(), + retrieve_vectors: retrieve_vectors.0, filter, }; @@ -223,6 +239,7 @@ pub async fn get_documents( with_filter: query.filter.is_some(), limit: query.limit, offset: query.offset, + retrieve_vectors: query.retrieve_vectors, }, &req, ); @@ -236,10 +253,14 @@ fn documents_by_query( query: BrowseQuery, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let BrowseQuery { offset, limit, fields, filter } = query; + let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query; + + let features = index_scheduler.features(); + let retrieve_vectors = RetrieveVectors::new(retrieve_vectors, features)?; let index = index_scheduler.index(&index_uid)?; - let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?; + let (total, documents) = + retrieve_documents(&index, offset, limit, filter, fields, retrieve_vectors)?; let ret = PaginationView::new(offset, limit, total as usize, documents); @@ -579,13 +600,44 @@ fn some_documents<'a, 't: 'a>( index: &'a Index, rtxn: &'t RoTxn, doc_ids: impl IntoIterator + 'a, + retrieve_vectors: RetrieveVectors, ) -> Result> + 'a, ResponseError> { let fields_ids_map = index.fields_ids_map(rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index.embedding_configs(rtxn)?; Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { - ret.map_err(ResponseError::from).and_then(|(_key, document)| -> Result<_, ResponseError> { - Ok(milli::obkv_to_json(&all_fields, &fields_ids_map, document)?) + ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> { + let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; + match retrieve_vectors { + RetrieveVectors::Ignore => {} + RetrieveVectors::Hide => { + document.remove("_vectors"); + } + RetrieveVectors::Retrieve => { + let mut vectors = match document.remove("_vectors") { + Some(Value::Object(map)) => map, + _ => Default::default(), + }; + for (name, vector) in index.embeddings(rtxn, key)? { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == name) + .is_some_and(|conf| conf.user_provided.contains(key)); + let embeddings = ExplicitVectors { + embeddings: Some(vector.into()), + regenerate: !user_provided, + }; + vectors.insert( + name, + serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, + ); + } + document.insert("_vectors".into(), vectors.into()); + } + } + + Ok(document) }) })) } @@ -596,6 +648,7 @@ fn retrieve_documents>( limit: usize, filter: Option, attributes_to_retrieve: Option>, + retrieve_vectors: RetrieveVectors, ) -> Result<(u64, Vec), ResponseError> { let rtxn = index.read_txn()?; let filter = &filter; @@ -620,53 +673,57 @@ fn retrieve_documents>( let (it, number_of_documents) = { let number_of_documents = candidates.len(); ( - some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?, + some_documents( + index, + &rtxn, + candidates.into_iter().skip(offset).take(limit), + retrieve_vectors, + )?, number_of_documents, ) }; - let documents: Result, ResponseError> = it + let documents: Vec<_> = it .map(|document| { Ok(match &attributes_to_retrieve { Some(attributes_to_retrieve) => permissive_json_pointer::select_values( &document?, - attributes_to_retrieve.iter().map(|s| s.as_ref()), + attributes_to_retrieve.iter().map(|s| s.as_ref()).chain( + (retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"), + ), ), None => document?, }) }) - .collect(); + .collect::>()?; - Ok((number_of_documents, documents?)) + Ok((number_of_documents, documents)) } fn retrieve_document>( index: &Index, doc_id: &str, attributes_to_retrieve: Option>, + retrieve_vectors: RetrieveVectors, ) -> Result { let txn = index.read_txn()?; - let fields_ids_map = index.fields_ids_map(&txn)?; - let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let internal_id = index .external_documents_ids() .get(&txn, doc_id)? .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; - let document = index - .documents(&txn, std::iter::once(internal_id))? - .into_iter() + let document = some_documents(index, &txn, Some(internal_id), retrieve_vectors)? .next() - .map(|(_, d)| d) - .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; + .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))??; - let document = meilisearch_types::milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; let document = match &attributes_to_retrieve { Some(attributes_to_retrieve) => permissive_json_pointer::select_values( &document, - attributes_to_retrieve.iter().map(|s| s.as_ref()), + attributes_to_retrieve + .iter() + .map(|s| s.as_ref()) + .chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")), ), None => document, }; diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 3f05fa846..89d0418b4 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -14,8 +14,8 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::routes::indexes::search::search_kind; use crate::search::{ - add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, SearchQuery, - DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, + add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, + SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, }; use crate::search_queue::SearchQueue; @@ -46,6 +46,8 @@ pub struct FacetSearchQuery { pub matching_strategy: MatchingStrategy, #[deserr(default, error = DeserrJsonError, default)] pub attributes_to_search_on: Option>, + #[deserr(default, error = DeserrJsonError, default)] + pub ranking_score_threshold: Option, } pub async fn search( @@ -103,6 +105,7 @@ impl From for SearchQuery { matching_strategy, attributes_to_search_on, hybrid, + ranking_score_threshold, } = value; SearchQuery { @@ -112,6 +115,7 @@ impl From for SearchQuery { page: None, hits_per_page: None, attributes_to_retrieve: None, + retrieve_vectors: false, attributes_to_crop: None, crop_length: DEFAULT_CROP_LENGTH(), attributes_to_highlight: None, @@ -120,6 +124,7 @@ impl From for SearchQuery { show_ranking_score_details: false, filter, sort: None, + distinct: None, facets: None, highlight_pre_tag: DEFAULT_HIGHLIGHT_PRE_TAG(), highlight_post_tag: DEFAULT_HIGHLIGHT_POST_TAG(), @@ -128,6 +133,7 @@ impl From for SearchQuery { vector, attributes_to_search_on, hybrid, + ranking_score_threshold, } } } diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 8628da6d9..985864ba5 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -19,9 +19,10 @@ use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; use crate::search::{ - add_search_rules, perform_search, HybridQuery, MatchingStrategy, SearchKind, SearchQuery, - SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, - DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, + add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, + RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, + DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, + DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, }; use crate::search_queue::SearchQueue; @@ -50,6 +51,8 @@ pub struct SearchQueryGet { hits_per_page: Option>, #[deserr(default, error = DeserrQueryParamError)] attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] attributes_to_crop: Option>, #[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError)] @@ -60,6 +63,8 @@ pub struct SearchQueryGet { filter: Option, #[deserr(default, error = DeserrQueryParamError)] sort: Option, + #[deserr(default, error = DeserrQueryParamError)] + distinct: Option, #[deserr(default, error = DeserrQueryParamError)] show_matches_position: Param, #[deserr(default, error = DeserrQueryParamError)] @@ -82,6 +87,21 @@ pub struct SearchQueryGet { pub hybrid_embedder: Option, #[deserr(default, error = DeserrQueryParamError)] pub hybrid_semantic_ratio: Option, + #[deserr(default, error = DeserrQueryParamError)] + pub ranking_score_threshold: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, deserr::Deserr)] +#[deserr(try_from(String) = TryFrom::try_from -> InvalidSearchRankingScoreThreshold)] +pub struct RankingScoreThresholdGet(RankingScoreThreshold); + +impl std::convert::TryFrom for RankingScoreThresholdGet { + type Error = InvalidSearchRankingScoreThreshold; + + fn try_from(s: String) -> Result { + let f: f64 = s.parse().map_err(|_| InvalidSearchRankingScoreThreshold)?; + Ok(RankingScoreThresholdGet(RankingScoreThreshold::try_from(f)?)) + } } #[derive(Debug, Clone, Copy, Default, PartialEq, deserr::Deserr)] @@ -137,11 +157,13 @@ impl From for SearchQuery { page: other.page.as_deref().copied(), hits_per_page: other.hits_per_page.as_deref().copied(), attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()), + retrieve_vectors: other.retrieve_vectors.0, attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()), crop_length: other.crop_length.0, attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()), filter, sort: other.sort.map(|attr| fix_sort_query_parameters(&attr)), + distinct: other.distinct, show_matches_position: other.show_matches_position.0, show_ranking_score: other.show_ranking_score.0, show_ranking_score_details: other.show_ranking_score_details.0, @@ -152,6 +174,7 @@ impl From for SearchQuery { matching_strategy: other.matching_strategy, attributes_to_search_on: other.attributes_to_search_on.map(|o| o.into_iter().collect()), hybrid, + ranking_score_threshold: other.ranking_score_threshold.map(|o| o.0), } } } @@ -205,10 +228,12 @@ pub async fn search_with_url_query( let features = index_scheduler.features(); let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; - + let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?; let _permit = search_queue.try_get_search_permit().await?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vector) + }) + .await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } @@ -245,10 +270,13 @@ pub async fn search_with_post( let features = index_scheduler.features(); let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; let _permit = search_queue.try_get_search_permit().await?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vectors) + }) + .await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); if search_result.degraded { @@ -270,11 +298,10 @@ pub fn search_kind( features: RoFeatures, ) -> Result { if query.vector.is_some() { - features.check_vector("Passing `vector` as a query parameter")?; + features.check_vector("Passing `vector` as a parameter")?; } - if query.hybrid.is_some() { - features.check_vector("Passing `hybrid` as a query parameter")?; + features.check_vector("Passing `hybrid` as a parameter")?; } // regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing diff --git a/meilisearch/src/routes/indexes/similar.rs b/meilisearch/src/routes/indexes/similar.rs index da73dd63b..1dd83b09b 100644 --- a/meilisearch/src/routes/indexes/similar.rs +++ b/meilisearch/src/routes/indexes/similar.rs @@ -4,11 +4,7 @@ use deserr::actix_web::{AwebJson, AwebQueryParameter}; use index_scheduler::IndexScheduler; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; -use meilisearch_types::error::deserr_codes::{ - InvalidEmbedder, InvalidSimilarAttributesToRetrieve, InvalidSimilarFilter, InvalidSimilarId, - InvalidSimilarLimit, InvalidSimilarOffset, InvalidSimilarShowRankingScore, - InvalidSimilarShowRankingScoreDetails, -}; +use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{ErrorCode as _, ResponseError}; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::keys::actions; @@ -21,8 +17,8 @@ use crate::analytics::{Analytics, SimilarAggregator}; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::search::{ - add_search_rules, perform_similar, SearchKind, SimilarQuery, SimilarResult, - DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, + add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind, + SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, }; pub fn configure(cfg: &mut web::ServiceConfig) { @@ -42,9 +38,7 @@ pub async fn similar_get( ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let query = params.0.try_into().map_err(|code: InvalidSimilarId| { - ResponseError::from_msg(code.to_string(), code.error_code()) - })?; + let query = params.0.try_into()?; let mut aggregate = SimilarAggregator::from_query(&query, &req); @@ -99,6 +93,8 @@ async fn similar( features.check_vector("Using the similar API")?; + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; + // Tenant token search_rules. if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) { add_search_rules(&mut query.filter, search_rules); @@ -109,8 +105,10 @@ async fn similar( let (embedder_name, embedder) = SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?; - tokio::task::spawn_blocking(move || perform_similar(&index, query, embedder_name, embedder)) - .await? + tokio::task::spawn_blocking(move || { + perform_similar(&index, query, embedder_name, embedder, retrieve_vectors) + }) + .await? } #[derive(Debug, deserr::Deserr)] @@ -124,18 +122,35 @@ pub struct SimilarQueryGet { limit: Param, #[deserr(default, error = DeserrQueryParamError)] attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] filter: Option, #[deserr(default, error = DeserrQueryParamError)] show_ranking_score: Param, #[deserr(default, error = DeserrQueryParamError)] show_ranking_score_details: Param, + #[deserr(default, error = DeserrQueryParamError, default)] + pub ranking_score_threshold: Option, #[deserr(default, error = DeserrQueryParamError)] pub embedder: Option, } +#[derive(Debug, Clone, Copy, PartialEq, deserr::Deserr)] +#[deserr(try_from(String) = TryFrom::try_from -> InvalidSimilarRankingScoreThreshold)] +pub struct RankingScoreThresholdGet(RankingScoreThresholdSimilar); + +impl std::convert::TryFrom for RankingScoreThresholdGet { + type Error = InvalidSimilarRankingScoreThreshold; + + fn try_from(s: String) -> Result { + let f: f64 = s.parse().map_err(|_| InvalidSimilarRankingScoreThreshold)?; + Ok(RankingScoreThresholdGet(RankingScoreThresholdSimilar::try_from(f)?)) + } +} + impl TryFrom for SimilarQuery { - type Error = InvalidSimilarId; + type Error = ResponseError; fn try_from( SimilarQueryGet { @@ -143,10 +158,12 @@ impl TryFrom for SimilarQuery { offset, limit, attributes_to_retrieve, + retrieve_vectors, filter, show_ranking_score, show_ranking_score_details, embedder, + ranking_score_threshold, }: SimilarQueryGet, ) -> Result { let filter = match filter { @@ -158,14 +175,18 @@ impl TryFrom for SimilarQuery { }; Ok(SimilarQuery { - id: id.0.try_into()?, + id: id.0.try_into().map_err(|code: InvalidSimilarId| { + ResponseError::from_msg(code.to_string(), code.error_code()) + })?, offset: offset.0, limit: limit.0, filter, embedder, attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()), + retrieve_vectors: retrieve_vectors.0, show_ranking_score: show_ranking_score.0, show_ranking_score_details: show_ranking_score_details.0, + ranking_score_threshold: ranking_score_threshold.map(|x| x.0), }) } } diff --git a/meilisearch/src/routes/multi_search.rs b/meilisearch/src/routes/multi_search.rs index a83dc4bc0..1d697dac6 100644 --- a/meilisearch/src/routes/multi_search.rs +++ b/meilisearch/src/routes/multi_search.rs @@ -15,7 +15,7 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::indexes::search::search_kind; use crate::search::{ - add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex, + add_search_rules, perform_search, RetrieveVectors, SearchQueryWithIndex, SearchResultWithIndex, }; use crate::search_queue::SearchQueue; @@ -83,11 +83,14 @@ pub async fn multi_search_with_post( let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features) .with_index(query_index)?; + let retrieve_vector = + RetrieveVectors::new(query.retrieve_vectors, features).with_index(query_index)?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)) - .await - .with_index(query_index)?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vector) + }) + .await + .with_index(query_index)?; search_results.push(SearchResultWithIndex { index_uid: index_uid.into_inner(), diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 7f258b952..a3a4b48a3 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -15,6 +15,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; +use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; @@ -59,6 +60,8 @@ pub struct SearchQuery { pub hits_per_page: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] pub attributes_to_crop: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_CROP_LENGTH())] @@ -75,6 +78,8 @@ pub struct SearchQuery { pub filter: Option, #[deserr(default, error = DeserrJsonError)] pub sort: Option>, + #[deserr(default, error = DeserrJsonError)] + pub distinct: Option, #[deserr(default, error = DeserrJsonError)] pub facets: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_HIGHLIGHT_PRE_TAG())] @@ -87,6 +92,44 @@ pub struct SearchQuery { pub matching_strategy: MatchingStrategy, #[deserr(default, error = DeserrJsonError, default)] pub attributes_to_search_on: Option>, + #[deserr(default, error = DeserrJsonError, default)] + pub ranking_score_threshold: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Deserr)] +#[deserr(try_from(f64) = TryFrom::try_from -> InvalidSearchRankingScoreThreshold)] +pub struct RankingScoreThreshold(f64); + +impl std::convert::TryFrom for RankingScoreThreshold { + type Error = InvalidSearchRankingScoreThreshold; + + fn try_from(f: f64) -> Result { + // the suggested "fix" is: `!(0.0..=1.0).contains(&f)`` which is allegedly less readable + #[allow(clippy::manual_range_contains)] + if f > 1.0 || f < 0.0 { + Err(InvalidSearchRankingScoreThreshold) + } else { + Ok(RankingScoreThreshold(f)) + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Deserr)] +#[deserr(try_from(f64) = TryFrom::try_from -> InvalidSimilarRankingScoreThreshold)] +pub struct RankingScoreThresholdSimilar(f64); + +impl std::convert::TryFrom for RankingScoreThresholdSimilar { + type Error = InvalidSimilarRankingScoreThreshold; + + fn try_from(f: f64) -> Result { + // the suggested "fix" is: `!(0.0..=1.0).contains(&f)`` which is allegedly less readable + #[allow(clippy::manual_range_contains)] + if f > 1.0 || f < 0.0 { + Err(InvalidSimilarRankingScoreThreshold) + } else { + Ok(Self(f)) + } + } } // Since this structure is logged A LOT we're going to reduce the number of things it logs to the bare minimum. @@ -103,6 +146,7 @@ impl fmt::Debug for SearchQuery { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -111,12 +155,14 @@ impl fmt::Debug for SearchQuery { show_ranking_score_details, filter, sort, + distinct, facets, highlight_pre_tag, highlight_post_tag, crop_marker, matching_strategy, attributes_to_search_on, + ranking_score_threshold, } = self; let mut debug = f.debug_struct("SearchQuery"); @@ -134,6 +180,9 @@ impl fmt::Debug for SearchQuery { if let Some(q) = q { debug.field("q", &q); } + if *retrieve_vectors { + debug.field("retrieve_vectors", &retrieve_vectors); + } if let Some(v) = vector { if v.len() < 10 { debug.field("vector", &v); @@ -156,6 +205,9 @@ impl fmt::Debug for SearchQuery { if let Some(sort) = sort { debug.field("sort", &sort); } + if let Some(distinct) = distinct { + debug.field("distinct", &distinct); + } if let Some(facets) = facets { debug.field("facets", &facets); } @@ -188,6 +240,9 @@ impl fmt::Debug for SearchQuery { debug.field("highlight_pre_tag", &highlight_pre_tag); debug.field("highlight_post_tag", &highlight_post_tag); debug.field("crop_marker", &crop_marker); + if let Some(ranking_score_threshold) = ranking_score_threshold { + debug.field("ranking_score_threshold", &ranking_score_threshold); + } debug.finish() } @@ -328,6 +383,8 @@ pub struct SearchQueryWithIndex { pub hits_per_page: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] pub attributes_to_crop: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_CROP_LENGTH())] @@ -344,6 +401,8 @@ pub struct SearchQueryWithIndex { pub filter: Option, #[deserr(default, error = DeserrJsonError)] pub sort: Option>, + #[deserr(default, error = DeserrJsonError)] + pub distinct: Option, #[deserr(default, error = DeserrJsonError)] pub facets: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_HIGHLIGHT_PRE_TAG())] @@ -356,6 +415,8 @@ pub struct SearchQueryWithIndex { pub matching_strategy: MatchingStrategy, #[deserr(default, error = DeserrJsonError, default)] pub attributes_to_search_on: Option>, + #[deserr(default, error = DeserrJsonError, default)] + pub ranking_score_threshold: Option, } impl SearchQueryWithIndex { @@ -369,6 +430,7 @@ impl SearchQueryWithIndex { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -377,6 +439,7 @@ impl SearchQueryWithIndex { show_matches_position, filter, sort, + distinct, facets, highlight_pre_tag, highlight_post_tag, @@ -384,6 +447,7 @@ impl SearchQueryWithIndex { matching_strategy, attributes_to_search_on, hybrid, + ranking_score_threshold, } = self; ( index_uid, @@ -395,6 +459,7 @@ impl SearchQueryWithIndex { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -403,6 +468,7 @@ impl SearchQueryWithIndex { show_matches_position, filter, sort, + distinct, facets, highlight_pre_tag, highlight_post_tag, @@ -410,6 +476,7 @@ impl SearchQueryWithIndex { matching_strategy, attributes_to_search_on, hybrid, + ranking_score_threshold, // do not use ..Default::default() here, // rather add any missing field from `SearchQuery` to `SearchQueryWithIndex` }, @@ -432,10 +499,14 @@ pub struct SimilarQuery { pub embedder: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError, default)] pub show_ranking_score: bool, #[deserr(default, error = DeserrJsonError, default)] pub show_ranking_score_details: bool, + #[deserr(default, error = DeserrJsonError, default)] + pub ranking_score_threshold: Option, } #[derive(Debug, Clone, PartialEq, Deserr)] @@ -664,6 +735,13 @@ fn prepare_search<'t>( ) -> Result<(milli::Search<'t>, bool, usize, usize), MeilisearchHttpError> { let mut search = index.search(rtxn); search.time_budget(time_budget); + if let Some(ranking_score_threshold) = query.ranking_score_threshold { + search.ranking_score_threshold(ranking_score_threshold.0); + } + + if let Some(distinct) = &query.distinct { + search.distinct(distinct.clone()); + } match search_kind { SearchKind::KeywordOnly => { @@ -705,11 +783,16 @@ fn prepare_search<'t>( .unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS); search.exhaustive_number_hits(is_finite_pagination); - search.scoring_strategy(if query.show_ranking_score || query.show_ranking_score_details { - ScoringStrategy::Detailed - } else { - ScoringStrategy::Skip - }); + search.scoring_strategy( + if query.show_ranking_score + || query.show_ranking_score_details + || query.ranking_score_threshold.is_some() + { + ScoringStrategy::Detailed + } else { + ScoringStrategy::Skip + }, + ); // compute the offset on the limit depending on the pagination mode. let (offset, limit) = if is_finite_pagination { @@ -754,6 +837,7 @@ pub fn perform_search( index: &Index, query: SearchQuery, search_kind: SearchKind, + retrieve_vectors: RetrieveVectors, ) -> Result { let before_search = Instant::now(); let rtxn = index.read_txn()?; @@ -787,32 +871,37 @@ pub fn perform_search( let SearchQuery { q, - vector: _, - hybrid: _, - // already computed from prepare_search - offset: _, limit, page, hits_per_page, attributes_to_retrieve, + // use the enum passed as parameter + retrieve_vectors: _, attributes_to_crop, crop_length, attributes_to_highlight, show_matches_position, show_ranking_score, show_ranking_score_details, - filter: _, sort, facets, highlight_pre_tag, highlight_post_tag, crop_marker, + // already used in prepare_search + vector: _, + hybrid: _, + offset: _, + ranking_score_threshold: _, matching_strategy: _, attributes_to_search_on: _, + filter: _, + distinct: _, } = query; let format = AttributesFormat { attributes_to_retrieve, + retrieve_vectors, attributes_to_highlight, attributes_to_crop, crop_length, @@ -896,6 +985,7 @@ pub fn perform_search( struct AttributesFormat { attributes_to_retrieve: Option>, + retrieve_vectors: RetrieveVectors, attributes_to_highlight: Option>, attributes_to_crop: Option>, crop_length: usize, @@ -908,6 +998,36 @@ struct AttributesFormat { show_ranking_score_details: bool, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RetrieveVectors { + /// Do not touch the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is disabled + Ignore, + /// Remove the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `false` + Hide, + /// Retrieve vectors from the DB and merge them into the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `true` + Retrieve, +} + +impl RetrieveVectors { + pub fn new( + retrieve_vector: bool, + features: index_scheduler::RoFeatures, + ) -> Result { + match (retrieve_vector, features.check_vector("Passing `retrieveVectors` as a parameter")) { + (true, Ok(())) => Ok(Self::Retrieve), + (true, Err(error)) => Err(error), + (false, Ok(())) => Ok(Self::Hide), + (false, Err(_)) => Ok(Self::Ignore), + } + } +} + fn make_hits( index: &Index, rtxn: &RoTxn<'_>, @@ -917,10 +1037,32 @@ fn make_hits( document_scores: Vec>, ) -> Result, MeilisearchHttpError> { let fields_ids_map = index.fields_ids_map(rtxn).unwrap(); - let displayed_ids = index - .displayed_fields_ids(rtxn)? - .map(|fields| fields.into_iter().collect::>()) - .unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); + let displayed_ids = + index.displayed_fields_ids(rtxn)?.map(|fields| fields.into_iter().collect::>()); + + let vectors_fid = fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + + let vectors_is_hidden = match (&displayed_ids, vectors_fid) { + // displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid + (None, _) => false, + // displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field + (Some(_), None) => true, + // displayed_ids is a finit list, so hide if `_vectors` is not part of it + (Some(map), Some(vectors_fid)) => map.contains(&vectors_fid), + }; + + let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors { + if vectors_is_hidden { + RetrieveVectors::Hide + } else { + RetrieveVectors::Retrieve + } + } else { + format.retrieve_vectors + }; + + let displayed_ids = + displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); let fids = |attrs: &BTreeSet| { let mut ids = BTreeSet::new(); for attr in attrs { @@ -943,6 +1085,7 @@ fn make_hits( .intersection(&displayed_ids) .cloned() .collect(); + let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default(); let attr_to_crop = format.attributes_to_crop.unwrap_or_default(); let formatted_options = compute_formatted_options( @@ -976,18 +1119,48 @@ fn make_hits( formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_suffix(format.highlight_post_tag); let mut documents = Vec::new(); + let embedding_configs = index.embedding_configs(rtxn)?; let documents_iter = index.documents(rtxn, documents_ids)?; - for ((_id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { + for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { // First generate a document with all the displayed fields let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?; + let add_vectors_fid = + vectors_fid.filter(|_fid| retrieve_vectors == RetrieveVectors::Retrieve); + // select the attributes to retrieve let attributes_to_retrieve = to_retrieve_ids .iter() + // skip the vectors_fid if RetrieveVectors::Hide + .filter(|fid| match vectors_fid { + Some(vectors_fid) => { + !(retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid) + } + None => true, + }) + // need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve` + .chain(add_vectors_fid.iter()) .map(|&fid| fields_ids_map.name(fid).expect("Missing field name")); let mut document = permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve); + if retrieve_vectors == RetrieveVectors::Retrieve { + let mut vectors = match document.remove("_vectors") { + Some(Value::Object(map)) => map, + _ => Default::default(), + }; + for (name, vector) in index.embeddings(rtxn, id)? { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == name) + .is_some_and(|conf| conf.user_provided.contains(id)); + let embeddings = + ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; + vectors.insert(name, serde_json::to_value(embeddings)?); + } + document.insert("_vectors".into(), vectors.into()); + } + let (matches_position, formatted) = format_fields( &displayed_document, &fields_ids_map, @@ -1057,6 +1230,7 @@ pub fn perform_similar( query: SimilarQuery, embedder_name: String, embedder: Arc, + retrieve_vectors: RetrieveVectors, ) -> Result { let before_search = Instant::now(); let rtxn = index.read_txn()?; @@ -1068,8 +1242,10 @@ pub fn perform_similar( filter: _, embedder: _, attributes_to_retrieve, + retrieve_vectors: _, show_ranking_score, show_ranking_score_details, + ranking_score_threshold, } = query; // using let-else rather than `?` so that the borrow checker identifies we're always returning here, @@ -1093,6 +1269,10 @@ pub fn perform_similar( } } + if let Some(ranking_score_threshold) = ranking_score_threshold { + similar.ranking_score_threshold(ranking_score_threshold.0); + } + let milli::SearchResult { documents_ids, matching_words: _, @@ -1109,6 +1289,7 @@ pub fn perform_similar( let format = AttributesFormat { attributes_to_retrieve, + retrieve_vectors, attributes_to_highlight: None, attributes_to_crop: None, crop_length: DEFAULT_CROP_LENGTH(), diff --git a/meilisearch/src/search_queue.rs b/meilisearch/src/search_queue.rs index 415da0c15..4f6dccc42 100644 --- a/meilisearch/src/search_queue.rs +++ b/meilisearch/src/search_queue.rs @@ -40,8 +40,9 @@ pub struct Permit { impl Drop for Permit { fn drop(&mut self) { + let sender = self.sender.clone(); // if the channel is closed then the whole instance is down - let _ = futures::executor::block_on(self.sender.send(())); + std::mem::drop(tokio::spawn(async move { sender.send(()).await })); } } diff --git a/meilisearch/tests/common/index.rs b/meilisearch/tests/common/index.rs index f914f8dc8..045f8673c 100644 --- a/meilisearch/tests/common/index.rs +++ b/meilisearch/tests/common/index.rs @@ -182,14 +182,10 @@ impl Index<'_> { self.service.get(url).await } - pub async fn get_document( - &self, - id: u64, - options: Option, - ) -> (Value, StatusCode) { + pub async fn get_document(&self, id: u64, options: Option) -> (Value, StatusCode) { let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id); - if let Some(fields) = options.and_then(|o| o.fields) { - let _ = write!(url, "?fields={}", fields.join(",")); + if let Some(options) = options { + write!(url, "{}", yaup::to_string(&options).unwrap()).unwrap(); } self.service.get(url).await } @@ -205,18 +201,11 @@ impl Index<'_> { } pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) { - let mut url = format!("/indexes/{}/documents?", urlencode(self.uid.as_ref())); - if let Some(limit) = options.limit { - let _ = write!(url, "limit={}&", limit); - } - - if let Some(offset) = options.offset { - let _ = write!(url, "offset={}&", offset); - } - - if let Some(attributes_to_retrieve) = options.attributes_to_retrieve { - let _ = write!(url, "fields={}&", attributes_to_retrieve.join(",")); - } + let url = format!( + "/indexes/{}/documents{}", + urlencode(self.uid.as_ref()), + yaup::to_string(&options).unwrap() + ); self.service.get(url).await } @@ -435,13 +424,14 @@ impl Index<'_> { } } -pub struct GetDocumentOptions { - pub fields: Option>, -} - -#[derive(Debug, Default)] +#[derive(Debug, Default, serde::Serialize)] +#[serde(rename_all = "camelCase")] pub struct GetAllDocumentsOptions { + #[serde(skip_serializing_if = "Option::is_none")] pub limit: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub offset: Option, - pub attributes_to_retrieve: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub fields: Option>, + pub retrieve_vectors: bool, } diff --git a/meilisearch/tests/common/mod.rs b/meilisearch/tests/common/mod.rs index 1391cf7cf..1317dbce7 100644 --- a/meilisearch/tests/common/mod.rs +++ b/meilisearch/tests/common/mod.rs @@ -6,7 +6,7 @@ pub mod service; use std::fmt::{self, Display}; #[allow(unused)] -pub use index::{GetAllDocumentsOptions, GetDocumentOptions}; +pub use index::GetAllDocumentsOptions; use meili_snap::json_string; use serde::{Deserialize, Serialize}; #[allow(unused)] @@ -71,7 +71,7 @@ impl Display for Value { write!( f, "{}", - json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }) + json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]", ".processingTimeMs" => "[duration]" }) ) } } diff --git a/meilisearch/tests/documents/errors.rs b/meilisearch/tests/documents/errors.rs index cd2d89813..055f6512f 100644 --- a/meilisearch/tests/documents/errors.rs +++ b/meilisearch/tests/documents/errors.rs @@ -719,7 +719,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!(null)).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid value type: expected an object, but found null", "code": "bad_request", @@ -730,7 +730,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "offset": "doggo" })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid value type at `.offset`: expected a positive integer, but found a string: `\"doggo\"`", "code": "invalid_document_offset", @@ -741,7 +741,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "limit": "doggo" })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid value type at `.limit`: expected a positive integer, but found a string: `\"doggo\"`", "code": "invalid_document_limit", @@ -752,7 +752,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "fields": "doggo" })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid value type at `.fields`: expected an array, but found a string: `\"doggo\"`", "code": "invalid_document_fields", @@ -763,7 +763,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "filter": true })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid syntax for the filter parameter: `expected String, Array, found: true`.", "code": "invalid_document_filter", @@ -774,7 +774,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "filter": "cool doggo" })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `cool doggo`.\n1:11 cool doggo", "code": "invalid_document_filter", @@ -786,7 +786,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "filter": "doggo = bernese" })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Attribute `doggo` is not filterable. Available filterable attributes are: `color`.\n1:6 doggo = bernese", "code": "invalid_document_filter", @@ -795,3 +795,70 @@ async fn fetch_document_by_filter() { } "###); } + +#[actix_rt::test] +async fn retrieve_vectors() { + let server = Server::new().await; + let index = server.index("doggo"); + + // GET ALL DOCUMENTS BY QUERY + let (response, _code) = index.get_all_documents_raw("?retrieveVectors=tamo").await; + snapshot!(response, @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); + let (response, _code) = index.get_all_documents_raw("?retrieveVectors=true").await; + snapshot!(response, @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + // FETCH ALL DOCUMENTS BY POST + let (response, _code) = + index.get_document_by_filter(json!({ "retrieveVectors": "tamo" })).await; + snapshot!(response, @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"tamo\"`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); + let (response, _code) = index.get_document_by_filter(json!({ "retrieveVectors": true })).await; + snapshot!(response, @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + // GET A SINGLE DOCUMENT + let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await; + snapshot!(response, @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); + let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await; + snapshot!(response, @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); +} diff --git a/meilisearch/tests/documents/get_documents.rs b/meilisearch/tests/documents/get_documents.rs index 3b0629fcb..efe4cf8e9 100644 --- a/meilisearch/tests/documents/get_documents.rs +++ b/meilisearch/tests/documents/get_documents.rs @@ -4,7 +4,7 @@ use meili_snap::*; use urlencoding::encode as urlencode; use crate::common::encoder::Encoder; -use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server, Value}; +use crate::common::{GetAllDocumentsOptions, Server, Value}; use crate::json; // TODO: partial test since we are testing error, amd error is not yet fully implemented in @@ -59,8 +59,7 @@ async fn get_document() { }) ); - let (response, code) = - index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["id"]) })).await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["id"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -69,9 +68,8 @@ async fn get_document() { }) ); - let (response, code) = index - .get_document(0, Some(GetDocumentOptions { fields: Some(vec!["nested.content"]) })) - .await; + let (response, code) = + index.get_document(0, Some(json!({ "fields": ["nested.content"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -211,7 +209,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["name"]), + fields: Some(vec!["name"]), ..Default::default() }) .await; @@ -225,9 +223,19 @@ async fn test_get_all_documents_attributes_to_retrieve() { assert_eq!(response["limit"], json!(20)); assert_eq!(response["total"], json!(77)); + let (response, code) = index.get_all_documents_raw("?fields=").await; + assert_eq!(code, 200); + assert_eq!(response["results"].as_array().unwrap().len(), 20); + for results in response["results"].as_array().unwrap() { + assert_eq!(results.as_object().unwrap().keys().count(), 0); + } + assert_eq!(response["offset"], json!(0)); + assert_eq!(response["limit"], json!(20)); + assert_eq!(response["total"], json!(77)); + let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec![]), + fields: Some(vec!["wrong"]), ..Default::default() }) .await; @@ -242,22 +250,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["wrong"]), - ..Default::default() - }) - .await; - assert_eq!(code, 200); - assert_eq!(response["results"].as_array().unwrap().len(), 20); - for results in response["results"].as_array().unwrap() { - assert_eq!(results.as_object().unwrap().keys().count(), 0); - } - assert_eq!(response["offset"], json!(0)); - assert_eq!(response["limit"], json!(20)); - assert_eq!(response["total"], json!(77)); - - let (response, code) = index - .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["name", "tags"]), + fields: Some(vec!["name", "tags"]), ..Default::default() }) .await; @@ -270,10 +263,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { } let (response, code) = index - .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["*"]), - ..Default::default() - }) + .get_all_documents(GetAllDocumentsOptions { fields: Some(vec!["*"]), ..Default::default() }) .await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 20); @@ -283,7 +273,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["*", "wrong"]), + fields: Some(vec!["*", "wrong"]), ..Default::default() }) .await; @@ -316,12 +306,10 @@ async fn get_document_s_nested_attributes_to_retrieve() { assert_eq!(code, 202); index.wait_task(1).await; - let (response, code) = - index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["content"] }))).await; assert_eq!(code, 200); assert_eq!(response, json!({})); - let (response, code) = - index.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await; + let (response, code) = index.get_document(1, Some(json!({ "fields": ["content"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -333,9 +321,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { }) ); - let (response, code) = index - .get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) })) - .await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["content.truc"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -343,9 +329,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { "content.truc": "foobar", }) ); - let (response, code) = index - .get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) })) - .await; + let (response, code) = index.get_document(1, Some(json!({ "fields": ["content.truc"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -540,3 +524,207 @@ async fn get_document_by_filter() { } "###); } + +#[actix_rt::test] +async fn get_document_with_vectors() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": null }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + // by default you shouldn't see the `_vectors` object + let (documents, _code) = index.get_all_documents(Default::default()).await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir" + }, + { + "id": 1, + "name": "echo" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = index.get_document(0, None).await; + snapshot!(json_string!(documents), @r###" + { + "id": 0, + "name": "kefir" + } + "###); + + // if we try to retrieve the vectors with the `fields` parameter they + // still shouldn't be displayed + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + fields: Some(vec!["name", "_vectors"]), + ..Default::default() + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "name": "kefir" + }, + { + "name": "echo" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = + index.get_document(0, Some(json!({"fields": ["name", "_vectors"]}))).await; + snapshot!(json_string!(documents), @r###" + { + "name": "kefir" + } + "###); + + // If we specify the retrieve vectors boolean and nothing else we should get the vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await; + snapshot!(json_string!(documents), @r###" + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + } + "###); + + // If we specify the retrieve vectors boolean and exclude vectors form the `fields` we should still get the vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + retrieve_vectors: true, + fields: Some(vec!["name"]), + ..Default::default() + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + }, + { + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = + index.get_document(0, Some(json!({"retrieveVectors": true, "fields": ["name"]}))).await; + snapshot!(json_string!(documents), @r###" + { + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + } + "###); +} diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index c8f8ca105..fa402cb41 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1938,3 +1938,210 @@ async fn import_dump_v6_containing_experimental_features() { }) .await; } + +// In this test we must generate the dump ourselves to ensure the +// `user provided` vectors are well set +#[actix_rt::test] +#[cfg_attr(target_os = "windows", ignore)] +async fn generate_and_import_dump_containing_vectors() { + let temp = tempfile::tempdir().unwrap(); + let mut opt = default_settings(temp.path()); + let server = Server::new_with_options(opt.clone()).await.unwrap(); + let (code, _) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + let index = server.index("pets"); + let (response, code) = index + .update_settings(json!( + { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}", + } + } + } + )) + .await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response); + let (response, code) = index + .add_documents( + json!([ + {"id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": vec![0; 384] }}, + {"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "regenerate": false, "embeddings": vec![1; 384] }}}, + {"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "regenerate": true, "embeddings": vec![2; 384] }}}, + {"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "regenerate": true }}}, + {"id": 4, "doggo": "max" }, + ]), + None, + ) + .await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response); + + let (response, code) = server.create_dump().await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // ========= We made a dump, now we should clear the DB and try to import our dump + drop(server); + tokio::fs::remove_dir_all(&opt.db_path).await.unwrap(); + let dump_name = format!("{}.dump", response["details"]["dumpUid"].as_str().unwrap()); + let dump_path = opt.dump_dir.join(dump_name); + assert!(dump_path.exists(), "path: `{}`", dump_path.display()); + + opt.import_dump = Some(dump_path); + // NOTE: We shouldn't have to change the database path but I lost one hour + // because of a « bad path » error and that fixed it. + opt.db_path = temp.path().join("data.ms"); + + let mut server = Server::new_auth_with_options(opt, temp).await; + server.use_api_key("MASTER_KEY"); + + let (indexes, code) = server.list_indexes(None, None).await; + assert_eq!(code, 200, "{indexes}"); + + snapshot!(indexes["results"].as_array().unwrap().len(), @"1"); + snapshot!(indexes["results"][0]["uid"], @r###""pets""###); + snapshot!(indexes["results"][0]["primaryKey"], @r###""id""###); + + let (response, code) = server.get_features().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let index = server.index("pets"); + + let (response, code) = index.settings().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "displayedAttributes": [ + "*" + ], + "searchableAttributes": [ + "*" + ], + "filterableAttributes": [], + "sortableAttributes": [], + "rankingRules": [ + "words", + "typo", + "proximity", + "attribute", + "sort", + "exactness" + ], + "stopWords": [], + "nonSeparatorTokens": [], + "separatorTokens": [], + "dictionary": [], + "synonyms": {}, + "distinctAttribute": null, + "proximityPrecision": "byWord", + "typoTolerance": { + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 5, + "twoTypos": 9 + }, + "disableOnWords": [], + "disableOnAttributes": [] + }, + "faceting": { + "maxValuesPerFacet": 100, + "sortFacetValuesBy": { + "*": "alpha" + } + }, + "pagination": { + "maxTotalHits": 1000 + }, + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}" + } + }, + "searchCutoffMs": null + } + "###); + + index + .search(json!({"retrieveVectors": true}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###" + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": false + } + } + }, + { + "id": 1, + "doggo": "echo", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": false + } + } + }, + { + "id": 2, + "doggo": "intel", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 3, + "doggo": "bill", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 4, + "doggo": "max", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": true + } + } + } + ] + "###); + }) + .await; +} diff --git a/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap new file mode 100644 index 000000000..4b05d417a --- /dev/null +++ b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap @@ -0,0 +1,25 @@ +--- +source: meilisearch/tests/dumps/mod.rs +--- +{ + "uid": 0, + "indexUid": "pets", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap new file mode 100644 index 000000000..43971924b --- /dev/null +++ b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap @@ -0,0 +1,19 @@ +--- +source: meilisearch/tests/dumps/mod.rs +--- +{ + "uid": 1, + "indexUid": "pets", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 5, + "indexedDocuments": 5 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/integration.rs b/meilisearch/tests/integration.rs index bb77ecc63..78da9825a 100644 --- a/meilisearch/tests/integration.rs +++ b/meilisearch/tests/integration.rs @@ -13,6 +13,7 @@ mod snapshot; mod stats; mod swap_indexes; mod tasks; +mod vector; // Tests are isolated by features in different modules to allow better readability, test // targetability, and improved incremental compilation times. diff --git a/meilisearch/tests/search/distinct.rs b/meilisearch/tests/search/distinct.rs index aea98215d..2023c01a8 100644 --- a/meilisearch/tests/search/distinct.rs +++ b/meilisearch/tests/search/distinct.rs @@ -107,6 +107,39 @@ static DOCUMENTS: Lazy = Lazy::new(|| { ]) }); +static NESTED_DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "id": 1, + "description": "Leather Jacket", + "brand": "Lee Jeans", + "product_id": "123456", + "color": { "main": "Brown", "pattern": "stripped" }, + }, + { + "id": 2, + "description": "Leather Jacket", + "brand": "Lee Jeans", + "product_id": "123456", + "color": { "main": "Black", "pattern": "stripped" }, + }, + { + "id": 3, + "description": "Leather Jacket", + "brand": "Lee Jeans", + "product_id": "123456", + "color": { "main": "Blue", "pattern": "used" }, + }, + { + "id": 4, + "description": "T-Shirt", + "brand": "Nike", + "product_id": "789012", + "color": { "main": "Blue", "pattern": "stripped" }, + } + ]) +}); + static DOCUMENT_PRIMARY_KEY: &str = "id"; static DOCUMENT_DISTINCT_KEY: &str = "product_id"; @@ -239,3 +272,35 @@ async fn distinct_search_with_pagination_no_ranking() { snapshot!(response["totalPages"], @"2"); snapshot!(response["totalHits"], @"6"); } + +#[actix_rt::test] +async fn distinct_at_search_time() { + let server = Server::new().await; + let index = server.index("tamo"); + + let documents = NESTED_DOCUMENTS.clone(); + index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await; + let (task, _) = index.update_settings_filterable_attributes(json!(["color.main"])).await; + let task = index.wait_task(task.uid()).await; + snapshot!(task, name: "succeed"); + + fn get_hits(response: &Value) -> Vec { + let hits_array = response["hits"] + .as_array() + .unwrap_or_else(|| panic!("{}", &serde_json::to_string_pretty(&response).unwrap())); + hits_array + .iter() + .map(|h| h[DOCUMENT_PRIMARY_KEY].as_number().unwrap().to_string()) + .collect::>() + } + + let (response, code) = + index.search_post(json!({"page": 1, "hitsPerPage": 3, "distinct": "color.main"})).await; + let hits = get_hits(&response); + snapshot!(code, @"200 OK"); + snapshot!(hits.len(), @"3"); + snapshot!(format!("{:?}", hits), @r###"["1", "2", "3"]"###); + snapshot!(response["page"], @"1"); + snapshot!(response["totalPages"], @"1"); + snapshot!(response["totalHits"], @"3"); +} diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index 0119be59e..b615902c2 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -167,6 +167,74 @@ async fn search_bad_hits_per_page() { "###); } +#[actix_rt::test] +async fn search_bad_attributes_to_retrieve() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"attributesToRetrieve": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.attributesToRetrieve`: expected an array, but found a string: `\"doggo\"`", + "code": "invalid_search_attributes_to_retrieve", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_retrieve" + } + "###); + // Can't make the `attributes_to_retrieve` fail with a get search since it'll accept anything as an array of strings. +} + +#[actix_rt::test] +async fn search_bad_retrieve_vectors() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"retrieveVectors": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_post(json!({"retrieveVectors": [true]})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_get("?retrieveVectors=").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_get("?retrieveVectors=doggo").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); +} + #[actix_rt::test] async fn search_bad_attributes_to_crop() { let server = Server::new().await; @@ -321,6 +389,40 @@ async fn search_bad_facets() { // Can't make the `attributes_to_highlight` fail with a get search since it'll accept anything as an array of strings. } +#[actix_rt::test] +async fn search_bad_threshold() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"rankingScoreThreshold": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.rankingScoreThreshold`: expected a number, but found a string: `\"doggo\"`", + "code": "invalid_search_ranking_score_threshold", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_ranking_score_threshold" + } + "###); +} + +#[actix_rt::test] +async fn search_invalid_threshold() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"rankingScoreThreshold": 42})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value at `.rankingScoreThreshold`: the value of `rankingScoreThreshold` is invalid, expected a float between `0.0` and `1.0`.", + "code": "invalid_search_ranking_score_threshold", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_ranking_score_threshold" + } + "###); +} + #[actix_rt::test] async fn search_non_filterable_facets() { let server = Server::new().await; @@ -1038,3 +1140,66 @@ async fn search_on_unknown_field_plus_joker() { ) .await; } + +#[actix_rt::test] +async fn distinct_at_search_time() { + let server = Server::new().await; + let index = server.index("tamo"); + let (task, _) = index.create(None).await; + let task = index.wait_task(task.uid()).await; + snapshot!(task, name: "task-succeed"); + + let (response, code) = + index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.", + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + } + "###); + + let (task, _) = index.update_settings_filterable_attributes(json!(["color", "machin"])).await; + index.wait_task(task.uid()).await; + + let (response, code) = + index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.", + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + } + "###); + + let (task, _) = index.update_settings_displayed_attributes(json!(["color"])).await; + index.wait_task(task.uid()).await; + + let (response, code) = + index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.", + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + } + "###); + + let (response, code) = + index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": true})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Invalid value type at `.distinct`: expected a string, but found a boolean: `true`", + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + } + "###); +} diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 9c50df6e1..02768bf60 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -124,32 +124,61 @@ async fn simple_search() { let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } +#[actix_rt::test] +async fn limit_offset() { + let server = Server::new().await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true, "offset": 1, "limit": 1}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}}]"###); + snapshot!(response["semanticHitCount"], @"0"); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + + let server = Server::new().await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.9}, "retrieveVectors": true, "offset": 1, "limit": 1}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}}]"###); + snapshot!(response["semanticHitCount"], @"1"); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); +} + #[actix_rt::test] async fn simple_search_hf() { let server = Server::new().await; @@ -204,10 +233,10 @@ async fn distribution_shift() { let server = Server::new().await; let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; - let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}}); + let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); let (response, code) = index .update_settings(json!({ @@ -228,7 +257,7 @@ async fn distribution_shift() { let (response, code) = index.search_post(search).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.1920928955078125e-7}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7}]"###); } #[actix_rt::test] @@ -239,20 +268,23 @@ async fn highlighter() { let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, - "attributesToHighlight": [ - "desc" + "retrieveVectors": true, + "attributesToHighlight": [ + "desc", + "_vectors", ], - "highlightPreTag": "**BEGIN**", - "highlightPostTag": "**END**" + "highlightPreTag": "**BEGIN**", + "highlightPostTag": "**END**", })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, + "retrieveVectors": true, "showRankingScore": true, "attributesToHighlight": [ "desc" @@ -262,13 +294,14 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 1.0}, + "retrieveVectors": true, "showRankingScore": true, "attributesToHighlight": [ "desc" @@ -278,7 +311,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -361,12 +394,12 @@ async fn single_document() { let (response, code) = index .search_post( - json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0}"###); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0}"###); snapshot!(response["semanticHitCount"], @"1"); } @@ -377,25 +410,25 @@ async fn query_combination() { // search without query and vector, but with hybrid => still placeholder let (response, code) = index - .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // same with a different semantic ratio let (response, code) = index - .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true})) + .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // wrong vector dimensions let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"400 Bad Request"); @@ -410,34 +443,34 @@ async fn query_combination() { // full vector let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.6581138968467712}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.6581138968467712}]"###); snapshot!(response["semanticHitCount"], @"3"); // full keyword, without a query let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, full keyword => keyword let (response, code) = index - .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => let (response, code) = index - .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true})) + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"400 Bad Request"); @@ -453,7 +486,7 @@ async fn query_combination() { // full vector, without a vector => error let (response, code) = index .search_post( - json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), ) .await; @@ -470,11 +503,93 @@ async fn query_combination() { // hybrid without a vector => full keyword let (response, code) = index .search_post( - json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true}), + json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } + +#[actix_rt::test] +async fn retrieve_vectors() { + let server = Server::new().await; + let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), + ) + .await; + snapshot!(code, @"200 OK"); + insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###" + [ + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + } + ] + "###); + + // remove `_vectors` from displayed attributes + let (response, code) = + index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), + ) + .await; + snapshot!(code, @"200 OK"); + insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###" + [ + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2" + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3" + }, + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1" + } + ] + "###); +} diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 284b68a15..e239ff767 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -48,6 +48,31 @@ static DOCUMENTS: Lazy = Lazy::new(|| { ]) }); +static SCORE_DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + }, + { + "title": "Batman Returns", + "id": "C", + }, + { + "title": "Batman", + "id": "D", + }, + { + "title": "Badman", + "id": "E", + } + ]) +}); + static NESTED_DOCUMENTS: Lazy = Lazy::new(|| { json!([ { @@ -276,7 +301,7 @@ async fn negative_special_cases_search() { index.add_documents(documents, None).await; index.wait_task(0).await; - index.update_settings(json!({"synonyms": { "escape": ["glass"] }})).await; + index.update_settings(json!({"synonyms": { "escape": ["gläss"] }})).await; index.wait_task(1).await; // There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass @@ -960,6 +985,213 @@ async fn test_score_details() { .await; } +#[actix_rt::test] +async fn test_score() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = SCORE_DOCUMENTS.clone(); + + let res = index.add_documents(json!(documents), None).await; + index.wait_task(res.0.uid()).await; + + index + .search( + json!({ + "q": "Badman the dark knight returns 1", + "showRankingScore": true, + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_rankingScore": 0.9746605609456898 + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_rankingScore": 0.8055252965383685 + }, + { + "title": "Badman", + "id": "E", + "_rankingScore": 0.16666666666666666 + }, + { + "title": "Batman Returns", + "id": "C", + "_rankingScore": 0.07702020202020202 + }, + { + "title": "Batman", + "id": "D", + "_rankingScore": 0.07702020202020202 + } + ] + "###); + }, + ) + .await; +} + +#[actix_rt::test] +async fn test_score_threshold() { + let query = "Badman dark returns 1"; + let server = Server::new().await; + let index = server.index("test"); + + let documents = SCORE_DOCUMENTS.clone(); + + let res = index.add_documents(json!(documents), None).await; + index.wait_task(res.0.uid()).await; + + index + .search( + json!({ + "q": query, + "showRankingScore": true, + "rankingScoreThreshold": 0.0 + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"5"); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_rankingScore": 0.93430081300813 + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_rankingScore": 0.6685627880184332 + }, + { + "title": "Badman", + "id": "E", + "_rankingScore": 0.25 + }, + { + "title": "Batman Returns", + "id": "C", + "_rankingScore": 0.11553030303030302 + }, + { + "title": "Batman", + "id": "D", + "_rankingScore": 0.11553030303030302 + } + ] + "###); + }, + ) + .await; + + index + .search( + json!({ + "q": query, + "showRankingScore": true, + "rankingScoreThreshold": 0.2 + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"3"###); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_rankingScore": 0.93430081300813 + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_rankingScore": 0.6685627880184332 + }, + { + "title": "Badman", + "id": "E", + "_rankingScore": 0.25 + } + ] + "###); + }, + ) + .await; + + index + .search( + json!({ + "q": query, + "showRankingScore": true, + "rankingScoreThreshold": 0.5 + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"2"###); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_rankingScore": 0.93430081300813 + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_rankingScore": 0.6685627880184332 + } + ] + "###); + }, + ) + .await; + + index + .search( + json!({ + "q": query, + "showRankingScore": true, + "rankingScoreThreshold": 0.8 + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"1"###); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_rankingScore": 0.93430081300813 + } + ] + "###); + }, + ) + .await; + + index + .search( + json!({ + "q": query, + "showRankingScore": true, + "rankingScoreThreshold": 1.0 + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"0"###); + // nobody is perfect + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @"[]"); + }, + ) + .await; +} + #[actix_rt::test] async fn test_degraded_score_details() { let server = Server::new().await; @@ -1058,21 +1290,38 @@ async fn experimental_feature_vector_store() { index.add_documents(json!(documents), None).await; index.wait_task(0).await; - let (response, code) = index - .search_post(json!({ + index + .search(json!({ "vector": [1.0, 2.0, 3.0], "showRankingScore": true - })) + }), |response, code|{ + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Passing `vector` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + }) + .await; + index + .search(json!({ + "retrieveVectors": true, + "showRankingScore": true + }), |response, code|{ + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + }) .await; - meili_snap::snapshot!(code, @"400 Bad Request"); - meili_snap::snapshot!(meili_snap::json_string!(response), @r###" - { - "message": "Passing `vector` as a query parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", - "code": "feature_not_enabled", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#feature_not_enabled" - } - "###); let (response, code) = server.set_features(json!({"vectorStore": true})).await; meili_snap::snapshot!(code, @"200 OK"); @@ -1105,6 +1354,7 @@ async fn experimental_feature_vector_store() { .search_post(json!({ "vector": [1.0, 2.0, 3.0], "showRankingScore": true, + "retrieveVectors": true, })) .await; @@ -1116,11 +1366,16 @@ async fn experimental_feature_vector_store() { "title": "Shazam!", "id": "287947", "_vectors": { - "manual": [ - 1.0, - 2.0, - 3.0 - ] + "manual": { + "embeddings": [ + [ + 1.0, + 2.0, + 3.0 + ] + ], + "regenerate": false + } }, "_rankingScore": 1.0 }, @@ -1128,11 +1383,16 @@ async fn experimental_feature_vector_store() { "title": "Captain Marvel", "id": "299537", "_vectors": { - "manual": [ - 1.0, - 2.0, - 54.0 - ] + "manual": { + "embeddings": [ + [ + 1.0, + 2.0, + 54.0 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.9129111766815186 }, @@ -1140,11 +1400,16 @@ async fn experimental_feature_vector_store() { "title": "Gläss", "id": "450465", "_vectors": { - "manual": [ - -100.0, - 340.0, - 90.0 - ] + "manual": { + "embeddings": [ + [ + -100.0, + 340.0, + 90.0 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.8106412887573242 }, @@ -1152,11 +1417,16 @@ async fn experimental_feature_vector_store() { "title": "How to Train Your Dragon: The Hidden World", "id": "166428", "_vectors": { - "manual": [ - -100.0, - 231.0, - 32.0 - ] + "manual": { + "embeddings": [ + [ + -100.0, + 231.0, + 32.0 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.7412010431289673 }, @@ -1164,11 +1434,16 @@ async fn experimental_feature_vector_store() { "title": "Escape Room", "id": "522681", "_vectors": { - "manual": [ - 10.0, - -23.0, - 32.0 - ] + "manual": { + "embeddings": [ + [ + 10.0, + -23.0, + 32.0 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.6972063183784485 } diff --git a/meilisearch/tests/search/snapshots/distinct.rs/distinct_at_search_time/succeed.snap b/meilisearch/tests/search/snapshots/distinct.rs/distinct_at_search_time/succeed.snap new file mode 100644 index 000000000..1b8190c42 --- /dev/null +++ b/meilisearch/tests/search/snapshots/distinct.rs/distinct_at_search_time/succeed.snap @@ -0,0 +1,20 @@ +--- +source: meilisearch/tests/search/distinct.rs +--- +{ + "uid": 1, + "indexUid": "tamo", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "filterableAttributes": [ + "color.main" + ] + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap b/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap new file mode 100644 index 000000000..903e96ffb --- /dev/null +++ b/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap @@ -0,0 +1,18 @@ +--- +source: meilisearch/tests/search/errors.rs +--- +{ + "uid": 0, + "indexUid": "tamo", + "status": "succeeded", + "type": "indexCreation", + "canceledBy": null, + "details": { + "primaryKey": null + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/similar/errors.rs b/meilisearch/tests/similar/errors.rs index a6cb3cdbc..a6d7a3da6 100644 --- a/meilisearch/tests/similar/errors.rs +++ b/meilisearch/tests/similar/errors.rs @@ -87,6 +87,68 @@ async fn similar_bad_id() { "###); } +#[actix_rt::test] +async fn similar_bad_ranking_score_threshold() { + let server = Server::new().await; + let index = server.index("test"); + server.set_features(json!({"vectorStore": true})).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + "filterableAttributes": ["title"]})) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let (response, code) = index.similar_post(json!({"rankingScoreThreshold": ["doggo"]})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.rankingScoreThreshold`: expected a number, but found an array: `[\"doggo\"]`", + "code": "invalid_similar_ranking_score_threshold", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_ranking_score_threshold" + } + "###); +} + +#[actix_rt::test] +async fn similar_invalid_ranking_score_threshold() { + let server = Server::new().await; + let index = server.index("test"); + server.set_features(json!({"vectorStore": true})).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + "filterableAttributes": ["title"]})) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let (response, code) = index.similar_post(json!({"rankingScoreThreshold": 42})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value at `.rankingScoreThreshold`: the value of `rankingScoreThreshold` is invalid, expected a float between `0.0` and `1.0`.", + "code": "invalid_similar_ranking_score_threshold", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_ranking_score_threshold" + } + "###); +} + #[actix_rt::test] async fn similar_invalid_id() { let server = Server::new().await; @@ -694,3 +756,54 @@ async fn filter_reserved_geo_point_string() { }) .await; } + +#[actix_rt::test] +async fn similar_bad_retrieve_vectors() { + let server = Server::new().await; + server.set_features(json!({"vectorStore": true})).await; + let index = server.index("test"); + + let (response, code) = index.similar_post(json!({"retrieveVectors": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_post(json!({"retrieveVectors": [true]})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_get("?retrieveVectors=").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_get("?retrieveVectors=doggo").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); +} diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index ee78917cb..60a0203ed 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -78,7 +78,7 @@ async fn basic() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 143}), |response, code| { + .similar(json!({"id": 143, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -87,11 +87,16 @@ async fn basic() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } } }, { @@ -99,11 +104,16 @@ async fn basic() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } } }, { @@ -111,11 +121,16 @@ async fn basic() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] + "manual": { + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "regenerate": false + } } }, { @@ -123,11 +138,16 @@ async fn basic() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] + "manual": { + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "regenerate": false + } } } ] @@ -136,7 +156,7 @@ async fn basic() { .await; index - .similar(json!({"id": "299537"}), |response, code| { + .similar(json!({"id": "299537", "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -145,11 +165,16 @@ async fn basic() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] + "manual": { + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "regenerate": false + } } }, { @@ -157,11 +182,16 @@ async fn basic() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] + "manual": { + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "regenerate": false + } } }, { @@ -169,11 +199,16 @@ async fn basic() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } } }, { @@ -181,11 +216,16 @@ async fn basic() { "release_year": 1930, "id": "143", "_vectors": { - "manual": [ - -0.5, - 0.3, - 0.85 - ] + "manual": { + "embeddings": [ + [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + ], + "regenerate": false + } } } ] @@ -194,6 +234,285 @@ async fn basic() { .await; } +#[actix_rt::test] +async fn ranking_score_threshold() { + let server = Server::new().await; + let index = server.index("test"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + "filterableAttributes": ["title"]})) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = DOCUMENTS.clone(); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + index + .similar( + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0, "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"4"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Escape Room", + "release_year": 2019, + "id": "522681", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } + }, + "_rankingScore": 0.890957772731781 + }, + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } + }, + "_rankingScore": 0.39060014486312866 + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "release_year": 2019, + "id": "166428", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "regenerate": false + } + }, + "_rankingScore": 0.2819308042526245 + }, + { + "title": "Shazam!", + "release_year": 2019, + "id": "287947", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "regenerate": false + } + }, + "_rankingScore": 0.1662663221359253 + } + ] + "###); + }, + ) + .await; + + index + .similar( + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2, "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"3"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Escape Room", + "release_year": 2019, + "id": "522681", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } + }, + "_rankingScore": 0.890957772731781 + }, + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } + }, + "_rankingScore": 0.39060014486312866 + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "release_year": 2019, + "id": "166428", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "regenerate": false + } + }, + "_rankingScore": 0.2819308042526245 + } + ] + "###); + }, + ) + .await; + + index + .similar( + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3, "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"2"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Escape Room", + "release_year": 2019, + "id": "522681", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } + }, + "_rankingScore": 0.890957772731781 + }, + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } + }, + "_rankingScore": 0.39060014486312866 + } + ] + "###); + }, + ) + .await; + + index + .similar( + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6, "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"1"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Escape Room", + "release_year": 2019, + "id": "522681", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } + }, + "_rankingScore": 0.890957772731781 + } + ] + "###); + }, + ) + .await; + + index + .similar( + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9, "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @"[]"); + }, + ) + .await; +} + #[actix_rt::test] async fn filter() { let server = Server::new().await; @@ -227,71 +546,97 @@ async fn filter() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 522681, "filter": "release_year = 2019"}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "Captain Marvel", - "release_year": 2019, - "id": "299537", - "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] - } - }, - { - "title": "How to Train Your Dragon: The Hidden World", - "release_year": 2019, - "id": "166428", - "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] - } - }, - { - "title": "Shazam!", - "release_year": 2019, - "id": "287947", - "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 522681, "filter": "release_year = 2019", "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } + } + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "release_year": 2019, + "id": "166428", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "regenerate": false + } + } + }, + { + "title": "Shazam!", + "release_year": 2019, + "id": "287947", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "regenerate": false + } + } + } + ] + "###); + }, + ) .await; index - .similar(json!({"id": 522681, "filter": "release_year < 2000"}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "All Quiet on the Western Front", - "release_year": 1930, - "id": "143", - "_vectors": { - "manual": [ - -0.5, - 0.3, - 0.85 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 522681, "filter": "release_year < 2000", "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "All Quiet on the Western Front", + "release_year": 1930, + "id": "143", + "_vectors": { + "manual": { + "embeddings": [ + [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + ], + "regenerate": false + } + } + } + ] + "###); + }, + ) .await; } @@ -328,7 +673,7 @@ async fn limit_and_offset() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 143, "limit": 1}), |response, code| { + .similar(json!({"id": 143, "limit": 1, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -337,11 +682,16 @@ async fn limit_and_offset() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } } } ] @@ -350,24 +700,32 @@ async fn limit_and_offset() { .await; index - .similar(json!({"id": 143, "limit": 1, "offset": 1}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "Captain Marvel", - "release_year": 2019, - "id": "299537", - "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 143, "limit": 1, "offset": 1, "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } + } + } + ] + "###); + }, + ) .await; } diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs new file mode 100644 index 000000000..0343ab785 --- /dev/null +++ b/meilisearch/tests/vector/mod.rs @@ -0,0 +1,588 @@ +mod settings; + +use meili_snap::{json_string, snapshot}; + +use crate::common::index::Index; +use crate::common::{GetAllDocumentsOptions, Server}; +use crate::json; + +#[actix_rt::test] +async fn add_remove_user_provided() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "manual": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [10, 10, 10] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": null }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 10.0, + 10.0, + 10.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let (value, code) = index.delete_document(0).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); +} + +async fn generate_default_user_provided_documents(server: &Server) -> Index { + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }}, + {"id": 2, "name": "billou", "_vectors": { "manual": [[2, 2, 2], [2, 2, 3]] }}, + {"id": 3, "name": "intel", "_vectors": { "manual": { "regenerate": false, "embeddings": [3, 3, 3] }}}, + {"id": 4, "name": "max", "_vectors": { "manual": { "regenerate": false, "embeddings": [[4, 4, 4], [4, 4, 5]] }}}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + index +} + +#[actix_rt::test] +async fn user_provided_embeddings_error() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + // First case, we forget to specify the `regenerate` + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [0, 0, 0] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 2, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // Second case, we don't specify anything + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": {}}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 3, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // Third case, we specify something wrong in place of regenerate + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": "yes please" }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 4, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.regenerate`: expected a boolean, but found a string: `\"yes please\"`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 5, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings`: expected null or an array, but found a boolean: `true`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 6, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 7, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [23, 0.1, -12], "regenerate": true }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [0.1, [0.2, 0.3]] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 10, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, 0.2], 0.3] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 11, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected an array, but found a number: `0.3`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, true], 0.3] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 12, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn clear_documents() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + let (value, _code) = index.clear_all_documents().await; + index.wait_task(value.uid()).await; + + // Make sure the documents DB has been cleared + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [], + "offset": 0, + "limit": 20, + "total": 0 + } + "###); + + // Make sure the arroy DB has been cleared + let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; + snapshot!(documents, @r###" + { + "hits": [], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0, + "semanticHitCount": 0 + } + "###); +} + +#[actix_rt::test] +async fn add_remove_one_vector_4588() { + // https://github.com/meilisearch/meilisearch/issues/4588 + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, name: "settings-processed"); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, name: "document-added"); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": null }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, name: "document-deleted"); + + let (documents, _code) = index.search_post(json!({"vector": [1, 1, 1] })).await; + snapshot!(documents, @r###" + { + "hits": [ + { + "id": 0, + "name": "kefir" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1, + "semanticHitCount": 1 + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); +} diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs new file mode 100644 index 000000000..e53ceb383 --- /dev/null +++ b/meilisearch/tests/vector/settings.rs @@ -0,0 +1,228 @@ +use meili_snap::{json_string, snapshot}; + +use crate::common::{GetAllDocumentsOptions, Server}; +use crate::json; +use crate::vector::generate_default_user_provided_documents; + +#[actix_rt::test] +async fn update_embedder() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "manual": {}}, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 2, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + + let ret = server.wait_task(response.uid()).await; + snapshot!(ret, @r###" + { + "uid": 1, + "indexUid": "doggo", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 2 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn reset_embedder_documents() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + let (response, code) = index.delete_settings().await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + // Make sure the documents are still present + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + limit: None, + offset: None, + retrieve_vectors: false, + fields: None, + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir" + }, + { + "id": 1, + "name": "echo" + }, + { + "id": 2, + "name": "billou" + }, + { + "id": 3, + "name": "intel" + }, + { + "id": 4, + "name": "max" + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure we are still able to retrieve their vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "manual": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "billou", + "_vectors": { + "manual": { + "embeddings": [ + [ + 2.0, + 2.0, + 2.0 + ], + [ + 2.0, + 2.0, + 3.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 3, + "name": "intel", + "_vectors": { + "manual": { + "embeddings": [ + [ + 3.0, + 3.0, + 3.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 4, + "name": "max", + "_vectors": { + "manual": { + "embeddings": [ + [ + 4.0, + 4.0, + 4.0 + ], + [ + 4.0, + 4.0, + 5.0 + ] + ], + "regenerate": false + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure the arroy DB has been cleared + let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; + snapshot!(json_string!(documents), @r###" + { + "message": "Cannot find embedder with name `default`.", + "code": "invalid_embedder", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_embedder" + } + "###); +} diff --git a/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-added.snap b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-added.snap new file mode 100644 index 000000000..52d9ad38d --- /dev/null +++ b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-added.snap @@ -0,0 +1,19 @@ +--- +source: meilisearch/tests/vector/mod.rs +--- +{ + "uid": 1, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap new file mode 100644 index 000000000..de02d0b1d --- /dev/null +++ b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap @@ -0,0 +1,19 @@ +--- +source: meilisearch/tests/vector/mod.rs +--- +{ + "uid": 2, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/settings-processed.snap b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/settings-processed.snap new file mode 100644 index 000000000..316305fa8 --- /dev/null +++ b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/settings-processed.snap @@ -0,0 +1,23 @@ +--- +source: meilisearch/tests/vector/mod.rs +--- +{ + "uid": 0, + "indexUid": "doggo", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 27ce4a53e..fd7bde99b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ bincode = "1.3.3" bstr = "1.9.0" bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.8.10", default-features = false } +charabia = { version = "0.8.11", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.11" deserr = "0.6.1" @@ -44,7 +44,7 @@ once_cell = "1.19.0" ordered-float = "4.2.0" rand_pcg = { version = "0.3.1", features = ["serde1"] } rayon = "1.8.0" -roaring = "0.10.2" +roaring = { version = "0.10.2", features = ["serde"] } rstar = { version = "0.11.0", features = ["serde"] } serde = { version = "1.0.195", features = ["derive"] } serde_json = { version = "1.0.111", features = ["preserve_order"] } @@ -79,7 +79,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.8" liquid = "0.26.4" -arroy = "0.3.1" +arroy = "0.4.0" rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.9.7", features = ["json"] } diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 2779f5b15..87020994a 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -59,6 +59,7 @@ fn main() -> Result<(), Box> { false, universe, &None, + &None, GeoSortStrategy::default(), 0, 20, @@ -66,6 +67,7 @@ fn main() -> Result<(), Box> { &mut DefaultSearchLogger, logger, TimeBudget::max(), + None, )?; if let Some((logger, dir)) = detailed_logger { logger.finish(&mut ctx, Path::new(dir))?; diff --git a/milli/src/error.rs b/milli/src/error.rs index 83754afe4..8e03fde4e 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -119,6 +119,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco InvalidVectorDimensions { expected: usize, found: usize }, #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] InvalidVectorsMapType { document_id: String, value: Value }, + #[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")] + InvalidVectorsEmbedderConf { document_id: String, error: deserr::errors::JsonError }, #[error("{0}")] InvalidFilter(String), #[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))] @@ -134,6 +136,17 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco } )] InvalidSortableAttribute { field: String, valid_fields: BTreeSet, hidden_fields: bool }, + #[error("Attribute `{}` is not filterable and thus, cannot be used as distinct attribute. {}", + .field, + match .valid_fields.is_empty() { + true => "This index does not have configured filterable attributes.".to_string(), + false => format!("Available filterable attributes are: `{}{}`.", + valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "), + .hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""), + ), + } + )] + InvalidDistinctAttribute { field: String, valid_fields: BTreeSet, hidden_fields: bool }, #[error("Attribute `{}` is not facet-searchable. {}", .field, match .valid_fields.is_empty() { @@ -270,8 +283,9 @@ impl From for Error { arroy::Error::DatabaseFull | arroy::Error::InvalidItemAppend | arroy::Error::UnmatchingDistance { .. } - | arroy::Error::MissingNode - | arroy::Error::MissingMetadata => { + | arroy::Error::NeedBuild(_) + | arroy::Error::MissingKey { .. } + | arroy::Error::MissingMetadata(_) => { Error::InternalError(InternalError::ArroyError(value)) } } diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index a737632a4..13f2f8afc 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -4,6 +4,7 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::{FieldId, FieldsIdsMap, Weight}; #[derive(Debug, Default, Serialize, Deserialize)] @@ -23,7 +24,13 @@ impl FieldidsWeightsMap { /// Should only be called in the case there are NO searchable attributes. /// All the fields will be inserted in the order of the fields ids map with a weight of 0. pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { - FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() } + FieldidsWeightsMap { + map: fid_map + .iter() + .filter(|(_fid, name)| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME)) + .map(|(fid, _name)| (fid, 0)) + .collect(), + } } /// Removes a field id from the map, returning the associated weight previously in the map. diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 9c1c87f82..f9d7c3704 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -41,6 +41,16 @@ impl FieldsIdsMap { } } + /// Get the ids of a field and all its nested fields based on its name. + pub fn nested_ids(&self, name: &str) -> Vec { + self.names_ids + .range(name.to_string()..) + .take_while(|(key, _)| key.starts_with(name)) + .filter(|(key, _)| crate::is_faceted_by(key, name)) + .map(|(_name, id)| *id) + .collect() + } + /// Get the id of a field based on its name. pub fn id(&self, name: &str) -> Option { self.names_ids.get(name).copied() @@ -126,4 +136,32 @@ mod tests { assert_eq!(iter.next(), Some((3, "title"))); assert_eq!(iter.next(), None); } + + #[test] + fn nested_fields() { + let mut map = FieldsIdsMap::new(); + + assert_eq!(map.insert("id"), Some(0)); + assert_eq!(map.insert("doggo"), Some(1)); + assert_eq!(map.insert("doggo.name"), Some(2)); + assert_eq!(map.insert("doggolution"), Some(3)); + assert_eq!(map.insert("doggo.breed.name"), Some(4)); + assert_eq!(map.insert("description"), Some(5)); + + insta::assert_debug_snapshot!(map.nested_ids("doggo"), @r###" + [ + 1, + 4, + 2, + ] + "###); + + insta::assert_debug_snapshot!(map.nested_ids("doggo.breed"), @r###" + [ + 4, + ] + "###); + + insta::assert_debug_snapshot!(map.nested_ids("_vector"), @"[]"); + } } diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 7bb874060..a8bb5055e 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -47,6 +47,12 @@ pub struct FacetGroupValue { pub bitmap: RoaringBitmap, } +#[derive(Debug)] +pub struct FacetGroupLazyValue<'b> { + pub size: u8, + pub bitmap_bytes: &'b [u8], +} + pub struct FacetGroupKeyCodec { _phantom: PhantomData, } @@ -69,6 +75,7 @@ where Ok(Cow::Owned(v)) } } + impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec where T: BytesDecode<'a>, @@ -84,6 +91,7 @@ where } pub struct FacetGroupValueCodec; + impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { type EItem = FacetGroupValue; @@ -93,11 +101,23 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { Ok(Cow::Owned(v)) } } + impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { type DItem = FacetGroupValue; + fn bytes_decode(bytes: &'a [u8]) -> Result { let size = bytes[0]; let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?; Ok(FacetGroupValue { size, bitmap }) } } + +pub struct FacetGroupLazyValueCodec; + +impl<'a> heed::BytesDecode<'a> for FacetGroupLazyValueCodec { + type DItem = FacetGroupLazyValue<'a>; + + fn bytes_decode(bytes: &'a [u8]) -> Result { + Ok(FacetGroupLazyValue { size: bytes[0], bitmap_bytes: &bytes[1..] }) + } +} diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 1db518c7d..a04698019 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::io; +use std::io::{self, Cursor}; use std::mem::size_of; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; @@ -57,6 +57,24 @@ impl CboRoaringBitmapCodec { } } + pub fn intersection_with_serialized( + mut bytes: &[u8], + other: &RoaringBitmap, + ) -> io::Result { + // See above `deserialize_from` method for implementation details. + if bytes.len() <= THRESHOLD * size_of::() { + let mut bitmap = RoaringBitmap::new(); + while let Ok(integer) = bytes.read_u32::() { + if other.contains(integer) { + bitmap.insert(integer); + } + } + Ok(bitmap) + } else { + other.intersection_with_serialized_unchecked(Cursor::new(bytes)) + } + } + /// Merge serialized CboRoaringBitmaps in a buffer. /// /// if the merged values length is under the threshold, values are directly diff --git a/milli/src/index.rs b/milli/src/index.rs index 3c502d541..0a7a20ce0 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -9,6 +9,7 @@ use heed::types::*; use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use roaring::RoaringBitmap; use rstar::RTree; +use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use crate::documents::PrimaryKey; @@ -23,6 +24,7 @@ use crate::heed_codec::{ }; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::vector::{Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, @@ -644,6 +646,7 @@ impl Index { &self, wtxn: &mut RwTxn, user_fields: &[&str], + non_searchable_fields_ids: &[FieldId], fields_ids_map: &FieldsIdsMap, ) -> Result<()> { // We can write the user defined searchable fields as-is. @@ -662,6 +665,7 @@ impl Index { for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) && !real_fields.contains(&field_from_map) + && !non_searchable_fields_ids.contains(&id) { real_fields.push(field_from_map); @@ -708,6 +712,7 @@ impl Index { Ok(self .fields_ids_map(rtxn)? .names() + .filter(|name| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME)) .map(|field| Cow::Owned(field.to_string())) .collect()) }) @@ -1568,12 +1573,16 @@ impl Index { Ok(script_language) } + /// Put the embedding configs: + /// 1. The name of the embedder + /// 2. The configuration option for this embedder + /// 3. The list of documents with a user provided embedding pub(crate) fn put_embedding_configs( &self, wtxn: &mut RwTxn<'_>, - configs: Vec<(String, EmbeddingConfig)>, + configs: Vec, ) -> heed::Result<()> { - self.main.remap_types::>>().put( + self.main.remap_types::>>().put( wtxn, main_key::EMBEDDING_CONFIGS, &configs, @@ -1584,13 +1593,10 @@ impl Index { self.main.remap_key_type::().delete(wtxn, main_key::EMBEDDING_CONFIGS) } - pub fn embedding_configs( - &self, - rtxn: &RoTxn<'_>, - ) -> Result> { + pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result> { Ok(self .main - .remap_types::>>() + .remap_types::>>() .get(rtxn, main_key::EMBEDDING_CONFIGS)? .unwrap_or_default()) } @@ -1604,7 +1610,7 @@ impl Index { arroy::Reader::open(rtxn, k, self.vector_arroy) .map(Some) .or_else(|e| match e { - arroy::Error::MissingMetadata => Ok(None), + arroy::Error::MissingMetadata(_) => Ok(None), e => Err(e.into()), }) .transpose() @@ -1637,7 +1643,7 @@ impl Index { let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy) .map(Some) .or_else(|e| match e { - arroy::Error::MissingMetadata => Ok(None), + arroy::Error::MissingMetadata(_) => Ok(None), e => Err(e), }) .transpose(); @@ -1662,6 +1668,13 @@ impl Index { } } +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub config: EmbeddingConfig, + pub user_provided: RoaringBitmap, +} + #[cfg(test)] pub(crate) mod tests { use std::collections::HashSet; @@ -1669,15 +1682,17 @@ pub(crate) mod tests { use big_s::S; use heed::{EnvOpenOptions, RwTxn}; - use maplit::hashset; + use maplit::{btreemap, hashset}; use tempfile::TempDir; use crate::documents::DocumentsBatchReader; use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, + self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, + Settings, }; + use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult}; pub(crate) struct TempIndex { @@ -2783,4 +2798,95 @@ pub(crate) mod tests { ] "###); } + + #[test] + fn vectors_are_never_indexed_as_searchable_or_filterable() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { "id": 0, "_vectors": { "doggo": [2345] } }, + { "id": 1, "_vectors": { "doggo": [6789] } }, + ])) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @r###"["id"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + drop(rtxn); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]); + settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]); + }) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @"[]"); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + + let mut search = index.search(&rtxn); + let results = search + .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) + .execute() + .unwrap(); + assert!(results.candidates.is_empty()); + + index + .update_settings(|settings| { + settings.set_embedder_settings(btreemap! { + S("doggo") => Setting::Set(EmbeddingSettings { + dimensions: Setting::Set(1), + source: Setting::Set(EmbedderSource::UserProvided), + ..EmbeddingSettings::default()}), + }); + }) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @"[]"); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + + let mut search = index.search(&rtxn); + let results = search + .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) + .execute() + .unwrap(); + assert!(results.candidates.is_empty()); + } } diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index d993ef2dc..1e6ea8d88 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -6,9 +6,11 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec, +}; use crate::heed_codec::BytesRefCodec; -use crate::DocumentId; +use crate::{CboRoaringBitmapCodec, DocumentId}; /// Call the given closure on the facet distribution of the candidate documents. /// @@ -31,14 +33,11 @@ pub fn lexicographically_iterate_over_facet_distribution<'t, CB>( where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { + let db = db.remap_data_type::(); let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback }; - let highest_level = get_highest_level( - rtxn, - db.remap_key_type::>(), - field_id, - )?; + let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; Ok(()) } else { @@ -75,13 +74,10 @@ where // Represents the list of keys that we must explore. let mut heap = BinaryHeap::new(); - let highest_level = get_highest_level( - rtxn, - db.remap_key_type::>(), - field_id, - )?; + let db = db.remap_data_type::(); + let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { // We first fill the heap with values from the highest level let starting_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; @@ -92,7 +88,10 @@ where if key.field_id != field_id { break; } - let intersection = value.bitmap & candidates; + let intersection = CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + candidates, + )?; let count = intersection.len(); if count != 0 { heap.push(LevelEntry { @@ -121,7 +120,10 @@ where if key.field_id != field_id { break; } - let intersection = value.bitmap & candidates; + let intersection = CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + candidates, + )?; let count = intersection.len(); if count != 0 { heap.push(LevelEntry { @@ -146,7 +148,7 @@ where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupLazyValueCodec>, field_id: u16, callback: CB, } @@ -171,7 +173,10 @@ where if key.field_id != self.field_id { return Ok(ControlFlow::Break(())); } - let docids_in_common = value.bitmap & candidates; + let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + candidates, + )?; if !docids_in_common.is_empty() { let any_docid_in_common = docids_in_common.min().unwrap(); match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)? @@ -205,7 +210,10 @@ where if key.field_id != self.field_id { return Ok(ControlFlow::Break(())); } - let docids_in_common = value.bitmap & candidates; + let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + candidates, + )?; if !docids_in_common.is_empty() { let cf = self.iterate( &docids_in_common, diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index e340fbac5..0f8f58771 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -4,9 +4,11 @@ use heed::BytesEncode; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec, +}; use crate::heed_codec::BytesRefCodec; -use crate::Result; +use crate::{CboRoaringBitmapCodec, Result}; /// Find all the document ids for which the given field contains a value contained within /// the two bounds. @@ -16,6 +18,7 @@ pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, + universe: Option<&RoaringBitmap>, docids: &mut RoaringBitmap, ) -> Result<()> where @@ -46,13 +49,15 @@ where } Bound::Unbounded => Bound::Unbounded, }; - let db = db.remap_key_type::>(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; + let db = db.remap_types::, FacetGroupLazyValueCodec>(); + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(starting_left_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(starting_left_bound) = + get_first_facet_value::(rtxn, db, field_id)? + { let rightmost_bound = - Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded + Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) @@ -64,12 +69,16 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupLazyValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, + /// The subset of documents ids that are useful for this search. + /// Great performance optimizations can be achieved by only fetching values matching this subset. + universe: Option<&'bitmap RoaringBitmap>, docids: &'bitmap mut RoaringBitmap, } + impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { let left_key = @@ -104,7 +113,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { - *self.docids |= value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?, + }; } } Ok(()) @@ -195,7 +210,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { left_condition && right_condition }; if should_take_whole_group { - *self.docids |= &previous_value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, + }; previous_key = next_key; previous_value = next_value; continue; @@ -291,7 +312,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { left_condition && right_condition }; if should_take_whole_group { - *self.docids |= &previous_value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, + }; } else { let level = level - 1; let starting_left_bound = previous_key.left_bound; @@ -365,6 +392,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -384,6 +412,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -418,6 +447,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -439,6 +469,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -474,6 +505,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -499,6 +531,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -537,6 +570,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -556,6 +590,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -571,6 +606,7 @@ mod tests { 0, &Bound::Unbounded, &Bound::Unbounded, + None, &mut docids, ) .unwrap(); @@ -586,6 +622,7 @@ mod tests { 1, &Bound::Unbounded, &Bound::Unbounded, + None, &mut docids, ) .unwrap(); @@ -621,6 +658,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -634,6 +672,7 @@ mod tests { 1, &start, &end, + None, &mut docids, ) .unwrap(); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 07fe64510..59a95e5bd 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -36,7 +36,7 @@ pub fn ascending_facet_sort<'t>( candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index dd2692012..29586e4e4 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -19,9 +19,9 @@ pub fn descending_facet_sort<'t>( candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); Ok(itertools::Either::Left(DescendingFacetSort { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index dbd9538a5..c08abc8e0 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -4,7 +4,7 @@ use std::ops::Bound::{self, Excluded, Included}; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Token}; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use serde_json::Value; use super::facet_range_search; @@ -224,14 +224,14 @@ impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time let filterable_fields = index.filterable_fields(rtxn)?; - - self.inner_evaluate(rtxn, index, &filterable_fields) + self.inner_evaluate(rtxn, index, &filterable_fields, None) } fn evaluate_operator( rtxn: &heed::RoTxn, index: &Index, field_id: FieldId, + universe: Option<&RoaringBitmap>, operator: &Condition<'a>, ) -> Result { let numbers_db = index.facet_id_f64_docids; @@ -291,14 +291,22 @@ impl<'a> Filter<'a> { } Condition::NotEqual(val) => { let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?; + let docids = Self::evaluate_operator(rtxn, index, field_id, None, &operator)?; let all_ids = index.documents_ids(rtxn)?; return Ok(all_ids - docids); } }; let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels(rtxn, numbers_db, field_id, left, right, &mut output)?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + left, + right, + universe, + &mut output, + )?; Ok(output) } @@ -310,6 +318,7 @@ impl<'a> Filter<'a> { field_id: FieldId, left: Bound, right: Bound, + universe: Option<&RoaringBitmap>, output: &mut RoaringBitmap, ) -> Result<()> { match (left, right) { @@ -321,7 +330,7 @@ impl<'a> Filter<'a> { (_, _) => (), } facet_range_search::find_docids_of_facet_within_bounds::( - rtxn, db, field_id, &left, &right, output, + rtxn, db, field_id, &left, &right, universe, output, )?; Ok(()) @@ -332,31 +341,37 @@ impl<'a> Filter<'a> { rtxn: &heed::RoTxn, index: &Index, filterable_fields: &HashSet, + universe: Option<&RoaringBitmap>, ) -> Result { + if universe.map_or(false, |u| u.is_empty()) { + return Ok(RoaringBitmap::new()); + } + match &self.condition { FilterCondition::Not(f) => { - let all_ids = index.documents_ids(rtxn)?; let selected = Self::inner_evaluate( &(f.as_ref().clone()).into(), rtxn, index, filterable_fields, + universe, )?; - Ok(all_ids - selected) + match universe { + Some(universe) => Ok(universe - selected), + None => { + let all_ids = index.documents_ids(rtxn)?; + Ok(all_ids - selected) + } + } } FilterCondition::In { fid, els } => { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(fid.value()) { - let mut bitmap = RoaringBitmap::new(); - - for el in els { - let op = Condition::Equal(el.clone()); - let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?; - bitmap |= el_bitmap; - } - Ok(bitmap) + els.iter() + .map(|el| Condition::Equal(el.clone())) + .map(|op| Self::evaluate_operator(rtxn, index, fid, universe, &op)) + .union() } else { Ok(RoaringBitmap::new()) } @@ -371,7 +386,7 @@ impl<'a> Filter<'a> { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { - Self::evaluate_operator(rtxn, index, fid, op) + Self::evaluate_operator(rtxn, index, fid, universe, op) } else { Ok(RoaringBitmap::new()) } @@ -382,14 +397,11 @@ impl<'a> Filter<'a> { }))? } } - FilterCondition::Or(subfilters) => { - let mut bitmap = RoaringBitmap::new(); - for f in subfilters { - bitmap |= - Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; - } - Ok(bitmap) - } + FilterCondition::Or(subfilters) => subfilters + .iter() + .cloned() + .map(|f| Self::inner_evaluate(&f.into(), rtxn, index, filterable_fields, universe)) + .union(), FilterCondition::And(subfilters) => { let mut subfilters_iter = subfilters.iter(); if let Some(first_subfilter) = subfilters_iter.next() { @@ -398,16 +410,21 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; for f in subfilters_iter { if bitmap.is_empty() { return Ok(bitmap); } + // TODO We are doing the intersections two times, + // it could be more efficient + // Can't I just replace this `&=` by an `=`? bitmap &= Self::inner_evaluate( &(f.clone()).into(), rtxn, index, filterable_fields, + Some(&bitmap), )?; } Ok(bitmap) @@ -507,6 +524,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; let geo_lng_token = Token::new( @@ -539,6 +557,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; let condition_right = FilterCondition::Condition { @@ -552,6 +571,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; left | right @@ -567,6 +587,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )? }; diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 34a9cdcb8..858028bb5 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::filter::{BadGeoError, Filter}; pub use self::search::{FacetValueHit, SearchForFacetValues}; -use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; +use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::heed_codec::BytesRefCodec; use crate::{Index, Result}; @@ -54,9 +54,9 @@ pub fn facet_max_value<'t>( } /// Get the first facet value in the facet database -pub(crate) fn get_first_facet_value<'t, BoundCodec>( +pub(crate) fn get_first_facet_value<'t, BoundCodec, DC>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result> where @@ -78,9 +78,9 @@ where } /// Get the last facet value in the facet database -pub(crate) fn get_last_facet_value<'t, BoundCodec>( +pub(crate) fn get_last_facet_value<'t, BoundCodec, DC>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result> where @@ -102,9 +102,9 @@ where } /// Get the height of the highest level in the facet database -pub(crate) fn get_highest_level<'t>( +pub(crate) fn get_highest_level<'t, DC>( txn: &'t RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index fc13a5e1e..f7e1aa492 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -159,6 +159,7 @@ impl<'a> Search<'a> { offset: 0, limit: self.limit + self.offset, sort_criteria: self.sort_criteria.clone(), + distinct: self.distinct.clone(), searchable_attributes: self.searchable_attributes, geo_strategy: self.geo_strategy, terms_matching_strategy: self.terms_matching_strategy, @@ -169,6 +170,7 @@ impl<'a> Search<'a> { index: self.index, semantic: self.semantic.clone(), time_budget: self.time_budget.clone(), + ranking_score_threshold: self.ranking_score_threshold, }; let semantic = search.semantic.take(); @@ -176,16 +178,16 @@ impl<'a> Search<'a> { // completely skip semantic search if the results of the keyword search are good enough if self.results_good_enough(&keyword_results, semantic_ratio) { - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); } // no vector search against placeholder search let Some(query) = search.query.take() else { - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); }; // no embedder, no semantic search let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else { - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); }; let vector_query = match vector { @@ -237,3 +239,44 @@ impl<'a> Search<'a> { true } } + +fn return_keyword_results( + limit: usize, + offset: usize, + SearchResult { + matching_words, + candidates, + mut documents_ids, + mut document_scores, + degraded, + used_negative_operator, + }: SearchResult, +) -> (SearchResult, Option) { + let (documents_ids, document_scores) = if offset >= documents_ids.len() || + // technically redudant because documents_ids.len() == document_scores.len(), + // defensive programming + offset >= document_scores.len() + { + (vec![], vec![]) + } else { + // PANICS: offset < len + documents_ids.rotate_left(offset); + documents_ids.truncate(limit); + + // PANICS: offset < len + document_scores.rotate_left(offset); + document_scores.truncate(limit); + (documents_ids, document_scores) + }; + ( + SearchResult { + matching_words, + candidates, + documents_ids, + document_scores, + degraded, + used_negative_operator, + }, + Some(0), + ) +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 443e4b9c1..8ae1ebb0f 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,8 +11,8 @@ use self::new::{execute_vector_search, PartialSearchResult}; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::vector::Embedder; use crate::{ - execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, - SearchContext, TimeBudget, + execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index, + Result, SearchContext, TimeBudget, UserError, }; // Building these factories is not free. @@ -40,6 +40,7 @@ pub struct Search<'a> { offset: usize, limit: usize, sort_criteria: Option>, + distinct: Option, searchable_attributes: Option<&'a [String]>, geo_strategy: new::GeoSortStrategy, terms_matching_strategy: TermsMatchingStrategy, @@ -50,6 +51,7 @@ pub struct Search<'a> { index: &'a Index, semantic: Option, time_budget: TimeBudget, + ranking_score_threshold: Option, } impl<'a> Search<'a> { @@ -60,6 +62,7 @@ impl<'a> Search<'a> { offset: 0, limit: 20, sort_criteria: None, + distinct: None, searchable_attributes: None, geo_strategy: new::GeoSortStrategy::default(), terms_matching_strategy: TermsMatchingStrategy::default(), @@ -70,6 +73,7 @@ impl<'a> Search<'a> { index, semantic: None, time_budget: TimeBudget::max(), + ranking_score_threshold: None, } } @@ -103,6 +107,11 @@ impl<'a> Search<'a> { self } + pub fn distinct(&mut self, distinct: String) -> &mut Search<'a> { + self.distinct = Some(distinct); + self + } + pub fn searchable_attributes(&mut self, searchable: &'a [String]) -> &mut Search<'a> { self.searchable_attributes = Some(searchable); self @@ -146,6 +155,11 @@ impl<'a> Search<'a> { self } + pub fn ranking_score_threshold(&mut self, ranking_score_threshold: f64) -> &mut Search<'a> { + self.ranking_score_threshold = Some(ranking_score_threshold); + self + } + pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result { if has_vector_search { let ctx = SearchContext::new(self.index, self.rtxn)?; @@ -162,6 +176,19 @@ impl<'a> Search<'a> { ctx.attributes_to_search_on(searchable_attributes)?; } + if let Some(distinct) = &self.distinct { + let filterable_fields = ctx.index.filterable_fields(ctx.txn)?; + if !crate::is_faceted(distinct, &filterable_fields) { + let (valid_fields, hidden_fields) = + ctx.index.remove_hidden_fields(ctx.txn, filterable_fields)?; + return Err(Error::UserError(UserError::InvalidDistinctAttribute { + field: distinct.clone(), + valid_fields, + hidden_fields, + })); + } + } + let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?; let PartialSearchResult { located_query_terms, @@ -178,12 +205,14 @@ impl<'a> Search<'a> { self.scoring_strategy, universe, &self.sort_criteria, + &self.distinct, self.geo_strategy, self.offset, self.limit, embedder_name, embedder, self.time_budget.clone(), + self.ranking_score_threshold, )? } _ => execute_search( @@ -194,6 +223,7 @@ impl<'a> Search<'a> { self.exhaustive_number_hits, universe, &self.sort_criteria, + &self.distinct, self.geo_strategy, self.offset, self.limit, @@ -201,6 +231,7 @@ impl<'a> Search<'a> { &mut DefaultSearchLogger, &mut DefaultSearchLogger, self.time_budget.clone(), + self.ranking_score_threshold, )?, }; @@ -229,6 +260,7 @@ impl fmt::Debug for Search<'_> { offset, limit, sort_criteria, + distinct, searchable_attributes, geo_strategy: _, terms_matching_strategy, @@ -239,6 +271,7 @@ impl fmt::Debug for Search<'_> { index: _, semantic, time_budget, + ranking_score_threshold, } = self; f.debug_struct("Search") .field("query", query) @@ -247,6 +280,7 @@ impl fmt::Debug for Search<'_> { .field("offset", offset) .field("limit", limit) .field("sort_criteria", sort_criteria) + .field("distinct", distinct) .field("searchable_attributes", searchable_attributes) .field("terms_matching_strategy", terms_matching_strategy) .field("scoring_strategy", scoring_strategy) @@ -257,6 +291,7 @@ impl fmt::Debug for Search<'_> { &semantic.as_ref().map(|semantic| &semantic.embedder_name), ) .field("time_budget", time_budget) + .field("ranking_score_threshold", ranking_score_threshold) .finish() } } diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index e9bc5449d..9255e4c09 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -22,18 +22,25 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, mut ranking_rules: Vec>, query: &Q, + distinct: Option<&str>, universe: &RoaringBitmap, from: usize, length: usize, scoring_strategy: ScoringStrategy, logger: &mut dyn SearchLogger, time_budget: TimeBudget, + ranking_score_threshold: Option, ) -> Result { logger.initial_query(query); logger.ranking_rules(&ranking_rules); logger.initial_universe(universe); - let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { + let distinct_field = match distinct { + Some(distinct) => Some(distinct), + None => ctx.index.distinct_field(ctx.txn)?, + }; + + let distinct_fid = if let Some(field) = distinct_field { ctx.index.fields_ids_map(ctx.txn)?.id(field) } else { None @@ -164,7 +171,19 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( loop { let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); ranking_rule_scores.push(ScoreDetails::Skipped); + + // remove candidates from the universe without adding them to result if their score is below the threshold + if let Some(ranking_score_threshold) = ranking_score_threshold { + let current_score = ScoreDetails::global_score(ranking_rule_scores.iter()); + if current_score < ranking_score_threshold { + all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index]; + back!(); + continue; + } + } + maybe_add_to_results!(bucket); + ranking_rule_scores.pop(); if cur_ranking_rule_index == 0 { @@ -220,6 +239,18 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( debug_assert!( ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates) ); + + // remove candidates from the universe without adding them to result if their score is below the threshold + if let Some(ranking_score_threshold) = ranking_score_threshold { + let current_score = ScoreDetails::global_score(ranking_rule_scores.iter()); + if current_score < ranking_score_threshold { + all_candidates -= + next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index]; + back!(); + continue; + } + } + ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; if cur_ranking_rule_index == ranking_rules_len - 1 diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index f121971b8..77ae5fcd5 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -516,6 +516,7 @@ mod tests { false, universe, &None, + &None, crate::search::new::GeoSortStrategy::default(), 0, 100, @@ -523,6 +524,7 @@ mod tests { &mut crate::DefaultSearchLogger, &mut crate::DefaultSearchLogger, TimeBudget::max(), + None, ) .unwrap(); diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index f1d1db6a9..25ce482d3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -548,6 +548,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( Ok(()) } +#[tracing::instrument(level = "trace", skip_all, target = "search")] pub fn filtered_universe( index: &Index, txn: &RoTxn<'_>, @@ -567,12 +568,14 @@ pub fn execute_vector_search( scoring_strategy: ScoringStrategy, universe: RoaringBitmap, sort_criteria: &Option>, + distinct: &Option, geo_strategy: geo_sort::Strategy, from: usize, length: usize, embedder_name: &str, embedder: &Embedder, time_budget: TimeBudget, + ranking_score_threshold: Option, ) -> Result { check_sort_criteria(ctx, sort_criteria.as_ref())?; @@ -596,12 +599,14 @@ pub fn execute_vector_search( ctx, ranking_rules, &PlaceholderQuery, + distinct.as_deref(), &universe, from, length, scoring_strategy, placeholder_search_logger, time_budget, + ranking_score_threshold, )?; Ok(PartialSearchResult { @@ -624,6 +629,7 @@ pub fn execute_search( exhaustive_number_hits: bool, mut universe: RoaringBitmap, sort_criteria: &Option>, + distinct: &Option, geo_strategy: geo_sort::Strategy, from: usize, length: usize, @@ -631,6 +637,7 @@ pub fn execute_search( placeholder_search_logger: &mut dyn SearchLogger, query_graph_logger: &mut dyn SearchLogger, time_budget: TimeBudget, + ranking_score_threshold: Option, ) -> Result { check_sort_criteria(ctx, sort_criteria.as_ref())?; @@ -713,12 +720,14 @@ pub fn execute_search( ctx, ranking_rules, &graph, + distinct.as_deref(), &universe, from, length, scoring_strategy, query_graph_logger, time_budget, + ranking_score_threshold, )? } else { let ranking_rules = @@ -727,12 +736,14 @@ pub fn execute_search( ctx, ranking_rules, &PlaceholderQuery, + distinct.as_deref(), &universe, from, length, scoring_strategy, placeholder_search_logger, time_budget, + ranking_score_threshold, )? }; @@ -742,7 +753,12 @@ pub fn execute_search( // The candidates is the universe unless the exhaustive number of hits // is requested and a distinct attribute is set. if exhaustive_number_hits { - if let Some(f) = ctx.index.distinct_field(ctx.txn)? { + let distinct_field = match distinct.as_deref() { + Some(distinct) => Some(distinct), + None => ctx.index.distinct_field(ctx.txn)?, + }; + + if let Some(f) = distinct_field { if let Some(distinct_fid) = fields_ids_map.id(f) { all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining; } diff --git a/milli/src/search/new/tests/distinct.rs b/milli/src/search/new/tests/distinct.rs index c54600f27..75c00da2a 100644 --- a/milli/src/search/new/tests/distinct.rs +++ b/milli/src/search/new/tests/distinct.rs @@ -205,8 +205,18 @@ fn create_index() -> TempIndex { index } -fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec { - let vs = collect_field_values(index, txn, index.distinct_field(txn).unwrap().unwrap(), docids); +fn verify_distinct( + index: &Index, + txn: &RoTxn, + distinct: Option<&str>, + docids: &[u32], +) -> Vec { + let vs = collect_field_values( + index, + txn, + distinct.or_else(|| index.distinct_field(txn).unwrap()).unwrap(), + docids, + ); let mut unique = HashSet::new(); for v in vs.iter() { @@ -223,12 +233,49 @@ fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec { fn test_distinct_placeholder_no_ranking_rules() { let index = create_index(); + // Set the letter as filterable and unset the distinct attribute. + index + .update_settings(|s| { + s.set_filterable_fields(hashset! { S("letter") }); + s.reset_distinct_field(); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.distinct(S("letter")); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); + let distinct_values = verify_distinct(&index, &txn, Some("letter"), &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"A\"", + "\"B\"", + "\"C\"", + "\"D\"", + "\"E\"", + "\"F\"", + "\"G\"", + "\"H\"", + "\"I\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); +} + +#[test] +fn test_distinct_at_search_placeholder_no_ranking_rules() { + let index = create_index(); + let txn = index.read_txn().unwrap(); let s = Search::new(&txn, &index); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"A\"", @@ -263,7 +310,7 @@ fn test_distinct_placeholder_sort() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"E\"", @@ -303,7 +350,7 @@ fn test_distinct_placeholder_sort() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"I\"", @@ -346,7 +393,7 @@ fn test_distinct_placeholder_sort() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"I\"", @@ -399,7 +446,7 @@ fn test_distinct_words() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"A\"", @@ -453,7 +500,7 @@ fn test_distinct_sort_words() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"I\"", @@ -549,7 +596,7 @@ fn test_distinct_typo() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"B\"", diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap deleted file mode 100644 index 930a21626..000000000 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap +++ /dev/null @@ -1,244 +0,0 @@ ---- -source: milli/src/search/new/tests/attribute_fid.rs -expression: "format!(\"{document_ids_scores:#?}\")" ---- -[ - ( - 2, - [ - Fid( - Rank { - rank: 19, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 91, - max_rank: 91, - }, - ), - ], - ), - ( - 6, - [ - Fid( - Rank { - rank: 15, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 5, - [ - Fid( - Rank { - rank: 14, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 79, - max_rank: 91, - }, - ), - ], - ), - ( - 4, - [ - Fid( - Rank { - rank: 13, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 77, - max_rank: 91, - }, - ), - ], - ), - ( - 3, - [ - Fid( - Rank { - rank: 12, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 83, - max_rank: 91, - }, - ), - ], - ), - ( - 9, - [ - Fid( - Rank { - rank: 11, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 75, - max_rank: 91, - }, - ), - ], - ), - ( - 8, - [ - Fid( - Rank { - rank: 10, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 79, - max_rank: 91, - }, - ), - ], - ), - ( - 7, - [ - Fid( - Rank { - rank: 10, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 73, - max_rank: 91, - }, - ), - ], - ), - ( - 11, - [ - Fid( - Rank { - rank: 7, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 77, - max_rank: 91, - }, - ), - ], - ), - ( - 10, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 13, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 12, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 78, - max_rank: 91, - }, - ), - ], - ), - ( - 14, - [ - Fid( - Rank { - rank: 5, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 75, - max_rank: 91, - }, - ), - ], - ), - ( - 0, - [ - Fid( - Rank { - rank: 1, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 91, - max_rank: 91, - }, - ), - ], - ), -] diff --git a/milli/src/search/new/tests/stop_words.rs b/milli/src/search/new/tests/stop_words.rs index 629751b48..dc1e45fce 100644 --- a/milli/src/search/new/tests/stop_words.rs +++ b/milli/src/search/new/tests/stop_words.rs @@ -13,7 +13,7 @@ use std::collections::BTreeSet; use std::iter::FromIterator; use crate::index::tests::TempIndex; -use crate::{db_snap, Search, SearchResult, TermsMatchingStrategy}; +use crate::{Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -66,9 +66,10 @@ fn create_index() -> TempIndex { } #[test] +#[cfg(not(feature = "swedish-recomposition"))] fn test_stop_words_not_indexed() { let index = create_index(); - db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264"); + crate::db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264"); } #[test] diff --git a/milli/src/search/similar.rs b/milli/src/search/similar.rs index 49b7c876f..bf5cc323f 100644 --- a/milli/src/search/similar.rs +++ b/milli/src/search/similar.rs @@ -17,6 +17,7 @@ pub struct Similar<'a> { index: &'a Index, embedder_name: String, embedder: Arc, + ranking_score_threshold: Option, } impl<'a> Similar<'a> { @@ -29,7 +30,17 @@ impl<'a> Similar<'a> { embedder_name: String, embedder: Arc, ) -> Self { - Self { id, filter: None, offset, limit, rtxn, index, embedder_name, embedder } + Self { + id, + filter: None, + offset, + limit, + rtxn, + index, + embedder_name, + embedder, + ranking_score_threshold: None, + } } pub fn filter(&mut self, filter: Filter<'a>) -> &mut Self { @@ -37,8 +48,18 @@ impl<'a> Similar<'a> { self } + pub fn ranking_score_threshold(&mut self, ranking_score_threshold: f64) -> &mut Self { + self.ranking_score_threshold = Some(ranking_score_threshold); + self + } + pub fn execute(&self) -> Result { - let universe = filtered_universe(self.index, self.rtxn, &self.filter)?; + let mut universe = filtered_universe(self.index, self.rtxn, &self.filter)?; + + // we never want to receive the docid + universe.remove(self.id); + + let universe = universe; let embedder_index = self.index @@ -77,6 +98,8 @@ impl<'a> Similar<'a> { let mut documents_seen = RoaringBitmap::new(); documents_seen.insert(self.id); + let mut candidates = universe; + for (docid, distance) in results .into_iter() // skip documents we've already seen & mark that we saw the current document @@ -85,8 +108,6 @@ impl<'a> Similar<'a> { // take **after** filter and skip so that we get exactly limit elements if available .take(self.limit) { - documents_ids.push(docid); - let score = 1.0 - distance; let score = self .embedder @@ -94,14 +115,28 @@ impl<'a> Similar<'a> { .map(|distribution| distribution.shift(score)) .unwrap_or(score); - let score = ScoreDetails::Vector(score_details::Vector { similarity: Some(score) }); + let score_details = + vec![ScoreDetails::Vector(score_details::Vector { similarity: Some(score) })]; - document_scores.push(vec![score]); + let score = ScoreDetails::global_score(score_details.iter()); + + if let Some(ranking_score_threshold) = &self.ranking_score_threshold { + if score < *ranking_score_threshold { + // this document is no longer a candidate + candidates.remove(docid); + // any document after this one is no longer a candidate either, so restrict the set to documents already seen. + candidates &= documents_seen; + break; + } + } + + documents_ids.push(docid); + document_scores.push(score_details); } Ok(SearchResult { matching_words: Default::default(), - candidates: universe, + candidates, documents_ids, document_scores, degraded: false, diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap deleted file mode 100644 index 1d1d629e6..000000000 --- a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/index.rs ---- -age 1 | -id 2 | -name 2 | - diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap deleted file mode 100644 index 1d1d629e6..000000000 --- a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/index.rs ---- -age 1 | -id 2 | -name 2 | - diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 3490b55e4..9eca378a5 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -64,6 +64,13 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?; + // Remove all user-provided bits from the configs + let mut configs = self.index.embedding_configs(self.wtxn)?; + for config in configs.iter_mut() { + config.user_provided.clear(); + } + self.index.put_embedding_configs(self.wtxn, configs)?; + // Clear the other databases. external_documents_ids.clear(self.wtxn)?; word_docids.clear(self.wtxn)?; diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 76ec90d65..36fa346a5 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -8,18 +8,19 @@ use std::sync::Arc; use bytemuck::cast_slice; use grenad::Writer; -use itertools::EitherOrBoth; use ordered_float::OrderedFloat; +use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; +use crate::index::IndexEmbeddingConfig; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; -use crate::update::index_documents::helpers::try_split_at; use crate::update::settings::InnerIndexSettingsDiff; -use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME}; +use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME}; +use crate::vector::settings::{EmbedderAction, ReindexAction}; use crate::vector::Embedder; -use crate::{DocumentId, Result, ThreadPoolNoAbort}; +use crate::{try_split_array_at, DocumentId, FieldId, FieldsIdsMap, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -35,6 +36,8 @@ pub struct ExtractedVectorPoints { // embedder pub embedder_name: String, pub embedder: Arc, + pub add_to_user_provided: RoaringBitmap, + pub remove_from_user_provided: RoaringBitmap, } enum VectorStateDelta { @@ -42,12 +45,7 @@ enum VectorStateDelta { // Remove all vectors, generated or manual, from this document NowRemoved, - // Add the manually specified vectors, passed in the other grenad - // Remove any previously generated vectors - // Note: changing the value of the manually specified vector **should not record** this delta - WasGeneratedNowManual(Vec>), - - ManualDelta(Vec>, Vec>), + NowManual(Vec>), // Add the vector computed from the specified prompt // Remove any previous vector @@ -56,14 +54,12 @@ enum VectorStateDelta { } impl VectorStateDelta { - fn into_values(self) -> (bool, String, (Vec>, Vec>)) { + fn into_values(self) -> (bool, String, Vec>) { match self { VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), - VectorStateDelta::WasGeneratedNowManual(add) => { - (true, Default::default(), (Default::default(), add)) - } - VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)), + // We always delete the previous vectors + VectorStateDelta::NowManual(add) => (true, Default::default(), add), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), } } @@ -74,12 +70,27 @@ struct EmbedderVectorExtractor { embedder: Arc, prompt: Arc, - // (docid, _index) -> KvWriterDelAdd -> Vector - manual_vectors_writer: Writer>, // (docid) -> (prompt) prompts_writer: Writer>, // (docid) -> () remove_vectors_writer: Writer>, + // (docid, _index) -> KvWriterDelAdd -> Vector + manual_vectors_writer: Writer>, + // The docids of the documents that contains a user defined embedding + add_to_user_provided: RoaringBitmap, + + action: ExtractionAction, +} + +struct DocumentOperation { + // The docids of the documents that contains an auto-generated embedding + remove_from_user_provided: RoaringBitmap, +} + +enum ExtractionAction { + SettingsFullReindex, + SettingsRegeneratePrompts { old_prompt: Arc }, + DocumentOperation(DocumentOperation), } /// Extracts the embedding vector contained in each document under the `_vectors` field. @@ -89,6 +100,7 @@ struct EmbedderVectorExtractor { pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, + embedders_configs: &[IndexEmbeddingConfig], settings_diff: &InnerIndexSettingsDiff, ) -> Result> { let reindex_vectors = settings_diff.reindex_vectors(); @@ -97,153 +109,207 @@ pub fn extract_vector_points( let new_fields_ids_map = &settings_diff.new.fields_ids_map; // the vector field id may have changed let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); - // filter the old vector fid if the settings has been changed forcing reindexing. - let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors); let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); let mut extractors = Vec::new(); - for (embedder_name, (embedder, prompt)) in - settings_diff.new.embedding_configs.clone().into_iter() - { - // (docid, _index) -> KvWriterDelAdd -> Vector - let manual_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); - // (docid) -> (prompt) - let prompts_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + let mut configs = settings_diff.new.embedding_configs.clone().into_inner(); + let old_configs = &settings_diff.old.embedding_configs; - // (docid) -> () - let remove_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + if reindex_vectors { + for (name, action) in settings_diff.embedding_config_updates.iter() { + match action { + EmbedderAction::WriteBackToDocuments(_) => continue, // already deleted + EmbedderAction::Reindex(action) => { + let Some((embedder_name, (embedder, prompt))) = configs.remove_entry(name) + else { + tracing::error!(embedder = name, "Requested embedder config not found"); + continue; + }; - extractors.push(EmbedderVectorExtractor { - embedder_name, - embedder, - prompt, - manual_vectors_writer, - prompts_writer, - remove_vectors_writer, - }); + // (docid, _index) -> KvWriterDelAdd -> Vector + let manual_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> (prompt) + let prompts_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> () + let remove_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + let action = match action { + ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex, + ReindexAction::RegeneratePrompts => { + let Some((_, old_prompt)) = old_configs.get(name) else { + tracing::error!(embedder = name, "Old embedder config not found"); + continue; + }; + + ExtractionAction::SettingsRegeneratePrompts { old_prompt } + } + }; + + extractors.push(EmbedderVectorExtractor { + embedder_name, + embedder, + prompt, + prompts_writer, + remove_vectors_writer, + manual_vectors_writer, + add_to_user_provided: RoaringBitmap::new(), + action, + }); + } + } + } + } else { + // document operation + + for (embedder_name, (embedder, prompt)) in configs.into_iter() { + // (docid, _index) -> KvWriterDelAdd -> Vector + let manual_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> (prompt) + let prompts_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> () + let remove_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + extractors.push(EmbedderVectorExtractor { + embedder_name, + embedder, + prompt, + prompts_writer, + remove_vectors_writer, + manual_vectors_writer, + add_to_user_provided: RoaringBitmap::new(), + action: ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided: RoaringBitmap::new(), + }), + }); + } } let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { // this must always be serialized as (docid, external_docid); + const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::(); let (docid_bytes, external_id_bytes) = - try_split_at(key, std::mem::size_of::()).unwrap(); + try_split_array_at::(key).unwrap(); debug_assert!(from_utf8(external_id_bytes).is_ok()); + let docid = DocumentId::from_be_bytes(docid_bytes); let obkv = obkv::KvReader::new(value); key_buffer.clear(); - key_buffer.extend_from_slice(docid_bytes); + key_buffer.extend_from_slice(docid_bytes.as_slice()); // since we only need the primary key when we throw an error we create this getter to // lazily get it when needed let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; - let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid) - .map_err(|error| error.to_crate_error(document_id().to_string()))?; + let mut parsed_vectors = ParsedVectorsDiff::new( + docid, + embedders_configs, + obkv, + old_vectors_fid, + new_vectors_fid, + ) + .map_err(|error| error.to_crate_error(document_id().to_string()))?; for EmbedderVectorExtractor { embedder_name, embedder: _, prompt, - manual_vectors_writer, prompts_writer, remove_vectors_writer, + manual_vectors_writer, + add_to_user_provided, + action, } in extractors.iter_mut() { - let delta = match parsed_vectors.remove(embedder_name) { - (Some(old), Some(new)) => { - // no autogeneration - let del_vectors = old.into_array_of_vectors(); - let add_vectors = new.into_array_of_vectors(); - - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::ManualDelta(del_vectors, add_vectors) - } - (Some(_old), None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - if document_is_kept { - // becomes autogenerated - VectorStateDelta::NowGenerated(prompt.render( - obkv, - DelAdd::Addition, - new_fields_ids_map, - )?) - } else { - VectorStateDelta::NowRemoved - } - } - (None, Some(new)) => { - // was possibly autogenerated, remove all vectors for that document - let add_vectors = new.into_array_of_vectors(); - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::WasGeneratedNowManual(add_vectors) - } - (None, None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - - if document_is_kept { - // Don't give up if the old prompt was failing - let old_prompt = Some(&prompt) - // TODO: this filter works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - .filter(|_| !settings_diff.reindex_vectors()) - .map(|p| { - p.render(obkv, DelAdd::Deletion, old_fields_ids_map) - .unwrap_or_default() - }); - let new_prompt = - prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; - if old_prompt.as_ref() != Some(&new_prompt) { - let old_prompt = old_prompt.unwrap_or_default(); - tracing::trace!( - "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" - ); - VectorStateDelta::NowGenerated(new_prompt) - } else { - tracing::trace!("⏭️ Prompt unmodified, skipping"); - VectorStateDelta::NoChange + let (old, new) = parsed_vectors.remove(embedder_name); + let delta = match action { + ExtractionAction::SettingsFullReindex => match old { + // A full reindex can be triggered either by: + // 1. a new embedder + // 2. an existing embedder changed so that it must regenerate all generated embeddings. + // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB + VectorState::Inline(vectors) => { + if !vectors.must_regenerate() { + add_to_user_provided.insert(docid); } + + match vectors.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError( + crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ), + )); + } + VectorStateDelta::NowManual(add_vectors) + } + None => VectorStateDelta::NoChange, + } + } + // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors + VectorState::Manual => VectorStateDelta::NoChange, + // generated vectors must be regenerated + VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?, + }, + // prompt regeneration is only triggered for existing embedders + ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { + if old.must_regenerate() { + regenerate_if_prompt_changed( + obkv, + (old_prompt, prompt), + (old_fields_ids_map, new_fields_ids_map), + )? } else { - VectorStateDelta::NowRemoved + // we can simply ignore user provided vectors as they are not regenerated and are + // already in the DB since this is an existing embedder + VectorStateDelta::NoChange } } + ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided, + }) => extract_vector_document_diff( + docid, + obkv, + prompt, + (add_to_user_provided, remove_from_user_provided), + (old, new), + (old_fields_ids_map, new_fields_ids_map), + document_id, + )?, }; - // and we finally push the unique vectors into the writer push_vectors_diff( remove_vectors_writer, @@ -251,7 +317,6 @@ pub fn extract_vector_points( manual_vectors_writer, &mut key_buffer, delta, - reindex_vectors, )?; } } @@ -262,43 +327,185 @@ pub fn extract_vector_points( embedder_name, embedder, prompt: _, - manual_vectors_writer, prompts_writer, remove_vectors_writer, + action, + manual_vectors_writer, + add_to_user_provided, } in extractors { - results.push(ExtractedVectorPoints { - // docid, _index -> KvWriterDelAdd -> Vector - manual_vectors: writer_into_reader(manual_vectors_writer)?, - // docid -> () - remove_vectors: writer_into_reader(remove_vectors_writer)?, - // docid -> prompt - prompts: writer_into_reader(prompts_writer)?, + let remove_from_user_provided = + if let ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided, + }) = action + { + remove_from_user_provided + } else { + Default::default() + }; + results.push(ExtractedVectorPoints { + manual_vectors: writer_into_reader(manual_vectors_writer)?, + remove_vectors: writer_into_reader(remove_vectors_writer)?, + prompts: writer_into_reader(prompts_writer)?, embedder, embedder_name, + add_to_user_provided, + remove_from_user_provided, }) } Ok(results) } -/// Computes the diff between both Del and Add numbers and -/// only inserts the parts that differ in the sorter. +fn extract_vector_document_diff( + docid: DocumentId, + obkv: obkv::KvReader<'_, FieldId>, + prompt: &Prompt, + (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), + (old, new): (VectorState, VectorState), + (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), + document_id: impl Fn() -> Value, +) -> Result { + match (old.must_regenerate(), new.must_regenerate()) { + (true, true) | (false, false) => {} + (true, false) => { + add_to_user_provided.insert(docid); + } + (false, true) => { + remove_from_user_provided.insert(docid); + } + } + + let delta = match (old, new) { + // regardless of the previous state, if a document now contains inline _vectors, they must + // be extracted manually + (_old, VectorState::Inline(new)) => match new.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); + } + + VectorStateDelta::NowManual(add_vectors) + } + None => VectorStateDelta::NoChange, + }, + // no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the + // document changed + (VectorState::Generated, VectorState::Generated) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + + if document_is_kept { + // Don't give up if the old prompt was failing + let old_prompt = Some(&prompt).map(|p| { + p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default() + }); + let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + if old_prompt.as_ref() != Some(&new_prompt) { + let old_prompt = old_prompt.unwrap_or_default(); + tracing::trace!( + "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" + ); + VectorStateDelta::NowGenerated(new_prompt) + } else { + tracing::trace!("⏭️ Prompt unmodified, skipping"); + VectorStateDelta::NoChange + } + } else { + VectorStateDelta::NowRemoved + } + } + // inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from + // the previous version of the document. + // Manual -> Generated is also not possible without an Inline to the right (which is handled above) + // Generated -> Generated is handled above, so not possible + // As a result, this code is unreachable + (_not_generated, VectorState::Generated) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + if document_is_kept { + // becomes autogenerated + VectorStateDelta::NowGenerated(prompt.render( + obkv, + DelAdd::Addition, + new_fields_ids_map, + )?) + } else { + // make sure the document is always removed from user provided on removal + remove_from_user_provided.insert(docid); + VectorStateDelta::NowRemoved + } + } + // inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous + // version of the document. + // however the Rust type system cannot know that. + (_manual, VectorState::Manual) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + if document_is_kept { + // if the new version of documents has the vectors in the DB, + // then they are user-provided and nothing possibly changed + VectorStateDelta::NoChange + } else { + // make sure the document is always removed from user provided on removal + remove_from_user_provided.insert(docid); + VectorStateDelta::NowRemoved + } + } + }; + + Ok(delta) +} + +fn regenerate_if_prompt_changed( + obkv: obkv::KvReader<'_, FieldId>, + (old_prompt, new_prompt): (&Prompt, &Prompt), + (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), +) -> Result { + let old_prompt = + old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default()); + let new_prompt = new_prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + + if new_prompt == old_prompt { + return Ok(VectorStateDelta::NoChange); + } + Ok(VectorStateDelta::NowGenerated(new_prompt)) +} + +fn regenerate_prompt( + obkv: obkv::KvReader<'_, FieldId>, + prompt: &Prompt, + new_fields_ids_map: &FieldsIdsMap, +) -> Result { + let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + + Ok(VectorStateDelta::NowGenerated(prompt)) +} + +/// We cannot compute the diff between both Del and Add vectors. +/// We'll push every vector and compute the difference later in TypedChunk. fn push_vectors_diff( remove_vectors_writer: &mut Writer>, prompts_writer: &mut Writer>, manual_vectors_writer: &mut Writer>, key_buffer: &mut Vec, delta: VectorStateDelta, - reindex_vectors: bool, ) -> Result<()> { - let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); - if must_remove - // TODO: the below condition works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - && !reindex_vectors - { + let (must_remove, prompt, mut add_vectors) = delta.into_values(); + if must_remove { key_buffer.truncate(TRUNCATE_SIZE); remove_vectors_writer.insert(&key_buffer, [])?; } @@ -308,44 +515,22 @@ fn push_vectors_diff( } // We sort and dedup the vectors - del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); - del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); - let merged_vectors_iter = - itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); - // insert vectors into the writer - for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { + for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { // Generate the key by extending the unique index to it. key_buffer.truncate(TRUNCATE_SIZE); let index = u16::try_from(i).unwrap(); key_buffer.extend_from_slice(&index.to_be_bytes()); - match eob { - EitherOrBoth::Both(_, _) => (), // no need to touch anything - EitherOrBoth::Left(vector) => { - // TODO: the below condition works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - if !reindex_vectors { - // We insert only the Del part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; - } - } - EitherOrBoth::Right(vector) => { - // We insert only the Add part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Addition, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; - } - } + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + manual_vectors_writer.insert(&key_buffer, bytes)?; } Ok(()) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 18340a3ae..2feb85414 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -11,7 +11,7 @@ mod extract_word_position_docids; use std::fs::File; use std::io::BufReader; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use crossbeam_channel::Sender; use rayon::prelude::*; @@ -30,8 +30,9 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; +use crate::index::IndexEmbeddingConfig; use crate::update::settings::InnerIndexSettingsDiff; -use crate::{FieldId, Result, ThreadPoolNoAbortBuilder}; +use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; /// Extract data for each databases from obkv documents in parallel. /// Send data in grenad file over provided Sender. @@ -43,6 +44,7 @@ pub(crate) fn data_from_obkv_documents( indexer: GrenadParameters, lmdb_writer_sx: Sender>, primary_key_id: FieldId, + embedders_configs: Arc>, settings_diff: Arc, max_positions_per_attributes: Option, ) -> Result<()> { @@ -55,6 +57,7 @@ pub(crate) fn data_from_obkv_documents( original_documents_chunk, indexer, lmdb_writer_sx.clone(), + embedders_configs.clone(), settings_diff.clone(), ) }) @@ -204,33 +207,47 @@ fn run_extraction_task( }) } +fn request_threads() -> &'static ThreadPoolNoAbort { + static REQUEST_THREADS: OnceLock = OnceLock::new(); + + REQUEST_THREADS.get_or_init(|| { + ThreadPoolNoAbortBuilder::new() + .num_threads(crate::vector::REQUEST_PARALLELISM) + .thread_name(|index| format!("embedding-request-{index}")) + .build() + .unwrap() + }) +} + /// Extract chunked data and send it into lmdb_writer_sx sender: /// - documents fn send_original_documents_data( original_documents_chunk: Result>>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, + embedders_configs: Arc>, settings_diff: Arc, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; - let request_threads = ThreadPoolNoAbortBuilder::new() - .num_threads(crate::vector::REQUEST_PARALLELISM) - .thread_name(|index| format!("embedding-request-{index}")) - .build()?; - let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only()) // no point in indexing vectors without embedders && (!settings_diff.new.embedding_configs.inner_as_ref().is_empty()); if index_vectors { let settings_diff = settings_diff.clone(); + let embedders_configs = embedders_configs.clone(); let original_documents_chunk = original_documents_chunk.clone(); let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { - match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) { + match extract_vector_points( + original_documents_chunk.clone(), + indexer, + &embedders_configs, + &settings_diff, + ) { Ok(extracted_vectors) => { for ExtractedVectorPoints { manual_vectors, @@ -238,13 +255,15 @@ fn send_original_documents_data( prompts, embedder_name, embedder, + add_to_user_provided, + remove_from_user_provided, } in extracted_vectors { let embeddings = match extract_embeddings( prompts, indexer, embedder.clone(), - &request_threads, + request_threads(), ) { Ok(results) => Some(results), Err(error) => { @@ -262,6 +281,8 @@ fn send_original_documents_data( expected_dimension: embedder.dimensions(), manual_vectors, embedder_name, + add_to_user_provided, + remove_from_user_provided, })); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2420463b4..089b56025 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -286,6 +286,7 @@ where settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let settings_diff = Arc::new(settings_diff); + let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?); let backup_pool; let pool = match self.indexer_config.thread_pool { @@ -399,6 +400,7 @@ where pool_params, lmdb_writer_sx.clone(), primary_key_id, + embedders_configs.clone(), settings_diff_cloned, max_positions_per_attributes, ) @@ -501,6 +503,8 @@ where embeddings, manual_vectors, embedder_name, + add_to_user_provided, + remove_from_user_provided, } => { dimension.insert(embedder_name.clone(), expected_dimension); TypedChunk::VectorPoints { @@ -509,6 +513,8 @@ where expected_dimension, manual_vectors, embedder_name, + add_to_user_provided, + remove_from_user_provided, } } otherwise => otherwise, @@ -541,10 +547,11 @@ where pool.install(|| { for k in crate::vector::arroy_db_range_for_embedder(embedder_index) { let writer = arroy::Writer::new(vector_arroy, k, dimension); - if writer.is_empty(wtxn)? { + if writer.need_build(wtxn)? { + writer.build(wtxn, &mut rng, None)?; + } else if writer.is_empty(wtxn)? { break; } - writer.build(wtxn, &mut rng, None)?; } Result::Ok(()) }) @@ -781,6 +788,7 @@ mod tests { use super::*; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; + use crate::index::IndexEmbeddingConfig; use crate::search::TermsMatchingStrategy; use crate::update::Setting; use crate::{db_snap, Filter, Search}; @@ -2616,10 +2624,12 @@ mod tests { let rtxn = index.read_txn().unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let (embedder_name, embedder) = embedding_configs.pop().unwrap(); + let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } = + embedding_configs.pop().unwrap(); + insta::assert_snapshot!(embedder_name, @"manual"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>"); let embedder = std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); - assert_eq!("manual", embedder_name); let res = index .search(&rtxn) .semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec())) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 876663c92..1dff29a90 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek}; @@ -27,6 +27,8 @@ use crate::update::del_add::{ use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; +use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; use crate::{ is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, }; @@ -806,13 +808,13 @@ impl<'a, 'i> Transform<'a, 'i> { let mut new_inner_settings = old_inner_settings.clone(); new_inner_settings.fields_ids_map = fields_ids_map; - let embedding_configs_updated = false; + let embedding_config_updates = Default::default(); let settings_update_only = false; let settings_diff = InnerIndexSettingsDiff::new( old_inner_settings, new_inner_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, ); @@ -833,10 +835,13 @@ impl<'a, 'i> Transform<'a, 'i> { /// Rebind the field_ids of the provided document to their values /// based on the field_ids_maps difference between the old and the new settings, /// then fill the provided buffers with delta documents using KvWritterDelAdd. + #[allow(clippy::too_many_arguments)] // need the vectors + fid, feel free to create a struct xo xo fn rebind_existing_document( old_obkv: KvReader, settings_diff: &InnerIndexSettingsDiff, modified_faceted_fields: &HashSet, + mut injected_vectors: serde_json::Map, + old_vectors_fid: Option, original_obkv_buffer: Option<&mut Vec>, flattened_obkv_buffer: Option<&mut Vec>, ) -> Result<()> { @@ -859,9 +864,49 @@ impl<'a, 'i> Transform<'a, 'i> { // The operations that we must perform on the different fields. let mut operations = HashMap::new(); + let mut error_seen = false; let mut obkv_writer = KvWriter::<_, FieldId>::memory(); - for (id, val) in old_obkv.iter() { + 'write_fid: for (id, val) in old_obkv.iter() { + if !injected_vectors.is_empty() { + 'inject_vectors: { + let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors }; + + if id < vectors_fid { + break 'inject_vectors; + } + + let mut existing_vectors = if id == vectors_fid { + let existing_vectors: std::result::Result< + serde_json::Map, + serde_json::Error, + > = serde_json::from_slice(val); + + match existing_vectors { + Ok(existing_vectors) => existing_vectors, + Err(error) => { + if !error_seen { + tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); + error_seen = true; + } + Default::default() + } + } + } else { + Default::default() + }; + + existing_vectors.append(&mut injected_vectors); + + operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition); + obkv_writer + .insert(vectors_fid, serde_json::to_vec(&existing_vectors).unwrap())?; + if id == vectors_fid { + continue 'write_fid; + } + } + } + if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { operations.insert(id, DelAddOperation::DeletionAndAddition); obkv_writer.insert(id, val)?; @@ -870,6 +915,15 @@ impl<'a, 'i> Transform<'a, 'i> { obkv_writer.insert(id, val)?; } } + if !injected_vectors.is_empty() { + 'inject_vectors: { + let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors }; + + operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition); + obkv_writer.insert(vectors_fid, serde_json::to_vec(&injected_vectors).unwrap())?; + } + } + let data = obkv_writer.into_inner()?; let obkv = KvReader::::new(&data); @@ -935,6 +989,35 @@ impl<'a, 'i> Transform<'a, 'i> { None }; + let readers: Result< + BTreeMap<&str, (Vec>, &RoaringBitmap)>, + > = settings_diff + .embedding_config_updates + .iter() + .filter_map(|(name, action)| { + if let EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }) = action + { + let readers: Result> = + self.index.arroy_readers(wtxn, *embedder_id).collect(); + match readers { + Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))), + Err(error) => Some(Err(error)), + } + } else { + None + } + }) + .collect(); + let readers = readers?; + + let old_vectors_fid = settings_diff + .old + .fields_ids_map + .id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + // We initialize the sorter with the user indexing settings. let mut flattened_sorter = if settings_diff.reindex_searchable() || settings_diff.reindex_facets() { @@ -961,10 +1044,50 @@ impl<'a, 'i> Transform<'a, 'i> { InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, )?; + let injected_vectors: std::result::Result< + serde_json::Map, + arroy::Error, + > = readers + .iter() + .filter_map(|(name, (readers, user_provided))| { + if !user_provided.contains(docid) { + return None; + } + let mut vectors = Vec::new(); + for reader in readers { + let Some(vector) = reader.item_vector(wtxn, docid).transpose() else { + break; + }; + + match vector { + Ok(vector) => vectors.push(vector), + Err(error) => return Some(Err(error)), + } + } + if vectors.is_empty() { + return None; + } + Some(Ok(( + name.to_string(), + serde_json::to_value(ExplicitVectors { + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + vectors, + )), + regenerate: false, + }) + .unwrap(), + ))) + }) + .collect(); + + let injected_vectors = injected_vectors?; + Self::rebind_existing_document( old_obkv, &settings_diff, &modified_faceted_fields, + injected_vectors, + old_vectors_fid, Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()), )?; @@ -981,6 +1104,23 @@ impl<'a, 'i> Transform<'a, 'i> { } } + let mut writers = Vec::new(); + + // delete all vectors from the embedders that need removal + for (_, (readers, _)) in readers { + for reader in readers { + let dimensions = reader.dimensions(); + let arroy_index = reader.index(); + drop(reader); + let writer = arroy::Writer::new(self.index.vector_arroy, arroy_index, dimensions); + writers.push(writer); + } + } + + for writer in writers { + writer.clear(wtxn)?; + } + let grenad_params = GrenadParameters { chunk_compression_type: self.indexer_settings.chunk_compression_type, chunk_compression_level: self.indexer_settings.chunk_compression_level, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2fbe91685..4737c6b42 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -20,6 +20,7 @@ use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; +use crate::index::IndexEmbeddingConfig; use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; @@ -90,6 +91,8 @@ pub(crate) enum TypedChunk { expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, + add_to_user_provided: RoaringBitmap, + remove_from_user_provided: RoaringBitmap, }, ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } @@ -154,8 +157,11 @@ pub(crate) fn write_typed_chunk_into_index( let mut docids = index.documents_ids(wtxn)?; let mut iter = merger.into_stream_merger_iter()?; - let embedders: BTreeSet<_> = - index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect(); + let embedders: BTreeSet<_> = index + .embedding_configs(wtxn)? + .into_iter() + .map(|IndexEmbeddingConfig { name, .. }| name) + .collect(); let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); @@ -181,7 +187,7 @@ pub(crate) fn write_typed_chunk_into_index( // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is break 'vectors Some(addition); }; - vectors.retain_user_provided_vectors(&embedders); + vectors.retain_not_embedded_vectors(&embedders); let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; if vectors.is_empty() { // skip writing empty `_vectors` map @@ -619,6 +625,8 @@ pub(crate) fn write_typed_chunk_into_index( let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); + let mut add_to_user_provided = RoaringBitmap::new(); + let mut remove_from_user_provided = RoaringBitmap::new(); let mut params = None; for typed_chunk in typed_chunks { let TypedChunk::VectorPoints { @@ -627,6 +635,8 @@ pub(crate) fn write_typed_chunk_into_index( embeddings, expected_dimension, embedder_name, + add_to_user_provided: aud, + remove_from_user_provided: rud, } = typed_chunk else { unreachable!(); @@ -639,11 +649,23 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(embeddings) = embeddings { embeddings_builder.push(embeddings.into_cursor()?); } + add_to_user_provided |= aud; + remove_from_user_provided |= rud; } // typed chunks has always at least 1 chunk. let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; + let mut embedding_configs = index.embedding_configs(wtxn)?; + let index_embedder_config = embedding_configs + .iter_mut() + .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) + .unwrap(); + index_embedder_config.user_provided -= remove_from_user_provided; + index_embedder_config.user_provided |= add_to_user_provided; + + index.put_embedding_configs(wtxn, embedding_configs)?; + let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, )?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index be9b6b74e..b792cde52 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; use itertools::{EitherOrBoth, Itertools}; +use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; @@ -14,12 +15,18 @@ use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; -use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; +use crate::index::{ + IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, +}; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; -use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; +use crate::vector::settings::{ + check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction, + WriteBackToDocuments, +}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::{FieldId, FieldsIdsMap, Index, Result}; @@ -490,6 +497,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, &names, + &fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME), &fields_ids_map, )?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -919,92 +927,177 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(changed) } - fn update_embedding_configs(&mut self) -> Result { - let update = match std::mem::take(&mut self.embedder_settings) { - Setting::Set(configs) => { - let mut changed = false; + fn update_embedding_configs(&mut self) -> Result> { + match std::mem::take(&mut self.embedder_settings) { + Setting::Set(configs) => self.update_embedding_configs_set(configs), + Setting::Reset => { + // all vectors should be written back to documents let old_configs = self.index.embedding_configs(self.wtxn)?; - let old_configs: BTreeMap> = - old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect(); - - let mut new_configs = BTreeMap::new(); - for joined in old_configs + let remove_all: Result> = old_configs .into_iter() - .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) - { - match joined { - // updated config - EitherOrBoth::Both((name, mut old), (_, new)) => { - changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); - if changed { - tracing::debug!(embedder = name, "need reindex"); - } else { - tracing::debug!(embedder = name, "skip reindex"); - } - let new = validate_embedding_settings(old, &name)?; - new_configs.insert(name, new); - } - // unchanged config - EitherOrBoth::Left((name, setting)) => { - new_configs.insert(name, setting); - } - // new config - EitherOrBoth::Right((name, mut setting)) => { - // apply the default source in case the source was not set so that it gets validated - crate::vector::settings::EmbeddingSettings::apply_default_source( - &mut setting, - ); - crate::vector::settings::EmbeddingSettings::apply_default_openai_model( - &mut setting, - ); - let setting = validate_embedding_settings(setting, &name)?; - changed = true; - new_configs.insert(name, setting); - } - } - } - let new_configs: Vec<(String, EmbeddingConfig)> = new_configs - .into_iter() - .filter_map(|(name, setting)| match setting { - Setting::Set(value) => Some((name, value.into())), - Setting::Reset => None, - Setting::NotSet => Some((name, EmbeddingSettings::default().into())), + .map(|IndexEmbeddingConfig { name, config: _, user_provided }| -> Result<_> { + let embedder_id = + self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + Ok(( + name, + EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }), + )) }) .collect(); + let remove_all = remove_all?; + self.index.embedder_category_id.clear(self.wtxn)?; - for (index, (embedder_name, _)) in new_configs.iter().enumerate() { - self.index.embedder_category_id.put_with_flags( - self.wtxn, - heed::PutFlags::APPEND, - embedder_name, - &index - .try_into() - .map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?, - )?; - } - - if new_configs.is_empty() { - self.index.delete_embedding_configs(self.wtxn)?; - } else { - self.index.put_embedding_configs(self.wtxn, new_configs)?; - } - changed - } - Setting::Reset => { self.index.delete_embedding_configs(self.wtxn)?; - true + Ok(remove_all) } - Setting::NotSet => false, - }; - - // if any changes force a reindexing - // clear the vector database. - if update { - self.index.vector_arroy.clear(self.wtxn)?; + Setting::NotSet => Ok(Default::default()), } + } - Ok(update) + fn update_embedding_configs_set( + &mut self, + configs: BTreeMap>, + ) -> Result> { + use crate::vector::settings::SettingsDiff; + + let old_configs = self.index.embedding_configs(self.wtxn)?; + let old_configs: BTreeMap = old_configs + .into_iter() + .map(|IndexEmbeddingConfig { name, config, user_provided }| { + (name, (config.into(), user_provided)) + }) + .collect(); + let mut updated_configs = BTreeMap::new(); + let mut embedder_actions = BTreeMap::new(); + for joined in old_configs + .into_iter() + .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) + { + match joined { + // updated config + EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => { + let settings_diff = SettingsDiff::from_settings(old, new); + match settings_diff { + SettingsDiff::Remove => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + "removing embedder" + ); + let embedder_id = + self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + // free id immediately + self.index.embedder_category_id.delete(self.wtxn, &name)?; + embedder_actions.insert( + name, + EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }), + ); + } + SettingsDiff::Reindex { action, updated_settings } => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + ?action, + "reindex embedder" + ); + embedder_actions.insert(name.clone(), EmbedderAction::Reindex(action)); + let new = + validate_embedding_settings(Setting::Set(updated_settings), &name)?; + updated_configs.insert(name, (new, user_provided)); + } + SettingsDiff::UpdateWithoutReindex { updated_settings } => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + "update without reindex embedder" + ); + let new = + validate_embedding_settings(Setting::Set(updated_settings), &name)?; + updated_configs.insert(name, (new, user_provided)); + } + } + } + // unchanged config + EitherOrBoth::Left((name, (setting, user_provided))) => { + tracing::debug!(embedder = name, "unchanged embedder"); + updated_configs.insert(name, (Setting::Set(setting), user_provided)); + } + // new config + EitherOrBoth::Right((name, mut setting)) => { + tracing::debug!(embedder = name, "new embedder"); + // apply the default source in case the source was not set so that it gets validated + crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting); + crate::vector::settings::EmbeddingSettings::apply_default_openai_model( + &mut setting, + ); + let setting = validate_embedding_settings(setting, &name)?; + embedder_actions + .insert(name.clone(), EmbedderAction::Reindex(ReindexAction::FullReindex)); + updated_configs.insert(name, (setting, RoaringBitmap::new())); + } + } + } + let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; + for res in self.index.embedder_category_id.iter(self.wtxn)? { + let (_name, id) = res?; + free_indices[id as usize] = false; + } + let mut free_indices = free_indices.iter_mut().enumerate(); + let mut find_free_index = + move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); + for (name, action) in embedder_actions.iter() { + match action { + EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => { + /* cannot be a new embedder, so has to have an id already */ + } + EmbedderAction::Reindex(ReindexAction::FullReindex) => { + if self.index.embedder_category_id.get(self.wtxn, name)?.is_none() { + let id = find_free_index() + .ok_or(UserError::TooManyEmbedders(updated_configs.len()))?; + tracing::debug!(embedder = name, id, "assigning free id to new embedder"); + self.index.embedder_category_id.put(self.wtxn, name, &id)?; + } + } + EmbedderAction::WriteBackToDocuments(_) => { /* already removed */ } + } + } + let updated_configs: Vec = updated_configs + .into_iter() + .filter_map(|(name, (config, user_provided))| match config { + Setting::Set(config) => { + Some(IndexEmbeddingConfig { name, config: config.into(), user_provided }) + } + Setting::Reset => None, + Setting::NotSet => Some(IndexEmbeddingConfig { + name, + config: EmbeddingSettings::default().into(), + user_provided, + }), + }) + .collect(); + if updated_configs.is_empty() { + self.index.delete_embedding_configs(self.wtxn)?; + } else { + self.index.put_embedding_configs(self.wtxn, updated_configs)?; + } + Ok(embedder_actions) } fn update_search_cutoff(&mut self) -> Result { @@ -1058,13 +1151,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_searchable()?; self.update_exact_attributes()?; self.update_proximity_precision()?; - // TODO: very rough approximation of the needs for reindexing where any change will result in - // a full reindexing. - // What can be done instead: - // 1. Only change the distance on a distance change - // 2. Only change the name -> embedder mapping on a name change - // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage - let embedding_configs_updated = self.update_embedding_configs()?; + + let embedding_config_updates = self.update_embedding_configs()?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; new_inner_settings.recompute_facets(self.wtxn, self.index)?; @@ -1078,7 +1166,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { old_inner_settings, new_inner_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, ); @@ -1094,8 +1182,7 @@ pub struct InnerIndexSettingsDiff { pub(crate) old: InnerIndexSettings, pub(crate) new: InnerIndexSettings, pub(crate) primary_key_id: Option, - // TODO: compare directly the embedders. - pub(crate) embedding_configs_updated: bool, + pub(crate) embedding_config_updates: BTreeMap, pub(crate) settings_update_only: bool, /// The set of only the additional searchable fields. /// If any other searchable field has been modified, is set to None. @@ -1116,7 +1203,7 @@ impl InnerIndexSettingsDiff { old_settings: InnerIndexSettings, new_settings: InnerIndexSettings, primary_key_id: Option, - embedding_configs_updated: bool, + embedding_config_updates: BTreeMap, settings_update_only: bool, ) -> Self { let only_additional_fields = match ( @@ -1153,7 +1240,7 @@ impl InnerIndexSettingsDiff { old: old_settings, new: new_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, only_additional_fields, cache_reindex_searchable_without_user_defined, @@ -1220,7 +1307,7 @@ impl InnerIndexSettingsDiff { } pub fn reindex_vectors(&self) -> bool { - self.embedding_configs_updated + !self.embedding_config_updates.is_empty() } pub fn settings_update_only(&self) -> bool { @@ -1252,6 +1339,8 @@ pub(crate) struct InnerIndexSettings { pub embedding_configs: EmbeddingConfigs, pub existing_fields: HashSet, pub geo_fields_ids: Option<(FieldId, FieldId)>, + pub non_searchable_fields_ids: Vec, + pub non_faceted_fields_ids: Vec, } impl InnerIndexSettings { @@ -1265,8 +1354,8 @@ impl InnerIndexSettings { let user_defined_searchable_fields = user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; - let searchable_fields_ids = index.searchable_fields_ids(rtxn)?; - let faceted_fields_ids = index.faceted_fields_ids(rtxn)?; + let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?; + let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let embedding_configs = embedders(index.embedding_configs(rtxn)?)?; @@ -1294,6 +1383,10 @@ impl InnerIndexSettings { None => None, }; + let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); + searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); + faceted_fields_ids.retain(|id| !vectors_fids.contains(id)); + Ok(Self { stop_words, allowed_separators, @@ -1308,6 +1401,8 @@ impl InnerIndexSettings { embedding_configs, existing_fields, geo_fields_ids, + non_searchable_fields_ids: vectors_fids.clone(), + non_faceted_fields_ids: vectors_fids.clone(), }) } @@ -1315,9 +1410,10 @@ impl InnerIndexSettings { pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { let new_facets = self .fields_ids_map - .names() - .filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields)) - .map(|field| field.to_string()) + .iter() + .filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid)) + .filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields)) + .map(|(_fid, field)| field.to_string()) .collect(); index.put_faceted_fields(wtxn, &new_facets)?; @@ -1337,6 +1433,7 @@ impl InnerIndexSettings { index.put_all_searchable_fields_from_fields_ids_map( wtxn, &searchable_fields, + &self.non_searchable_fields_ids, &self.fields_ids_map, )?; } @@ -1347,19 +1444,25 @@ impl InnerIndexSettings { } } -fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result { +fn embedders(embedding_configs: Vec) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, EmbeddingConfig { embedder_options, prompt })| { - let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); + .map( + |IndexEmbeddingConfig { + name, + config: EmbeddingConfig { embedder_options, prompt }, + .. + }| { + let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); - let embedder = Arc::new( - Embedder::new(embedder_options.clone()) - .map_err(crate::vector::Error::from) - .map_err(crate::Error::from)?, - ); - Ok((name, (embedder, prompt))) - }) + let embedder = Arc::new( + Embedder::new(embedder_options.clone()) + .map_err(crate::vector::Error::from) + .map_err(crate::Error::from)?, + ); + Ok((name, (embedder, prompt))) + }, + ) .collect(); res.map(EmbeddingConfigs::new) } diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 553c8c3c1..c43fa8bd2 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -152,6 +152,10 @@ impl EmbeddingConfigs { &self.0 } + pub fn into_inner(self) -> HashMap, Arc)> { + self.0 + } + /// Get the name of the default embedder configuration. /// /// The default embedder is determined as follows: diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 2c61baa9e..f555b39ae 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -1,51 +1,119 @@ use std::collections::{BTreeMap, BTreeSet}; +use deserr::{take_cf_content, DeserializeError, Deserr, Sequence}; use obkv::KvReader; use serde_json::{from_slice, Value}; use super::Embedding; +use crate::index::IndexEmbeddingConfig; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; -use crate::{FieldId, InternalError, UserError}; +use crate::{DocumentId, FieldId, InternalError, UserError}; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; -#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[derive(serde::Serialize, Debug)] #[serde(untagged)] pub enum Vectors { ImplicitlyUserProvided(VectorOrArrayOfVectors), Explicit(ExplicitVectors), } +impl Deserr for Vectors { + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + match value { + deserr::Value::Sequence(_) | deserr::Value::Null => { + Ok(Vectors::ImplicitlyUserProvided(VectorOrArrayOfVectors::deserialize_from_value( + value, location, + )?)) + } + deserr::Value::Map(_) => { + Ok(Vectors::Explicit(ExplicitVectors::deserialize_from_value(value, location)?)) + } + + value => Err(take_cf_content(E::error( + None, + deserr::ErrorKind::IncorrectValueKind { + actual: value, + accepted: &[ + deserr::ValueKind::Sequence, + deserr::ValueKind::Map, + deserr::ValueKind::Null, + ], + }, + location, + ))), + } + } +} + impl Vectors { - pub fn into_array_of_vectors(self) -> Vec { + pub fn must_regenerate(&self) -> bool { match self { - Vectors::ImplicitlyUserProvided(embeddings) - | Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => { - embeddings.into_array_of_vectors().unwrap_or_default() + Vectors::ImplicitlyUserProvided(_) => false, + Vectors::Explicit(ExplicitVectors { regenerate, .. }) => *regenerate, + } + } + + pub fn into_array_of_vectors(self) -> Option> { + match self { + Vectors::ImplicitlyUserProvided(embeddings) => { + Some(embeddings.into_array_of_vectors().unwrap_or_default()) + } + Vectors::Explicit(ExplicitVectors { embeddings, regenerate: _ }) => { + embeddings.map(|embeddings| embeddings.into_array_of_vectors().unwrap_or_default()) } } } } -#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[derive(serde::Serialize, Deserr, Debug)] #[serde(rename_all = "camelCase")] pub struct ExplicitVectors { - pub embeddings: VectorOrArrayOfVectors, - pub user_provided: bool, + #[serde(default)] + #[deserr(default)] + pub embeddings: Option, + pub regenerate: bool, +} + +pub enum VectorState { + Inline(Vectors), + Manual, + Generated, +} + +impl VectorState { + pub fn must_regenerate(&self) -> bool { + match self { + VectorState::Inline(vectors) => vectors.must_regenerate(), + VectorState::Manual => false, + VectorState::Generated => true, + } + } +} + +pub enum VectorsState { + NoVectorsFid, + NoVectorsFieldInDocument, + Vectors(BTreeMap), } pub struct ParsedVectorsDiff { - pub old: Option>, - pub new: Option>, + old: BTreeMap, + new: VectorsState, } impl ParsedVectorsDiff { pub fn new( + docid: DocumentId, + embedders_configs: &[IndexEmbeddingConfig], documents_diff: KvReader<'_, FieldId>, old_vectors_fid: Option, new_vectors_fid: Option, ) -> Result { - let old = match old_vectors_fid + let mut old = match old_vectors_fid .and_then(|vectors_fid| documents_diff.get(vectors_fid)) .map(KvReaderDelAdd::new) .map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) @@ -61,48 +129,84 @@ impl ParsedVectorsDiff { return Err(error); } } - .flatten(); - let new = new_vectors_fid - .and_then(|vectors_fid| documents_diff.get(vectors_fid)) - .map(KvReaderDelAdd::new) - .map(|obkv| to_vector_map(obkv, DelAdd::Addition)) - .transpose()? - .flatten(); + .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); + for embedding_config in embedders_configs { + if embedding_config.user_provided.contains(docid) { + old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual); + } + } + + let new = 'new: { + let Some(new_vectors_fid) = new_vectors_fid else { + break 'new VectorsState::NoVectorsFid; + }; + let Some(bytes) = documents_diff.get(new_vectors_fid) else { + break 'new VectorsState::NoVectorsFieldInDocument; + }; + let obkv = KvReaderDelAdd::new(bytes); + match to_vector_map(obkv, DelAdd::Addition)? { + Some(new) => VectorsState::Vectors(new), + None => VectorsState::NoVectorsFieldInDocument, + } + }; + Ok(Self { old, new }) } - pub fn remove(&mut self, embedder_name: &str) -> (Option, Option) { - let old = self.old.as_mut().and_then(|old| old.remove(embedder_name)); - let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); + pub fn remove(&mut self, embedder_name: &str) -> (VectorState, VectorState) { + let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated); + let state_from_old = match old { + // assume a userProvided is still userProvided + VectorState::Manual => VectorState::Manual, + // generated is still generated + VectorState::Generated => VectorState::Generated, + // weird case that shouldn't happen were the previous docs version is inline, + // but it was removed in the new version + // Since it is not in the new version, we switch to generated + VectorState::Inline(_) => VectorState::Generated, + }; + let new = match &mut self.new { + VectorsState::Vectors(new) => { + new.remove(embedder_name).map(VectorState::Inline).unwrap_or(state_from_old) + } + _ => + // if no `_vectors` field is present in the new document, + // the state depends on the previous version of the document + { + state_from_old + } + }; + (old, new) } } pub struct ParsedVectors(pub BTreeMap); -impl ParsedVectors { - pub fn from_bytes(value: &[u8]) -> Result { - let Ok(value) = from_slice(value) else { - let value = from_slice(value).map_err(Error::InternalSerdeJson)?; - return Err(Error::InvalidMap(value)); - }; +impl Deserr for ParsedVectors { + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + let value = >::deserialize_from_value(value, location)?; Ok(ParsedVectors(value)) } +} - pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet) { - self.0.retain(|k, v| match v { - Vectors::ImplicitlyUserProvided(_) => true, - Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => { - *user_provided - // if the embedder is not in the config, then never touch it - || !embedders.contains(k) - } - }); +impl ParsedVectors { + pub fn from_bytes(value: &[u8]) -> Result { + let value: serde_json::Value = from_slice(value).map_err(Error::InternalSerdeJson)?; + deserr::deserialize(value).map_err(|error| Error::InvalidEmbedderConf { error }) + } + + pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet) { + self.0.retain(|k, _v| !embedders.contains(k)) } } pub enum Error { InvalidMap(Value), + InvalidEmbedderConf { error: deserr::errors::JsonError }, InternalSerdeJson(serde_json::Error), } @@ -112,6 +216,12 @@ impl Error { Error::InvalidMap(value) => { crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value }) } + Error::InvalidEmbedderConf { error } => { + crate::Error::UserError(UserError::InvalidVectorsEmbedderConf { + document_id, + error, + }) + } Error::InternalSerdeJson(error) => { crate::Error::InternalError(InternalError::SerdeJson(error)) } @@ -132,13 +242,84 @@ fn to_vector_map( } /// Represents either a vector or an array of multiple vectors. -#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[derive(serde::Serialize, Debug)] #[serde(transparent)] pub struct VectorOrArrayOfVectors { #[serde(with = "either::serde_untagged_optional")] inner: Option, Embedding>>, } +impl Deserr for VectorOrArrayOfVectors { + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + match value { + deserr::Value::Null => Ok(VectorOrArrayOfVectors { inner: None }), + deserr::Value::Sequence(seq) => { + let mut iter = seq.into_iter(); + match iter.next().map(|v| v.into_value()) { + None => { + // With the strange way serde serialize the `Either`, we must send the left part + // otherwise it'll consider we returned [[]] + Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(Vec::new())) }) + } + Some(val @ deserr::Value::Sequence(_)) => { + let first = Embedding::deserialize_from_value(val, location.push_index(0))?; + let mut collect = vec![first]; + let mut tail = iter + .enumerate() + .map(|(i, v)| { + Embedding::deserialize_from_value( + v.into_value(), + location.push_index(i + 1), + ) + }) + .collect::, _>>()?; + collect.append(&mut tail); + + Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(collect)) }) + } + Some( + val @ deserr::Value::Integer(_) + | val @ deserr::Value::NegativeInteger(_) + | val @ deserr::Value::Float(_), + ) => { + let first = ::deserialize_from_value(val, location.push_index(0))?; + let mut embedding = iter + .enumerate() + .map(|(i, v)| { + ::deserialize_from_value( + v.into_value(), + location.push_index(i + 1), + ) + }) + .collect::, _>>()?; + embedding.insert(0, first); + Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Right(embedding)) }) + } + Some(value) => Err(take_cf_content(E::error( + None, + deserr::ErrorKind::IncorrectValueKind { + actual: value, + accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Float], + }, + location.push_index(0), + ))), + } + } + value => Err(take_cf_content(E::error( + None, + deserr::ErrorKind::IncorrectValueKind { + actual: value, + accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Null], + }, + location, + ))), + } + } +} + impl VectorOrArrayOfVectors { pub fn into_array_of_vectors(self) -> Option> { match self.inner? { @@ -150,21 +331,41 @@ impl VectorOrArrayOfVectors { pub fn from_array_of_vectors(array_of_vec: Vec) -> Self { Self { inner: Some(either::Either::Left(array_of_vec)) } } + + pub fn from_vector(vec: Embedding) -> Self { + Self { inner: Some(either::Either::Right(vec)) } + } +} + +impl From for VectorOrArrayOfVectors { + fn from(vec: Embedding) -> Self { + Self::from_vector(vec) + } +} + +impl From> for VectorOrArrayOfVectors { + fn from(vec: Vec) -> Self { + Self::from_array_of_vectors(vec) + } } #[cfg(test)] mod test { use super::VectorOrArrayOfVectors; + fn embedding_from_str(s: &str) -> Result { + let value: serde_json::Value = serde_json::from_str(s).unwrap(); + deserr::deserialize(value) + } + #[test] fn array_of_vectors() { - let null: VectorOrArrayOfVectors = serde_json::from_str("null").unwrap(); - let empty: VectorOrArrayOfVectors = serde_json::from_str("[]").unwrap(); - let one: VectorOrArrayOfVectors = serde_json::from_str("[0.1]").unwrap(); - let two: VectorOrArrayOfVectors = serde_json::from_str("[0.1, 0.2]").unwrap(); - let one_vec: VectorOrArrayOfVectors = serde_json::from_str("[[0.1, 0.2]]").unwrap(); - let two_vecs: VectorOrArrayOfVectors = - serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap(); + let null = embedding_from_str("null").unwrap(); + let empty = embedding_from_str("[]").unwrap(); + let one = embedding_from_str("[0.1]").unwrap(); + let two = embedding_from_str("[0.1, 0.2]").unwrap(); + let one_vec = embedding_from_str("[[0.1, 0.2]]").unwrap(); + let two_vecs = embedding_from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap(); insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null"); insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]"); diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index e786a7164..9c7fb09b1 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -1,4 +1,5 @@ use deserr::Deserr; +use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use super::rest::InputType; @@ -72,6 +73,238 @@ pub fn check_unset( } } +/// Indicates what action should take place during a reindexing operation for an embedder +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum ReindexAction { + /// An indexing operation should take place for this embedder, keeping existing vectors + /// and checking whether the document template changed or not + RegeneratePrompts, + /// An indexing operation should take place for all documents for this embedder, removing existing vectors + /// (except userProvided ones) + FullReindex, +} + +pub enum SettingsDiff { + Remove, + Reindex { action: ReindexAction, updated_settings: EmbeddingSettings }, + UpdateWithoutReindex { updated_settings: EmbeddingSettings }, +} + +pub enum EmbedderAction { + WriteBackToDocuments(WriteBackToDocuments), + Reindex(ReindexAction), +} + +pub struct WriteBackToDocuments { + pub embedder_id: u8, + pub user_provided: RoaringBitmap, +} + +impl SettingsDiff { + pub fn from_settings(old: EmbeddingSettings, new: Setting) -> Self { + match new { + Setting::Set(new) => { + let EmbeddingSettings { + mut source, + mut model, + mut revision, + mut api_key, + mut dimensions, + mut document_template, + mut url, + mut query, + mut input_field, + mut path_to_embeddings, + mut embedding_object, + mut input_type, + mut distribution, + } = old; + + let EmbeddingSettings { + source: new_source, + model: new_model, + revision: new_revision, + api_key: new_api_key, + dimensions: new_dimensions, + document_template: new_document_template, + url: new_url, + query: new_query, + input_field: new_input_field, + path_to_embeddings: new_path_to_embeddings, + embedding_object: new_embedding_object, + input_type: new_input_type, + distribution: new_distribution, + } = new; + + let mut reindex_action = None; + + // **Warning**: do not use short-circuiting || here, we want all these operations applied + if source.apply(new_source) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + // when the source changes, we need to reapply the default settings for the new source + apply_default_for_source( + &source, + &mut model, + &mut revision, + &mut dimensions, + &mut url, + &mut query, + &mut input_field, + &mut path_to_embeddings, + &mut embedding_object, + &mut input_type, + &mut document_template, + ) + } + if model.apply(new_model) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if revision.apply(new_revision) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if dimensions.apply(new_dimensions) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if url.apply(new_url) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if query.apply(new_query) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if input_field.apply(new_input_field) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if path_to_embeddings.apply(new_path_to_embeddings) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if embedding_object.apply(new_embedding_object) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if input_type.apply(new_input_type) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if document_template.apply(new_document_template) { + ReindexAction::push_action( + &mut reindex_action, + ReindexAction::RegeneratePrompts, + ); + } + + distribution.apply(new_distribution); + api_key.apply(new_api_key); + + let updated_settings = EmbeddingSettings { + source, + model, + revision, + api_key, + dimensions, + document_template, + url, + query, + input_field, + path_to_embeddings, + embedding_object, + input_type, + distribution, + }; + + match reindex_action { + Some(action) => Self::Reindex { action, updated_settings }, + None => Self::UpdateWithoutReindex { updated_settings }, + } + } + Setting::Reset => Self::Remove, + Setting::NotSet => Self::UpdateWithoutReindex { updated_settings: old }, + } + } +} + +impl ReindexAction { + fn push_action(this: &mut Option, other: Self) { + *this = match (*this, other) { + (_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex), + (Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex), + (_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts), + } + } +} + +#[allow(clippy::too_many_arguments)] // private function +fn apply_default_for_source( + source: &Setting, + model: &mut Setting, + revision: &mut Setting, + dimensions: &mut Setting, + url: &mut Setting, + query: &mut Setting, + input_field: &mut Setting>, + path_to_embeddings: &mut Setting>, + embedding_object: &mut Setting>, + input_type: &mut Setting, + document_template: &mut Setting, +) { + match source { + Setting::Set(EmbedderSource::HuggingFace) => { + *model = Setting::Reset; + *revision = Setting::Reset; + *dimensions = Setting::NotSet; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::Ollama) => { + *model = Setting::Reset; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => { + *model = Setting::Reset; + *revision = Setting::NotSet; + *dimensions = Setting::NotSet; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::Rest) => { + *model = Setting::NotSet; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::Reset; + *query = Setting::Reset; + *input_field = Setting::Reset; + *path_to_embeddings = Setting::Reset; + *embedding_object = Setting::Reset; + *input_type = Setting::Reset; + } + Setting::Set(EmbedderSource::UserProvided) => { + *model = Setting::NotSet; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + *document_template = Setting::NotSet; + } + Setting::NotSet => {} + } +} + pub fn check_set( key: &Setting, field: &'static str, @@ -210,66 +443,6 @@ impl EmbeddingSettings { *model = Setting::Set(openai::EmbeddingModel::default().name().to_owned()) } } - - pub(crate) fn apply_and_need_reindex( - old: &mut Setting, - new: Setting, - ) -> bool { - match (old, new) { - ( - Setting::Set(EmbeddingSettings { - source: old_source, - model: old_model, - revision: old_revision, - api_key: old_api_key, - dimensions: old_dimensions, - document_template: old_document_template, - url: old_url, - query: old_query, - input_field: old_input_field, - path_to_embeddings: old_path_to_embeddings, - embedding_object: old_embedding_object, - input_type: old_input_type, - distribution: old_distribution, - }), - Setting::Set(EmbeddingSettings { - source: new_source, - model: new_model, - revision: new_revision, - api_key: new_api_key, - dimensions: new_dimensions, - document_template: new_document_template, - url: new_url, - query: new_query, - input_field: new_input_field, - path_to_embeddings: new_path_to_embeddings, - embedding_object: new_embedding_object, - input_type: new_input_type, - distribution: new_distribution, - }), - ) => { - let mut needs_reindex = false; - - needs_reindex |= old_source.apply(new_source); - needs_reindex |= old_model.apply(new_model); - needs_reindex |= old_revision.apply(new_revision); - needs_reindex |= old_dimensions.apply(new_dimensions); - needs_reindex |= old_document_template.apply(new_document_template); - needs_reindex |= old_url.apply(new_url); - needs_reindex |= old_query.apply(new_query); - needs_reindex |= old_input_field.apply(new_input_field); - needs_reindex |= old_path_to_embeddings.apply(new_path_to_embeddings); - needs_reindex |= old_embedding_object.apply(new_embedding_object); - needs_reindex |= old_input_type.apply(new_input_type); - - old_distribution.apply(new_distribution); - old_api_key.apply(new_api_key); - needs_reindex - } - (Setting::Reset, Setting::Reset) | (_, Setting::NotSet) => false, - _ => true, - } - } } #[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]