diff --git a/.cargo/config.toml b/.cargo/config.toml index 35049cbcb..e11d56a31 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,2 +1,2 @@ [alias] -xtask = "run --package xtask --" +xtask = "run --release --package xtask --" diff --git a/.github/workflows/bench-manual.yml b/.github/workflows/bench-manual.yml new file mode 100644 index 000000000..6d8c3a006 --- /dev/null +++ b/.github/workflows/bench-manual.yml @@ -0,0 +1,30 @@ +name: Bench (manual) + +on: + workflow_dispatch: + inputs: + workload: + description: 'The path to the workloads to execute (workloads/...)' + required: true + default: 'workloads/movies.json' + +env: + WORKLOAD_NAME: ${{ github.event.inputs.workload }} + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: benchmarks + timeout-minutes: 180 # 3h + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + - name: Run benchmarks - workload ${WORKLOAD_NAME} - branch ${{ github.ref }} - commit ${{ github.sha }} + run: | + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Manual [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- ${WORKLOAD_NAME} + diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml new file mode 100644 index 000000000..6f4956542 --- /dev/null +++ b/.github/workflows/bench-pr.yml @@ -0,0 +1,46 @@ +name: Bench (PR) +on: + issue_comment: + types: [created] + +permissions: + issues: write + +env: + GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }} + +jobs: + run-benchmarks-on-comment: + if: startsWith(github.event.comment.body, '/bench') + name: Run and upload benchmarks + runs-on: benchmarks + timeout-minutes: 180 # 3h + steps: + - name: Check for Command + id: command + uses: xt0rted/slash-command-action@v2 + with: + command: bench + reaction-type: "rocket" + repo-token: ${{ env.GH_TOKEN }} + + - uses: xt0rted/pull-request-comment-branch@v2 + id: comment-branch + with: + repo_token: ${{ env.GH_TOKEN }} + + - uses: actions/checkout@v3 + if: success() + with: + fetch-depth: 0 # fetch full history to be able to get main commit sha + ref: ${{ steps.comment-branch.outputs.head_ref }} + + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + - name: Run benchmarks on PR ${{ github.event.issue.id }} + run: | + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "[Comment](${{ github.event.comment.url }}) on [#${{github.event.issue.id}}](${{ github.event.issue.url }})" -- ${{ steps.command.outputs.command-arguments }} \ No newline at end of file diff --git a/.github/workflows/bench-push-indexing.yml b/.github/workflows/bench-push-indexing.yml new file mode 100644 index 000000000..fd0f19a5a --- /dev/null +++ b/.github/workflows/bench-push-indexing.yml @@ -0,0 +1,25 @@ +name: Indexing bench (push) + +on: + push: + branches: + - main + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: benchmarks + timeout-minutes: 180 # 3h + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch main - Commit ${{ github.sha }} + run: | + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Push on `main` [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- workloads/*.json + diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 18e9fc48a..5dbde4301 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -31,17 +31,10 @@ jobs: apt-get update && apt-get install -y curl apt-get install build-essential -y - name: Setup test with Rust stable - if: github.event_name != 'schedule' uses: actions-rs/toolchain@v1 with: toolchain: stable override: true - - name: Setup test with Rust nightly - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' - uses: actions-rs/toolchain@v1 - with: - toolchain: nightly - override: true - name: Cache dependencies uses: Swatinem/rust-cache@v2.7.1 - name: Run cargo check without any default features diff --git a/.gitignore b/.gitignore index 5f660c735..e00f45c1e 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ /data.ms /snapshots /dumps +/bench +/_xtask_benchmark.ms # Snapshots ## ... large diff --git a/Cargo.lock b/Cargo.lock index 7be74bd70..24540c455 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -45,7 +45,7 @@ dependencies = [ "actix-service", "actix-tls", "actix-utils", - "ahash 0.8.8", + "ahash", "base64 0.21.7", "bitflags 2.4.1", "brotli", @@ -181,7 +181,7 @@ dependencies = [ "actix-tls", "actix-utils", "actix-web-codegen", - "ahash 0.8.8", + "ahash", "bytes", "bytestring", "cfg-if", @@ -257,20 +257,9 @@ dependencies = [ [[package]] name = "ahash" -version = "0.7.6" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" -dependencies = [ - "getrandom", - "once_cell", - "version_check", -] - -[[package]] -name = "ahash" -version = "0.8.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42cd52102d3df161c77a887b608d7a4897d7cc112886a9537b738a887a03aaff" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "getrandom", @@ -303,6 +292,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + [[package]] name = "anes" version = "0.1.6" @@ -359,9 +354,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.79" +version = "1.0.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" +checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" dependencies = [ "backtrace", ] @@ -443,6 +438,12 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "atomic" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba" + [[package]] name = "atomic-polyfill" version = "0.1.11" @@ -621,10 +622,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", - "regex-automata 0.4.3", + "regex-automata", "serde", ] +[[package]] +name = "build-info" +version = "1.7.0" +dependencies = [ + "anyhow", + "time", + "vergen-git2", +] + [[package]] name = "bumpalo" version = "3.13.0" @@ -709,16 +719,6 @@ dependencies = [ "pkg-config", ] -[[package]] -name = "calendrical_calculations" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8dfe3bc6a50b4667fafdb6d9cf26731c5418c457e317d8166c972014facf9a5d" -dependencies = [ - "core_maths", - "displaydoc", -] - [[package]] name = "camino" version = "1.1.6" @@ -877,9 +877,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.8.5" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffb924701d850fbf0331302e7f9715c04e494b4b9bebb38ac48bdd30924e1936" +checksum = "3a9071b1586dd067b5fdfd2069fab932c047ca5bbce4bd2bdee8af0f4b155053" dependencies = [ "aho-corasick", "cow-utils", @@ -887,15 +887,12 @@ dependencies = [ "deunicode", "either", "fst", - "icu", - "icu_provider", - "icu_provider_blob", "irg-kvariants", "jieba-rs", "lindera-core", "lindera-dictionary", "lindera-tokenizer", - "litemap 0.6.1", + "litemap", "once_cell", "pinyin", "serde", @@ -903,7 +900,7 @@ dependencies = [ "unicode-normalization", "wana_kana", "whatlang", - "zerovec 0.9.6", + "zerovec", ] [[package]] @@ -994,12 +991,6 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" -[[package]] -name = "cobs" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15" - [[package]] name = "color-spantrace" version = "0.2.1" @@ -1090,15 +1081,6 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" -[[package]] -name = "core_maths" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3b02505ccb8c50b0aa21ace0fc08c3e53adebd4e58caa18a36152803c7709a3" -dependencies = [ - "libm", -] - [[package]] name = "cow-utils" version = "0.1.2" @@ -1346,15 +1328,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "deduplicating_array" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a636096586ca093a10ac0175bfb384d024089dca0dae54e3e69bc1c1596358e8" -dependencies = [ - "serde", -] - [[package]] name = "deranged" version = "0.3.9" @@ -1382,7 +1355,16 @@ version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" dependencies = [ - "derive_builder_macro", + "derive_builder_macro 0.12.0", +] + +[[package]] +name = "derive_builder" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f59169f400d8087f238c5c0c7db6a28af18681717f3b623227d92f397e938c7" +dependencies = [ + "derive_builder_macro 0.13.1", ] [[package]] @@ -1397,13 +1379,35 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_builder_core" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4ec317cc3e7ef0928b0ca6e4a634a4d6c001672ae210438cf114a83e56b018d" +dependencies = [ + "darling 0.14.4", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "derive_builder_macro" version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" dependencies = [ - "derive_builder_core", + "derive_builder_core 0.12.0", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder_macro" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "870368c3fb35b8031abb378861d4460f573b92238ec2152c927a21f77e3e0127" +dependencies = [ + "derive_builder_core 0.13.1", "syn 1.0.109", ] @@ -1451,9 +1455,9 @@ dependencies = [ [[package]] name = "deunicode" -version = "1.3.3" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c1bba4f227a4a53d12b653f50ca7bf10c9119ae2aba56aff9e0338b5c98f36a" +checksum = "3ae2a35373c5c74340b79ae6780b498b2b183915ec5dacf263aac5a099bf485a" [[package]] name = "digest" @@ -1508,17 +1512,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "displaydoc" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.48", -] - [[package]] name = "doc-comment" version = "0.3.3" @@ -1578,12 +1571,6 @@ dependencies = [ "serde", ] -[[package]] -name = "embedded-io" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced" - [[package]] name = "encode_unicode" version = "0.3.6" @@ -1785,6 +1772,7 @@ dependencies = [ "faux", "tempfile", "thiserror", + "tracing", "uuid", ] @@ -1810,17 +1798,6 @@ dependencies = [ "unescaper", ] -[[package]] -name = "fixed_decimal" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5287d527037d0f35c8801880361eb38bb9bce194805350052c2a79538388faeb" -dependencies = [ - "displaydoc", - "smallvec", - "writeable", -] - [[package]] name = "flate2" version = "1.0.28" @@ -2151,11 +2128,11 @@ checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" [[package]] name = "git2" -version = "0.16.1" +version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf7f68c2995f392c49fffb4f95ae2c873297830eb25c6bc4c114ce8f4562acc" +checksum = "1b3ba52851e73b46a4c3df1d89343741112003f0f6f13beb0dfac9e457c3fdcd" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "libc", "libgit2-sys", "log", @@ -2228,20 +2205,15 @@ dependencies = [ "byteorder", ] -[[package]] -name = "hashbrown" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" -dependencies = [ - "ahash 0.7.6", -] - [[package]] name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "heapless" @@ -2426,487 +2398,6 @@ dependencies = [ "tokio-rustls", ] -[[package]] -name = "icu" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30f75f394ebee8d539bef8f6f02ad7b5f41c33de74c9eae1a50337b382a5aab1" -dependencies = [ - "icu_calendar", - "icu_casemap", - "icu_collator", - "icu_collections", - "icu_compactdecimal", - "icu_datetime", - "icu_decimal", - "icu_displaynames", - "icu_list", - "icu_locid", - "icu_locid_transform", - "icu_normalizer", - "icu_plurals", - "icu_properties", - "icu_provider", - "icu_relativetime", - "icu_segmenter", - "icu_timezone", - "icu_transliterate", -] - -[[package]] -name = "icu_calendar" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b520c5675775e3838447c33fc55bf558148c6824ef0d20ff7a9e0df7345a281c" -dependencies = [ - "calendrical_calculations", - "displaydoc", - "icu_calendar_data", - "icu_locid", - "icu_locid_transform", - "icu_provider", - "serde", - "tinystr", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_calendar_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75d8d1a514ca7e6dc547be930f2fd661d578909c07cf1c1adade81c3f7a78840" - -[[package]] -name = "icu_casemap" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "976068d7759293cbd9daa0d1669618bb9094c7ee54e546cd8b877dd4fe59007a" -dependencies = [ - "displaydoc", - "icu_casemap_data", - "icu_collections", - "icu_locid", - "icu_properties", - "icu_provider", - "serde", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_casemap_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1251070c14d5b94cd00f97025e9cedce6a6eeb39485e2a226c58432cc4f72ffd" - -[[package]] -name = "icu_collator" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be493c81154545a00fc5196e814cae0e1470bc696d518b5df877049aa6bcefe1" -dependencies = [ - "displaydoc", - "icu_collator_data", - "icu_collections", - "icu_locid", - "icu_locid_transform", - "icu_normalizer", - "icu_properties", - "icu_provider", - "serde", - "smallvec", - "utf16_iter", - "utf8_iter", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_collator_data" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dbe9abe5ce570ad4707026f37bc21ef95c36b945c3c4564b9aa4e2e1c043126" - -[[package]] -name = "icu_collections" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3907b2246e8dd5a29ead8a965e7c0c8a90e9b928e614a4279257d45c5e553e91" -dependencies = [ - "displaydoc", - "serde", - "yoke", - "zerofrom", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_compactdecimal" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a8bb9143e7681fd5f5877c76f7b6365e173545d00d0e12ef23ba1888a996baa" -dependencies = [ - "displaydoc", - "fixed_decimal", - "icu_compactdecimal_data", - "icu_decimal", - "icu_locid_transform", - "icu_plurals", - "icu_provider", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_compactdecimal_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2e9b7585f26db531ea5aaedaa68cb66cd2be37fe698b33a289849ff3129545b" - -[[package]] -name = "icu_datetime" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5bf2e6dd961b59ee5935070220915db6cf0ab5137de362964f800c2b7d14fa" -dependencies = [ - "displaydoc", - "either", - "fixed_decimal", - "icu_calendar", - "icu_datetime_data", - "icu_decimal", - "icu_locid", - "icu_locid_transform", - "icu_plurals", - "icu_provider", - "icu_timezone", - "litemap 0.7.1", - "serde", - "smallvec", - "tinystr", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_datetime_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "078b2ed516a2f5054ee7f55b1fe970b92e90ae4cace8a0fe1e5f9fc2e94be609" - -[[package]] -name = "icu_decimal" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1986a0b7df834aaddb911b4593c990950ac5606fc83ce9aad4311be80f51e81a" -dependencies = [ - "displaydoc", - "fixed_decimal", - "icu_decimal_data", - "icu_locid", - "icu_locid_transform", - "icu_provider", - "serde", - "writeable", -] - -[[package]] -name = "icu_decimal_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c064b3828953151f8c610bfff6fec776f958641249ebfd1cf36f073f0654e77" - -[[package]] -name = "icu_displaynames" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c98329d348e918ac7e88e6d6613a46bef09ca8a65db4ddf70d86e6eaac0e2ec3" -dependencies = [ - "icu_displaynames_data", - "icu_locid", - "icu_locid_transform", - "icu_provider", - "serde", - "tinystr", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_displaynames_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60f9f56c427f1e80383667e8fb13c07707f6561839283115617cc67307a5d020" - -[[package]] -name = "icu_list" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc1a44bbed77a7e7b555f9d7dd4b43f75ec1402b438a901d20451943d50cbd90" -dependencies = [ - "deduplicating_array", - "displaydoc", - "icu_list_data", - "icu_locid_transform", - "icu_provider", - "regex-automata 0.2.0", - "serde", - "writeable", -] - -[[package]] -name = "icu_list_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3237583f0cb7feafabb567c4492fe9ef1d2d4113f6a8798a923273ea5de996d" - -[[package]] -name = "icu_locid" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f284eb342dc49d3e9d9f3b188489d76b5d22dfb1d1a5e0d1941811253bac625c" -dependencies = [ - "displaydoc", - "litemap 0.7.1", - "serde", - "tinystr", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_locid_transform" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6551daf80882d8e68eee186cc19e132d8bde1b1f059a79b93384a5ca0e8fc5e7" -dependencies = [ - "displaydoc", - "icu_locid", - "icu_locid_transform_data", - "icu_provider", - "serde", - "tinystr", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_locid_transform_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a741eba5431f75eb2f1f9022d3cffabcadda6771e54fb4e77c8ba8653e4da44" - -[[package]] -name = "icu_normalizer" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "080fc33a720d50a7342b0c58df010fbcfb842d6f78ef81555f8b1ac6bba57d3c" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_normalizer_data", - "icu_properties", - "icu_provider", - "serde", - "smallvec", - "utf16_iter", - "utf8_iter", - "write16", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_normalizer_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f8d22f74066c2e6442db2a9aa14950278e86719e811e304e48bae03094b369d" - -[[package]] -name = "icu_plurals" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20556516b8be2b2f5dc3d6b23884b65c5c59ed8be0b44c419e4808c9b0792fce" -dependencies = [ - "displaydoc", - "fixed_decimal", - "icu_locid", - "icu_locid_transform", - "icu_plurals_data", - "icu_provider", - "serde", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_plurals_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdc552215224997aaaa4e05d95981386d3c52042acebfcc732137d5d9be96a21" - -[[package]] -name = "icu_properties" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3477ae70f8ca8dc08ff7574b5398ed0a2f2e4e6b66bdff2558a92ed67e262be1" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_locid_transform", - "icu_properties_data", - "icu_provider", - "serde", - "tinystr", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_properties_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c8bb3b67a8347e94d580434369e5c7ee89999b9309d04b7cfc88dfaa0f31b59" - -[[package]] -name = "icu_provider" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68acdef80034b5e35d8524e9817479d389a4f9774f3f0cbe1bf3884d80fd5934" -dependencies = [ - "displaydoc", - "icu_locid", - "icu_provider_macros", - "postcard", - "serde", - "stable_deref_trait", - "tinystr", - "writeable", - "yoke", - "zerofrom", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_provider_blob" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31326d28c7f95a964a4f0ee86c24002da5f6db907e3bcb079949b4ff103b6a9" -dependencies = [ - "icu_provider", - "postcard", - "serde", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_provider_macros" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2060258edfcfe32ca7058849bf0f146cb5c59aadbedf480333c0d0002f97bc99" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.48", -] - -[[package]] -name = "icu_relativetime" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4e6c1b531ab35f5b0cb552d3fb8dab1cb49f98e68e12bdc2169ca15e805207c" -dependencies = [ - "displaydoc", - "fixed_decimal", - "icu_decimal", - "icu_locid_transform", - "icu_plurals", - "icu_provider", - "icu_relativetime_data", - "serde", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_relativetime_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71ec2ca0aff8c6865075c6257bc91d21a77acb6465635306a280af89208bed24" - -[[package]] -name = "icu_segmenter" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcb3c1981ce2187a745f391a741cb14e77453325acb3b2e014b05da51c0a39f2" -dependencies = [ - "core_maths", - "displaydoc", - "icu_collections", - "icu_locid", - "icu_provider", - "icu_segmenter_data", - "serde", - "utf8_iter", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_segmenter_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9703f6713044d1c0a1335a6d78ffece4c9380582416ace6feeb608e84d279fc7" - -[[package]] -name = "icu_timezone" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e6401cd210ccda98b2e7fc707831b29c6efe319efbbec460f957b6f331f626" -dependencies = [ - "displaydoc", - "icu_calendar", - "icu_locid", - "icu_provider", - "icu_timezone_data", - "serde", - "tinystr", - "zerotrie", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_timezone_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d7e214a653bac59b768c42f82d252f13af95e8a9cb07b6108b8bc723c561b43" - -[[package]] -name = "icu_transliterate" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4bdf006774b5a5898d97af6c95b148d34cd5c87cbed00610ff873e5b5885e28" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_locid", - "icu_normalizer", - "icu_properties", - "icu_provider", - "icu_unicodeset_parse", - "litemap 0.7.1", - "serde", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_unicodeset_parse" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2c3c1ab072cb9ec2dfb377ed7be07bf1bdce055b8324ba6392323f588c38c5a" -dependencies = [ - "icu_collections", - "icu_properties", - "icu_provider", - "tinystr", - "zerovec 0.10.0", -] - [[package]] name = "ident_case" version = "1.0.1" @@ -2938,7 +2429,7 @@ dependencies = [ "bincode", "crossbeam", "csv", - "derive_builder", + "derive_builder 0.12.0", "dump", "enum-iterator", "file-store", @@ -2949,6 +2440,7 @@ dependencies = [ "meilisearch-types", "page_size 0.5.0", "puffin", + "rayon", "roaring", "serde", "serde_json", @@ -2968,7 +2460,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" dependencies = [ "equivalent", - "hashbrown 0.14.3", + "hashbrown", "serde", ] @@ -3054,7 +2546,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" dependencies = [ "hermit-abi", - "rustix 0.38.26", + "rustix 0.38.31", "windows-sys 0.52.0", ] @@ -3090,7 +2582,7 @@ checksum = "93f0c1347cd3ac8d7c6e3a2dc33ac496d365cf09fc0831aa61111e1a6738983e" dependencies = [ "cedarwood", "fxhash", - "hashbrown 0.14.3", + "hashbrown", "lazy_static", "phf", "phf_codegen", @@ -3177,15 +2669,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.150" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "libgit2-sys" -version = "0.14.2+1.5.1" +version = "0.16.2+1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f3d95f6b51075fe9810a7ae22c7095f12b98005ab364d8544797a825ce946a4" +checksum = "ee4126d8b4ee5c9d9ea891dd875cfdc1e9d0950437179104b183d7d8a74d24e8" dependencies = [ "cc", "libc", @@ -3232,9 +2724,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.12" +version = "1.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +checksum = "037731f5d3aaa87a5675e895b63ddff1a87624bc29f77004ea829809654e48f6" dependencies = [ "cc", "libc", @@ -3244,9 +2736,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict-builder" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f567a47e47b5420908424de2c6c5e424e3cafe588d0146bd128c0f3755758a3" +checksum = "a90d23f7cef31c6ab7ac0d4f3b23940754207f7b5a80b080c39193caffe99ac2" dependencies = [ "anyhow", "bincode", @@ -3263,9 +2755,9 @@ dependencies = [ [[package]] name = "lindera-compress" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f3e553d55ebe9881fa5e5de588b0a153456e93564d17dfbef498912caf63a2" +checksum = "1927b7d2bd4ffc19e07691bf8609722663c341f80260a1c636cee8f1ec420dce" dependencies = [ "anyhow", "flate2", @@ -3274,9 +2766,9 @@ dependencies = [ [[package]] name = "lindera-core" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9a2440cc156a4a911a174ec68203543d1efb10df3a700a59b6bf581e453c726" +checksum = "3299caa2b81c9a076535a4651a83bf7d624c15f2349f243187fffc64b5a78251" dependencies = [ "anyhow", "bincode", @@ -3291,9 +2783,9 @@ dependencies = [ [[package]] name = "lindera-decompress" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e077a410e61c962cb526f71b7effd62ffc607488a8f61869c937582d2ccb529b" +checksum = "7b82b8d2323a67dc8ff0c40751d199b7ba94cd5e3c13a5b31622d318acc79e5b" dependencies = [ "anyhow", "flate2", @@ -3302,9 +2794,9 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f57491adf7b311a3ee87f5e4a36454df16a2ec73de4ef28b2106fac80bd782" +checksum = "cddf783b459d54b130d956889bec052c25fcb478a304e03fa9b2289387572bc5" dependencies = [ "anyhow", "bincode", @@ -3322,9 +2814,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-builder" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3476ec7748aebd2eb23d496ddfce5e7e0a5c031cffcd214451043e02d029f11" +checksum = "27c708f08f14b0806f6c4cce5324b4bcba27209463026b78c31f399f8be9d30d" dependencies = [ "anyhow", "bincode", @@ -3343,9 +2835,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-neologd-builder" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b1c7576a02d5e4af2bf62de51790a01bc4b8bc0d0b6a6b86a46b157f5cb306d" +checksum = "e5e67eb91652203d202f7d27ead220d1d8c9099552709b8429eae9c70f2312fb" dependencies = [ "anyhow", "bincode", @@ -3364,9 +2856,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b713ecd5b827d7d448c3c5eb3c6d5899ecaf22cd17087599996349a02c76828d" +checksum = "d45da8d9a5888f4d4e78bb29fc82ff9ae519962efb0d2d92343b6cf8e373952f" dependencies = [ "bincode", "byteorder", @@ -3381,9 +2873,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic-builder" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e545752f6487be87b572529ad594cb3b48d2ef20821516f598b2d152d23277b" +checksum = "41c0933295dc945178bbc08f34111dc3ef22bfee38820f78453c8f8d4f3463d1" dependencies = [ "anyhow", "bincode", @@ -3401,9 +2893,9 @@ dependencies = [ [[package]] name = "lindera-tokenizer" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a2d4606a5a4da62ac4a3680ee884a75da7f0c892dc967fc9cb983ceba39a8f" +checksum = "348ce9bb3f2e5edc577420b98cca05b2177f3af50ef5ae278a1d8a1351d56197" dependencies = [ "bincode", "byteorder", @@ -3416,9 +2908,9 @@ dependencies = [ [[package]] name = "lindera-unidic" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388b1bdf81794b5d5b8057ce0321c58ff4b90d676b637948ccc7863ae2f43d28" +checksum = "74022a57c395ed7e213a9cd5833207e3c583145078ee9a164aeaec68b30c9d8e" dependencies = [ "bincode", "byteorder", @@ -3433,9 +2925,9 @@ dependencies = [ [[package]] name = "lindera-unidic-builder" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdfa3e29a22c047da57fadd960ff674b720de15a1e2fb35b5ed67f3408afb469" +checksum = "a34e5564ee81af82603cd6a03c3abe6e17cc0ae598bfa5078809f06e59e96e08" dependencies = [ "anyhow", "bincode", @@ -3528,18 +3020,9 @@ dependencies = [ [[package]] name = "litemap" -version = "0.6.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "575d8a551c59104b4df91269921e5eab561aa1b77c618dac0414b5d44a4617de" - -[[package]] -name = "litemap" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a1a2647d5b7134127971a6de0d533c49de2159167e7f259c427195f87168a1" -dependencies = [ - "serde", -] +checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da" [[package]] name = "lmdb-master-sys" @@ -3587,28 +3070,6 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" -[[package]] -name = "logging_timer" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64e96f261d684b7089aa576bb74e823241dccd994b27d30fabf1dcb3af284fe9" -dependencies = [ - "log", - "logging_timer_proc_macros", -] - -[[package]] -name = "logging_timer_proc_macros" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10a9062912d7952c5588cc474795e0b9ee008e7e6781127945b85413d4b99d81" -dependencies = [ - "log", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "lz4_flex" version = "0.10.0" @@ -3680,6 +3141,7 @@ dependencies = [ "async-trait", "brotli", "bstr", + "build-info", "byte-unit", "bytes", "cargo_toml", @@ -3751,7 +3213,6 @@ dependencies = [ "url", "urlencoding", "uuid", - "vergen", "walkdir", "yaup", "zip", @@ -3880,7 +3341,6 @@ dependencies = [ "json-depth-checker", "levenshtein_automata", "liquid", - "logging_timer", "maplit", "md5", "meili-snap", @@ -4052,6 +3512,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-integer" version = "0.1.45" @@ -4082,6 +3548,15 @@ dependencies = [ "libc", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + [[package]] name = "number_prefix" version = "0.4.0" @@ -4411,9 +3886,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pinyin" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bd12336e3afa34152e002f57df37a7056778daa59ea542b3473b87f5fb260c4" +checksum = "16f2611cd06a1ac239a0cea4521de9eb068a6ca110324ee00631aa68daa74fc0" [[package]] name = "pkg-config" @@ -4464,17 +3939,6 @@ version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3bccab0e7fd7cc19f820a1c8c91720af652d0c88dc9664dd72aef2614f04af3b" -[[package]] -name = "postcard" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a55c51ee6c0db07e68448e336cf8ea4131a620edefebf9893e759b2d793420f8" -dependencies = [ - "cobs", - "embedded-io", - "serde", -] - [[package]] name = "powerfmt" version = "0.2.0" @@ -4655,9 +4119,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" +checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051" dependencies = [ "either", "rayon-core", @@ -4676,9 +4140,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.12.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", "crossbeam-utils", @@ -4708,15 +4172,6 @@ dependencies = [ "bitflags 1.3.2", ] -[[package]] -name = "redox_syscall" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_users" version = "0.4.3" @@ -4736,19 +4191,10 @@ checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.3", + "regex-automata", "regex-syntax 0.8.2", ] -[[package]] -name = "regex-automata" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9368763f5a9b804326f3af749e16f9abf378d227bcdee7634b13d8f17793782" -dependencies = [ - "memchr", -] - [[package]] name = "regex-automata" version = "0.4.3" @@ -4803,10 +4249,12 @@ dependencies = [ "system-configuration", "tokio", "tokio-rustls", + "tokio-util", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", + "wasm-streams", "web-sys", "webpki-roots", "winreg", @@ -4899,9 +4347,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.26" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9470c4bf8246c8daf25f9598dca807fb6510347b1e1cfa55749113850c79d88a" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ "bitflags 2.4.1", "errno", @@ -5205,9 +4653,6 @@ name = "smallvec" version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2593d31f82ead8df961d8bd23a64c2ccf2eb5dd34b0a34bfb4dd54011c72009e" -dependencies = [ - "serde", -] [[package]] name = "smartstring" @@ -5422,14 +4867,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.9.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.4.1", - "rustix 0.38.26", + "rustix 0.38.31", "windows-sys 0.52.0", ] @@ -5489,12 +4933,15 @@ dependencies = [ [[package]] name = "time" -version = "0.3.31" +version = "0.3.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e" +checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" dependencies = [ "deranged", "itoa", + "libc", + "num-conv", + "num_threads", "powerfmt", "serde", "time-core", @@ -5509,24 +4956,14 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f" +checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774" dependencies = [ + "num-conv", "time-core", ] -[[package]] -name = "tinystr" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5d0e245e80bdc9b4e5356fc45a72184abbc3861992603f515270e9340f5a219" -dependencies = [ - "displaydoc", - "serde", - "zerovec 0.10.0", -] - [[package]] name = "tinytemplate" version = "1.2.1" @@ -5558,7 +4995,7 @@ version = "0.14.1" source = "git+https://github.com/huggingface/tokenizers.git?tag=v0.14.1#6357206cdcce4d78ffb1e0372feb456caea09375" dependencies = [ "aho-corasick", - "derive_builder", + "derive_builder 0.12.0", "esaxx-rs", "getrandom", "itertools 0.11.0", @@ -5754,6 +5191,16 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-serde" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +dependencies = [ + "serde", + "tracing-core", +] + [[package]] name = "tracing-subscriber" version = "0.3.18" @@ -5761,11 +5208,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" dependencies = [ "nu-ansi-term", + "serde", + "serde_json", "sharded-slab", "smallvec", "thread_local", "tracing-core", "tracing-log", + "tracing-serde", ] [[package]] @@ -5917,24 +5367,12 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" -[[package]] -name = "utf16_iter" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52df8b7fb78e7910d776fccf2e42ceaf3604d55e8e7eb2dbd183cb1441d8a692" - [[package]] name = "utf8-width" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1" -[[package]] -name = "utf8_iter" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64a8922555b9500e3d865caed19330172cd67cbf82203f1a3311d8c305cc9f33" - [[package]] name = "utf8parse" version = "0.2.1" @@ -5943,10 +5381,11 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560" +checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a" dependencies = [ + "atomic", "getrandom", "serde", ] @@ -5965,18 +5404,42 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vergen" -version = "7.5.1" +version = "9.0.0-beta.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f21b881cd6636ece9735721cf03c1fe1e774fe258683d084bb2812ab67435749" +checksum = "107dc53b443fe8cc380798abb75ad6b7038281165109afea1f1b28bb47047ed5" dependencies = [ "anyhow", - "cfg-if", - "enum-iterator", + "derive_builder 0.13.1", "getset", + "rustversion", + "vergen-lib", +] + +[[package]] +name = "vergen-git2" +version = "1.0.0-beta.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8875c5d71074bb67118774e3d795ab6fe77c3ae3161cb54e19104cabc49487f1" +dependencies = [ + "anyhow", + "derive_builder 0.13.1", "git2", "rustversion", - "thiserror", "time", + "vergen", + "vergen-lib", +] + +[[package]] +name = "vergen-lib" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26ebfba72ba904559f25f41ea1512335b5a46459084258cea0857549d9645187" +dependencies = [ + "anyhow", + "derive_builder 0.13.1", + "getset", + "rustversion", ] [[package]] @@ -6087,6 +5550,19 @@ version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +[[package]] +name = "wasm-streams" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wav" version = "1.0.0" @@ -6114,11 +5590,11 @@ checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10" [[package]] name = "whatlang" -version = "0.16.2" +version = "0.16.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c531a2dc4c462b833788be2c07eef4e621d0e9edbd55bf280cc164c1c1aa043" +checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0" dependencies = [ - "hashbrown 0.12.3", + "hashbrown", "once_cell", ] @@ -6372,9 +5848,9 @@ checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" [[package]] name = "winnow" -version = "0.5.4" +version = "0.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acaaa1190073b2b101e15083c38ee8ec891b5e05cbee516521e94ec008f61e64" +checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" dependencies = [ "memchr", ] @@ -6389,18 +5865,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "write16" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" - -[[package]] -name = "writeable" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0af0c3d13faebf8dda0b5256fa7096a2d5ccb662f7b9f54a40fe201077ab1c2" - [[package]] name = "xattr" version = "1.0.1" @@ -6414,8 +5878,23 @@ dependencies = [ name = "xtask" version = "1.7.0" dependencies = [ + "anyhow", + "build-info", "cargo_metadata", "clap", + "futures-core", + "futures-util", + "reqwest", + "serde", + "serde_json", + "sha2", + "sysinfo", + "time", + "tokio", + "tracing", + "tracing-subscriber", + "tracing-trace", + "uuid", ] [[package]] @@ -6508,52 +5987,15 @@ dependencies = [ "synstructure", ] -[[package]] -name = "zerotrie" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9685bb4deb98dab812e87c296a9631fc00d7ca4bc5c2c5f304f375bbed711a8a" -dependencies = [ - "displaydoc", - "litemap 0.7.1", - "serde", - "yoke", - "zerofrom", - "zerovec 0.10.0", -] - [[package]] name = "zerovec" -version = "0.9.6" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "591691014119b87047ead4dcf3e6adfbf73cb7c38ab6980d4f18a32138f35d46" +checksum = "eff4439ae91fb5c72b8abc12f3f2dbf51bd27e6eadb9f8a5bc8898dddb0e27ea" dependencies = [ "zerofrom", ] -[[package]] -name = "zerovec" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1194130c5b155bf8ae50ab16c86ab758cd695cf9ad176d2f870b744cbdbb572e" -dependencies = [ - "serde", - "yoke", - "zerofrom", - "zerovec-derive", -] - -[[package]] -name = "zerovec-derive" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acabf549809064225ff8878baedc4ce3732ac3b07e7c7ce6e5c2ccdbc485c324" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.48", -] - [[package]] name = "zip" version = "0.6.6" diff --git a/Cargo.toml b/Cargo.toml index 11190025a..1d79fd196 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ members = [ "benchmarks", "fuzzers", "tracing-trace", - "xtask", + "xtask", "build-info", ] [workspace.package] diff --git a/Dockerfile b/Dockerfile index dd2cfc134..5b227e6fc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,7 @@ WORKDIR / ARG COMMIT_SHA ARG COMMIT_DATE ARG GIT_TAG -ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_SEMVER_LIGHTWEIGHT=${GIT_TAG} +ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_DESCRIBE=${GIT_TAG} ENV RUSTFLAGS="-C target-feature=-crt-static" COPY . . diff --git a/build-info/Cargo.toml b/build-info/Cargo.toml new file mode 100644 index 000000000..50854a642 --- /dev/null +++ b/build-info/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "build-info" +version.workspace = true +authors.workspace = true +description.workspace = true +homepage.workspace = true +readme.workspace = true +edition.workspace = true +license.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +time = { version = "0.3.34", features = ["parsing"] } + +[build-dependencies] +anyhow = "1.0.80" +vergen-git2 = "1.0.0-beta.2" diff --git a/build-info/build.rs b/build-info/build.rs new file mode 100644 index 000000000..b1ec0ab47 --- /dev/null +++ b/build-info/build.rs @@ -0,0 +1,22 @@ +fn main() { + if let Err(err) = emit_git_variables() { + println!("cargo:warning=vergen: {}", err); + } +} + +fn emit_git_variables() -> anyhow::Result<()> { + // Note: any code that needs VERGEN_ environment variables should take care to define them manually in the Dockerfile and pass them + // in the corresponding GitHub workflow (publish_docker.yml). + // This is due to the Dockerfile building the binary outside of the git directory. + let mut builder = vergen_git2::Git2Builder::default(); + + builder.branch(true); + builder.commit_timestamp(true); + builder.commit_message(true); + builder.describe(true, true, None); + builder.sha(false); + + let git2 = builder.build()?; + + vergen_git2::Emitter::default().fail_on_error().add_instructions(&git2)?.emit() +} diff --git a/build-info/src/lib.rs b/build-info/src/lib.rs new file mode 100644 index 000000000..cfcefb4a2 --- /dev/null +++ b/build-info/src/lib.rs @@ -0,0 +1,203 @@ +use time::format_description::well_known::Iso8601; + +#[derive(Debug, Clone)] +pub struct BuildInfo { + pub branch: Option<&'static str>, + pub describe: Option, + pub commit_sha1: Option<&'static str>, + pub commit_msg: Option<&'static str>, + pub commit_timestamp: Option, +} + +impl BuildInfo { + pub fn from_build() -> Self { + let branch: Option<&'static str> = option_env!("VERGEN_GIT_BRANCH"); + let describe = DescribeResult::from_build(); + let commit_sha1 = option_env!("VERGEN_GIT_SHA"); + let commit_msg = option_env!("VERGEN_GIT_COMMIT_MESSAGE"); + let commit_timestamp = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP"); + + let commit_timestamp = commit_timestamp.and_then(|commit_timestamp| { + time::OffsetDateTime::parse(commit_timestamp, &Iso8601::DEFAULT).ok() + }); + + Self { branch, describe, commit_sha1, commit_msg, commit_timestamp } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum DescribeResult { + Prototype { name: &'static str }, + Release { version: &'static str, major: u64, minor: u64, patch: u64 }, + Prerelease { version: &'static str, major: u64, minor: u64, patch: u64, rc: u64 }, + NotATag { describe: &'static str }, +} + +impl DescribeResult { + pub fn new(describe: &'static str) -> Self { + if let Some(name) = prototype_name(describe) { + Self::Prototype { name } + } else if let Some(release) = release_version(describe) { + release + } else if let Some(prerelease) = prerelease_version(describe) { + prerelease + } else { + Self::NotATag { describe } + } + } + + pub fn from_build() -> Option { + let describe: &'static str = option_env!("VERGEN_GIT_DESCRIBE")?; + Some(Self::new(describe)) + } + + pub fn as_tag(&self) -> Option<&'static str> { + match self { + DescribeResult::Prototype { name } => Some(name), + DescribeResult::Release { version, .. } => Some(version), + DescribeResult::Prerelease { version, .. } => Some(version), + DescribeResult::NotATag { describe: _ } => None, + } + } + + pub fn as_prototype(&self) -> Option<&'static str> { + match self { + DescribeResult::Prototype { name } => Some(name), + DescribeResult::Release { .. } + | DescribeResult::Prerelease { .. } + | DescribeResult::NotATag { .. } => None, + } + } +} + +/// Parses the input as a prototype name. +/// +/// Returns `Some(prototype_name)` if the following conditions are met on this value: +/// +/// 1. starts with `prototype-`, +/// 2. ends with `-`, +/// 3. does not end with `-`. +/// +/// Otherwise, returns `None`. +fn prototype_name(describe: &'static str) -> Option<&'static str> { + if !describe.starts_with("prototype-") { + return None; + } + + let mut rsplit_prototype = describe.rsplit('-'); + // last component MUST be a number + rsplit_prototype.next()?.parse::().ok()?; + // before than last component SHALL NOT be a number + rsplit_prototype.next()?.parse::().err()?; + + Some(describe) +} + +fn release_version(describe: &'static str) -> Option { + if !describe.starts_with('v') { + return None; + } + + // full release version don't contain a `-` + if describe.contains('-') { + return None; + } + + // full release version parse as vX.Y.Z, with X, Y, Z numbers. + let mut dots = describe[1..].split('.'); + let major: u64 = dots.next()?.parse().ok()?; + let minor: u64 = dots.next()?.parse().ok()?; + let patch: u64 = dots.next()?.parse().ok()?; + + if dots.next().is_some() { + return None; + } + + Some(DescribeResult::Release { version: describe, major, minor, patch }) +} + +fn prerelease_version(describe: &'static str) -> Option { + // prerelease version is in the shape vM.N.P-rc.C + let mut hyphen = describe.rsplit('-'); + let prerelease = hyphen.next()?; + if !prerelease.starts_with("rc.") { + return None; + } + + let rc: u64 = prerelease[3..].parse().ok()?; + + let release = hyphen.next()?; + + let DescribeResult::Release { version: _, major, minor, patch } = release_version(release)? + else { + return None; + }; + + Some(DescribeResult::Prerelease { version: describe, major, minor, patch, rc }) +} + +#[cfg(test)] +mod test { + use super::DescribeResult; + + fn assert_not_a_tag(describe: &'static str) { + assert_eq!(DescribeResult::NotATag { describe }, DescribeResult::new(describe)) + } + + fn assert_proto(describe: &'static str) { + assert_eq!(DescribeResult::Prototype { name: describe }, DescribeResult::new(describe)) + } + + fn assert_release(describe: &'static str, major: u64, minor: u64, patch: u64) { + assert_eq!( + DescribeResult::Release { version: describe, major, minor, patch }, + DescribeResult::new(describe) + ) + } + + fn assert_prerelease(describe: &'static str, major: u64, minor: u64, patch: u64, rc: u64) { + assert_eq!( + DescribeResult::Prerelease { version: describe, major, minor, patch, rc }, + DescribeResult::new(describe) + ) + } + + #[test] + fn not_a_tag() { + assert_not_a_tag("whatever-fuzzy"); + assert_not_a_tag("whatever-fuzzy-5-ggg-dirty"); + assert_not_a_tag("whatever-fuzzy-120-ggg-dirty"); + + // technically a tag, but not a proto nor a version, so not parsed as a tag + assert_not_a_tag("whatever"); + + // dirty version + assert_not_a_tag("v1.7.0-1-ggga-dirty"); + assert_not_a_tag("v1.7.0-rc.1-1-ggga-dirty"); + + // after version + assert_not_a_tag("v1.7.0-1-ggga"); + assert_not_a_tag("v1.7.0-rc.1-1-ggga"); + + // after proto + assert_not_a_tag("protoype-tag-0-1-ggga"); + assert_not_a_tag("protoype-tag-0-1-ggga-dirty"); + } + + #[test] + fn prototype() { + assert_proto("prototype-tag-0"); + assert_proto("prototype-tag-10"); + assert_proto("prototype-long-name-tag-10"); + } + + #[test] + fn release() { + assert_release("v1.7.2", 1, 7, 2); + } + + #[test] + fn prerelease() { + assert_prerelease("v1.7.2-rc.3", 1, 7, 2, 3); + } +} diff --git a/dump/src/reader/compat/v2_to_v3.rs b/dump/src/reader/compat/v2_to_v3.rs index 1d4238290..82a3b9e84 100644 --- a/dump/src/reader/compat/v2_to_v3.rs +++ b/dump/src/reader/compat/v2_to_v3.rs @@ -1,4 +1,3 @@ -use std::convert::TryInto; use std::str::FromStr; use time::OffsetDateTime; diff --git a/file-store/Cargo.toml b/file-store/Cargo.toml index 1b1b0cff5..5fae1aab4 100644 --- a/file-store/Cargo.toml +++ b/file-store/Cargo.toml @@ -13,6 +13,7 @@ license.workspace = true [dependencies] tempfile = "3.9.0" thiserror = "1.0.56" +tracing = "0.1.40" uuid = { version = "1.6.1", features = ["serde", "v4"] } [dev-dependencies] diff --git a/file-store/src/lib.rs b/file-store/src/lib.rs index 15c4168bc..c8b3849ab 100644 --- a/file-store/src/lib.rs +++ b/file-store/src/lib.rs @@ -61,7 +61,13 @@ impl FileStore { /// Returns the file corresponding to the requested uuid. pub fn get_update(&self, uuid: Uuid) -> Result { let path = self.get_update_path(uuid); - let file = StdFile::open(path)?; + let file = match StdFile::open(path) { + Ok(file) => file, + Err(e) => { + tracing::error!("Can't access update file {uuid}: {e}"); + return Err(e.into()); + } + }; Ok(file) } @@ -96,8 +102,12 @@ impl FileStore { pub fn delete(&self, uuid: Uuid) -> Result<()> { let path = self.path.join(uuid.to_string()); - std::fs::remove_file(path)?; - Ok(()) + if let Err(e) = std::fs::remove_file(path) { + tracing::error!("Can't delete file {uuid}: {e}"); + Err(e.into()) + } else { + Ok(()) + } } /// List the Uuids of the files in the FileStore diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 890312854..c758f1114 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -23,6 +23,7 @@ meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } page_size = "0.5.0" puffin = { version = "0.16.0", features = ["serialization"] } +rayon = "1.8.1" roaring = { version = "0.10.2", features = ["serde"] } serde = { version = "1.0.195", features = ["derive"] } serde_json = { version = "1.0.111", features = ["preserve_order"] } diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 8e2eb26a0..b7e31c136 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -142,22 +142,28 @@ pub(crate) enum IndexOperation { impl Batch { /// Return the task ids associated with this batch. - pub fn ids(&self) -> Vec { + pub fn ids(&self) -> RoaringBitmap { match self { Batch::TaskCancelation { task, .. } | Batch::Dump(task) | Batch::IndexCreation { task, .. } - | Batch::IndexUpdate { task, .. } => vec![task.uid], + | Batch::IndexUpdate { task, .. } => { + RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() + } Batch::SnapshotCreation(tasks) | Batch::TaskDeletions(tasks) - | Batch::IndexDeletion { tasks, .. } => tasks.iter().map(|task| task.uid).collect(), + | Batch::IndexDeletion { tasks, .. } => { + RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid)) + } Batch::IndexOperation { op, .. } => match op { IndexOperation::DocumentOperation { tasks, .. } | IndexOperation::Settings { tasks, .. } | IndexOperation::DocumentClear { tasks, .. } => { - tasks.iter().map(|task| task.uid).collect() + RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid)) + } + IndexOperation::IndexDocumentDeletionByFilter { task, .. } => { + RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() } - IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid], IndexOperation::SettingsAndDocumentOperation { document_import_tasks: tasks, settings_tasks: other, @@ -167,9 +173,11 @@ impl Batch { cleared_tasks: tasks, settings_tasks: other, .. - } => tasks.iter().chain(other).map(|task| task.uid).collect(), + } => RoaringBitmap::from_iter(tasks.iter().chain(other).map(|task| task.uid)), }, - Batch::IndexSwap { task } => vec![task.uid], + Batch::IndexSwap { task } => { + RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() + } } } diff --git a/index-scheduler/src/features.rs b/index-scheduler/src/features.rs index 4fd5bd0e7..3be18a3f1 100644 --- a/index-scheduler/src/features.rs +++ b/index-scheduler/src/features.rs @@ -48,7 +48,7 @@ impl RoFeatures { Ok(()) } else { Err(FeatureNotEnabledError { - disabled_action: "getting logs through the `/logs/stream` route", + disabled_action: "Modifying logs through the `/logs/*` routes", feature: "logs route", issue_link: "https://github.com/orgs/meilisearch/discussions/721", } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 1c3b93bce..adb3d4942 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -37,8 +37,8 @@ use std::fs::File; use std::io::{self, BufReader, Read}; use std::ops::{Bound, RangeBounds}; use std::path::{Path, PathBuf}; -use std::sync::atomic::AtomicBool; -use std::sync::atomic::Ordering::Relaxed; +use std::sync::atomic::Ordering::{self, Relaxed}; +use std::sync::atomic::{AtomicBool, AtomicU32}; use std::sync::{Arc, RwLock}; use std::time::Duration; @@ -60,6 +60,8 @@ use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmap use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; use puffin::FrameView; +use rayon::current_num_threads; +use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; use synchronoise::SignalEvent; use time::format_description::well_known::Rfc3339; @@ -1170,15 +1172,13 @@ impl IndexScheduler { drop(rtxn); // 1. store the starting date with the bitmap of processing tasks. - let mut ids = batch.ids(); - ids.sort_unstable(); + let ids = batch.ids(); let processed_tasks = ids.len(); - let processing_tasks = RoaringBitmap::from_sorted_iter(ids.iter().copied()).unwrap(); let started_at = OffsetDateTime::now_utc(); // We reset the must_stop flag to be sure that we don't stop processing tasks self.must_stop_processing.reset(); - self.processing_tasks.write().unwrap().start_processing_at(started_at, processing_tasks); + self.processing_tasks.write().unwrap().start_processing_at(started_at, ids.clone()); #[cfg(test)] self.breakpoint(Breakpoint::BatchCreated); @@ -1207,6 +1207,9 @@ impl IndexScheduler { #[cfg(test)] self.breakpoint(Breakpoint::ProcessBatchSucceeded); + let mut success = 0; + let mut failure = 0; + #[allow(unused_variables)] for (i, mut task) in tasks.into_iter().enumerate() { task.started_at = Some(started_at); @@ -1219,13 +1222,15 @@ impl IndexScheduler { }, )?; + match task.error { + Some(_) => failure += 1, + None => success += 1, + } + self.update_task(&mut wtxn, &task) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?; - if let Err(e) = self.delete_persisted_task_data(&task) { - tracing::error!("Failure to delete the content files associated with task {}. Error: {e}", task.uid); - } } - tracing::info!("A batch of tasks was successfully completed."); + tracing::info!("A batch of tasks was successfully completed with {success} successful tasks and {failure} failed tasks."); } // If we have an abortion error we must stop the tick here and re-schedule tasks. Err(Error::Milli(milli::Error::InternalError( @@ -1236,6 +1241,7 @@ impl IndexScheduler { self.breakpoint(Breakpoint::AbortedIndexation); wtxn.abort(); + tracing::info!("A batch of tasks was aborted."); // We make sure that we don't call `stop_processing` on the `processing_tasks`, // this is because we want to let the next tick call `create_next_batch` and keep // the `started_at` date times and `processings` of the current processing tasks. @@ -1257,6 +1263,8 @@ impl IndexScheduler { self.index_mapper.resize_index(&wtxn, &index_uid)?; wtxn.abort(); + tracing::info!("The max database size was reached. Resizing the index."); + return Ok(TickOutcome::TickAgain(0)); } // In case of a failure we must get back and patch all the tasks with the error. @@ -1264,7 +1272,7 @@ impl IndexScheduler { #[cfg(test)] self.breakpoint(Breakpoint::ProcessBatchFailed); let error: ResponseError = err.into(); - for id in ids { + for id in ids.iter() { let mut task = self .get_task(&wtxn, id) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))? @@ -1278,9 +1286,8 @@ impl IndexScheduler { #[cfg(test)] self.maybe_fail(tests::FailureLocation::UpdatingTaskAfterProcessBatchFailure)?; - if let Err(e) = self.delete_persisted_task_data(&task) { - tracing::error!("Failure to delete the content files associated with task {}. Error: {e}", task.uid); - } + tracing::info!("Batch failed {}", error); + self.update_task(&mut wtxn, &task) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?; } @@ -1294,6 +1301,28 @@ impl IndexScheduler { wtxn.commit().map_err(Error::HeedTransaction)?; + // Once the tasks are commited, we should delete all the update files associated ASAP to avoid leaking files in case of a restart + tracing::debug!("Deleting the upadate files"); + + //We take one read transaction **per thread**. Then, every thread is going to pull out new IDs from the roaring bitmap with the help of an atomic shared index into the bitmap + let idx = AtomicU32::new(0); + (0..current_num_threads()).into_par_iter().try_for_each(|_| -> Result<()> { + let rtxn = self.read_txn()?; + while let Some(id) = ids.select(idx.fetch_add(1, Ordering::Relaxed)) { + let task = self + .get_task(&rtxn, id) + .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))? + .ok_or(Error::CorruptedTaskQueue)?; + if let Err(e) = self.delete_persisted_task_data(&task) { + tracing::error!( + "Failure to delete the content files associated with task {}. Error: {e}", + task.uid + ); + } + } + Ok(()) + })?; + // We shouldn't crash the tick function if we can't send data to the webhook. let _ = self.notify_webhook(&processed); @@ -1366,7 +1395,9 @@ impl IndexScheduler { // let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); - let request = ureq::post(url).set("Content-Encoding", "gzip"); + let request = ureq::post(url) + .set("Content-Encoding", "gzip") + .set("Content-Type", "application/x-ndjson"); let request = match &self.webhook_authorization_header { Some(header) => request.set("Authorization", header), None => request, @@ -1706,7 +1737,7 @@ pub enum TickOutcome { /// The scheduler should immediately attempt another `tick`. /// /// The `usize` field contains the number of processed tasks. - TickAgain(usize), + TickAgain(u64), /// The scheduler should wait for an external signal before attempting another `tick`. WaitForSignal, } diff --git a/index-scheduler/src/uuid_codec.rs b/index-scheduler/src/uuid_codec.rs index 54020fa3c..92dc70b0c 100644 --- a/index-scheduler/src/uuid_codec.rs +++ b/index-scheduler/src/uuid_codec.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::convert::TryInto; use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode}; use uuid::Uuid; diff --git a/meilisearch-auth/src/store.rs b/meilisearch-auth/src/store.rs index 276c035b0..1eebd3fe9 100644 --- a/meilisearch-auth/src/store.rs +++ b/meilisearch-auth/src/store.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; use std::cmp::Reverse; use std::collections::HashSet; -use std::convert::{TryFrom, TryInto}; use std::fs::create_dir_all; use std::path::Path; use std::result::Result as StdResult; diff --git a/meilisearch-types/Cargo.toml b/meilisearch-types/Cargo.toml index b9edb4c1e..7709d33d7 100644 --- a/meilisearch-types/Cargo.toml +++ b/meilisearch-types/Cargo.toml @@ -54,3 +54,5 @@ thai = ["milli/thai"] greek = ["milli/greek"] # allow khmer specialized tokenization khmer = ["milli/khmer"] +# allow vietnamese specialized tokenization +vietnamese = ["milli/vietnamese"] diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index f8a50238a..04b919904 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -104,9 +104,10 @@ serde_urlencoded = "0.7.1" termcolor = "1.4.1" url = { version = "2.5.0", features = ["serde"] } tracing = "0.1.40" -tracing-subscriber = "0.3.18" +tracing-subscriber = { version = "0.3.18", features = ["json"] } tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.9" +build-info = { version = "1.7.0", path = "../build-info" } [dev-dependencies] actix-rt = "2.9.0" @@ -131,7 +132,6 @@ reqwest = { version = "0.11.23", features = [ sha-1 = { version = "0.10.1", optional = true } static-files = { version = "0.2.3", optional = true } tempfile = { version = "3.9.0", optional = true } -vergen = { version = "7.5.1", default-features = false, features = ["git"] } zip = { version = "0.6.6", optional = true } [features] @@ -154,6 +154,7 @@ japanese = ["meilisearch-types/japanese"] thai = ["meilisearch-types/thai"] greek = ["meilisearch-types/greek"] khmer = ["meilisearch-types/khmer"] +vietnamese = ["meilisearch-types/vietnamese"] [package.metadata.mini-dashboard] assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip" diff --git a/meilisearch/build.rs b/meilisearch/build.rs index c839b6e33..dc24b0449 100644 --- a/meilisearch/build.rs +++ b/meilisearch/build.rs @@ -1,17 +1,4 @@ -use vergen::{vergen, Config, SemverKind}; - fn main() { - // Note: any code that needs VERGEN_ environment variables should take care to define them manually in the Dockerfile and pass them - // in the corresponding GitHub workflow (publish_docker.yml). - // This is due to the Dockerfile building the binary outside of the git directory. - let mut config = Config::default(); - // allow using non-annotated tags - *config.git_mut().semver_kind_mut() = SemverKind::Lightweight; - - if let Err(e) = vergen(config) { - println!("cargo:warning=vergen: {}", e); - } - #[cfg(feature = "mini-dashboard")] mini_dashboard::setup_mini_dashboard().expect("Could not load the mini-dashboard assets"); } diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 8bb7e8d81..7dfc52900 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -28,7 +28,9 @@ use super::{ config_user_id_path, DocumentDeletionKind, DocumentFetchKind, MEILISEARCH_CONFIG_PATH, }; use crate::analytics::Analytics; -use crate::option::{default_http_addr, IndexerOpts, MaxMemory, MaxThreads, ScheduleSnapshot}; +use crate::option::{ + default_http_addr, IndexerOpts, LogMode, MaxMemory, MaxThreads, ScheduleSnapshot, +}; use crate::routes::indexes::documents::UpdateDocumentsQuery; use crate::routes::indexes::facet_search::FacetSearchQuery; use crate::routes::tasks::TasksFilterQuery; @@ -250,10 +252,12 @@ impl super::Analytics for SegmentAnalytics { struct Infos { env: String, experimental_enable_metrics: bool, + experimental_logs_mode: LogMode, experimental_replication_parameters: bool, experimental_enable_logs_route: bool, experimental_reduce_indexing_memory_usage: bool, experimental_max_number_of_batched_tasks: usize, + gpu_enabled: bool, db_path: bool, import_dump: bool, dump_dir: bool, @@ -289,6 +293,7 @@ impl From for Infos { let Opt { db_path, experimental_enable_metrics, + experimental_logs_mode, experimental_replication_parameters, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, @@ -337,9 +342,11 @@ impl From for Infos { Self { env, experimental_enable_metrics, + experimental_logs_mode, experimental_replication_parameters, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, + gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), import_dump: import_dump.is_some(), dump_dir: dump_dir != PathBuf::from("dumps/"), @@ -466,7 +473,9 @@ impl Segment { create_all_stats(index_scheduler.into(), auth_controller.into(), &AuthFilter::default()) { // Replace the version number with the prototype name if any. - let version = if let Some(prototype) = crate::prototype_name() { + let version = if let Some(prototype) = build_info::DescribeResult::from_build() + .and_then(|describe| describe.as_prototype()) + { prototype } else { env!("CARGO_PKG_VERSION") diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 1ab161564..820f1ae42 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -97,11 +97,25 @@ pub type LogRouteType = tracing_subscriber::filter::Filtered< tracing_subscriber::Registry, >; +pub type SubscriberForSecondLayer = tracing_subscriber::layer::Layered< + tracing_subscriber::reload::Layer, + tracing_subscriber::Registry, +>; + +pub type LogStderrHandle = + tracing_subscriber::reload::Handle; + +pub type LogStderrType = tracing_subscriber::filter::Filtered< + Box + Send + Sync>, + Targets, + SubscriberForSecondLayer, +>; + pub fn create_app( index_scheduler: Data, auth_controller: Data, opt: Opt, - logs: LogRouteHandle, + logs: (LogRouteHandle, LogStderrHandle), analytics: Arc, enable_dashboard: bool, ) -> actix_web::App< @@ -412,6 +426,9 @@ fn import_dump( let reader = BufReader::new(file); let reader = DocumentsBatchReader::from_reader(reader)?; + let embedder_configs = index.embedding_configs(&wtxn)?; + let embedders = index_scheduler.embedders(embedder_configs)?; + let builder = milli::update::IndexDocuments::new( &mut wtxn, &index, @@ -424,6 +441,8 @@ fn import_dump( || false, )?; + let builder = builder.with_embedders(embedders); + let (builder, user_result) = builder.add_documents(reader)?; let user_result = user_result?; tracing::info!(documents_found = user_result, "{} documents found.", user_result); @@ -447,7 +466,7 @@ pub fn configure_data( index_scheduler: Data, auth: Data, opt: &Opt, - logs: LogRouteHandle, + (logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle), analytics: Arc, ) { let http_payload_size_limit = opt.http_payload_size_limit.get_bytes() as usize; @@ -455,7 +474,8 @@ pub fn configure_data( .app_data(index_scheduler) .app_data(auth) .app_data(web::Data::from(analytics)) - .app_data(web::Data::new(logs)) + .app_data(web::Data::new(logs_route)) + .app_data(web::Data::new(logs_stderr)) .app_data(web::Data::new(opt.clone())) .app_data( web::JsonConfig::default() @@ -516,30 +536,3 @@ pub fn dashboard(config: &mut web::ServiceConfig, enable_frontend: bool) { pub fn dashboard(config: &mut web::ServiceConfig, _enable_frontend: bool) { config.service(web::resource("/").route(web::get().to(routes::running))); } - -/// Parses the output of -/// [`VERGEN_GIT_SEMVER_LIGHTWEIGHT`](https://docs.rs/vergen/latest/vergen/struct.Git.html#instructions) -/// as a prototype name. -/// -/// Returns `Some(prototype_name)` if the following conditions are met on this value: -/// -/// 1. starts with `prototype-`, -/// 2. ends with `-`, -/// 3. does not end with `-`. -/// -/// Otherwise, returns `None`. -pub fn prototype_name() -> Option<&'static str> { - let prototype: &'static str = option_env!("VERGEN_GIT_SEMVER_LIGHTWEIGHT")?; - - if !prototype.starts_with("prototype-") { - return None; - } - - let mut rsplit_prototype = prototype.rsplit('-'); - // last component MUST be a number - rsplit_prototype.next()?.parse::().ok()?; - // before than last component SHALL NOT be a number - rsplit_prototype.next()?.parse::().err()?; - - Some(prototype) -} diff --git a/meilisearch/src/main.rs b/meilisearch/src/main.rs index 1e067b43e..af02f58e1 100644 --- a/meilisearch/src/main.rs +++ b/meilisearch/src/main.rs @@ -1,5 +1,5 @@ use std::env; -use std::io::{stderr, Write}; +use std::io::{stderr, LineWriter, Write}; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; @@ -10,8 +10,10 @@ use actix_web::HttpServer; use index_scheduler::IndexScheduler; use is_terminal::IsTerminal; use meilisearch::analytics::Analytics; +use meilisearch::option::LogMode; use meilisearch::{ - analytics, create_app, prototype_name, setup_meilisearch, LogRouteHandle, LogRouteType, Opt, + analytics, create_app, setup_meilisearch, LogRouteHandle, LogRouteType, LogStderrHandle, + LogStderrType, Opt, SubscriberForSecondLayer, }; use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE}; use mimalloc::MiMalloc; @@ -23,28 +25,44 @@ use tracing_subscriber::Layer; #[global_allocator] static ALLOC: MiMalloc = MiMalloc; -fn default_layer() -> LogRouteType { +fn default_log_route_layer() -> LogRouteType { None.with_filter(tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF)) } +fn default_log_stderr_layer(opt: &Opt) -> LogStderrType { + let layer = tracing_subscriber::fmt::layer() + .with_writer(|| LineWriter::new(std::io::stderr())) + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE); + + let layer = match opt.experimental_logs_mode { + LogMode::Human => Box::new(layer) + as Box + Send + Sync>, + LogMode::Json => Box::new(layer.json()) + as Box + Send + Sync>, + }; + + layer.with_filter( + tracing_subscriber::filter::Targets::new() + .with_target("", LevelFilter::from_str(&opt.log_level.to_string()).unwrap()), + ) +} + /// does all the setup before meilisearch is launched -fn setup(opt: &Opt) -> anyhow::Result { - let (route_layer, route_layer_handle) = tracing_subscriber::reload::Layer::new(default_layer()); +fn setup(opt: &Opt) -> anyhow::Result<(LogRouteHandle, LogStderrHandle)> { + let (route_layer, route_layer_handle) = + tracing_subscriber::reload::Layer::new(default_log_route_layer()); let route_layer: tracing_subscriber::reload::Layer<_, _> = route_layer; - let subscriber = tracing_subscriber::registry().with(route_layer).with( - tracing_subscriber::fmt::layer() - .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE) - .with_filter( - tracing_subscriber::filter::LevelFilter::from_str(&opt.log_level.to_string()) - .unwrap(), - ), - ); + let (stderr_layer, stderr_layer_handle) = + tracing_subscriber::reload::Layer::new(default_log_stderr_layer(opt)); + let route_layer: tracing_subscriber::reload::Layer<_, _> = route_layer; + + let subscriber = tracing_subscriber::registry().with(route_layer).with(stderr_layer); // set the subscriber as the default for the application tracing::subscriber::set_global_default(subscriber).unwrap(); - Ok(route_layer_handle) + Ok((route_layer_handle, stderr_layer_handle)) } fn on_panic(info: &std::panic::PanicInfo) { @@ -110,7 +128,7 @@ async fn run_http( index_scheduler: Arc, auth_controller: Arc, opt: Opt, - logs: LogRouteHandle, + logs: (LogRouteHandle, LogStderrHandle), analytics: Arc, ) -> anyhow::Result<()> { let enable_dashboard = &opt.env == "development"; @@ -145,8 +163,8 @@ pub fn print_launch_resume( analytics: Arc, config_read_from: Option, ) { - let commit_sha = option_env!("VERGEN_GIT_SHA").unwrap_or("unknown"); - let commit_date = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP").unwrap_or("unknown"); + let build_info = build_info::BuildInfo::from_build(); + let protocol = if opt.ssl_cert_path.is_some() && opt.ssl_key_path.is_some() { "https" } else { "http" }; let ascii_name = r#" @@ -171,10 +189,18 @@ pub fn print_launch_resume( eprintln!("Database path:\t\t{:?}", opt.db_path); eprintln!("Server listening on:\t\"{}://{}\"", protocol, opt.http_addr); eprintln!("Environment:\t\t{:?}", opt.env); - eprintln!("Commit SHA:\t\t{:?}", commit_sha.to_string()); - eprintln!("Commit date:\t\t{:?}", commit_date.to_string()); + eprintln!("Commit SHA:\t\t{:?}", build_info.commit_sha1.unwrap_or("unknown")); + eprintln!( + "Commit date:\t\t{:?}", + build_info + .commit_timestamp + .and_then(|commit_timestamp| commit_timestamp + .format(&time::format_description::well_known::Rfc3339) + .ok()) + .unwrap_or("unknown".into()) + ); eprintln!("Package version:\t{:?}", env!("CARGO_PKG_VERSION").to_string()); - if let Some(prototype) = prototype_name() { + if let Some(prototype) = build_info.describe.and_then(|describe| describe.as_prototype()) { eprintln!("Prototype:\t\t{:?}", prototype); } diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 657be00d0..43bf2c62c 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -1,4 +1,3 @@ -use std::convert::TryFrom; use std::env::VarError; use std::ffi::OsStr; use std::fmt::Display; @@ -51,6 +50,7 @@ const MEILI_IGNORE_MISSING_DUMP: &str = "MEILI_IGNORE_MISSING_DUMP"; const MEILI_IGNORE_DUMP_IF_DB_EXISTS: &str = "MEILI_IGNORE_DUMP_IF_DB_EXISTS"; const MEILI_DUMP_DIR: &str = "MEILI_DUMP_DIR"; const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL"; +const MEILI_EXPERIMENTAL_LOGS_MODE: &str = "MEILI_EXPERIMENTAL_LOGS_MODE"; const MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS: &str = "MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS"; const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; @@ -80,6 +80,39 @@ const DEFAULT_LOG_EVERY_N: usize = 100_000; pub const INDEX_SIZE: u64 = 2 * 1024 * 1024 * 1024 * 1024; // 2 TiB pub const TASK_DB_SIZE: u64 = 20 * 1024 * 1024 * 1024; // 20 GiB +#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] +#[serde(rename_all = "UPPERCASE")] +pub enum LogMode { + #[default] + Human, + Json, +} + +impl Display for LogMode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + LogMode::Human => Display::fmt("HUMAN", f), + LogMode::Json => Display::fmt("JSON", f), + } + } +} + +impl FromStr for LogMode { + type Err = LogModeError; + + fn from_str(s: &str) -> Result { + match s.trim().to_lowercase().as_str() { + "human" => Ok(LogMode::Human), + "json" => Ok(LogMode::Json), + _ => Err(LogModeError(s.to_owned())), + } + } +} + +#[derive(Debug, thiserror::Error)] +#[error("Unsupported log mode level `{0}`. Supported values are `HUMAN` and `JSON`.")] +pub struct LogModeError(String); + #[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] #[serde(rename_all = "UPPERCASE")] pub enum LogLevel { @@ -311,9 +344,16 @@ pub struct Opt { #[serde(default)] pub experimental_enable_metrics: bool, + /// Experimental logs mode feature. For more information, see: + /// + /// Change the mode of the logs on the console. + #[clap(long, env = MEILI_EXPERIMENTAL_LOGS_MODE, default_value_t)] + #[serde(default)] + pub experimental_logs_mode: LogMode, + /// Experimental logs route feature. For more information, see: /// - /// Enables the log route on the `POST /logs/stream` endpoint and the `DELETE /logs/stream` to stop receiving logs. + /// Enables the log routes on the `POST /logs/stream`, `POST /logs/stderr` endpoints, and the `DELETE /logs/stream` to stop receiving logs. #[clap(long, env = MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE)] #[serde(default)] pub experimental_enable_logs_route: bool, @@ -433,6 +473,7 @@ impl Opt { #[cfg(feature = "analytics")] no_analytics, experimental_enable_metrics, + experimental_logs_mode, experimental_enable_logs_route, experimental_replication_parameters, experimental_reduce_indexing_memory_usage, @@ -491,6 +532,10 @@ impl Opt { MEILI_EXPERIMENTAL_ENABLE_METRICS, experimental_enable_metrics.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_LOGS_MODE, + experimental_logs_mode.to_string(), + ); export_to_env_if_not_present( MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS, experimental_replication_parameters.to_string(), diff --git a/meilisearch/src/routes/api_key.rs b/meilisearch/src/routes/api_key.rs index 597d04486..0bd4b9d59 100644 --- a/meilisearch/src/routes/api_key.rs +++ b/meilisearch/src/routes/api_key.rs @@ -10,7 +10,7 @@ use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{Code, ResponseError}; -use meilisearch_types::keys::{Action, CreateApiKey, Key, PatchApiKey}; +use meilisearch_types::keys::{CreateApiKey, Key, PatchApiKey}; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use uuid::Uuid; diff --git a/meilisearch/src/routes/logs.rs b/meilisearch/src/routes/logs.rs index d95f80bb8..57e2cbd22 100644 --- a/meilisearch/src/routes/logs.rs +++ b/meilisearch/src/routes/logs.rs @@ -22,14 +22,15 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::LogRouteHandle; +use crate::{LogRouteHandle, LogStderrHandle}; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("stream") .route(web::post().to(SeqHandler(get_logs))) .route(web::delete().to(SeqHandler(cancel_logs))), - ); + ) + .service(web::resource("stderr").route(web::post().to(SeqHandler(update_stderr_target)))); } #[derive(Debug, Default, Clone, Copy, Deserr, PartialEq, Eq)] @@ -37,6 +38,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { pub enum LogMode { #[default] Human, + Json, Profile, } @@ -165,7 +167,18 @@ fn make_layer< let fmt_layer = tracing_subscriber::fmt::layer() .with_writer(move || LogWriter { sender: sender.clone() }) - .with_span_events(tracing_subscriber::fmt::format::FmtSpan::ACTIVE); + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE); + + let stream = byte_stream(receiver, guard); + (Box::new(fmt_layer) as Box + Send + Sync>, Box::pin(stream)) + } + LogMode::Json => { + let (sender, receiver) = tokio::sync::mpsc::unbounded_channel(); + + let fmt_layer = tracing_subscriber::fmt::layer() + .with_writer(move || LogWriter { sender: sender.clone() }) + .json() + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE); let stream = byte_stream(receiver, guard); (Box::new(fmt_layer) as Box + Send + Sync>, Box::pin(stream)) @@ -279,3 +292,27 @@ pub async fn cancel_logs( Ok(HttpResponse::NoContent().finish()) } + +#[derive(Debug, Deserr)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +pub struct UpdateStderrLogs { + #[deserr(default = "info".parse().unwrap(), try_from(&String) = MyTargets::from_str -> DeserrJsonError)] + target: MyTargets, +} + +pub async fn update_stderr_target( + index_scheduler: GuardedData, Data>, + logs: Data, + body: AwebJson, +) -> Result { + index_scheduler.features().check_logs_route()?; + + let opt = body.into_inner(); + + logs.modify(|layer| { + *layer.filter_mut() = opt.target.0.clone(); + }) + .unwrap(); + + Ok(HttpResponse::NoContent().finish()) +} diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 249103e12..1c1465582 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -359,12 +359,18 @@ async fn get_version( ) -> HttpResponse { analytics.publish("Version Seen".to_string(), json!(null), Some(&req)); - let commit_sha = option_env!("VERGEN_GIT_SHA").unwrap_or("unknown"); - let commit_date = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP").unwrap_or("unknown"); + let build_info = build_info::BuildInfo::from_build(); HttpResponse::Ok().json(VersionResponse { - commit_sha: commit_sha.to_string(), - commit_date: commit_date.to_string(), + commit_sha: build_info.commit_sha1.unwrap_or("unknown").to_string(), + commit_date: build_info + .commit_timestamp + .and_then(|commit_timestamp| { + commit_timestamp + .format(&time::format_description::well_known::Iso8601::DEFAULT) + .ok() + }) + .unwrap_or("unknown".into()), pkg_version: env!("CARGO_PKG_VERSION").to_string(), }) } diff --git a/meilisearch/tests/common/server.rs b/meilisearch/tests/common/server.rs index 134124cc8..41607f76d 100644 --- a/meilisearch/tests/common/server.rs +++ b/meilisearch/tests/common/server.rs @@ -9,7 +9,7 @@ use actix_web::http::StatusCode; use byte_unit::{Byte, ByteUnit}; use clap::Parser; use meilisearch::option::{IndexerOpts, MaxMemory, Opt}; -use meilisearch::{analytics, create_app, setup_meilisearch}; +use meilisearch::{analytics, create_app, setup_meilisearch, SubscriberForSecondLayer}; use once_cell::sync::Lazy; use tempfile::TempDir; use tokio::time::sleep; @@ -87,12 +87,20 @@ impl Server { tracing_subscriber::reload::Layer::new(None.with_filter( tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF), )); + let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new( + (Box::new( + tracing_subscriber::fmt::layer() + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE), + ) + as Box + Send + Sync>) + .with_filter(tracing_subscriber::filter::Targets::new()), + ); actix_web::test::init_service(create_app( self.service.index_scheduler.clone().into(), self.service.auth.clone().into(), self.service.options.clone(), - route_layer_handle, + (route_layer_handle, stderr_layer_handle), analytics::MockAnalytics::new(&self.service.options), true, )) diff --git a/meilisearch/tests/common/service.rs b/meilisearch/tests/common/service.rs index 4c23a18d8..cd78253aa 100644 --- a/meilisearch/tests/common/service.rs +++ b/meilisearch/tests/common/service.rs @@ -5,7 +5,7 @@ use actix_web::http::StatusCode; use actix_web::test; use actix_web::test::TestRequest; use index_scheduler::IndexScheduler; -use meilisearch::{analytics, create_app, Opt}; +use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer}; use meilisearch_auth::AuthController; use tracing::level_filters::LevelFilter; use tracing_subscriber::Layer; @@ -111,12 +111,20 @@ impl Service { tracing_subscriber::reload::Layer::new(None.with_filter( tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF), )); + let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new( + (Box::new( + tracing_subscriber::fmt::layer() + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE), + ) + as Box + Send + Sync>) + .with_filter(tracing_subscriber::filter::Targets::new()), + ); let app = test::init_service(create_app( self.index_scheduler.clone().into(), self.auth.clone().into(), self.options.clone(), - route_layer_handle, + (route_layer_handle, stderr_layer_handle), analytics::MockAnalytics::new(&self.options), true, )) diff --git a/meilisearch/tests/documents/update_documents.rs b/meilisearch/tests/documents/update_documents.rs index b4f61bf99..a5d466513 100644 --- a/meilisearch/tests/documents/update_documents.rs +++ b/meilisearch/tests/documents/update_documents.rs @@ -1,4 +1,4 @@ -use meili_snap::snapshot; +use meili_snap::{json_string, snapshot}; use crate::common::encoder::Encoder; use crate::common::{GetAllDocumentsOptions, Server}; @@ -209,3 +209,93 @@ async fn error_update_documents_missing_document_id() { "https://docs.meilisearch.com/errors#missing_document_id" ); } + +#[actix_rt::test] +async fn update_faceted_document() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "rankingRules": ["facet:asc"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(0).await; + + let documents: Vec<_> = (0..1000) + .map(|id| { + json!({ + "doc_id": id, + "facet": (id/3), + }) + }) + .collect(); + + let (_response, code) = index.add_documents(documents.into(), None).await; + assert_eq!(code, 202); + + index.wait_task(1).await; + + let documents = json!([ + { + "doc_id": 9, + "facet": 1.5, + } + ]); + + let (response, code) = index.update_documents(documents, None).await; + assert_eq!(code, 202, "response: {}", response); + + index.wait_task(2).await; + + index + .search(json!({"limit": 10}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "doc_id": 0, + "facet": 0 + }, + { + "doc_id": 1, + "facet": 0 + }, + { + "doc_id": 2, + "facet": 0 + }, + { + "doc_id": 3, + "facet": 1 + }, + { + "doc_id": 4, + "facet": 1 + }, + { + "doc_id": 5, + "facet": 1 + }, + { + "doc_id": 9, + "facet": 1.5 + }, + { + "doc_id": 6, + "facet": 2 + }, + { + "doc_id": 7, + "facet": 2 + }, + { + "doc_id": 8, + "facet": 2 + } + ] + "###); + }) + .await; +} diff --git a/meilisearch/tests/logs/error.rs b/meilisearch/tests/logs/error.rs index 4f4d741e3..93dcccd66 100644 --- a/meilisearch/tests/logs/error.rs +++ b/meilisearch/tests/logs/error.rs @@ -89,7 +89,7 @@ async fn logs_stream_bad_mode() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Unknown value `tamo` at `.mode`: expected one of `human`, `profile`", + "message": "Unknown value `tamo` at `.mode`: expected one of `human`, `json`, `profile`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" @@ -146,7 +146,7 @@ async fn logs_stream_bad_profile_memory() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Unknown value `fmt` at `.mode`: expected one of `human`, `profile`", + "message": "Unknown value `fmt` at `.mode`: expected one of `human`, `json`, `profile`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" @@ -162,7 +162,7 @@ async fn logs_stream_without_enabling_the_route() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "getting logs through the `/logs/stream` route requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721", + "message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721", "code": "feature_not_enabled", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#feature_not_enabled" @@ -173,7 +173,18 @@ async fn logs_stream_without_enabling_the_route() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "getting logs through the `/logs/stream` route requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721", + "message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + let (response, code) = server.service.post("/logs/stderr", json!({})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721", "code": "feature_not_enabled", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#feature_not_enabled" diff --git a/meilisearch/tests/logs/mod.rs b/meilisearch/tests/logs/mod.rs index 0002fe33c..3b36d78f8 100644 --- a/meilisearch/tests/logs/mod.rs +++ b/meilisearch/tests/logs/mod.rs @@ -5,7 +5,7 @@ use std::str::FromStr; use actix_web::http::header::ContentType; use meili_snap::snapshot; -use meilisearch::{analytics, create_app, Opt}; +use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer}; use tracing::level_filters::LevelFilter; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::Layer; @@ -27,18 +27,25 @@ async fn basic_test_log_stream_route() { tracing_subscriber::reload::Layer::new(None.with_filter( tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF), )); + let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new( + (Box::new( + tracing_subscriber::fmt::layer() + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE), + ) as Box + Send + Sync>) + .with_filter(tracing_subscriber::filter::Targets::new()), + ); let subscriber = tracing_subscriber::registry().with(route_layer).with( tracing_subscriber::fmt::layer() .with_span_events(tracing_subscriber::fmt::format::FmtSpan::ACTIVE) - .with_filter(tracing_subscriber::filter::LevelFilter::from_str("INFO").unwrap()), + .with_filter(tracing_subscriber::filter::LevelFilter::from_str("OFF").unwrap()), ); let app = actix_web::test::init_service(create_app( server.service.index_scheduler.clone().into(), server.service.auth.clone().into(), server.service.options.clone(), - route_layer_handle, + (route_layer_handle, stderr_layer_handle), analytics::MockAnalytics::new(&server.service.options), true, )) diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index a18a93edb..b01ef3d5a 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use actix_http::body::MessageBody; use actix_web::dev::{ServiceFactory, ServiceResponse}; use actix_web::web::{Bytes, Data}; -use actix_web::{post, App, HttpResponse, HttpServer}; +use actix_web::{post, App, HttpRequest, HttpResponse, HttpServer}; use meili_snap::{json_string, snapshot}; use meilisearch::Opt; use tokio::sync::mpsc; @@ -17,7 +17,17 @@ use crate::common::{default_settings, Server}; use crate::json; #[post("/")] -async fn forward_body(sender: Data>>, body: Bytes) -> HttpResponse { +async fn forward_body( + req: HttpRequest, + sender: Data>>, + body: Bytes, +) -> HttpResponse { + let headers = req.headers(); + assert_eq!(headers.get("content-type").unwrap(), "application/x-ndjson"); + assert_eq!(headers.get("transfer-encoding").unwrap(), "chunked"); + assert_eq!(headers.get("accept-encoding").unwrap(), "gzip"); + assert_eq!(headers.get("content-encoding").unwrap(), "gzip"); + let body = body.to_vec(); sender.send(body).unwrap(); HttpResponse::Ok().into() diff --git a/meilitool/src/uuid_codec.rs b/meilitool/src/uuid_codec.rs index 54020fa3c..92dc70b0c 100644 --- a/meilitool/src/uuid_codec.rs +++ b/meilitool/src/uuid_codec.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::convert::TryInto; use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode}; use uuid::Uuid; diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 4bc05d2cc..1dfa495ea 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ bincode = "1.3.3" bstr = "1.9.0" bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.8.5", default-features = false } +charabia = { version = "0.8.7", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.11" deserr = "0.6.1" @@ -70,13 +70,13 @@ itertools = "0.11.0" # profiling puffin = "0.16.0" -# logging -logging_timer = "1.1.0" csv = "1.3.0" candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } candle-transformers = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } candle-nn = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } -tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = ["onig"] } +tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = [ + "onig", +] } hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [ "online", ] } @@ -102,7 +102,16 @@ meili-snap = { path = "../meili-snap" } rand = { version = "0.8.5", features = ["small_rng"] } [features] -all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer"] +all-tokenizations = [ + "charabia/chinese", + "charabia/hebrew", + "charabia/japanese", + "charabia/thai", + "charabia/korean", + "charabia/greek", + "charabia/khmer", + "charabia/vietnamese", +] # Use POSIX semaphores instead of SysV semaphores in LMDB # For more information on this feature, see heed's Cargo.toml @@ -130,5 +139,7 @@ greek = ["charabia/greek"] # allow khmer specialized tokenization khmer = ["charabia/khmer"] +vietnamese = ["charabia/vietnamese"] + # allow CUDA support, see cuda = ["candle-core/cuda"] diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index e7bafaf70..02528e378 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -15,7 +15,7 @@ pub struct BucketSortOutput { // TODO: would probably be good to regroup some of these inside of a struct? #[allow(clippy::too_many_arguments)] -#[logging_timer::time] +#[tracing::instrument(level = "trace", skip_all, target = "search::bucket_sort")] pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, mut ranking_rules: Vec>, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 7b3b1d5b2..ae661e3f6 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -191,7 +191,7 @@ fn resolve_maximally_reduced_query_graph( Ok(docids) } -#[logging_timer::time] +#[tracing::instrument(level = "trace", skip_all, target = "search")] fn resolve_universe( ctx: &mut SearchContext, initial_universe: &RoaringBitmap, @@ -557,7 +557,7 @@ pub fn execute_vector_search( } #[allow(clippy::too_many_arguments)] -#[logging_timer::time] +#[tracing::instrument(level = "trace", skip_all, target = "search")] pub fn execute_search( ctx: &mut SearchContext, query: Option<&str>, @@ -577,6 +577,9 @@ pub fn execute_search( let mut located_query_terms = None; let query_terms = if let Some(query) = query { + let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder"); + let entered = span.enter(); + // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut tokbuilder = TokenizerBuilder::new(); @@ -605,7 +608,12 @@ pub fn execute_search( } let tokenizer = tokbuilder.build(); + drop(entered); + + let span = tracing::trace_span!(target: "search::tokens", "tokenize"); + let entered = span.enter(); let tokens = tokenizer.tokenize(query); + drop(entered); let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?; if query_terms.is_empty() { diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index d5dfbbcd0..02754929a 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -6,9 +6,10 @@ use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use heed::types::DecodeIgnore; -use super::*; +use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm}; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; -use crate::search::new::query_term::TwoTypoTerm; +use crate::search::new::interner::{DedupInterner, Interned}; +use crate::search::new::query_term::{Lazy, TwoTypoTerm}; use crate::search::new::{limits, SearchContext}; use crate::search::{build_dfa, get_first}; use crate::{Result, MAX_WORD_LENGTH}; diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 6760c8be7..a37e60ed0 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -7,7 +7,6 @@ use std::collections::BTreeSet; use std::iter::FromIterator; use std::ops::RangeInclusive; -use compute_derivations::partially_initialized_term_from_word; use either::Either; pub use ntypo_subset::NTypoTermSubset; pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed}; diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 865075d97..ea997a41a 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -1,11 +1,15 @@ +use std::collections::BTreeSet; + use charabia::normalizer::NormalizedTokenIter; use charabia::{SeparatorKind, TokenKind}; -use super::*; +use super::compute_derivations::partially_initialized_term_from_word; +use super::{LocatedQueryTerm, ZeroTypoTerm}; +use crate::search::new::query_term::{Lazy, Phrase, QueryTerm}; use crate::{Result, SearchContext, MAX_WORD_LENGTH}; /// Convert the tokenised search query into a list of located query terms. -#[logging_timer::time] +#[tracing::instrument(level = "trace", skip_all, target = "search::query")] pub fn located_query_terms_from_tokens( ctx: &mut SearchContext, query: NormalizedTokenIter, @@ -225,7 +229,7 @@ pub fn make_ngram( } struct PhraseBuilder { - words: Vec>>, + words: Vec>>, start: u16, end: u16, } diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 3bd4cf5f5..888b1c4eb 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,7 +1,7 @@ use std::fs::File; use std::io::BufReader; -use grenad::CompressionType; +use grenad::{CompressionType, Merger}; use heed::types::Bytes; use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn}; use roaring::RoaringBitmap; @@ -14,6 +14,7 @@ use crate::heed_codec::facet::{ use crate::heed_codec::BytesRefCodec; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; +use crate::update::MergeFn; use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result}; /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases @@ -28,7 +29,7 @@ pub struct FacetsUpdateBulk<'i> { facet_type: FacetType, field_ids: Vec, // None if level 0 does not need to be updated - delta_data: Option>>, + delta_data: Option, MergeFn>>, } impl<'i> FacetsUpdateBulk<'i> { @@ -36,7 +37,7 @@ impl<'i> FacetsUpdateBulk<'i> { index: &'i Index, field_ids: Vec, facet_type: FacetType, - delta_data: grenad::Reader>, + delta_data: Merger, MergeFn>, group_size: u8, min_level_size: u8, ) -> FacetsUpdateBulk<'i> { @@ -65,7 +66,7 @@ impl<'i> FacetsUpdateBulk<'i> { } } - #[logging_timer::time("FacetsUpdateBulk::{}")] + #[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::bulk")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self; @@ -89,7 +90,7 @@ impl<'i> FacetsUpdateBulk<'i> { /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type pub(crate) struct FacetsUpdateBulkInner { pub db: heed::Database, FacetGroupValueCodec>, - pub delta_data: Option>, + pub delta_data: Option>, pub group_size: u8, pub min_level_size: u8, } @@ -129,8 +130,8 @@ impl FacetsUpdateBulkInner { if self.db.is_empty(wtxn)? { let mut buffer = Vec::new(); let mut database = self.db.iter_mut(wtxn)?.remap_types::(); - let mut cursor = delta_data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = delta_data.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { if !valid_lmdb_key(key) { continue; } @@ -154,8 +155,8 @@ impl FacetsUpdateBulkInner { let mut buffer = Vec::new(); let database = self.db.remap_types::(); - let mut cursor = delta_data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = delta_data.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { if !valid_lmdb_key(key) { continue; } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 78db218e3..798e0fe3d 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,6 +1,7 @@ use std::fs::File; use std::io::BufReader; +use grenad::Merger; use heed::types::{Bytes, DecodeIgnore}; use heed::{BytesDecode, Error, RoTxn, RwTxn}; use obkv::KvReader; @@ -14,31 +15,56 @@ use crate::heed_codec::BytesRefCodec; use crate::search::facet::get_highest_level; use crate::update::del_add::DelAdd; use crate::update::index_documents::valid_lmdb_key; +use crate::update::MergeFn; use crate::{CboRoaringBitmapCodec, Index, Result}; -enum InsertionResult { +/// Enum used as a return value for the facet incremental indexing. +/// +/// - `ModificationResult::InPlace` means that modifying the `facet_value` into the `level` did not have +/// an effect on the number of keys in that level. Therefore, it did not increase the number of children +/// of the parent node. +/// +/// - `ModificationResult::Insert` means that modifying the `facet_value` into the `level` resulted +/// in the addition of a new key in that level, and that therefore the number of children +/// of the parent node should be incremented. +/// +/// - `ModificationResult::Remove` means that modifying the `facet_value` into the `level` resulted in a change in the +/// number of keys in the level. For example, removing a document id from the facet value `3` could +/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted +/// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must +/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. +/// +/// - `ModificationResult::Reduce/Expand` means that modifying the `facet_value` into the `level` resulted in a change in the +/// bounds of the keys of the level. For example, removing a document id from the facet value +/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, +/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). +/// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust +/// its left bound as well. +/// +/// - `ModificationResult::Nothing` means that modifying the `facet_value` didn't have any impact into the `level`. +/// This case is reachable when a document id is removed from a sub-level node but is still present in another one. +/// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` but should remain in the group node [1..4] in `level 1`. +enum ModificationResult { InPlace, Expand, Insert, -} -enum DeletionResult { - InPlace, Reduce { next: Option> }, Remove { next: Option> }, + Nothing, } /// Algorithm to incrementally insert and delete elememts into the /// `facet_id_(string/f64)_docids` databases. pub struct FacetsUpdateIncremental { inner: FacetsUpdateIncrementalInner, - delta_data: grenad::Reader>, + delta_data: Merger, MergeFn>, } impl FacetsUpdateIncremental { pub fn new( index: &Index, facet_type: FacetType, - delta_data: grenad::Reader>, + delta_data: Merger, MergeFn>, group_size: u8, min_level_size: u8, max_group_size: u8, @@ -61,34 +87,59 @@ impl FacetsUpdateIncremental { } } + #[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::incremental")] pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> { - let mut cursor = self.delta_data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut current_field_id = None; + let mut facet_level_may_be_updated = false; + let mut iter = self.delta_data.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { if !valid_lmdb_key(key) { continue; } + let key = FacetGroupKeyCodec::::bytes_decode(key) .map_err(heed::Error::Encoding)?; - let value = KvReader::new(value); + if facet_level_may_be_updated + && current_field_id.map_or(false, |fid| fid != key.field_id) + { + // Only add or remove a level after making all the field modifications. + self.inner.add_or_delete_level(wtxn, current_field_id.unwrap())?; + facet_level_may_be_updated = false; + } + current_field_id = Some(key.field_id); + + let value = KvReader::new(value); let docids_to_delete = value .get(DelAdd::Deletion) .map(CboRoaringBitmapCodec::bytes_decode) - .map(|o| o.map_err(heed::Error::Encoding)); + .map(|o| o.map_err(heed::Error::Encoding)) + .transpose()?; let docids_to_add = value .get(DelAdd::Addition) .map(CboRoaringBitmapCodec::bytes_decode) - .map(|o| o.map_err(heed::Error::Encoding)); + .map(|o| o.map_err(heed::Error::Encoding)) + .transpose()?; - if let Some(docids_to_delete) = docids_to_delete { - let docids_to_delete = docids_to_delete?; - self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?; + let level_size_changed = self.inner.modify( + wtxn, + key.field_id, + key.left_bound, + docids_to_add.as_ref(), + docids_to_delete.as_ref(), + )?; + + if level_size_changed { + // if a node has been added or removed from the highest level, + // we may have to update the facet level. + facet_level_may_be_updated = true; } + } - if let Some(docids_to_add) = docids_to_add { - let docids_to_add = docids_to_add?; - self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?; + if let Some(field_id) = current_field_id { + if facet_level_may_be_updated { + self.inner.add_or_delete_level(wtxn, field_id)?; } } @@ -162,138 +213,78 @@ impl FacetsUpdateIncrementalInner { /// /// ## Return /// See documentation of `insert_in_level` - fn insert_in_level_0( + fn modify_in_level_0( &self, txn: &mut RwTxn, field_id: u16, facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { + add_docids: Option<&RoaringBitmap>, + del_docids: Option<&RoaringBitmap>, + ) -> Result { let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; - let value = FacetGroupValue { bitmap: docids.clone(), size: 1 }; - let mut level0_prefix = vec![]; - level0_prefix.extend_from_slice(&field_id.to_be_bytes()); - level0_prefix.push(0); - - let mut iter = - self.db.remap_types::().prefix_iter(txn, &level0_prefix)?; - - if iter.next().is_none() { - drop(iter); - self.db.put(txn, &key, &value)?; - Ok(InsertionResult::Insert) - } else { - drop(iter); - let old_value = self.db.get(txn, &key)?; - match old_value { - Some(mut updated_value) => { - // now merge the two - updated_value.bitmap |= value.bitmap; - self.db.put(txn, &key, &updated_value)?; - Ok(InsertionResult::InPlace) - } - None => { + let old_value = self.db.get(txn, &key)?; + match (old_value, add_docids, del_docids) { + // Addition + deletion on an existing value + (Some(FacetGroupValue { bitmap, .. }), Some(add_docids), Some(del_docids)) => { + let value = FacetGroupValue { bitmap: (bitmap - del_docids) | add_docids, size: 1 }; + self.db.put(txn, &key, &value)?; + Ok(ModificationResult::InPlace) + } + // Addition on an existing value + (Some(FacetGroupValue { bitmap, .. }), Some(add_docids), None) => { + let value = FacetGroupValue { bitmap: bitmap | add_docids, size: 1 }; + self.db.put(txn, &key, &value)?; + Ok(ModificationResult::InPlace) + } + // Addition of a new value (ignore deletion) + (None, Some(add_docids), _) => { + let value = FacetGroupValue { bitmap: add_docids.clone(), size: 1 }; + self.db.put(txn, &key, &value)?; + Ok(ModificationResult::Insert) + } + // Deletion on an existing value, fully delete the key if the resulted value is empty. + (Some(FacetGroupValue { mut bitmap, .. }), None, Some(del_docids)) => { + bitmap -= del_docids; + if bitmap.is_empty() { + // Full deletion + let mut next_key = None; + if let Some((next, _)) = + self.db.remap_data_type::().get_greater_than(txn, &key)? + { + if next.field_id == field_id && next.level == 0 { + next_key = Some(next.left_bound.to_vec()); + } + } + self.db.delete(txn, &key)?; + Ok(ModificationResult::Remove { next: next_key }) + } else { + // Partial deletion + let value = FacetGroupValue { bitmap, size: 1 }; self.db.put(txn, &key, &value)?; - Ok(InsertionResult::Insert) + Ok(ModificationResult::InPlace) } } + // Otherwise do nothing (None + no addition + deletion == Some + no addition + no deletion == Nothing), + // may be unreachable at some point. + (None, None, _) | (Some(_), None, None) => Ok(ModificationResult::Nothing), } } - /// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`. - /// This function works recursively. + /// Split a level node into two balanced nodes. /// - /// ## Return - /// Returns the effect of adding the facet value to the database on the given `level`. - /// - /// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have - /// an effect on the number of keys in that level. Therefore, it did not increase the number of children - /// of the parent node. - /// - /// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted - /// in the addition of a new key in that level, and that therefore the number of children - /// of the parent node should be incremented. - fn insert_in_level( + /// # Return + /// Returns `ModificationResult::Insert` if the split is successful. + fn split_group( &self, txn: &mut RwTxn, field_id: u16, level: u8, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { - if level == 0 { - return self.insert_in_level_0(txn, field_id, facet_value, docids); - } - - let max_group_size = self.max_group_size; - - let result = self.insert_in_level(txn, field_id, level - 1, facet_value, docids)?; - // level below inserted an element - - let (insertion_key, insertion_value) = - self.find_insertion_key_value(field_id, level, facet_value, txn)?; - - match result { - // because we know that we inserted in place, the facet_value is not a new one - // thus it doesn't extend a group, and thus the insertion key computed above is - // still correct - InsertionResult::InPlace => { - let mut updated_value = insertion_value; - updated_value.bitmap |= docids; - self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; - - return Ok(InsertionResult::InPlace); - } - InsertionResult::Expand => {} - InsertionResult::Insert => {} - } - - // Here we know that inserting the facet value in the level below resulted in the creation - // of a new key. Therefore, it may be the case that we need to modify the left bound of the - // insertion key (see documentation of `find_insertion_key_value` for an example of when that - // could happen). - let (insertion_key, insertion_key_was_modified) = { - let mut new_insertion_key = insertion_key.clone(); - let mut key_should_be_modified = false; - - if facet_value < insertion_key.left_bound.as_slice() { - new_insertion_key.left_bound = facet_value.to_vec(); - key_should_be_modified = true; - } - if key_should_be_modified { - let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; - assert!(is_deleted); - self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; - } - (new_insertion_key, key_should_be_modified) - }; - // Now we know that the insertion key contains the `facet_value`. - - // We still need to update the insertion value by: - // 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`) - // 2. Merge the previous docids with the new one - let mut updated_value = insertion_value; - - if matches!(result, InsertionResult::Insert) { - updated_value.size += 1; - } - - if updated_value.size < max_group_size { - updated_value.bitmap |= docids; - self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; - if insertion_key_was_modified { - return Ok(InsertionResult::Expand); - } else { - return Ok(InsertionResult::InPlace); - } - } - - // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` - // Therefore it must be split into two nodes. - - let size_left = updated_value.size / 2; - let size_right = updated_value.size - size_left; + insertion_key: FacetGroupKey>, + insertion_value: FacetGroupValue, + ) -> Result { + let size_left = insertion_value.size / 2; + let size_right = insertion_value.size - size_left; let level_below = level - 1; @@ -347,34 +338,228 @@ impl FacetsUpdateIncrementalInner { self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; - Ok(InsertionResult::Insert) + Ok(ModificationResult::Insert) } - /// Insert the given facet value and corresponding document ids in the database. - pub fn insert( + /// Remove the docids still present in the related sub-level nodes from the del_docids. + /// + /// This process is needed to avoid removing docids from a group node where the docid is present in several sub-nodes. + fn trim_del_docids<'a>( + &self, + txn: &mut RwTxn, + field_id: u16, + level: u8, + insertion_key: &FacetGroupKey>, + insertion_value_size: usize, + del_docids: &'a RoaringBitmap, + ) -> Result> { + let level_below = level - 1; + + let start_key = FacetGroupKey { + field_id, + level: level_below, + left_bound: insertion_key.left_bound.as_slice(), + }; + + let mut del_docids = std::borrow::Cow::Borrowed(del_docids); + let iter = self.db.range(txn, &(start_key..))?.take(insertion_value_size); + for next in iter { + let (_, value) = next?; + // if a sublevel bitmap as common docids with del_docids, + // then these docids shouldn't be removed and so, remove them from the deletion list. + if !value.bitmap.is_disjoint(&del_docids) { + *del_docids.to_mut() -= value.bitmap; + } + } + + Ok(del_docids) + } + + /// Modify the given facet value and corresponding document ids in all the levels of the database up to the given `level`. + /// This function works recursively. + /// + /// ## Return + /// Returns the effect of modifying the facet value to the database on the given `level`. + /// + fn modify_in_level( + &self, + txn: &mut RwTxn, + field_id: u16, + level: u8, + facet_value: &[u8], + add_docids: Option<&RoaringBitmap>, + del_docids: Option<&RoaringBitmap>, + ) -> Result { + if level == 0 { + return self.modify_in_level_0(txn, field_id, facet_value, add_docids, del_docids); + } + + let result = + self.modify_in_level(txn, field_id, level - 1, facet_value, add_docids, del_docids)?; + // level below inserted an element + + if let ModificationResult::Nothing = result { + // if the previous level has not been modified, + // early return ModificationResult::Nothing. + return Ok(ModificationResult::Nothing); + } + + let (insertion_key, insertion_value) = + self.find_insertion_key_value(field_id, level, facet_value, txn)?; + let insertion_value_size = insertion_value.size as usize; + + let mut insertion_value_was_modified = false; + let mut updated_value = insertion_value; + + if let ModificationResult::Insert = result { + // if a key has been inserted in the sub-level raise the value size. + updated_value.size += 1; + insertion_value_was_modified = true; + } else if let ModificationResult::Remove { .. } = result { + if updated_value.size <= 1 { + // if the only remaining node is the one to delete, + // delete the key instead and early return. + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + return Ok(result); + } else { + // Reduce the value size + updated_value.size -= 1; + insertion_value_was_modified = true; + } + } + + let (insertion_key, insertion_key_modification) = + if let ModificationResult::InPlace = result { + (insertion_key, ModificationResult::InPlace) + } else { + // Inserting or deleting the facet value in the level below resulted in the creation + // of a new key. Therefore, it may be the case that we need to modify the left bound of the + // insertion key (see documentation of `find_insertion_key_value` for an example of when that + // could happen). + let mut new_insertion_key = insertion_key.clone(); + let mut key_modification = ModificationResult::InPlace; + + if let ModificationResult::Remove { next } | ModificationResult::Reduce { next } = + result + { + // if the deleted facet_value is the left_bound of the current node, + // the left_bound should be updated reducing the current node. + let reduced_range = facet_value == insertion_key.left_bound; + if reduced_range { + new_insertion_key.left_bound = next.clone().unwrap(); + key_modification = ModificationResult::Reduce { next }; + } + } else if facet_value < insertion_key.left_bound.as_slice() { + // if the added facet_value is the under the left_bound of the current node, + // the left_bound should be updated expanding the current node. + new_insertion_key.left_bound = facet_value.to_vec(); + key_modification = ModificationResult::Expand; + } + + if matches!( + key_modification, + ModificationResult::Expand | ModificationResult::Reduce { .. } + ) { + // if the node should be updated, delete it, it will be recreated using a new key later. + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + } + (new_insertion_key, key_modification) + }; + + if updated_value.size < self.max_group_size { + // If there are docids to delete, trim them avoiding unexpected removal. + if let Some(del_docids) = del_docids + .map(|ids| { + self.trim_del_docids( + txn, + field_id, + level, + &insertion_key, + insertion_value_size, + ids, + ) + }) + .transpose()? + .filter(|ids| !ids.is_empty()) + { + updated_value.bitmap -= &*del_docids; + insertion_value_was_modified = true; + } + + if let Some(add_docids) = add_docids { + updated_value.bitmap |= add_docids; + insertion_value_was_modified = true; + } + + if insertion_value_was_modified + || matches!( + insertion_key_modification, + ModificationResult::Expand | ModificationResult::Reduce { .. } + ) + { + // if any modification occured, insert it in the database. + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; + Ok(insertion_key_modification) + } else { + // this case is reachable when a docid is removed from a sub-level node but is still present in another one. + // For instance, a document containing 2 and 3, if 2 is removed, the docid should remain in the group node [1..4]. + Ok(ModificationResult::Nothing) + } + } else { + // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` + // Therefore it must be split into two nodes. + self.split_group(txn, field_id, level, insertion_key, updated_value) + } + } + + /// Modify the given facet value and corresponding document ids in the database. + /// If no more document ids correspond to the facet value, delete it completely. + /// + /// ## Return + /// Returns `true` if some tree-nodes of the highest level have been removed or added implying a potential + /// addition or deletion of a facet level. + /// Otherwise returns `false` if the tree-nodes have been modified in place. + pub fn modify( &self, txn: &mut RwTxn, field_id: u16, facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result<()> { - if docids.is_empty() { - return Ok(()); + add_docids: Option<&RoaringBitmap>, + del_docids: Option<&RoaringBitmap>, + ) -> Result { + if add_docids.map_or(true, RoaringBitmap::is_empty) + && del_docids.map_or(true, RoaringBitmap::is_empty) + { + return Ok(false); } - let group_size = self.group_size; let highest_level = get_highest_level(txn, self.db, field_id)?; - let result = self.insert_in_level(txn, field_id, highest_level, facet_value, docids)?; + let result = self.modify_in_level( + txn, + field_id, + highest_level, + facet_value, + add_docids, + del_docids, + )?; match result { - InsertionResult::InPlace => return Ok(()), - InsertionResult::Expand => return Ok(()), - InsertionResult::Insert => {} + ModificationResult::InPlace + | ModificationResult::Expand + | ModificationResult::Nothing + | ModificationResult::Reduce { .. } => Ok(false), + ModificationResult::Insert | ModificationResult::Remove { .. } => Ok(true), } + } - // Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`. - // If it has, we must build an addition level above it. - + /// Check whether the highest level has exceeded `min_level_size` * `self.group_size`. + /// If it has, we must build an addition level above it. + /// Then check whether the highest level is under `min_level_size`. + /// If it has, we must remove the complete level. + pub(crate) fn add_or_delete_level(&self, txn: &mut RwTxn, field_id: u16) -> Result<()> { + let highest_level = get_highest_level(txn, self.db, field_id)?; let mut highest_level_prefix = vec![]; highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); highest_level_prefix.push(highest_level); @@ -382,14 +567,48 @@ impl FacetsUpdateIncrementalInner { let size_highest_level = self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?.count(); - if size_highest_level < self.group_size as usize * self.min_level_size as usize { - return Ok(()); + if size_highest_level >= self.group_size as usize * self.min_level_size as usize { + self.add_level(txn, field_id, highest_level, &highest_level_prefix, size_highest_level) + } else if size_highest_level < self.min_level_size as usize && highest_level != 0 { + self.delete_level(txn, &highest_level_prefix) + } else { + Ok(()) } + } + /// Delete a level. + fn delete_level(&self, txn: &mut RwTxn, highest_level_prefix: &[u8]) -> Result<()> { + let mut to_delete = vec![]; + let mut iter = + self.db.remap_types::().prefix_iter(txn, highest_level_prefix)?; + for el in iter.by_ref() { + let (k, _) = el?; + to_delete.push( + FacetGroupKeyCodec::::bytes_decode(k) + .map_err(Error::Encoding)? + .into_owned(), + ); + } + drop(iter); + for k in to_delete { + self.db.delete(txn, &k.as_ref())?; + } + Ok(()) + } + + /// Build an additional level for the field id. + fn add_level( + &self, + txn: &mut RwTxn, + field_id: u16, + highest_level: u8, + highest_level_prefix: &[u8], + size_highest_level: usize, + ) -> Result<()> { let mut groups_iter = self .db .remap_types::() - .prefix_iter(txn, &highest_level_prefix)?; + .prefix_iter(txn, highest_level_prefix)?; let nbr_new_groups = size_highest_level / self.group_size as usize; let nbr_leftover_elements = size_highest_level % self.group_size as usize; @@ -398,7 +617,7 @@ impl FacetsUpdateIncrementalInner { for _ in 0..nbr_new_groups { let mut first_key = None; let mut values = RoaringBitmap::new(); - for _ in 0..group_size { + for _ in 0..self.group_size { let (key_bytes, value_i) = groups_iter.next().unwrap()?; let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) .map_err(Error::Encoding)?; @@ -413,7 +632,7 @@ impl FacetsUpdateIncrementalInner { level: highest_level + 1, left_bound: first_key.unwrap().left_bound, }; - let value = FacetGroupValue { size: group_size, bitmap: values }; + let value = FacetGroupValue { size: self.group_size, bitmap: values }; to_add.push((key.into_owned(), value)); } // now we add the rest of the level, in case its size is > group_size * min_level_size @@ -448,173 +667,6 @@ impl FacetsUpdateIncrementalInner { } Ok(()) } - - /// Delete the given document id from the given facet value in the database, from level 0 to the - /// the given level. - /// - /// ## Return - /// Returns the effect of removing the document id from the database on the given `level`. - /// - /// - `DeletionResult::InPlace` means that deleting the document id did not have - /// an effect on the keys in that level. - /// - /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the - /// number of keys in the level. For example, removing a document id from the facet value `3` could - /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted - /// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must - /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. - /// - /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the - /// bounds of the keys of the level. For example, removing a document id from the facet value - /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, - /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). - /// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust - /// its left bound as well. - fn delete_in_level( - &self, - txn: &mut RwTxn, - field_id: u16, - level: u8, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { - if level == 0 { - return self.delete_in_level_0(txn, field_id, facet_value, docids); - } - let (deletion_key, mut bitmap) = - self.find_insertion_key_value(field_id, level, facet_value, txn)?; - - let result = self.delete_in_level(txn, field_id, level - 1, facet_value, docids)?; - - let mut decrease_size = false; - let next_key = match result { - DeletionResult::InPlace => { - bitmap.bitmap -= docids; - self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; - return Ok(DeletionResult::InPlace); - } - DeletionResult::Reduce { next } => next, - DeletionResult::Remove { next } => { - decrease_size = true; - next - } - }; - // If either DeletionResult::Reduce or DeletionResult::Remove was returned, - // then we may need to adjust the left_bound of the deletion key. - - // If DeletionResult::Remove was returned, then we need to decrease the group - // size of the deletion key. - let mut updated_value = bitmap; - if decrease_size { - updated_value.size -= 1; - } - - if updated_value.size == 0 { - self.db.delete(txn, &deletion_key.as_ref())?; - Ok(DeletionResult::Remove { next: next_key }) - } else { - let mut updated_deletion_key = deletion_key.clone(); - let reduced_range = facet_value == deletion_key.left_bound; - if reduced_range { - updated_deletion_key.left_bound = next_key.clone().unwrap(); - } - updated_value.bitmap -= docids; - let _ = self.db.delete(txn, &deletion_key.as_ref())?; - self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; - if reduced_range { - Ok(DeletionResult::Reduce { next: next_key }) - } else { - Ok(DeletionResult::InPlace) - } - } - } - - fn delete_in_level_0( - &self, - txn: &mut RwTxn, - field_id: u16, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { - let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; - let mut bitmap = self.db.get(txn, &key)?.unwrap().bitmap; - bitmap -= docids; - - if bitmap.is_empty() { - let mut next_key = None; - if let Some((next, _)) = - self.db.remap_data_type::().get_greater_than(txn, &key)? - { - if next.field_id == field_id && next.level == 0 { - next_key = Some(next.left_bound.to_vec()); - } - } - self.db.delete(txn, &key)?; - Ok(DeletionResult::Remove { next: next_key }) - } else { - self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?; - Ok(DeletionResult::InPlace) - } - } - - pub fn delete( - &self, - txn: &mut RwTxn, - field_id: u16, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result<()> { - if self - .db - .remap_data_type::() - .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })? - .is_none() - { - return Ok(()); - } - let highest_level = get_highest_level(txn, self.db, field_id)?; - - let result = self.delete_in_level(txn, field_id, highest_level, facet_value, docids)?; - match result { - DeletionResult::InPlace => return Ok(()), - DeletionResult::Reduce { .. } => return Ok(()), - DeletionResult::Remove { .. } => {} - } - - // if we either removed a key from the highest level, its size may have fallen - // below `min_level_size`, in which case we need to remove the entire level - - let mut highest_level_prefix = vec![]; - highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); - highest_level_prefix.push(highest_level); - - if highest_level == 0 - || self - .db - .remap_types::() - .prefix_iter(txn, &highest_level_prefix)? - .count() - >= self.min_level_size as usize - { - return Ok(()); - } - let mut to_delete = vec![]; - let mut iter = - self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?; - for el in iter.by_ref() { - let (k, _) = el?; - to_delete.push( - FacetGroupKeyCodec::::bytes_decode(k) - .map_err(Error::Encoding)? - .into_owned(), - ); - } - drop(iter); - for k in to_delete { - self.db.delete(txn, &k.as_ref())?; - } - Ok(()) - } } impl<'a> FacetGroupKey<&'a [u8]> { diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 400507c97..0af64c4c5 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -79,12 +79,9 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5; use std::collections::BTreeSet; use std::fs::File; use std::io::BufReader; -use std::iter::FromIterator; -use charabia::normalizer::{Normalize, NormalizerOption}; -use grenad::{CompressionType, SortAlgorithm}; -use heed::types::{Bytes, DecodeIgnore, SerdeJson}; -use heed::BytesEncode; +use grenad::Merger; +use heed::types::{Bytes, DecodeIgnore}; use time::OffsetDateTime; use tracing::debug; @@ -93,9 +90,9 @@ use super::FacetsUpdateBulk; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::BytesRefCodec; -use crate::update::index_documents::create_sorter; -use crate::update::merge_btreeset_string; -use crate::{BEU16StrCodec, Index, Result, MAX_FACET_VALUE_LENGTH}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; +use crate::update::MergeFn; +use crate::{try_split_array_at, FieldId, Index, Result}; pub mod bulk; pub mod incremental; @@ -108,16 +105,20 @@ pub struct FacetsUpdate<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, - delta_data: grenad::Reader>, + delta_data: Merger, MergeFn>, + normalized_delta_data: Option, MergeFn>>, group_size: u8, max_group_size: u8, min_level_size: u8, + data_size: u64, } impl<'i> FacetsUpdate<'i> { pub fn new( index: &'i Index, facet_type: FacetType, - delta_data: grenad::Reader>, + delta_data: Merger, MergeFn>, + normalized_delta_data: Option, MergeFn>>, + data_size: u64, ) -> Self { let database = match facet_type { FacetType::String => { @@ -135,18 +136,20 @@ impl<'i> FacetsUpdate<'i> { min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, delta_data, + normalized_delta_data, + data_size, } } pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { - if self.delta_data.is_empty() { + if self.data_size == 0 { return Ok(()); } debug!("Computing and writing the facet values levels docids into LMDB on disk..."); self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // See self::comparison_bench::benchmark_facet_indexing - if self.delta_data.len() >= (self.database.len(wtxn)? / 50) { + if self.data_size >= (self.database.len(wtxn)? / 500) { let field_ids = self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); let bulk_update = FacetsUpdateBulk::new( @@ -170,96 +173,110 @@ impl<'i> FacetsUpdate<'i> { incremental_update.execute(wtxn)?; } - // We clear the list of normalized-for-search facets - // and the previous FSTs to compute everything from scratch - self.index.facet_id_normalized_string_strings.clear(wtxn)?; - self.index.facet_id_string_fst.clear(wtxn)?; - - // As we can't use the same write transaction to read and write in two different databases - // we must create a temporary sorter that we will write into LMDB afterward. - // As multiple unnormalized facet values can become the same normalized facet value - // we must merge them together. - let mut sorter = create_sorter( - SortAlgorithm::Unstable, - merge_btreeset_string, - CompressionType::None, - None, - None, - None, - ); - - // We iterate on the list of original, semi-normalized, facet values - // and normalize them for search, inserting them in LMDB in any given order. - let options = NormalizerOption { lossy: true, ..Default::default() }; - let database = self.index.facet_id_string_docids.remap_data_type::(); - for result in database.iter(wtxn)? { - let (facet_group_key, ()) = result?; - if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key { - let mut normalized_facet = left_bound.normalize(&options); - let normalized_truncated_facet: String; - if normalized_facet.len() > MAX_FACET_VALUE_LENGTH { - normalized_truncated_facet = normalized_facet - .char_indices() - .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); - normalized_facet = normalized_truncated_facet.into(); - } - let set = BTreeSet::from_iter(std::iter::once(left_bound)); - let key = (field_id, normalized_facet.as_ref()); - let key = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; - let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; - sorter.insert(key, val)?; - } + match self.normalized_delta_data { + Some(data) => index_facet_search(wtxn, data, self.index), + None => Ok(()), } - - // In this loop we don't need to take care of merging bitmaps - // as the grenad sorter already merged them for us. - let mut merger_iter = sorter.into_stream_merger_iter()?; - while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? { - self.index.facet_id_normalized_string_strings.remap_types::().put( - wtxn, - key_bytes, - btreeset_bytes, - )?; - } - - // We compute one FST by string facet - let mut text_fsts = vec![]; - let mut current_fst: Option<(u16, fst::SetBuilder>)> = None; - let database = - self.index.facet_id_normalized_string_strings.remap_data_type::(); - for result in database.iter(wtxn)? { - let ((field_id, normalized_facet), _) = result?; - current_fst = match current_fst.take() { - Some((fid, fst_builder)) if fid != field_id => { - let fst = fst_builder.into_set(); - text_fsts.push((fid, fst)); - Some((field_id, fst::SetBuilder::memory())) - } - Some((field_id, fst_builder)) => Some((field_id, fst_builder)), - None => Some((field_id, fst::SetBuilder::memory())), - }; - - if let Some((_, fst_builder)) = current_fst.as_mut() { - fst_builder.insert(normalized_facet)?; - } - } - - if let Some((field_id, fst_builder)) = current_fst { - let fst = fst_builder.into_set(); - text_fsts.push((field_id, fst)); - } - - // We write those FSTs in LMDB now - for (field_id, fst) in text_fsts { - self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?; - } - - Ok(()) } } +fn index_facet_search( + wtxn: &mut heed::RwTxn, + normalized_delta_data: Merger, MergeFn>, + index: &Index, +) -> Result<()> { + let mut iter = normalized_delta_data.into_stream_merger_iter()?; + while let Some((key_bytes, delta_bytes)) = iter.next()? { + let deladd_reader = KvReaderDelAdd::new(delta_bytes); + + let database_set = index + .facet_id_normalized_string_strings + .remap_key_type::() + .get(wtxn, key_bytes)? + .unwrap_or_default(); + + let add_set = deladd_reader + .get(DelAdd::Addition) + .and_then(|bytes| serde_json::from_slice::>(bytes).ok()) + .unwrap_or_default(); + + let del_set = match deladd_reader + .get(DelAdd::Deletion) + .and_then(|bytes| serde_json::from_slice::>(bytes).ok()) + { + Some(del_set) => { + let (field_id_bytes, _) = try_split_array_at(key_bytes).unwrap(); + let field_id = FieldId::from_be_bytes(field_id_bytes); + let mut set = BTreeSet::new(); + for facet in del_set { + let key = FacetGroupKey { field_id, level: 0, left_bound: facet.as_str() }; + // Check if the referenced value doesn't exist anymore before deleting it. + if index + .facet_id_string_docids + .remap_data_type::() + .get(wtxn, &key)? + .is_none() + { + set.insert(facet); + } + } + set + } + None => BTreeSet::new(), + }; + + let set: BTreeSet<_> = + database_set.difference(&del_set).chain(add_set.iter()).cloned().collect(); + + if set.is_empty() { + index + .facet_id_normalized_string_strings + .remap_key_type::() + .delete(wtxn, key_bytes)?; + } else { + index + .facet_id_normalized_string_strings + .remap_key_type::() + .put(wtxn, key_bytes, &set)?; + } + } + + // We clear the FST of normalized-for-search to compute everything from scratch. + index.facet_id_string_fst.clear(wtxn)?; + // We compute one FST by string facet + let mut text_fsts = vec![]; + let mut current_fst: Option<(u16, fst::SetBuilder>)> = None; + let database = index.facet_id_normalized_string_strings.remap_data_type::(); + for result in database.iter(wtxn)? { + let ((field_id, normalized_facet), _) = result?; + current_fst = match current_fst.take() { + Some((fid, fst_builder)) if fid != field_id => { + let fst = fst_builder.into_set(); + text_fsts.push((fid, fst)); + Some((field_id, fst::SetBuilder::memory())) + } + Some((field_id, fst_builder)) => Some((field_id, fst_builder)), + None => Some((field_id, fst::SetBuilder::memory())), + }; + + if let Some((_, fst_builder)) = current_fst.as_mut() { + fst_builder.insert(normalized_facet)?; + } + } + + if let Some((field_id, fst_builder)) = current_fst { + let fst = fst_builder.into_set(); + text_fsts.push((field_id, fst)); + } + + // We write those FSTs in LMDB now + for (field_id, fst) in text_fsts { + index.facet_id_string_fst.put(wtxn, &field_id, &fst)?; + } + + Ok(()) +} + #[cfg(test)] pub(crate) mod test_helpers { use std::cell::Cell; @@ -268,6 +285,7 @@ pub(crate) mod test_helpers { use std::marker::PhantomData; use std::rc::Rc; + use grenad::MergerBuilder; use heed::types::Bytes; use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; use roaring::RoaringBitmap; @@ -280,7 +298,8 @@ pub(crate) mod test_helpers { use crate::search::facet::get_highest_level; use crate::snapshot_tests::display_bitmap; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; - use crate::update::FacetsUpdateIncrementalInner; + use crate::update::index_documents::merge_deladd_cbo_roaring_bitmaps; + use crate::update::{FacetsUpdateIncrementalInner, MergeFn}; use crate::CboRoaringBitmapCodec; /// Utility function to generate a string whose position in a lexicographically @@ -410,7 +429,8 @@ pub(crate) mod test_helpers { max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(key).unwrap(); - update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); + update.modify(wtxn, field_id, &key_bytes, Some(docids), None).unwrap(); + update.add_or_delete_level(wtxn, field_id).unwrap(); } pub fn delete_single_docid<'a>( &self, @@ -436,7 +456,8 @@ pub(crate) mod test_helpers { max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(key).unwrap(); - update.delete(wtxn, field_id, &key_bytes, docids).unwrap(); + update.modify(wtxn, field_id, &key_bytes, None, Some(docids)).unwrap(); + update.add_or_delete_level(wtxn, field_id).unwrap(); } pub fn bulk_insert<'a, 'b>( @@ -463,10 +484,13 @@ pub(crate) mod test_helpers { } writer.finish().unwrap(); let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + builder.push(reader.into_cursor().unwrap()); + let merger = builder.build(); let update = FacetsUpdateBulkInner { db: self.content, - delta_data: Some(reader), + delta_data: Some(merger), group_size: self.group_size.get(), min_level_size: self.min_level_size.get(), }; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index d568154b2..dc4886f00 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -26,7 +26,7 @@ pub fn extract_docid_word_positions( obkv_documents: grenad::Reader, indexer: GrenadParameters, searchable_fields: &Option>, - stop_words: Option<&fst::Set<&[u8]>>, + stop_words: Option<&fst::Set>>, allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: Option, @@ -181,11 +181,11 @@ fn searchable_fields_changed( /// Factorize tokenizer building. fn tokenizer_builder<'a>( - stop_words: Option<&'a fst::Set<&[u8]>>, + stop_words: Option<&'a fst::Set>>, allowed_separators: Option<&'a [&str]>, dictionary: Option<&'a [&str]>, script_language: Option<&'a HashMap>>, -) -> TokenizerBuilder<'a, &'a [u8]> { +) -> TokenizerBuilder<'a, Vec> { let mut tokenizer_builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { tokenizer_builder.stop_words(stop_words); @@ -211,7 +211,7 @@ fn lang_safe_tokens_from_document<'a>( obkv: &KvReader, searchable_fields: &Option>, tokenizer: &Tokenizer, - stop_words: Option<&fst::Set<&[u8]>>, + stop_words: Option<&fst::Set>>, allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: u32, diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index d14be7464..8fdd11ee7 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -1,15 +1,21 @@ +use std::collections::BTreeSet; use std::fs::File; use std::io::BufReader; +use std::iter::FromIterator; use std::{io, str}; +use charabia::normalizer::{Normalize, NormalizerOption}; +use heed::types::SerdeJson; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; -use crate::heed_codec::StrRefCodec; -use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; -use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps; -use crate::{FieldId, Result}; +use crate::heed_codec::{BEU16StrCodec, StrRefCodec}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::index_documents::helpers::{ + merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, +}; +use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet string and the documents ids where this facet string appear. /// @@ -19,10 +25,11 @@ use crate::{FieldId, Result}; pub fn extract_facet_string_docids( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, -) -> Result>> { +) -> Result<(grenad::Reader>, grenad::Reader>)> { puffin::profile_function!(); let max_memory = indexer.max_memory_by_thread(); + let options = NormalizerOption { lossy: true, ..Default::default() }; let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, @@ -30,12 +37,30 @@ pub fn extract_facet_string_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory, + max_memory.map(|m| m / 2), + ); + + let mut normalized_facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_deladd_btreeset_string, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), ); let mut buffer = Vec::new(); let mut cursor = docid_fid_facet_string.into_cursor()?; while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { + let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes); + + // nothing to do if we delete and re-add the value. + if deladd_reader.get(DelAdd::Deletion).is_some() + && deladd_reader.get(DelAdd::Addition).is_some() + { + continue; + } + let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); @@ -44,17 +69,46 @@ pub fn extract_facet_string_docids( let document_id = u32::from_be_bytes(document_id_bytes); let normalized_value = str::from_utf8(normalized_value_bytes)?; + + // Facet search normalization + { + let mut hyper_normalized_value = normalized_value.normalize(&options); + let normalized_truncated_facet: String; + if hyper_normalized_value.len() > MAX_FACET_VALUE_LENGTH { + normalized_truncated_facet = hyper_normalized_value + .char_indices() + .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + hyper_normalized_value = normalized_truncated_facet.into(); + } + let set = BTreeSet::from_iter(std::iter::once(normalized_value)); + + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in deladd_reader.iter() { + let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; + obkv.insert(deladd_key, val)?; + } + obkv.finish()?; + + let key = (field_id, hyper_normalized_value.as_ref()); + let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; + } + let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); buffer.clear(); let mut obkv = KvWriterDelAdd::new(&mut buffer); - for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() { + for (deladd_key, _) in deladd_reader.iter() { obkv.insert(deladd_key, document_id.to_ne_bytes())?; } obkv.finish()?; facet_string_docids_sorter.insert(&key_bytes, &buffer)?; } - sorter_into_reader(facet_string_docids_sorter, indexer) + let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?; + sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized)) } diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 117f6cc8c..ece841659 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -257,6 +257,7 @@ fn push_vectors_diff( key_buffer: &mut Vec, delta: VectorStateDelta, ) -> Result<()> { + puffin::profile_function!(); let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); if must_remove { key_buffer.truncate(TRUNCATE_SIZE); @@ -332,13 +333,14 @@ fn extract_vectors( } } -#[logging_timer::time] +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] pub fn extract_embeddings( // docid, prompt prompt_reader: grenad::Reader, indexer: GrenadParameters, embedder: Arc, ) -> Result>> { + puffin::profile_function!(); let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 44f54ff26..43f3f4947 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -15,7 +15,6 @@ use std::io::BufReader; use crossbeam_channel::Sender; use rayon::prelude::*; -use tracing::debug; use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_facet_number_docids::extract_facet_number_docids; @@ -29,10 +28,7 @@ use self::extract_vector_points::{ use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; -use super::helpers::{ - as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, - MergeFn, MergeableReader, -}; +use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; use crate::proximity::ProximityPrecision; use crate::vector::EmbeddingConfigs; @@ -52,7 +48,7 @@ pub(crate) fn data_from_obkv_documents( primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, field_id_map: FieldsIdsMap, - stop_words: Option>, + stop_words: Option>>, allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: Option, @@ -62,227 +58,170 @@ pub(crate) fn data_from_obkv_documents( ) -> Result<()> { puffin::profile_function!(); - original_obkv_chunks - .par_bridge() - .map(|original_documents_chunk| { - send_original_documents_data( - original_documents_chunk, - indexer, - lmdb_writer_sx.clone(), - field_id_map.clone(), - embedders.clone(), - ) - }) - .collect::>()?; - - #[allow(clippy::type_complexity)] - let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> = - flattened_obkv_chunks - .par_bridge() - .map(|flattened_obkv_chunks| { - send_and_extract_flattened_documents_data( - flattened_obkv_chunks, - indexer, - lmdb_writer_sx.clone(), - &searchable_fields, - &faceted_fields, - primary_key_id, - geo_fields_ids, - &stop_words, - &allowed_separators, - &dictionary, - max_positions_per_attributes, - ) - }) - .collect(); - - let ( - docid_word_positions_chunks, - ( - fid_docid_facet_numbers_chunks, - ( - fid_docid_facet_strings_chunks, - ( - facet_is_null_docids_chunks, - (facet_is_empty_docids_chunks, facet_exists_docids_chunks), - ), - ), - ), - ) = result?; - - // merge facet_exists_docids and send them as a typed chunk - { - let lmdb_writer_sx = lmdb_writer_sx.clone(); - rayon::spawn(move || { - debug!(database = "facet-id-exists-docids", "merge"); - match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { - Ok(reader) => { - let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); - } - Err(e) => { - let _ = lmdb_writer_sx.send(Err(e)); - } - } - }); - } - - // merge facet_is_null_docids and send them as a typed chunk - { - let lmdb_writer_sx = lmdb_writer_sx.clone(); - rayon::spawn(move || { - debug!(database = "facet-id-is-null-docids", "merge"); - match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { - Ok(reader) => { - let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader))); - } - Err(e) => { - let _ = lmdb_writer_sx.send(Err(e)); - } - } - }); - } - - // merge facet_is_empty_docids and send them as a typed chunk - { - let lmdb_writer_sx = lmdb_writer_sx.clone(); - rayon::spawn(move || { - debug!(database = "facet-id-is-empty-docids", "merge"); - match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { - Ok(reader) => { - let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader))); - } - Err(e) => { - let _ = lmdb_writer_sx.send(Err(e)); - } - } - }); - } - - if proximity_precision == ProximityPrecision::ByWord { - spawn_extraction_task::<_, _, Vec>>>( - docid_word_positions_chunks.clone(), - indexer, - lmdb_writer_sx.clone(), - extract_word_pair_proximity_docids, - merge_deladd_cbo_roaring_bitmaps, - TypedChunk::WordPairProximityDocids, - "word-pair-proximity-docids", - ); - } - - spawn_extraction_task::<_, _, Vec>>>( - docid_word_positions_chunks.clone(), - indexer, - lmdb_writer_sx.clone(), - extract_fid_word_count_docids, - merge_deladd_cbo_roaring_bitmaps, - TypedChunk::FieldIdWordCountDocids, - "field-id-wordcount-docids", - ); - - spawn_extraction_task::< - _, - _, - Vec<( - grenad::Reader>, - grenad::Reader>, - grenad::Reader>, - )>, - >( - docid_word_positions_chunks.clone(), - indexer, - lmdb_writer_sx.clone(), - move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), - merge_deladd_cbo_roaring_bitmaps, - |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| { - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, - } + let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join( + || { + original_obkv_chunks + .par_bridge() + .map(|original_documents_chunk| { + send_original_documents_data( + original_documents_chunk, + indexer, + lmdb_writer_sx.clone(), + field_id_map.clone(), + embedders.clone(), + ) + }) + .collect::>() + }, + || { + flattened_obkv_chunks + .par_bridge() + .map(|flattened_obkv_chunks| { + send_and_extract_flattened_documents_data( + flattened_obkv_chunks, + indexer, + lmdb_writer_sx.clone(), + &searchable_fields, + &faceted_fields, + primary_key_id, + geo_fields_ids, + &stop_words, + &allowed_separators, + &dictionary, + max_positions_per_attributes, + ) + }) + .map(|result| { + if let Ok(( + ref docid_word_positions_chunk, + (ref fid_docid_facet_numbers_chunk, ref fid_docid_facet_strings_chunk), + )) = result + { + run_extraction_task::<_, _, grenad::Reader>>( + docid_word_positions_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_fid_word_count_docids, + TypedChunk::FieldIdWordCountDocids, + "field-id-wordcount-docids", + ); + + let exact_attributes = exact_attributes.clone(); + run_extraction_task::< + _, + _, + ( + grenad::Reader>, + grenad::Reader>, + grenad::Reader>, + ), + >( + docid_word_positions_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + move |doc_word_pos, indexer| { + extract_word_docids(doc_word_pos, indexer, &exact_attributes) + }, + |( + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + )| { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } + }, + "word-docids", + ); + + run_extraction_task::<_, _, grenad::Reader>>( + docid_word_positions_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_word_position_docids, + TypedChunk::WordPositionDocids, + "word-position-docids", + ); + + run_extraction_task::< + _, + _, + (grenad::Reader>, grenad::Reader>), + >( + fid_docid_facet_strings_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_facet_string_docids, + TypedChunk::FieldIdFacetStringDocids, + "field-id-facet-string-docids", + ); + + run_extraction_task::<_, _, grenad::Reader>>( + fid_docid_facet_numbers_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_facet_number_docids, + TypedChunk::FieldIdFacetNumberDocids, + "field-id-facet-number-docids", + ); + + if proximity_precision == ProximityPrecision::ByWord { + run_extraction_task::<_, _, grenad::Reader>>( + docid_word_positions_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_word_pair_proximity_docids, + TypedChunk::WordPairProximityDocids, + "word-pair-proximity-docids", + ); + } + } + + Ok(()) + }) + .collect::>() }, - "word-docids", ); - spawn_extraction_task::<_, _, Vec>>>( - docid_word_positions_chunks.clone(), - indexer, - lmdb_writer_sx.clone(), - extract_word_position_docids, - merge_deladd_cbo_roaring_bitmaps, - TypedChunk::WordPositionDocids, - "word-position-docids", - ); - - spawn_extraction_task::<_, _, Vec>>>( - fid_docid_facet_strings_chunks, - indexer, - lmdb_writer_sx.clone(), - extract_facet_string_docids, - merge_deladd_cbo_roaring_bitmaps, - TypedChunk::FieldIdFacetStringDocids, - "field-id-facet-string-docids", - ); - - spawn_extraction_task::<_, _, Vec>>>( - fid_docid_facet_numbers_chunks, - indexer, - lmdb_writer_sx, - extract_facet_number_docids, - merge_deladd_cbo_roaring_bitmaps, - TypedChunk::FieldIdFacetNumberDocids, - "field-id-facet-number-docids", - ); - - Ok(()) + original_pipeline_result.and(flattened_pipeline_result) } /// Spawn a new task to extract data for a specific DB using extract_fn. /// Generated grenad chunks are merged using the merge_fn. /// The result of merged chunks is serialized as TypedChunk using the serialize_fn /// and sent into lmdb_writer_sx. -fn spawn_extraction_task( - chunks: Vec>, +fn run_extraction_task( + chunk: grenad::Reader, indexer: GrenadParameters, lmdb_writer_sx: Sender>, extract_fn: FE, - merge_fn: MergeFn, serialize_fn: FS, name: &'static str, ) where - FE: Fn(grenad::Reader, GrenadParameters) -> Result + FE: Fn(grenad::Reader, GrenadParameters) -> Result + Sync + Send + 'static, - FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static, - M: MergeableReader + FromParallelIterator + Send + 'static, - M::Output: Send, + FS: Fn(M) -> TypedChunk + Sync + Send + 'static, + M: Send, { let current_span = tracing::Span::current(); rayon::spawn(move || { - let child_span = - tracing::trace_span!(target: "", parent: ¤t_span, "extract_multiple_chunks"); + let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks"); let _entered = child_span.enter(); - puffin::profile_scope!("extract_multiple_chunksdexing::details, ", name); - let chunks: Result = - chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer)).collect(); - let current_span = tracing::Span::current(); - - rayon::spawn(move || match chunks { - Ok(chunks) => { - let child_span = tracing::trace_span!(target: "", parent: ¤t_span, "merge_multiple_chunks"); - let _entered = child_span.enter(); - debug!(database = name, "merge"); - puffin::profile_scope!("merge_multiple_chunks", name); - let reader = chunks.merge(merge_fn, &indexer); - let _ = lmdb_writer_sx.send(reader.map(serialize_fn)); + puffin::profile_scope!("extract_multiple_chunks", name); + match extract_fn(chunk, indexer) { + Ok(chunk) => { + let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk))); } Err(e) => { let _ = lmdb_writer_sx.send(Err(e)); } - }) - }); + } + }) } /// Extract chunked data and send it into lmdb_writer_sx sender: @@ -340,7 +279,7 @@ fn send_original_documents_data( }); // TODO: create a custom internal error - lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap(); + let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))); Ok(()) } @@ -360,22 +299,13 @@ fn send_and_extract_flattened_documents_data( faceted_fields: &HashSet, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, - stop_words: &Option>, + stop_words: &Option>>, allowed_separators: &Option<&[&str]>, dictionary: &Option<&[&str]>, max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, - ( - grenad::Reader, - ( - grenad::Reader, - ( - grenad::Reader>, - (grenad::Reader>, grenad::Reader>), - ), - ), - ), + (grenad::Reader, grenad::Reader), )> { let flattened_documents_chunk = flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -446,16 +376,17 @@ fn send_and_extract_flattened_documents_data( fid_docid_facet_strings_chunk.clone(), ))); - Ok(( - fid_docid_facet_numbers_chunk, - ( - fid_docid_facet_strings_chunk, - ( - fid_facet_is_null_docids_chunk, - (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk), - ), - ), - )) + let _ = lmdb_writer_sx + .send(Ok(TypedChunk::FieldIdFacetIsNullDocids(fid_facet_is_null_docids_chunk))); + + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids( + fid_facet_is_empty_docids_chunk, + ))); + + let _ = lmdb_writer_sx + .send(Ok(TypedChunk::FieldIdFacetExistsDocids(fid_facet_exists_docids_chunk))); + + Ok((fid_docid_facet_numbers_chunk, fid_docid_facet_strings_chunk)) }, ); diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 3e63fcf77..b0e3654a9 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -90,90 +90,6 @@ pub unsafe fn as_cloneable_grenad( Ok(reader) } -pub trait MergeableReader -where - Self: Sized, -{ - type Output; - - fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result; -} - -impl MergeableReader for Vec>> { - type Output = grenad::Reader>; - - fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { - let mut merger = MergerBuilder::new(merge_fn); - self.into_iter().try_for_each(|r| merger.push(r))?; - merger.finish(params) - } -} - -impl MergeableReader for Vec<(grenad::Reader>, grenad::Reader>)> { - type Output = (grenad::Reader>, grenad::Reader>); - - fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { - let mut m1 = MergerBuilder::new(merge_fn); - let mut m2 = MergerBuilder::new(merge_fn); - for (r1, r2) in self.into_iter() { - m1.push(r1)?; - m2.push(r2)?; - } - Ok((m1.finish(params)?, m2.finish(params)?)) - } -} - -impl MergeableReader - for Vec<( - grenad::Reader>, - grenad::Reader>, - grenad::Reader>, - )> -{ - type Output = ( - grenad::Reader>, - grenad::Reader>, - grenad::Reader>, - ); - - fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { - let mut m1 = MergerBuilder::new(merge_fn); - let mut m2 = MergerBuilder::new(merge_fn); - let mut m3 = MergerBuilder::new(merge_fn); - for (r1, r2, r3) in self.into_iter() { - m1.push(r1)?; - m2.push(r2)?; - m3.push(r3)?; - } - Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?)) - } -} - -struct MergerBuilder(grenad::MergerBuilder); - -impl MergerBuilder { - fn new(merge_fn: MergeFn) -> Self { - Self(grenad::MergerBuilder::new(merge_fn)) - } - - fn push(&mut self, reader: grenad::Reader) -> Result<()> { - self.0.push(reader.into_cursor()?); - Ok(()) - } - - fn finish(self, params: &GrenadParameters) -> Result>> { - let merger = self.0.build(); - let mut writer = create_writer( - params.chunk_compression_type, - params.chunk_compression_level, - tempfile::tempfile()?, - ); - merger.write_into_stream_writer(&mut writer)?; - - writer_into_reader(writer) - } -} - #[derive(Debug, Clone, Copy)] pub struct GrenadParameters { pub chunk_compression_type: CompressionType, diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index d355ead68..a265d152f 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -35,27 +35,6 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul } } -pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - // TODO improve the perf by using a `#[borrow] Cow`. - let strings: BTreeSet = values - .iter() - .map(AsRef::as_ref) - .map(serde_json::from_slice::>) - .map(StdResult::unwrap) - .reduce(|mut current, new| { - for x in new { - current.insert(x); - } - current - }) - .unwrap(); - Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap())) - } -} - pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { Ok(values[0].clone()) } @@ -243,3 +222,40 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>( buffer, )?) } + +/// Do a union of BtreeSet on both sides of a DelAdd obkv +/// separately and outputs a new DelAdd with both unions. +pub fn merge_deladd_btreeset_string<'a>( + _key: &[u8], + values: &[Cow<'a, [u8]>], +) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + // Retrieve the bitmaps from both sides + let mut del_set = BTreeSet::new(); + let mut add_set = BTreeSet::new(); + for value in values { + let obkv = KvReaderDelAdd::new(value); + if let Some(bytes) = obkv.get(DelAdd::Deletion) { + let set = serde_json::from_slice::>(bytes).unwrap(); + for value in set { + del_set.insert(value); + } + } + if let Some(bytes) = obkv.get(DelAdd::Addition) { + let set = serde_json::from_slice::>(bytes).unwrap(); + for value in set { + add_set.insert(value); + } + } + } + + let mut output_deladd_obkv = KvWriterDelAdd::memory(); + let del = serde_json::to_vec(&del_set).unwrap(); + output_deladd_obkv.insert(DelAdd::Deletion, &del)?; + let add = serde_json::to_vec(&add_set).unwrap(); + output_deladd_obkv.insert(DelAdd::Addition, &add)?; + output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) + } +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 1e29c0240..5d8f16fae 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -10,10 +10,10 @@ use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader, - GrenadParameters, MergeableReader, + GrenadParameters, }; pub use merge_functions::{ - keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps, + keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions, MergeFn, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 36aa94964..7499b68e5 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -5,29 +5,29 @@ mod transform; mod typed_chunk; use std::collections::{HashMap, HashSet}; -use std::io::{Cursor, Read, Seek}; +use std::io::{Read, Seek}; use std::iter::FromIterator; use std::num::NonZeroU32; use std::result::Result as StdResult; use crossbeam_channel::{Receiver, Sender}; +use grenad::{Merger, MergerBuilder}; use heed::types::Str; use heed::Database; use rand::SeedableRng; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use slice_group_by::GroupBy; -use tracing::debug_span; -use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; +use tracing::debug; +use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk}; use self::enrich::enrich_documents_batch; pub use self::enrich::{extract_finite_float_from_value, DocumentId}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, - fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, - merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, - merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader, - ClonableMmap, MergeFn, + fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps, + valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; @@ -95,8 +95,8 @@ pub struct IndexDocumentsConfig { impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA> where - FP: Fn(UpdateIndexingStep) + Sync, - FA: Fn() -> bool + Sync, + FP: Fn(UpdateIndexingStep) + Sync + Send, + FA: Fn() -> bool + Sync + Send, { pub fn new( wtxn: &'t mut heed::RwTxn<'i>, @@ -284,7 +284,7 @@ where #[tracing::instrument( level = "trace", skip_all, - target = "profile::indexing::details", + target = "indexing::details", name = "index_documents_raw" )] pub fn execute_raw(self, output: TransformOutput) -> Result @@ -326,9 +326,6 @@ where } }; - let original_documents = grenad::Reader::new(original_documents)?; - let flattened_documents = grenad::Reader::new(flattened_documents)?; - // create LMDB writer channel let (lmdb_writer_sx, lmdb_writer_rx): ( Sender>, @@ -367,11 +364,7 @@ where let stop_words = self.index.stop_words(self.wtxn)?; let separators = self.index.allowed_separators(self.wtxn)?; - let separators: Option> = - separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); let dictionary = self.index.dictionary(self.wtxn)?; - let dictionary: Option> = - dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default(); @@ -381,141 +374,204 @@ where max_memory: self.indexer_config.max_memory, max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. }; - let documents_chunk_size = - self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB + let documents_chunk_size = match self.indexer_config.documents_chunk_size { + Some(chunk_size) => chunk_size, + None => { + let default_chunk_size = 1024 * 1024 * 4; // 4MiB + let min_chunk_size = 1024 * 512; // 512KiB + + // compute the chunk size from the number of available threads and the inputed data size. + let total_size = flattened_documents.metadata().map(|m| m.len()); + let current_num_threads = pool.current_num_threads(); + // if we have more than 2 thread, create a number of chunk equal to 3/4 threads count + let chunk_count = if current_num_threads > 2 { + (current_num_threads * 3 / 4).max(2) + } else { + current_num_threads + }; + total_size + .map_or(default_chunk_size, |size| (size as usize) / chunk_count) + .max(min_chunk_size) + } + }; + + let original_documents = grenad::Reader::new(original_documents)?; + let flattened_documents = grenad::Reader::new(flattened_documents)?; + let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; let cloned_embedder = self.embedders.clone(); + let mut final_documents_ids = RoaringBitmap::new(); + let mut databases_seen = 0; + let mut word_position_docids = None; + let mut word_fid_docids = None; + let mut word_docids = None; + let mut exact_word_docids = None; + let mut chunk_accumulator = ChunkAccumulator::default(); + let mut dimension = HashMap::new(); + let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); + let current_span = tracing::Span::current(); // Run extraction pipeline in parallel. pool.install(|| { - let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks"); + rayon::spawn(move || { + let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks"); let _enter = child_span.enter(); puffin::profile_scope!("extract_and_send_grenad_chunks"); - // split obkv file into several chunks - let original_chunk_iter = - grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size); + // split obkv file into several chunks + let original_chunk_iter = + grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size); - // split obkv file into several chunks - let flattened_chunk_iter = - grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); + // split obkv file into several chunks + let flattened_chunk_iter = + grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); - let result = original_chunk_iter.and_then(|original_chunk| { - let flattened_chunk = flattened_chunk_iter?; - // extract all databases from the chunked obkv douments - extract::data_from_obkv_documents( - original_chunk, - flattened_chunk, - pool_params, - lmdb_writer_sx.clone(), - searchable_fields, - faceted_fields, - primary_key_id, - geo_fields_ids, - field_id_map, - stop_words, - separators.as_deref(), - dictionary.as_deref(), - max_positions_per_attributes, - exact_attributes, - proximity_precision, - cloned_embedder, - ) + let separators: Option> = + separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); + let dictionary: Option> = + dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); + let result = original_chunk_iter.and_then(|original_chunk| { + let flattened_chunk = flattened_chunk_iter?; + // extract all databases from the chunked obkv douments + extract::data_from_obkv_documents( + original_chunk, + flattened_chunk, + pool_params, + lmdb_writer_sx.clone(), + searchable_fields, + faceted_fields, + primary_key_id, + geo_fields_ids, + field_id_map, + stop_words, + separators.as_deref(), + dictionary.as_deref(), + max_positions_per_attributes, + exact_attributes, + proximity_precision, + cloned_embedder, + ) + }); + + if let Err(e) = result { + let _ = lmdb_writer_sx.send(Err(e)); + } + + // needs to be dropped to avoid channel waiting lock. + drop(lmdb_writer_sx); }); - if let Err(e) = result { - let _ = lmdb_writer_sx.send(Err(e)); - } + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); - // needs to be dropped to avoid channel waiting lock. - drop(lmdb_writer_sx); - }); + loop { + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } - let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0; - let mut final_documents_ids = RoaringBitmap::new(); + match lmdb_writer_rx.clone().recv_timeout(std::time::Duration::from_millis(500)) { + Err(status) => { + if let Some(typed_chunks) = chunk_accumulator.pop_longest() { + let (docids, is_merged_database) = + write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?; + if !docids.is_empty() { + final_documents_ids |= docids; + let documents_seen_count = final_documents_ids.len(); + (self.progress)(UpdateIndexingStep::IndexDocuments { + documents_seen: documents_seen_count as usize, + total_documents: documents_count, + }); + debug!(documents = documents_seen_count, total = documents_count, "Seen"); + } + if is_merged_database { + databases_seen += 1; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + } + // If no more chunk remains in the chunk accumulator and the channel is disconected, break. + } else if status == crossbeam_channel::RecvTimeoutError::Disconnected { + break; + } else { + rayon::yield_now(); + } + } + Ok(result) => { + let typed_chunk = match result? { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => { + let cloneable_chunk = + unsafe { as_cloneable_grenad(&word_docids_reader)? }; + let word_docids = word_docids.get_or_insert_with(|| { + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn) + }); + word_docids.push(cloneable_chunk.into_cursor()?); + let cloneable_chunk = + unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; + let exact_word_docids = + exact_word_docids.get_or_insert_with(|| { + MergerBuilder::new( + merge_deladd_cbo_roaring_bitmaps as MergeFn, + ) + }); + exact_word_docids.push(cloneable_chunk.into_cursor()?); + let cloneable_chunk = + unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; + let word_fid_docids = word_fid_docids.get_or_insert_with(|| { + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn) + }); + word_fid_docids.push(cloneable_chunk.into_cursor()?); + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } + } + TypedChunk::WordPositionDocids(chunk) => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + let word_position_docids = + word_position_docids.get_or_insert_with(|| { + MergerBuilder::new( + merge_deladd_cbo_roaring_bitmaps as MergeFn, + ) + }); + word_position_docids.push(cloneable_chunk.into_cursor()?); + TypedChunk::WordPositionDocids(chunk) + } + TypedChunk::VectorPoints { + expected_dimension, + remove_vectors, + embeddings, + manual_vectors, + embedder_name, + } => { + dimension.insert(embedder_name.clone(), expected_dimension); + TypedChunk::VectorPoints { + remove_vectors, + embeddings, + expected_dimension, + manual_vectors, + embedder_name, + } + } + otherwise => otherwise, + }; - let mut databases_seen = 0; - (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen, - total_databases: TOTAL_POSTING_DATABASE_COUNT, - }); - - let mut word_position_docids = None; - let mut word_fid_docids = None; - let mut word_docids = None; - let mut exact_word_docids = None; - - let mut dimension = HashMap::new(); - - for result in lmdb_writer_rx { - if (self.should_abort)() { - return Err(Error::InternalError(InternalError::AbortedIndexation)); - } - - let typed_chunk = match result? { - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, - } => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; - word_docids = Some(cloneable_chunk); - let cloneable_chunk = - unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; - exact_word_docids = Some(cloneable_chunk); - let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; - word_fid_docids = Some(cloneable_chunk); - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, + chunk_accumulator.insert(typed_chunk); } } - TypedChunk::WordPositionDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_position_docids = Some(cloneable_chunk); - TypedChunk::WordPositionDocids(chunk) - } - TypedChunk::VectorPoints { - expected_dimension, - remove_vectors, - embeddings, - manual_vectors, - embedder_name, - } => { - dimension.insert(embedder_name.clone(), expected_dimension); - TypedChunk::VectorPoints { - remove_vectors, - embeddings, - expected_dimension, - manual_vectors, - embedder_name, - } - } - otherwise => otherwise, - }; + } - let (docids, is_merged_database) = - write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?; - if !docids.is_empty() { - final_documents_ids |= docids; - let documents_seen_count = final_documents_ids.len(); - (self.progress)(UpdateIndexingStep::IndexDocuments { - documents_seen: documents_seen_count as usize, - total_documents: documents_count, - }); - debug_span!("Seen", documents = documents_seen_count, total = documents_count); - } - if is_merged_database { - databases_seen += 1; - (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen, - total_databases: TOTAL_POSTING_DATABASE_COUNT, - }); - } - } + Ok(()) + })?; // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; @@ -548,10 +604,10 @@ where } self.execute_prefix_databases( - word_docids, - exact_word_docids, - word_position_docids, - word_fid_docids, + word_docids.map(MergerBuilder::build), + exact_word_docids.map(MergerBuilder::build), + word_position_docids.map(MergerBuilder::build), + word_fid_docids.map(MergerBuilder::build), )?; Ok(number_of_documents) @@ -565,10 +621,10 @@ where )] pub fn execute_prefix_databases( self, - word_docids: Option>, - exact_word_docids: Option>, - word_position_docids: Option>, - word_fid_docids: Option>, + word_docids: Option>, + exact_word_docids: Option>, + word_position_docids: Option>, + word_fid_docids: Option>, ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -751,7 +807,7 @@ where )] fn execute_word_prefix_docids( txn: &mut heed::RwTxn, - reader: grenad::Reader>, + merger: Merger, word_docids_db: Database, word_prefix_docids_db: Database, indexer_config: &IndexerConfig, @@ -761,13 +817,12 @@ fn execute_word_prefix_docids( ) -> Result<()> { puffin::profile_function!(); - let cursor = reader.into_cursor()?; let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db); builder.chunk_compression_type = indexer_config.chunk_compression_type; builder.chunk_compression_level = indexer_config.chunk_compression_level; builder.max_nb_chunks = indexer_config.max_nb_chunks; builder.max_memory = indexer_config.max_memory; - builder.execute(cursor, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?; + builder.execute(merger, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?; Ok(()) } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index af828fee6..6aad290e5 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -5,27 +5,64 @@ use std::io::{self, BufReader}; use bytemuck::allocation::pod_collect_to_vec; use charabia::{Language, Script}; -use grenad::MergerBuilder; +use grenad::{Merger, MergerBuilder}; use heed::types::Bytes; -use heed::{PutFlags, RwTxn}; +use heed::RwTxn; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; use super::helpers::{ - self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, - valid_lmdb_key, CursorClonableMmap, + self, keep_first, merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, valid_lmdb_key, + CursorClonableMmap, }; -use super::{ClonableMmap, MergeFn}; +use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; -use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; +use crate::update::index_documents::helpers::{ + as_cloneable_grenad, keep_latest_obkv, try_split_array_at, +}; use crate::{ lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError, }; +/// This struct accumulates and group the TypedChunks +/// and is able to give the biggest accumulated group to index them all together +/// with a merger. +#[derive(Default)] +pub(crate) struct ChunkAccumulator { + inner: Vec>, +} + +impl ChunkAccumulator { + pub fn pop_longest(&mut self) -> Option> { + match self.inner.iter().max_by_key(|v| v.len()) { + Some(left) => { + let position = self.inner.iter().position(|right| left.len() == right.len()); + position.map(|p| self.inner.remove(p)).filter(|v| !v.is_empty()) + } + None => None, + } + } + + pub fn insert(&mut self, chunk: TypedChunk) { + match self + .inner + .iter() + .position(|right| right.first().map_or(false, |right| chunk.mergeable_with(right))) + { + Some(position) => { + let v = self.inner.get_mut(position).unwrap(); + v.push(chunk); + } + None => self.inner.push(vec![chunk]), + } + } +} + pub(crate) enum TypedChunk { FieldIdDocidFacetStrings(grenad::Reader), FieldIdDocidFacetNumbers(grenad::Reader), @@ -38,7 +75,7 @@ pub(crate) enum TypedChunk { }, WordPositionDocids(grenad::Reader>), WordPairProximityDocids(grenad::Reader>), - FieldIdFacetStringDocids(grenad::Reader>), + FieldIdFacetStringDocids((grenad::Reader>, grenad::Reader>)), FieldIdFacetNumberDocids(grenad::Reader>), FieldIdFacetExistsDocids(grenad::Reader>), FieldIdFacetIsNullDocids(grenad::Reader>), @@ -54,6 +91,33 @@ pub(crate) enum TypedChunk { ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } +impl TypedChunk { + fn mergeable_with(&self, other: &Self) -> bool { + use TypedChunk::*; + match (self, other) { + (FieldIdDocidFacetStrings(_), FieldIdDocidFacetStrings(_)) + | (FieldIdDocidFacetNumbers(_), FieldIdDocidFacetNumbers(_)) + | (Documents(_), Documents(_)) + | (FieldIdWordCountDocids(_), FieldIdWordCountDocids(_)) + | (WordDocids { .. }, WordDocids { .. }) + | (WordPositionDocids(_), WordPositionDocids(_)) + | (WordPairProximityDocids(_), WordPairProximityDocids(_)) + | (FieldIdFacetStringDocids(_), FieldIdFacetStringDocids(_)) + | (FieldIdFacetNumberDocids(_), FieldIdFacetNumberDocids(_)) + | (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_)) + | (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_)) + | (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_)) + | (GeoPoints(_), GeoPoints(_)) + | (ScriptLanguageDocids(_), ScriptLanguageDocids(_)) => true, + ( + VectorPoints { embedder_name: left, expected_dimension: left_dim, .. }, + VectorPoints { embedder_name: right, expected_dimension: right_dim, .. }, + ) => left == right && left_dim == right_dim, + _ => false, + } + } +} + impl TypedChunk { pub fn to_debug_string(&self) -> String { match self { @@ -85,7 +149,7 @@ impl TypedChunk { TypedChunk::WordPairProximityDocids(grenad) => { format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::FieldIdFacetStringDocids(grenad) => { + TypedChunk::FieldIdFacetStringDocids((grenad, _)) => { format!("FieldIdFacetStringDocids {{ number_of_entries: {} }}", grenad.len()) } TypedChunk::FieldIdFacetNumberDocids(grenad) => { @@ -117,23 +181,32 @@ impl TypedChunk { /// Return new documents seen. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] pub(crate) fn write_typed_chunk_into_index( - typed_chunk: TypedChunk, + typed_chunks: Vec, index: &Index, wtxn: &mut RwTxn, - index_is_empty: bool, ) -> Result<(RoaringBitmap, bool)> { - puffin::profile_function!(typed_chunk.to_debug_string()); + puffin::profile_function!(typed_chunks[0].to_debug_string()); let mut is_merged_database = false; - match typed_chunk { - TypedChunk::Documents(obkv_documents_iter) => { + match typed_chunks[0] { + TypedChunk::Documents(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "documents"); let _entered = span.enter(); + + let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::Documents(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); let mut operations: Vec = Default::default(); let mut docids = index.documents_ids(wtxn)?; - let mut cursor = obkv_documents_iter.into_cursor()?; - while let Some((key, reader)) = cursor.move_on_next()? { + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let reader: KvReader = KvReader::new(reader); @@ -174,59 +247,91 @@ pub(crate) fn write_typed_chunk_into_index( external_documents_docids.apply(wtxn, operations)?; index.put_documents_ids(wtxn, &docids)?; } - TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { + TypedChunk::FieldIdWordCountDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids"); let _entered = span.enter(); - append_entries_into_database( - fid_word_count_docids_iter, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.field_id_word_count_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, - } => { + TypedChunk::WordDocids { .. } => { let span = tracing::trace_span!(target: "indexing::write_db", "word_docids"); let _entered = span.enter(); - let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; - append_entries_into_database( - word_docids_iter.clone(), + + let mut word_docids_builder = + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut exact_word_docids_builder = + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut word_fid_docids_builder = + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut fst_merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } = typed_chunk + else { + unreachable!(); + }; + let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?; + let clonable_exact_word_docids = + unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; + + word_docids_builder.push(word_docids_reader.into_cursor()?); + exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?); + word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?); + fst_merger_builder.push(clonable_word_docids.into_cursor()?); + fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?); + } + + let word_docids_merger = word_docids_builder.build(); + write_entries_into_database( + word_docids_merger, &index.word_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; - let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; - append_entries_into_database( - exact_word_docids_iter.clone(), + let exact_word_docids_merger = exact_word_docids_builder.build(); + write_entries_into_database( + exact_word_docids_merger, &index.exact_word_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; - let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; - append_entries_into_database( - word_fid_docids_iter, + let word_fid_docids_merger = word_fid_docids_builder.build(); + write_entries_into_database( + word_fid_docids_merger, &index.word_fid_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; // create fst from word docids - let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?; + let fst_merger = fst_merger_builder.build(); + let fst = merge_word_docids_reader_into_fst(fst_merger)?; let db_fst = index.words_fst(wtxn)?; // merge new fst with database fst @@ -237,98 +342,202 @@ pub(crate) fn write_typed_chunk_into_index( index.put_words_fst(wtxn, &fst)?; is_merged_database = true; } - TypedChunk::WordPositionDocids(word_position_docids_iter) => { + TypedChunk::WordPositionDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids"); let _entered = span.enter(); - append_entries_into_database( - word_position_docids_iter, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::WordPositionDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.word_position_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { + TypedChunk::FieldIdFacetNumberDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids"); let _entered = span.enter(); - let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut data_size = 0; + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk + else { + unreachable!(); + }; + + data_size += facet_id_number_docids.len(); + builder.push(facet_id_number_docids.into_cursor()?); + } + let merger = builder.build(); + + let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size); indexer.execute(wtxn)?; is_merged_database = true; } - TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => { + TypedChunk::FieldIdFacetStringDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids"); let _entered = span.enter(); - let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter); + + let mut facet_id_string_builder = + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut normalized_facet_id_string_builder = + MergerBuilder::new(merge_deladd_btreeset_string as MergeFn); + let mut data_size = 0; + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdFacetStringDocids(( + facet_id_string_docids, + normalized_facet_id_string_docids, + )) = typed_chunk + else { + unreachable!(); + }; + + data_size += facet_id_string_docids.len(); + facet_id_string_builder.push(facet_id_string_docids.into_cursor()?); + normalized_facet_id_string_builder + .push(normalized_facet_id_string_docids.into_cursor()?); + } + let facet_id_string_merger = facet_id_string_builder.build(); + let normalized_facet_id_string_merger = normalized_facet_id_string_builder.build(); + + let indexer = FacetsUpdate::new( + index, + FacetType::String, + facet_id_string_merger, + Some(normalized_facet_id_string_merger), + data_size, + ); indexer.execute(wtxn)?; is_merged_database = true; } - TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { + TypedChunk::FieldIdFacetExistsDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids"); let _entered = span.enter(); - append_entries_into_database( - facet_id_exists_docids, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.facet_id_exists_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => { + TypedChunk::FieldIdFacetIsNullDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids"); let _entered = span.enter(); - append_entries_into_database( - facet_id_is_null_docids, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.facet_id_is_null_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => { - let span = tracing::trace_span!(target: "profile::indexing::write_db", "field_id_facet_is_empty_docids"); + TypedChunk::FieldIdFacetIsEmptyDocids(_) => { + let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids"); let _entered = span.enter(); - append_entries_into_database( - facet_id_is_empty_docids, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.facet_id_is_empty_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { + TypedChunk::WordPairProximityDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids"); let _entered = span.enter(); - append_entries_into_database( - word_pair_proximity_docids_iter, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.word_pair_proximity_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::FieldIdDocidFacetNumbers(fid_docid_facet_number) => { + TypedChunk::FieldIdDocidFacetNumbers(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers"); let _entered = span.enter(); + + let mut builder = MergerBuilder::new(keep_first as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdDocidFacetNumbers(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + let index_fid_docid_facet_numbers = index.field_id_docid_facet_f64s.remap_types::(); - let mut cursor = fid_docid_facet_number.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { let reader = KvReaderDelAdd::new(value); if valid_lmdb_key(key) { match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { @@ -344,14 +553,25 @@ pub(crate) fn write_typed_chunk_into_index( } } } - TypedChunk::FieldIdDocidFacetStrings(fid_docid_facet_string) => { + TypedChunk::FieldIdDocidFacetStrings(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings"); let _entered = span.enter(); + + let mut builder = MergerBuilder::new(keep_first as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdDocidFacetStrings(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + let index_fid_docid_facet_strings = index.field_id_docid_facet_strings.remap_types::(); - let mut cursor = fid_docid_facet_string.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { let reader = KvReaderDelAdd::new(value); if valid_lmdb_key(key) { match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { @@ -367,14 +587,25 @@ pub(crate) fn write_typed_chunk_into_index( } } } - TypedChunk::GeoPoints(geo_points) => { + TypedChunk::GeoPoints(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "geo_points"); let _entered = span.enter(); + + let mut builder = MergerBuilder::new(keep_first as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::GeoPoints(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; - let mut cursor = geo_points.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { // convert the key back to a u32 (4 bytes) let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); @@ -393,15 +624,38 @@ pub(crate) fn write_typed_chunk_into_index( index.put_geo_rtree(wtxn, &rtree)?; index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } - TypedChunk::VectorPoints { - remove_vectors, - manual_vectors, - embeddings, - expected_dimension, - embedder_name, - } => { + TypedChunk::VectorPoints { .. } => { let span = tracing::trace_span!(target: "indexing::write_db", "vector_points"); let _entered = span.enter(); + + let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); + let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); + let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); + let mut params = None; + for typed_chunk in typed_chunks { + let TypedChunk::VectorPoints { + remove_vectors, + manual_vectors, + embeddings, + expected_dimension, + embedder_name, + } = typed_chunk + else { + unreachable!(); + }; + + params = Some((expected_dimension, embedder_name)); + + remove_vectors_builder.push(remove_vectors.into_cursor()?); + manual_vectors_builder.push(manual_vectors.into_cursor()?); + if let Some(embeddings) = embeddings { + embeddings_builder.push(embeddings.into_cursor()?); + } + } + + // typed chunks has always at least 1 chunk. + let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; + let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, )?; @@ -419,8 +673,9 @@ pub(crate) fn write_typed_chunk_into_index( let writers = writers?; // remove vectors for docids we want them removed - let mut cursor = remove_vectors.into_cursor()?; - while let Some((key, _)) = cursor.move_on_next()? { + let merger = remove_vectors_builder.build(); + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, _)) = iter.next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); for writer in &writers { @@ -432,40 +687,39 @@ pub(crate) fn write_typed_chunk_into_index( } // add generated embeddings - if let Some(embeddings) = embeddings { - let mut cursor = embeddings.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - let data = pod_collect_to_vec(value); - // it is a code error to have embeddings and not expected_dimension - let embeddings = - crate::vector::Embeddings::from_inner(data, expected_dimension) - // code error if we somehow got the wrong dimension - .unwrap(); + let merger = embeddings_builder.build(); + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { + let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); + let data = pod_collect_to_vec(value); + // it is a code error to have embeddings and not expected_dimension + let embeddings = crate::vector::Embeddings::from_inner(data, expected_dimension) + // code error if we somehow got the wrong dimension + .unwrap(); - if embeddings.embedding_count() > usize::from(u8::MAX) { - let external_docid = if let Ok(Some(Ok(index))) = index - .external_id_of(wtxn, std::iter::once(docid)) - .map(|it| it.into_iter().next()) - { - index - } else { - format!("internal docid={docid}") - }; - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - external_docid, - embeddings.embedding_count(), - ))); - } - for (embedding, writer) in embeddings.iter().zip(&writers) { - writer.add_item(wtxn, docid, embedding)?; - } + if embeddings.embedding_count() > usize::from(u8::MAX) { + let external_docid = if let Ok(Some(Ok(index))) = index + .external_id_of(wtxn, std::iter::once(docid)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={docid}") + }; + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + external_docid, + embeddings.embedding_count(), + ))); + } + for (embedding, writer) in embeddings.iter().zip(&writers) { + writer.add_item(wtxn, docid, embedding)?; } } // perform the manual diff - let mut cursor = manual_vectors.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let merger = manual_vectors_builder.build(); + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { // convert the key back to a u32 (4 bytes) let (left, _index) = try_split_array_at(key).unwrap(); let docid = DocumentId::from_be_bytes(left); @@ -519,26 +773,30 @@ pub(crate) fn write_typed_chunk_into_index( tracing::debug!("Finished vector chunk for {}", embedder_name); } - TypedChunk::ScriptLanguageDocids(sl_map) => { + TypedChunk::ScriptLanguageDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "script_language_docids"); let _entered = span.enter(); - for (key, (deletion, addition)) in sl_map { - let mut db_key_exists = false; - let final_value = match index.script_language_docids.get(wtxn, &key)? { - Some(db_values) => { - db_key_exists = true; - (db_values - deletion) | addition - } - None => addition, - }; - if final_value.is_empty() { - // If the database entry exists, delete it. - if db_key_exists { - index.script_language_docids.delete(wtxn, &key)?; + for typed_chunk in typed_chunks { + let TypedChunk::ScriptLanguageDocids(sl_map) = typed_chunk else { unreachable!() }; + for (key, (deletion, addition)) in sl_map { + let mut db_key_exists = false; + let final_value = match index.script_language_docids.get(wtxn, &key)? { + Some(db_values) => { + db_key_exists = true; + (db_values - deletion) | addition + } + None => addition, + }; + + if final_value.is_empty() { + // If the database entry exists, delete it. + if db_key_exists { + index.script_language_docids.delete(wtxn, &key)?; + } + } else { + index.script_language_docids.put(wtxn, &key, &final_value)?; } - } else { - index.script_language_docids.put(wtxn, &key, &final_value)?; } } } @@ -557,13 +815,9 @@ fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint { } fn merge_word_docids_reader_into_fst( - word_docids_iter: grenad::Reader>, - exact_word_docids_iter: grenad::Reader>, + merger: Merger, ) -> Result>> { - let mut merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn); - merger_builder.push(word_docids_iter.into_cursor()?); - merger_builder.push(exact_word_docids_iter.into_cursor()?); - let mut iter = merger_builder.build().into_stream_merger_iter()?; + let mut iter = merger.into_stream_merger_iter()?; let mut builder = fst::SetBuilder::memory(); while let Some((k, _)) = iter.next()? { @@ -577,10 +831,9 @@ fn merge_word_docids_reader_into_fst( /// merge_values function is used if an entry already exist in the database. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] fn write_entries_into_database( - data: grenad::Reader, + merger: Merger, database: &heed::Database, wtxn: &mut RwTxn, - index_is_empty: bool, serialize_value: FS, merge_values: FM, ) -> Result<()> @@ -589,22 +842,17 @@ where FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, { - puffin::profile_function!(format!("number of entries: {}", data.len())); - + puffin::profile_function!(); let mut buffer = Vec::new(); let database = database.remap_types::(); - let mut cursor = data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { if valid_lmdb_key(key) { buffer.clear(); - let value = if index_is_empty { - Some(serialize_value(value, &mut buffer)?) - } else { - match database.get(wtxn, key)? { - Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, - None => Some(serialize_value(value, &mut buffer)?), - } + let value = match database.get(wtxn, key)? { + Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, + None => Some(serialize_value(value, &mut buffer)?), }; match value { Some(value) => database.put(wtxn, key, value)?, @@ -614,62 +862,5 @@ where } } } - - Ok(()) -} - -/// Write provided entries in database using serialize_value function. -/// merge_values function is used if an entry already exist in the database. -/// All provided entries must be ordered. -/// If the index is not empty, write_entries_into_database is called instead. -#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] -fn append_entries_into_database( - data: grenad::Reader, - database: &heed::Database, - wtxn: &mut RwTxn, - index_is_empty: bool, - serialize_value: FS, - merge_values: FM, -) -> Result<()> -where - R: io::Read + io::Seek, - FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, - FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, - K: for<'a> heed::BytesDecode<'a>, -{ - puffin::profile_function!(format!("number of entries: {}", data.len())); - - if !index_is_empty { - return write_entries_into_database( - data, - database, - wtxn, - false, - serialize_value, - merge_values, - ); - } - - let mut buffer = Vec::new(); - let mut database = database.iter_mut(wtxn)?.remap_types::(); - - let mut cursor = data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - if valid_lmdb_key(key) { - debug_assert!( - K::bytes_decode(key).is_ok(), - "Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}", - key.len(), - &key - ); - buffer.clear(); - let value = serialize_value(value, &mut buffer)?; - unsafe { - // safety: We do not keep a reference to anything that lives inside the database - database.put_current_with_options::(PutFlags::APPEND, key, value)? - }; - } - } - Ok(()) } diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 66c52a52f..195b95d1e 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -3,9 +3,8 @@ pub use self::clear_documents::ClearDocuments; pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ - merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, - MergeFn, + merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId, + IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn, }; pub use self::indexer_config::IndexerConfig; pub use self::settings::{validate_embedding_settings, Setting, Settings}; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 99c6c815e..1db066058 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -47,7 +47,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> { )] pub fn execute( self, - mut new_word_docids_iter: grenad::ReaderCursor, + new_word_docids: grenad::Merger, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -68,7 +68,8 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> { if !common_prefix_fst_words.is_empty() { let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((word, data)) = new_word_docids_iter.move_on_next()? { + let mut new_word_docids_iter = new_word_docids.into_stream_merger_iter()?; + while let Some((word, data)) = new_word_docids_iter.next()? { current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(prefixes[0].as_bytes()) => Some(prefixes), _otherwise => { diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs index a05eb8721..272d465fd 100644 --- a/milli/src/update/words_prefix_integer_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -52,7 +52,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> { )] pub fn execute( self, - new_word_integer_docids: grenad::Reader, + new_word_integer_docids: grenad::Merger, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -69,14 +69,14 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> { self.max_memory, ); - let mut new_word_integer_docids_iter = new_word_integer_docids.into_cursor()?; - if !common_prefix_fst_words.is_empty() { // We fetch all the new common prefixes between the previous and new prefix fst. let mut buffer = Vec::new(); let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = new_word_integer_docids_iter.move_on_next()? { + let mut new_word_integer_docids_iter = + new_word_integer_docids.into_stream_merger_iter()?; + while let Some((key, data)) = new_word_integer_docids_iter.next()? { let (word, pos) = StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?; diff --git a/milli/src/vector/error.rs b/milli/src/vector/error.rs index 3673c85e3..fbe4ee878 100644 --- a/milli/src/vector/error.rs +++ b/milli/src/vector/error.rs @@ -59,8 +59,8 @@ pub enum EmbedErrorKind { OpenAiAuth(OpenAiError), #[error("sent too many requests to OpenAI: {0}")] OpenAiTooManyRequests(OpenAiError), - #[error("received internal error from OpenAI: {0}")] - OpenAiInternalServerError(OpenAiError), + #[error("received internal error from OpenAI: {0:?}")] + OpenAiInternalServerError(Option), #[error("sent too many tokens in a request to OpenAI: {0}")] OpenAiTooManyTokens(OpenAiError), #[error("received unhandled HTTP status code {0} from OpenAI")] @@ -106,7 +106,7 @@ impl EmbedError { Self { kind: EmbedErrorKind::OpenAiTooManyRequests(inner), fault: FaultSource::Runtime } } - pub(crate) fn openai_internal_server_error(inner: OpenAiError) -> EmbedError { + pub(crate) fn openai_internal_server_error(inner: Option) -> EmbedError { Self { kind: EmbedErrorKind::OpenAiInternalServerError(inner), fault: FaultSource::Runtime } } diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 99b7bff7e..6aa324da9 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -261,3 +261,7 @@ impl DistributionShift { score } } + +pub const fn is_cuda_enabled() -> bool { + cfg!(feature = "cuda") +} diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index cbddddfb7..33442dda4 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -178,6 +178,8 @@ impl Embedder { retry.into_duration(attempt) } }?; + + let retry_duration = retry_duration.min(std::time::Duration::from_secs(60)); // don't wait more than a minute tracing::warn!( "Attempt #{}, retrying after {}ms.", attempt, @@ -220,24 +222,12 @@ impl Embedder { error_response.error, ))); } - StatusCode::INTERNAL_SERVER_ERROR => { - let error_response: OpenAiErrorResponse = response - .json() - .await - .map_err(EmbedError::openai_unexpected) - .map_err(Retry::retry_later)?; + StatusCode::INTERNAL_SERVER_ERROR + | StatusCode::BAD_GATEWAY + | StatusCode::SERVICE_UNAVAILABLE => { + let error_response: Result = response.json().await; return Err(Retry::retry_later(EmbedError::openai_internal_server_error( - error_response.error, - ))); - } - StatusCode::SERVICE_UNAVAILABLE => { - let error_response: OpenAiErrorResponse = response - .json() - .await - .map_err(EmbedError::openai_unexpected) - .map_err(Retry::retry_later)?; - return Err(Retry::retry_later(EmbedError::openai_internal_server_error( - error_response.error, + error_response.ok().map(|error_response| error_response.error), ))); } StatusCode::BAD_REQUEST => { @@ -248,14 +238,14 @@ impl Embedder { .map_err(EmbedError::openai_unexpected) .map_err(Retry::retry_later)?; - tracing::warn!("OpenAI: input was too long, retrying on tokenized version. For best performance, limit the size of your prompt."); + tracing::warn!("OpenAI: received `BAD_REQUEST`. Input was maybe too long, retrying on tokenized version. For best performance, limit the size of your prompt."); return Err(Retry::retry_tokenized(EmbedError::openai_too_many_tokens( error_response.error, ))); } code => { - return Err(Retry::give_up(EmbedError::openai_unhandled_status_code( + return Err(Retry::retry_later(EmbedError::openai_unhandled_status_code( code.as_u16(), ))); } diff --git a/tracing-trace/src/processor/span_stats.rs b/tracing-trace/src/processor/span_stats.rs index f3e6238ff..584fe53f8 100644 --- a/tracing-trace/src/processor/span_stats.rs +++ b/tracing-trace/src/processor/span_stats.rs @@ -1,4 +1,5 @@ use std::collections::{BTreeMap, HashMap}; +use std::ops::Range; use std::time::Duration; use serde::{Deserialize, Serialize}; @@ -16,6 +17,51 @@ enum SpanStatus { pub struct CallStats { pub call_count: usize, pub time: u64, + pub self_time: u64, +} + +#[derive(Debug, Default)] +pub struct SelfTime { + child_ranges: Vec>, +} + +impl SelfTime { + pub fn new() -> Self { + Default::default() + } + + pub fn add_child_range(&mut self, child_range: Range) { + self.child_ranges.push(child_range) + } + + pub fn self_duration(&mut self, self_range: Range) -> Duration { + if self.child_ranges.is_empty() { + return self_range.end - self_range.start; + } + + // by sorting child ranges by their start time, + // we make sure that no child will start before the last one we visited. + self.child_ranges + .sort_by(|left, right| left.start.cmp(&right.start).then(left.end.cmp(&right.end))); + // self duration computed by adding all the segments where the span is not executing a child + let mut self_duration = Duration::from_nanos(0); + + // last point in time where we are certain that this span was not executing a child. + let mut committed_point = self_range.start; + + for child_range in &self.child_ranges { + if child_range.start > committed_point { + // we add to the self duration the point between the end of the latest span and the beginning of the next span + self_duration += child_range.start - committed_point; + } + if committed_point < child_range.end { + // then we set ourselves to the end of the latest span + committed_point = child_range.end; + } + } + + self_duration + } } pub fn to_call_stats( @@ -23,6 +69,9 @@ pub fn to_call_stats( ) -> Result, Error> { let mut calls = HashMap::new(); let mut spans = HashMap::new(); + let mut last_point = Duration::from_nanos(0); + let mut first_point = None; + let mut total_self_time = SelfTime::new(); for entry in trace { let entry = entry?; match entry { @@ -31,10 +80,11 @@ pub fn to_call_stats( } Entry::NewThread(_) => {} Entry::NewSpan(span) => { - spans.insert(span.id, (span, SpanStatus::Outside)); + spans.insert(span.id, (span, SpanStatus::Outside, SelfTime::new())); } Entry::SpanEnter(SpanEnter { id, time, memory: _ }) => { - let (_, status) = spans.get_mut(&id).unwrap(); + first_point.get_or_insert(time); + let (_, status, _) = spans.get_mut(&id).unwrap(); let SpanStatus::Outside = status else { continue; @@ -43,18 +93,32 @@ pub fn to_call_stats( *status = SpanStatus::Inside(time); } Entry::SpanExit(SpanExit { id, time: end, memory: _ }) => { - let (span, status) = spans.get_mut(&id).unwrap(); + let (span, status, self_time) = spans.get_mut(&id).unwrap(); let SpanStatus::Inside(begin) = status else { continue; }; let begin = *begin; + if last_point < end { + last_point = end; + } + *status = SpanStatus::Outside; + let self_range = begin..end; + + let self_duration = self_time.self_duration(self_range.clone()); + *self_time = SelfTime::new(); + let span = *span; + if let Some(parent_id) = span.parent_id { + let (_, _, parent_self_time) = spans.get_mut(&parent_id).unwrap(); + parent_self_time.add_child_range(self_range.clone()) + } + total_self_time.add_child_range(self_range); let (_, call_list) = calls.get_mut(&span.call_id).unwrap(); - call_list.push(end - begin); + call_list.push((end - begin, self_duration)); } Entry::SpanClose(SpanClose { id, time: _ }) => { spans.remove(&id); @@ -63,17 +127,31 @@ pub fn to_call_stats( } } + let total_self_time = first_point + .map(|first_point| (first_point, total_self_time.self_duration(first_point..last_point))); + Ok(calls .into_iter() .map(|(_, (call_site, calls))| (site_to_string(call_site), calls_to_stats(calls))) + .chain(total_self_time.map(|(first_point, total_self_time)| { + ( + "::meta::total".to_string(), + CallStats { + call_count: 1, + time: (last_point - first_point).as_nanos() as u64, + self_time: total_self_time.as_nanos() as u64, + }, + ) + })) .collect()) } fn site_to_string(call_site: NewCallsite) -> String { format!("{}::{}", call_site.target, call_site.name) } -fn calls_to_stats(calls: Vec) -> CallStats { +fn calls_to_stats(calls: Vec<(Duration, Duration)>) -> CallStats { let nb = calls.len(); - let sum: Duration = calls.iter().sum(); - CallStats { call_count: nb, time: sum.as_nanos() as u64 } + let sum: Duration = calls.iter().map(|(total, _)| total).sum(); + let self_sum: Duration = calls.iter().map(|(_, self_duration)| self_duration).sum(); + CallStats { call_count: nb, time: sum.as_nanos() as u64, self_time: self_sum.as_nanos() as u64 } } diff --git a/workloads/hackernews.json b/workloads/hackernews.json new file mode 100644 index 000000000..0a99b69ff --- /dev/null +++ b/workloads/hackernews.json @@ -0,0 +1,164 @@ +{ + "name": "hackernews.ndjson_1M", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-100_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-100_000.ndjson", + "sha256": "60ecd23485d560edbd90d9ca31f0e6dba1455422f2a44e402600fbb5f7f1b213" + }, + "hackernews-200_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-200_000.ndjson", + "sha256": "785b0271fdb47cba574fab617d5d332276b835c05dd86e4a95251cf7892a1685" + }, + "hackernews-300_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-300_000.ndjson", + "sha256": "de73c7154652eddfaf69cdc3b2f824d5c452f095f40a20a1c97bb1b5c4d80ab2" + }, + "hackernews-400_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-400_000.ndjson", + "sha256": "c1b00a24689110f366447e434c201c086d6f456d54ed1c4995894102794d8fe7" + }, + "hackernews-500_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-500_000.ndjson", + "sha256": "ae98f9dbef8193d750e3e2dbb6a91648941a1edca5f6e82c143e7996f4840083" + }, + "hackernews-600_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-600_000.ndjson", + "sha256": "b495fdc72c4a944801f786400f22076ab99186bee9699f67cbab2f21f5b74dbe" + }, + "hackernews-700_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-700_000.ndjson", + "sha256": "4b2c63974f3dabaa4954e3d4598b48324d03c522321ac05b0d583f36cb78a28b" + }, + "hackernews-800_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-800_000.ndjson", + "sha256": "cb7b6afe0e6caa1be111be256821bc63b0771b2a0e1fad95af7aaeeffd7ba546" + }, + "hackernews-900_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-900_000.ndjson", + "sha256": "e1154ddcd398f1c867758a93db5bcb21a07b9e55530c188a2917fdef332d3ba9" + }, + "hackernews-1_000_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-1_000_000.ndjson", + "sha256": "27e25efd0b68b159b8b21350d9af76938710cb29ce0393fa71b41c4f3c630ffe" + } + }, + "commands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time" + ], + "searchableAttributes": [ + "title" + ], + "filterableAttributes": [ + "by" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-100_000.ndjson" + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-200_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-300_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-400_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-500_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-600_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-700_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-800_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-900_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-1_000_000.ndjson" + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file diff --git a/workloads/movies-nothreads.json b/workloads/movies-nothreads.json new file mode 100644 index 000000000..175daacf9 --- /dev/null +++ b/workloads/movies-nothreads.json @@ -0,0 +1,44 @@ +{ + "name": "movies.json,no-threads", + "run_count": 2, + "extra_cli_args": [ + "--max-indexing-threads=1" + ], + "assets": { + "movies.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies.json", + "sha256": "5b6e4cb660bc20327776e8a33ea197b43d9ec84856710ead1cc87ab24df77de1" + } + }, + "commands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "title", + "overview" + ], + "filterableAttributes": [ + "genres", + "release_date" + ], + "sortableAttributes": [ + "release_date" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "movies.json" + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file diff --git a/workloads/movies.json b/workloads/movies.json new file mode 100644 index 000000000..445ff3aca --- /dev/null +++ b/workloads/movies.json @@ -0,0 +1,42 @@ +{ + "name": "movies.json", + "run_count": 10, + "extra_cli_args": [], + "assets": { + "movies.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies.json", + "sha256": "5b6e4cb660bc20327776e8a33ea197b43d9ec84856710ead1cc87ab24df77de1" + } + }, + "commands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "title", + "overview" + ], + "filterableAttributes": [ + "genres", + "release_date" + ], + "sortableAttributes": [ + "release_date" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "movies.json" + }, + "synchronous": "WaitForTask" + } + ] +} diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 07271ea09..562dfddb3 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -11,5 +11,34 @@ license.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +anyhow = "1.0.79" +build-info = { version = "1.7.0", path = "../build-info" } cargo_metadata = "0.18.1" clap = { version = "4.4.14", features = ["derive"] } +futures-core = "0.3.30" +futures-util = "0.3.30" +reqwest = { version = "0.11.23", features = [ + "stream", + "json", + "rustls-tls", +], default_features = false } +serde = { version = "1.0.195", features = ["derive"] } +serde_json = "1.0.111" +sha2 = "0.10.8" +sysinfo = "0.30.5" +time = { version = "0.3.32", features = [ + "serde", + "serde-human-readable", + "macros", +] } +tokio = { version = "1.35.1", features = [ + "rt", + "net", + "time", + "process", + "signal", +] } +tracing = "0.1.40" +tracing-subscriber = "0.3.18" +tracing-trace = { version = "0.1.0", path = "../tracing-trace" } +uuid = { version = "1.7.0", features = ["v7", "serde"] } diff --git a/xtask/src/bench/assets.rs b/xtask/src/bench/assets.rs new file mode 100644 index 000000000..241928dbf --- /dev/null +++ b/xtask/src/bench/assets.rs @@ -0,0 +1,250 @@ +use std::collections::BTreeMap; +use std::io::{Read as _, Seek as _, Write as _}; + +use anyhow::{bail, Context}; +use futures_util::TryStreamExt as _; +use serde::Deserialize; +use sha2::Digest; + +use super::client::Client; + +#[derive(Deserialize, Clone)] +pub struct Asset { + pub local_location: Option, + pub remote_location: Option, + #[serde(default)] + pub format: AssetFormat, + pub sha256: Option, +} + +#[derive(Deserialize, Default, Copy, Clone)] +pub enum AssetFormat { + #[default] + Auto, + Json, + NdJson, + Raw, +} + +impl AssetFormat { + pub fn to_content_type(self, filename: &str) -> &'static str { + match self { + AssetFormat::Auto => Self::auto_detect(filename).to_content_type(filename), + AssetFormat::Json => "application/json", + AssetFormat::NdJson => "application/x-ndjson", + AssetFormat::Raw => "application/octet-stream", + } + } + + fn auto_detect(filename: &str) -> Self { + let path = std::path::Path::new(filename); + match path.extension().and_then(|extension| extension.to_str()) { + Some(extension) if extension.eq_ignore_ascii_case("json") => Self::Json, + Some(extension) if extension.eq_ignore_ascii_case("ndjson") => Self::NdJson, + extension => { + tracing::warn!(asset = filename, ?extension, "asset has format `Auto`, but extension was not recognized. Specify `Raw` format to suppress this warning."); + AssetFormat::Raw + } + } + } +} + +pub fn fetch_asset( + name: &str, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<(std::fs::File, AssetFormat)> { + let asset = + assets.get(name).with_context(|| format!("could not find asset with name '{name}'"))?; + let filename = if let Some(local_filename) = &asset.local_location { + local_filename.clone() + } else { + format!("{asset_folder}/{name}") + }; + + Ok(( + std::fs::File::open(&filename) + .with_context(|| format!("could not open asset '{name}' at '{filename}'"))?, + asset.format, + )) +} + +#[tracing::instrument(skip(client, assets), fields(asset_count = assets.len()))] +pub async fn fetch_assets( + client: &Client, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + let mut download_tasks = tokio::task::JoinSet::new(); + for (name, asset) in assets { + // trying local + if let Some(local) = &asset.local_location { + match std::fs::File::open(local) { + Ok(file) => { + if check_sha256(name, asset, file)? { + continue; + } else { + tracing::warn!(asset = name, file = local, "found local resource for asset but hash differed, skipping to asset store"); + } + } + Err(error) => match error.kind() { + std::io::ErrorKind::NotFound => { /* file does not exist, go to remote, no need for logs */ + } + _ => tracing::warn!( + error = &error as &dyn std::error::Error, + "error checking local resource, skipping to asset store" + ), + }, + } + } + + // checking asset store + let store_filename = format!("{}/{}", asset_folder, name); + + match std::fs::File::open(&store_filename) { + Ok(file) => { + if check_sha256(name, asset, file)? { + continue; + } else { + tracing::warn!(asset = name, file = store_filename, "found resource for asset in asset store, but hash differed, skipping to remote method"); + } + } + Err(error) => match error.kind() { + std::io::ErrorKind::NotFound => { /* file does not exist, go to remote, no need for logs */ + } + _ => tracing::warn!( + error = &error as &dyn std::error::Error, + "error checking resource in store, skipping to remote method" + ), + }, + } + + // downloading remote + match &asset.remote_location { + Some(location) => { + std::fs::create_dir_all(asset_folder).with_context(|| format!("could not create asset folder at {asset_folder}"))?; + download_tasks.spawn({ + let client = client.clone(); + let name = name.to_string(); + let location = location.to_string(); + let store_filename = store_filename.clone(); + let asset = asset.clone(); + download_asset(client, name, asset, location, store_filename)}); + }, + None => bail!("asset {name} has no remote location, but was not found locally or in the asset store"), + } + } + + while let Some(res) = download_tasks.join_next().await { + res.context("download task panicked")?.context("download task failed")?; + } + + Ok(()) +} + +fn check_sha256(name: &str, asset: &Asset, mut file: std::fs::File) -> anyhow::Result { + let mut bytes = Vec::new(); + file.read_to_end(&mut bytes).with_context(|| format!("hashing file for asset {name}"))?; + let mut file_hash = sha2::Sha256::new(); + file_hash.update(&bytes); + let file_hash = file_hash.finalize(); + let file_hash = format!("{:x}", file_hash); + tracing::debug!(hash = file_hash, "hashed local file"); + + Ok(match &asset.sha256 { + Some(hash) => { + tracing::debug!(hash, "hash from workload"); + if hash.to_ascii_lowercase() == file_hash { + true + } else { + tracing::warn!( + file_hash, + asset_hash = hash.to_ascii_lowercase(), + "hashes don't match" + ); + false + } + } + None => { + tracing::warn!(sha256 = file_hash, "Skipping hash for asset {name} that doesn't have one. Please add it to workload file"); + true + } + }) +} + +#[tracing::instrument(skip(client, asset, name), fields(asset = name))] +async fn download_asset( + client: Client, + name: String, + asset: Asset, + src: String, + dest_filename: String, +) -> anyhow::Result<()> { + let context = || format!("failure downloading asset {name} from {src}"); + + let response = client.get(&src).send().await.with_context(context)?; + + let file = std::fs::File::options() + .create(true) + .truncate(true) + .write(true) + .read(true) + .open(&dest_filename) + .with_context(|| format!("creating destination file {dest_filename}")) + .with_context(context)?; + + let mut dest = std::io::BufWriter::new( + file.try_clone().context("cloning I/O handle").with_context(context)?, + ); + + let total_len: Option = response + .headers() + .get(reqwest::header::CONTENT_LENGTH) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.parse().ok()); + + let progress = tokio::spawn({ + let name = name.clone(); + async move { + loop { + match file.metadata().context("could not get file metadata") { + Ok(metadata) => { + let len = metadata.len(); + tracing::info!( + asset = name, + downloaded_bytes = len, + total_bytes = total_len, + "asset download in progress" + ); + } + Err(error) => { + tracing::warn!(%error, "could not get file metadata"); + } + } + tokio::time::sleep(std::time::Duration::from_secs(60)).await; + } + } + }); + + let writing_context = || format!("while writing to destination file at {dest_filename}"); + + let mut response = response.bytes_stream(); + + while let Some(bytes) = + response.try_next().await.context("while downloading file").with_context(context)? + { + dest.write_all(&bytes).with_context(writing_context).with_context(context)?; + } + + progress.abort(); + + let mut file = dest.into_inner().with_context(writing_context).with_context(context)?; + + file.rewind().context("while rewinding asset file")?; + + if !check_sha256(&name, &asset, file)? { + bail!("asset '{name}': sha256 mismatch for file {dest_filename} downloaded from {src}") + } + + Ok(()) +} diff --git a/xtask/src/bench/client.rs b/xtask/src/bench/client.rs new file mode 100644 index 000000000..3e46615cc --- /dev/null +++ b/xtask/src/bench/client.rs @@ -0,0 +1,80 @@ +use anyhow::Context; +use serde::Deserialize; + +#[derive(Debug, Clone)] +pub struct Client { + base_url: Option, + client: reqwest::Client, +} + +impl Client { + pub fn new( + base_url: Option, + api_key: Option<&str>, + timeout: Option, + ) -> anyhow::Result { + let mut headers = reqwest::header::HeaderMap::new(); + if let Some(api_key) = api_key { + headers.append( + reqwest::header::AUTHORIZATION, + reqwest::header::HeaderValue::from_str(&format!("Bearer {api_key}")) + .context("Invalid authorization header")?, + ); + } + + let client = reqwest::ClientBuilder::new().default_headers(headers); + let client = if let Some(timeout) = timeout { client.timeout(timeout) } else { client }; + let client = client.build()?; + Ok(Self { base_url, client }) + } + + pub fn request(&self, method: reqwest::Method, route: &str) -> reqwest::RequestBuilder { + if let Some(base_url) = &self.base_url { + if route.is_empty() { + self.client.request(method, base_url) + } else { + self.client.request(method, format!("{}/{}", base_url, route)) + } + } else { + self.client.request(method, route) + } + } + + pub fn get(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::GET, route) + } + + pub fn put(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::PUT, route) + } + + pub fn post(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::POST, route) + } + + pub fn delete(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::DELETE, route) + } +} + +#[derive(Debug, Clone, Copy, Deserialize)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +pub enum Method { + Get, + Post, + Patch, + Delete, + Put, +} + +impl From for reqwest::Method { + fn from(value: Method) -> Self { + match value { + Method::Get => Self::GET, + Method::Post => Self::POST, + Method::Patch => Self::PATCH, + Method::Delete => Self::DELETE, + Method::Put => Self::PUT, + } + } +} diff --git a/xtask/src/bench/command.rs b/xtask/src/bench/command.rs new file mode 100644 index 000000000..0f0b5d213 --- /dev/null +++ b/xtask/src/bench/command.rs @@ -0,0 +1,194 @@ +use std::collections::BTreeMap; +use std::fmt::Display; +use std::io::Read as _; + +use anyhow::{bail, Context as _}; +use serde::Deserialize; + +use super::assets::{fetch_asset, Asset}; +use super::client::{Client, Method}; + +#[derive(Clone, Deserialize)] +pub struct Command { + pub route: String, + pub method: Method, + #[serde(default)] + pub body: Body, + #[serde(default)] + pub synchronous: SyncMode, +} + +#[derive(Default, Clone, Deserialize)] +#[serde(untagged)] +pub enum Body { + Inline { + inline: serde_json::Value, + }, + Asset { + asset: String, + }, + #[default] + Empty, +} + +impl Body { + pub fn get( + self, + assets: &BTreeMap, + asset_folder: &str, + ) -> anyhow::Result, &'static str)>> { + Ok(match self { + Body::Inline { inline: body } => Some(( + serde_json::to_vec(&body) + .context("serializing to bytes") + .context("while getting inline body")?, + "application/json", + )), + Body::Asset { asset: name } => Some({ + let context = || format!("while getting body from asset '{name}'"); + let (mut file, format) = + fetch_asset(&name, assets, asset_folder).with_context(context)?; + let mut buf = Vec::new(); + file.read_to_end(&mut buf).with_context(context)?; + (buf, format.to_content_type(&name)) + }), + Body::Empty => None, + }) + } +} + +impl Display for Command { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?} {} ({:?})", self.method, self.route, self.synchronous) + } +} + +#[derive(Default, Debug, Clone, Copy, Deserialize)] +pub enum SyncMode { + DontWait, + #[default] + WaitForResponse, + WaitForTask, +} + +pub async fn run_batch( + client: &Client, + batch: &[Command], + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + let [.., last] = batch else { return Ok(()) }; + let sync = last.synchronous; + + let mut tasks = tokio::task::JoinSet::new(); + + for command in batch { + // FIXME: you probably don't want to copy assets everytime here + tasks.spawn({ + let client = client.clone(); + let command = command.clone(); + let assets = assets.clone(); + let asset_folder = asset_folder.to_owned(); + + async move { run(client, command, &assets, &asset_folder).await } + }); + } + + while let Some(result) = tasks.join_next().await { + result + .context("panicked while executing command")? + .context("error while executing command")?; + } + + match sync { + SyncMode::DontWait => {} + SyncMode::WaitForResponse => {} + SyncMode::WaitForTask => wait_for_tasks(client).await?, + } + + Ok(()) +} + +async fn wait_for_tasks(client: &Client) -> anyhow::Result<()> { + loop { + let response = client + .get("tasks?statuses=enqueued,processing") + .send() + .await + .context("could not wait for tasks")?; + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response to JSON") + .context("could not wait for tasks")?; + match response.get("total") { + Some(serde_json::Value::Number(number)) => { + let number = number.as_u64().with_context(|| { + format!("waiting for tasks: could not parse 'total' as integer, got {}", number) + })?; + if number == 0 { + break; + } else { + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + continue; + } + } + Some(thing_else) => { + bail!(format!( + "waiting for tasks: could not parse 'total' as a number, got '{thing_else}'" + )) + } + None => { + bail!(format!( + "waiting for tasks: expected response to contain 'total', got '{response}'" + )) + } + } + } + Ok(()) +} + +#[tracing::instrument(skip(client, command, assets, asset_folder), fields(command = %command))] +pub async fn run( + client: Client, + mut command: Command, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + // memtake the body here to leave an empty body in its place, so that command is not partially moved-out + let body = std::mem::take(&mut command.body) + .get(assets, asset_folder) + .with_context(|| format!("while getting body for command {command}"))?; + + let request = client.request(command.method.into(), &command.route); + + let request = if let Some((body, content_type)) = body { + request.body(body).header(reqwest::header::CONTENT_TYPE, content_type) + } else { + request + }; + + let response = + request.send().await.with_context(|| format!("error sending command: {}", command))?; + + let code = response.status(); + if code.is_client_error() { + tracing::error!(%command, %code, "error in workload file"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("parsing error in workload file when sending command")?; + bail!("error in workload file: server responded with error code {code} and '{response}'") + } else if code.is_server_error() { + tracing::error!(%command, %code, "server error"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("parsing server error when sending command")?; + bail!("server error: server responded with error code {code} and '{response}'") + } + + Ok(()) +} diff --git a/xtask/src/bench/dashboard.rs b/xtask/src/bench/dashboard.rs new file mode 100644 index 000000000..833426207 --- /dev/null +++ b/xtask/src/bench/dashboard.rs @@ -0,0 +1,167 @@ +use std::collections::BTreeMap; + +use anyhow::{bail, Context}; +use serde_json::json; +use tokio::signal::ctrl_c; +use tokio::task::AbortHandle; +use tracing_trace::processor::span_stats::CallStats; +use uuid::Uuid; + +use super::client::Client; +use super::env_info; +use super::workload::Workload; + +pub async fn cancel_on_ctrl_c( + invocation_uuid: Uuid, + dashboard_client: Client, + abort_handle: AbortHandle, +) { + tracing::info!("press Ctrl-C to cancel the invocation"); + match ctrl_c().await { + Ok(()) => { + tracing::info!(%invocation_uuid, "received Ctrl-C, cancelling invocation"); + mark_as_failed(dashboard_client, invocation_uuid, None).await; + abort_handle.abort(); + } + Err(error) => tracing::warn!( + error = &error as &dyn std::error::Error, + "failed to listen to Ctrl-C signal, invocation won't be canceled on Ctrl-C" + ), + } +} + +pub async fn mark_as_failed( + dashboard_client: Client, + invocation_uuid: Uuid, + failure_reason: Option, +) { + let response = dashboard_client + .post("cancel-invocation") + .json(&json!({ + "invocation_uuid": invocation_uuid, + "failure_reason": failure_reason, + })) + .send() + .await; + let response = match response { + Ok(response) => response, + Err(response_error) => { + tracing::error!(error = &response_error as &dyn std::error::Error, %invocation_uuid, "could not mark invocation as failed"); + return; + } + }; + + if !response.status().is_success() { + tracing::error!( + %invocation_uuid, + "could not mark invocation as failed: {}", + response.text().await.unwrap() + ); + return; + } + tracing::warn!(%invocation_uuid, "marked invocation as failed or canceled"); +} + +pub async fn send_machine_info( + dashboard_client: &Client, + env: &env_info::Environment, +) -> anyhow::Result<()> { + let response = dashboard_client + .put("machine") + .json(&json!({"hostname": env.hostname})) + .send() + .await + .context("sending machine information")?; + if !response.status().is_success() { + bail!( + "could not send machine information: {} {}", + response.status(), + response.text().await.unwrap_or_else(|_| "unknown".into()) + ); + } + Ok(()) +} + +pub async fn create_invocation( + dashboard_client: &Client, + build_info: build_info::BuildInfo, + commit_message: &str, + env: env_info::Environment, + max_workloads: usize, + reason: Option<&str>, +) -> anyhow::Result { + let response = dashboard_client + .put("invocation") + .json(&json!({ + "commit": { + "sha1": build_info.commit_sha1, + "message": commit_message, + "commit_date": build_info.commit_timestamp, + "branch": build_info.branch, + "tag": build_info.describe.and_then(|describe| describe.as_tag()), + }, + "machine_hostname": env.hostname, + "max_workloads": max_workloads, + "reason": reason + })) + .send() + .await + .context("sending invocation")?; + if !response.status().is_success() { + bail!( + "could not send new invocation: {}", + response.text().await.unwrap_or_else(|_| "unknown".into()) + ); + } + let invocation_uuid: Uuid = + response.json().await.context("could not deserialize invocation response as JSON")?; + Ok(invocation_uuid) +} + +pub async fn create_workload( + dashboard_client: &Client, + invocation_uuid: Uuid, + workload: &Workload, +) -> anyhow::Result { + let response = dashboard_client + .put("workload") + .json(&json!({ + "invocation_uuid": invocation_uuid, + "name": &workload.name, + "max_runs": workload.run_count, + })) + .send() + .await + .context("could not create new workload")?; + + if !response.status().is_success() { + bail!("creating new workload failed: {}", response.text().await.unwrap()) + } + + let workload_uuid: Uuid = + response.json().await.context("could not deserialize JSON as UUID")?; + Ok(workload_uuid) +} + +pub async fn create_run( + dashboard_client: Client, + workload_uuid: Uuid, + report: &BTreeMap, +) -> anyhow::Result<()> { + let response = dashboard_client + .put("run") + .json(&json!({ + "workload_uuid": workload_uuid, + "data": report + })) + .send() + .await + .context("sending new run")?; + if !response.status().is_success() { + bail!( + "sending new run failed: {}", + response.text().await.unwrap_or_else(|_| "unknown".into()) + ) + } + Ok(()) +} diff --git a/xtask/src/bench/env_info.rs b/xtask/src/bench/env_info.rs new file mode 100644 index 000000000..08dacf915 --- /dev/null +++ b/xtask/src/bench/env_info.rs @@ -0,0 +1,75 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Environment { + pub hostname: Option, + pub cpu: String, + + /// Advertised or nominal clock speed in Hertz. + pub clock_speed: u64, + + /// Total number of bytes of memory provided by the system. */ + pub memory: u64, + pub os_type: String, + pub software: Vec, + + pub user_name: String, + + /// Is set true when the data was gathered by a manual run, + /// possibly on a developer machine, instead of the usual benchmark server. + pub manual_run: bool, +} + +impl Environment { + pub fn generate_from_current_config() -> Self { + use sysinfo::System; + + let unknown_string = String::from("Unknown"); + let mut system = System::new(); + system.refresh_cpu(); + system.refresh_cpu_frequency(); + system.refresh_memory(); + + let (cpu, frequency) = match system.cpus().first() { + Some(cpu) => ( + format!("{} @ {:.2}GHz", cpu.brand(), cpu.frequency() as f64 / 1000.0), + cpu.frequency() * 1_000_000, + ), + None => (unknown_string.clone(), 0), + }; + + let mut software = Vec::new(); + if let Some(distribution) = System::name() { + software + .push(VersionInfo { name: distribution, version: String::from("distribution") }); + } + if let Some(kernel) = System::kernel_version() { + software.push(VersionInfo { name: kernel, version: String::from("kernel") }); + } + if let Some(os) = System::os_version() { + software.push(VersionInfo { name: os, version: String::from("kernel-release") }); + } + if let Some(arch) = System::cpu_arch() { + software.push(VersionInfo { name: arch, version: String::from("arch") }); + } + + Self { + hostname: System::host_name(), + cpu, + clock_speed: frequency, + memory: system.total_memory(), + os_type: System::long_os_version().unwrap_or(unknown_string.clone()), + user_name: System::name().unwrap_or(unknown_string.clone()), + manual_run: false, + software, + } + } +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct VersionInfo { + pub name: String, + pub version: String, +} diff --git a/xtask/src/bench/meili_process.rs b/xtask/src/bench/meili_process.rs new file mode 100644 index 000000000..99f6f4ea6 --- /dev/null +++ b/xtask/src/bench/meili_process.rs @@ -0,0 +1,112 @@ +use std::collections::BTreeMap; + +use anyhow::{bail, Context as _}; + +use super::assets::Asset; +use super::client::Client; +use super::workload::Workload; + +pub async fn kill(mut meilisearch: tokio::process::Child) { + if let Err(error) = meilisearch.kill().await { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server" + ) + } +} + +#[tracing::instrument] +pub async fn build() -> anyhow::Result<()> { + let mut command = tokio::process::Command::new("cargo"); + command.arg("build").arg("--release").arg("-p").arg("meilisearch"); + + command.kill_on_drop(true); + + let mut builder = command.spawn().context("error building Meilisearch")?; + + if !builder.wait().await.context("could not build Meilisearch")?.success() { + bail!("failed building Meilisearch") + } + + Ok(()) +} + +#[tracing::instrument(skip(client, master_key, workload), fields(workload = workload.name))] +pub async fn start( + client: &Client, + master_key: Option<&str>, + workload: &Workload, + asset_folder: &str, +) -> anyhow::Result { + let mut command = tokio::process::Command::new("cargo"); + command + .arg("run") + .arg("--release") + .arg("-p") + .arg("meilisearch") + .arg("--bin") + .arg("meilisearch") + .arg("--"); + + command.arg("--db-path").arg("./_xtask_benchmark.ms"); + if let Some(master_key) = master_key { + command.arg("--master-key").arg(master_key); + } + command.arg("--experimental-enable-logs-route"); + + for extra_arg in workload.extra_cli_args.iter() { + command.arg(extra_arg); + } + + command.kill_on_drop(true); + + let mut meilisearch = command.spawn().context("Error starting Meilisearch")?; + + wait_for_health(client, &mut meilisearch, &workload.assets, asset_folder).await?; + + Ok(meilisearch) +} + +async fn wait_for_health( + client: &Client, + meilisearch: &mut tokio::process::Child, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + for i in 0..100 { + let res = super::command::run(client.clone(), health_command(), assets, asset_folder).await; + if res.is_ok() { + // check that this is actually the current Meilisearch instance that answered us + if let Some(exit_code) = + meilisearch.try_wait().context("cannot check Meilisearch server process status")? + { + tracing::error!("Got an health response from a different process"); + bail!("Meilisearch server exited early with code {exit_code}"); + } + + return Ok(()); + } + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + // check whether the Meilisearch instance exited early (cut the wait) + if let Some(exit_code) = + meilisearch.try_wait().context("cannot check Meilisearch server process status")? + { + bail!("Meilisearch server exited early with code {exit_code}"); + } + tracing::debug!(attempt = i, "Waiting for Meilisearch to go up"); + } + bail!("meilisearch is not responding") +} + +fn health_command() -> super::command::Command { + super::command::Command { + route: "/health".into(), + method: super::client::Method::Get, + body: Default::default(), + synchronous: super::command::SyncMode::WaitForResponse, + } +} + +pub fn delete_db() { + let _ = std::fs::remove_dir_all("./_xtask_benchmark.ms"); +} diff --git a/xtask/src/bench/mod.rs b/xtask/src/bench/mod.rs new file mode 100644 index 000000000..62c11b604 --- /dev/null +++ b/xtask/src/bench/mod.rs @@ -0,0 +1,203 @@ +mod assets; +mod client; +mod command; +mod dashboard; +mod env_info; +mod meili_process; +mod workload; + +use std::path::PathBuf; + +use anyhow::Context; +use clap::Parser; +use tracing_subscriber::fmt::format::FmtSpan; +use tracing_subscriber::layer::SubscriberExt; +use tracing_subscriber::Layer; + +use self::client::Client; +use self::workload::Workload; + +pub fn default_http_addr() -> String { + "127.0.0.1:7700".to_string() +} +pub fn default_report_folder() -> String { + "./bench/reports/".into() +} + +pub fn default_asset_folder() -> String { + "./bench/assets/".into() +} + +pub fn default_log_filter() -> String { + "info".into() +} + +pub fn default_dashboard_url() -> String { + "http://localhost:9001".into() +} + +/// Run benchmarks from a workload +#[derive(Parser, Debug)] +pub struct BenchDeriveArgs { + /// Filename of the workload file, pass multiple filenames + /// to run multiple workloads in the specified order. + /// + /// Each workload run will get its own report file. + #[arg(value_name = "WORKLOAD_FILE", last = false)] + workload_file: Vec, + + /// URL of the dashboard. + #[arg(long, default_value_t = default_dashboard_url())] + dashboard_url: String, + + /// Directory to output reports. + #[arg(long, default_value_t = default_report_folder())] + report_folder: String, + + /// Directory to store the remote assets. + #[arg(long, default_value_t = default_asset_folder())] + asset_folder: String, + + /// Log directives + #[arg(short, long, default_value_t = default_log_filter())] + log_filter: String, + + /// Benchmark dashboard API key + #[arg(long)] + api_key: Option, + + /// Meilisearch master keys + #[arg(long)] + master_key: Option, + + /// Authentication bearer for fetching assets + #[arg(long)] + assets_key: Option, + + /// Reason for the benchmark invocation + #[arg(short, long)] + reason: Option, +} + +pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { + // setup logs + let filter: tracing_subscriber::filter::Targets = + args.log_filter.parse().context("invalid --log-filter")?; + + let subscriber = tracing_subscriber::registry().with( + tracing_subscriber::fmt::layer() + .with_span_events(FmtSpan::NEW | FmtSpan::CLOSE) + .with_filter(filter), + ); + tracing::subscriber::set_global_default(subscriber).context("could not setup logging")?; + + // fetch environment and build info + let env = env_info::Environment::generate_from_current_config(); + let build_info = build_info::BuildInfo::from_build(); + + // tokio runtime + let rt = tokio::runtime::Builder::new_current_thread().enable_io().enable_time().build()?; + let _scope = rt.enter(); + + // setup clients + let assets_client = + Client::new(None, args.assets_key.as_deref(), Some(std::time::Duration::from_secs(3600)))?; // 1h + + let dashboard_client = Client::new( + Some(format!("{}/api/v1", args.dashboard_url)), + args.api_key.as_deref(), + Some(std::time::Duration::from_secs(60)), + )?; + + // reporting uses its own client because keeping the stream open to wait for entries + // blocks any other requests + // Also we don't want any pesky timeout because we don't know how much time it will take to recover the full trace + let logs_client = Client::new( + Some("http://127.0.0.1:7700/logs/stream".into()), + args.master_key.as_deref(), + None, + )?; + + let meili_client = Client::new( + Some("http://127.0.0.1:7700".into()), + args.master_key.as_deref(), + Some(std::time::Duration::from_secs(60)), + )?; + + // enter runtime + + rt.block_on(async { + dashboard::send_machine_info(&dashboard_client, &env).await?; + + let commit_message = build_info.commit_msg.context("missing commit message")?.split('\n').next().unwrap(); + let max_workloads = args.workload_file.len(); + let reason: Option<&str> = args.reason.as_deref(); + let invocation_uuid = dashboard::create_invocation(&dashboard_client, build_info, commit_message, env, max_workloads, reason).await?; + + tracing::info!(workload_count = args.workload_file.len(), "handling workload files"); + + // main task + let workload_runs = tokio::spawn( + { + let dashboard_client = dashboard_client.clone(); + async move { + for workload_file in args.workload_file.iter() { + let workload: Workload = serde_json::from_reader( + std::fs::File::open(workload_file) + .with_context(|| format!("error opening {}", workload_file.display()))?, + ) + .with_context(|| format!("error parsing {} as JSON", workload_file.display()))?; + + workload::execute( + &assets_client, + &dashboard_client, + &logs_client, + &meili_client, + invocation_uuid, + args.master_key.as_deref(), + workload, + &args, + ) + .await?; + } + Ok::<(), anyhow::Error>(()) + }}); + + // handle ctrl-c + let abort_handle = workload_runs.abort_handle(); + tokio::spawn({ + let dashboard_client = dashboard_client.clone(); + dashboard::cancel_on_ctrl_c(invocation_uuid, dashboard_client, abort_handle) + }); + + // wait for the end of the main task, handle result + match workload_runs.await { + Ok(Ok(_)) => { + tracing::info!("Success"); + Ok::<(), anyhow::Error>(()) + } + Ok(Err(error)) => { + tracing::error!(%invocation_uuid, error = %error, "invocation failed, attempting to report the failure to dashboard"); + dashboard::mark_as_failed(dashboard_client, invocation_uuid, Some(error.to_string())).await; + tracing::warn!(%invocation_uuid, "invocation marked as failed following error"); + Err(error) + }, + Err(join_error) => { + match join_error.try_into_panic() { + Ok(panic) => { + tracing::error!("invocation panicked, attempting to report the failure to dashboard"); + dashboard::mark_as_failed(dashboard_client, invocation_uuid, Some("Panicked".into())).await; + std::panic::resume_unwind(panic) + } + Err(_) => { + tracing::warn!("task was canceled"); + Ok(()) + } + } + }, + } + + })?; + + Ok(()) +} diff --git a/xtask/src/bench/workload.rs b/xtask/src/bench/workload.rs new file mode 100644 index 000000000..b3e952f29 --- /dev/null +++ b/xtask/src/bench/workload.rs @@ -0,0 +1,262 @@ +use std::collections::BTreeMap; +use std::fs::File; +use std::io::{Seek as _, Write as _}; + +use anyhow::{bail, Context as _}; +use futures_util::TryStreamExt as _; +use serde::Deserialize; +use serde_json::json; +use tokio::task::JoinHandle; +use uuid::Uuid; + +use super::assets::Asset; +use super::client::Client; +use super::command::SyncMode; +use super::BenchDeriveArgs; +use crate::bench::{assets, dashboard, meili_process}; + +#[derive(Deserialize)] +pub struct Workload { + pub name: String, + pub run_count: u16, + pub extra_cli_args: Vec, + pub assets: BTreeMap, + pub commands: Vec, +} + +async fn run_commands( + dashboard_client: &Client, + logs_client: &Client, + meili_client: &Client, + workload_uuid: Uuid, + workload: &Workload, + args: &BenchDeriveArgs, + run_number: u16, +) -> anyhow::Result>> { + let report_folder = &args.report_folder; + let workload_name = &workload.name; + + std::fs::create_dir_all(report_folder) + .with_context(|| format!("could not create report directory at {report_folder}"))?; + + let trace_filename = format!("{report_folder}/{workload_name}-{run_number}-trace.json"); + let report_filename = format!("{report_folder}/{workload_name}-{run_number}-report.json"); + + let report_handle = start_report(logs_client, trace_filename).await?; + + for batch in workload + .commands + .as_slice() + .split_inclusive(|command| !matches!(command.synchronous, SyncMode::DontWait)) + { + super::command::run_batch(meili_client, batch, &workload.assets, &args.asset_folder) + .await?; + } + + let processor = + stop_report(dashboard_client, logs_client, workload_uuid, report_filename, report_handle) + .await?; + + Ok(processor) +} + +#[allow(clippy::too_many_arguments)] // not best code quality, but this is a benchmark runner +#[tracing::instrument(skip(assets_client, dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = workload.name))] +pub async fn execute( + assets_client: &Client, + dashboard_client: &Client, + logs_client: &Client, + meili_client: &Client, + invocation_uuid: Uuid, + master_key: Option<&str>, + workload: Workload, + args: &BenchDeriveArgs, +) -> anyhow::Result<()> { + assets::fetch_assets(assets_client, &workload.assets, &args.asset_folder).await?; + + let workload_uuid = + dashboard::create_workload(dashboard_client, invocation_uuid, &workload).await?; + + let mut tasks = Vec::new(); + + for i in 0..workload.run_count { + tasks.push( + execute_run( + dashboard_client, + logs_client, + meili_client, + workload_uuid, + master_key, + &workload, + args, + i, + ) + .await?, + ); + } + + let mut reports = Vec::with_capacity(workload.run_count as usize); + + for task in tasks { + reports.push( + task.await + .context("task panicked while processing report")? + .context("task failed while processing report")?, + ); + } + + tracing::info!(workload = workload.name, "Successful workload"); + + Ok(()) +} + +#[allow(clippy::too_many_arguments)] // not best code quality, but this is a benchmark runner +#[tracing::instrument(skip(dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = %workload.name))] +async fn execute_run( + dashboard_client: &Client, + logs_client: &Client, + meili_client: &Client, + workload_uuid: Uuid, + master_key: Option<&str>, + workload: &Workload, + args: &BenchDeriveArgs, + run_number: u16, +) -> anyhow::Result>> { + meili_process::delete_db(); + + meili_process::build().await?; + let meilisearch = + meili_process::start(meili_client, master_key, workload, &args.asset_folder).await?; + + let processor = run_commands( + dashboard_client, + logs_client, + meili_client, + workload_uuid, + workload, + args, + run_number, + ) + .await?; + + meili_process::kill(meilisearch).await; + + tracing::info!(run_number, "Successful run"); + + Ok(processor) +} + +async fn start_report( + logs_client: &Client, + filename: String, +) -> anyhow::Result>> { + let report_file = std::fs::File::options() + .create(true) + .truncate(true) + .write(true) + .read(true) + .open(&filename) + .with_context(|| format!("could not create file at {filename}"))?; + let mut report_file = std::io::BufWriter::new(report_file); + + let response = logs_client + .post("") + .json(&json!({ + "mode": "profile", + "target": "indexing::=trace" + })) + .send() + .await + .context("failed to start report")?; + + let code = response.status(); + if code.is_client_error() { + tracing::error!(%code, "request error when trying to start report"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("response error when trying to start report")?; + bail!( + "request error when trying to start report: server responded with error code {code} and '{response}'" + ) + } else if code.is_server_error() { + tracing::error!(%code, "server error when trying to start report"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("response error trying to start report")?; + bail!("server error when trying to start report: server responded with error code {code} and '{response}'") + } + + Ok(tokio::task::spawn(async move { + let mut stream = response.bytes_stream(); + while let Some(bytes) = stream.try_next().await.context("while waiting for report")? { + report_file + .write_all(&bytes) + .with_context(|| format!("while writing report to {filename}"))?; + } + report_file.into_inner().with_context(|| format!("while writing report to {filename}")) + })) +} + +async fn stop_report( + dashboard_client: &Client, + logs_client: &Client, + workload_uuid: Uuid, + filename: String, + report_handle: tokio::task::JoinHandle>, +) -> anyhow::Result>> { + let response = logs_client.delete("").send().await.context("while stopping report")?; + if !response.status().is_success() { + bail!("received HTTP {} while stopping report", response.status()) + } + + let mut file = tokio::time::timeout(std::time::Duration::from_secs(1000), report_handle) + .await + .context("while waiting for the end of the report")? + .context("report writing task panicked")? + .context("while writing report")?; + + file.rewind().context("while rewinding report file")?; + + let process_handle = tokio::task::spawn({ + let dashboard_client = dashboard_client.clone(); + async move { + let span = tracing::info_span!("processing trace to report", filename); + let _guard = span.enter(); + let report = tracing_trace::processor::span_stats::to_call_stats( + tracing_trace::TraceReader::new(std::io::BufReader::new(file)), + ) + .context("could not convert trace to report")?; + let context = || format!("writing report to {filename}"); + + dashboard::create_run(dashboard_client, workload_uuid, &report).await?; + + let mut output_file = std::io::BufWriter::new( + std::fs::File::options() + .create(true) + .truncate(true) + .write(true) + .read(true) + .open(&filename) + .with_context(context)?, + ); + + for (key, value) in report { + serde_json::to_writer(&mut output_file, &json!({key: value})) + .context("serializing span stat")?; + writeln!(&mut output_file).with_context(context)?; + } + output_file.flush().with_context(context)?; + let mut output_file = output_file.into_inner().with_context(context)?; + + output_file.rewind().context("could not rewind output_file").with_context(context)?; + + Ok(output_file) + } + }); + + Ok(process_handle) +} diff --git a/xtask/src/lib.rs b/xtask/src/lib.rs new file mode 100644 index 000000000..cbda260db --- /dev/null +++ b/xtask/src/lib.rs @@ -0,0 +1 @@ +pub mod bench; diff --git a/xtask/src/main.rs b/xtask/src/main.rs index 6570dc67b..b81424666 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use clap::Parser; +use xtask::bench::BenchDeriveArgs; /// List features available in the workspace #[derive(Parser, Debug)] @@ -17,13 +18,16 @@ struct ListFeaturesDeriveArgs { #[command(bin_name = "cargo xtask")] enum Command { ListFeatures(ListFeaturesDeriveArgs), + Bench(BenchDeriveArgs), } -fn main() { +fn main() -> anyhow::Result<()> { let args = Command::parse(); match args { Command::ListFeatures(args) => list_features(args), + Command::Bench(args) => xtask::bench::run(args)?, } + Ok(()) } fn list_features(args: ListFeaturesDeriveArgs) {