diff --git a/.github/workflows/benchmarks-pr.yml b/.github/workflows/benchmarks-pr.yml index aa784296a..30baa294e 100644 --- a/.github/workflows/benchmarks-pr.yml +++ b/.github/workflows/benchmarks-pr.yml @@ -90,7 +90,8 @@ jobs: set -x export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8) export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json) - echo 'Here are your benchmarks diff 👊' >> body.txt + export bench_name=$(echo ${{ steps.command.outputs.command-arguments }}) + echo "Here are your $bench_name benchmarks diff 👊" >> body.txt echo '```' >> body.txt ./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt echo '```' >> body.txt diff --git a/.github/workflows/publish-apt-brew-pkg.yml b/.github/workflows/publish-apt-brew-pkg.yml index 452776e38..11893bae0 100644 --- a/.github/workflows/publish-apt-brew-pkg.yml +++ b/.github/workflows/publish-apt-brew-pkg.yml @@ -50,7 +50,7 @@ jobs: needs: check-version steps: - name: Create PR to Homebrew - uses: mislav/bump-homebrew-formula-action@v2 + uses: mislav/bump-homebrew-formula-action@v3 with: formula-name: meilisearch formula-path: Formula/m/meilisearch.rb diff --git a/.github/workflows/publish-docker-images.yml b/.github/workflows/publish-docker-images.yml index 051fb105d..1ee8ba4d0 100644 --- a/.github/workflows/publish-docker-images.yml +++ b/.github/workflows/publish-docker-images.yml @@ -63,7 +63,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Login to Docker Hub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} diff --git a/.github/workflows/sdks-tests.yml b/.github/workflows/sdks-tests.yml index 05cf6b91c..7b6ea74de 100644 --- a/.github/workflows/sdks-tests.yml +++ b/.github/workflows/sdks-tests.yml @@ -160,7 +160,7 @@ jobs: with: repository: meilisearch/meilisearch-js - name: Setup node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: cache: 'yarn' - name: Install dependencies @@ -318,7 +318,7 @@ jobs: with: repository: meilisearch/meilisearch-js-plugins - name: Setup node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: cache: yarn - name: Install dependencies diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index a44a843fe..ed9cafa79 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -43,7 +43,7 @@ jobs: toolchain: nightly override: true - name: Cache dependencies - uses: Swatinem/rust-cache@v2.6.2 + uses: Swatinem/rust-cache@v2.7.1 - name: Run cargo check without any default features uses: actions-rs/cargo@v1 with: @@ -65,7 +65,7 @@ jobs: steps: - uses: actions/checkout@v3 - name: Cache dependencies - uses: Swatinem/rust-cache@v2.6.2 + uses: Swatinem/rust-cache@v2.7.1 - name: Run cargo check without any default features uses: actions-rs/cargo@v1 with: @@ -149,7 +149,7 @@ jobs: toolchain: stable override: true - name: Cache dependencies - uses: Swatinem/rust-cache@v2.6.2 + uses: Swatinem/rust-cache@v2.7.1 - name: Run tests in debug uses: actions-rs/cargo@v1 with: @@ -168,7 +168,7 @@ jobs: override: true components: clippy - name: Cache dependencies - uses: Swatinem/rust-cache@v2.6.2 + uses: Swatinem/rust-cache@v2.7.1 - name: Run cargo clippy uses: actions-rs/cargo@v1 with: @@ -187,7 +187,7 @@ jobs: override: true components: rustfmt - name: Cache dependencies - uses: Swatinem/rust-cache@v2.6.2 + uses: Swatinem/rust-cache@v2.7.1 - name: Run cargo fmt # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate diff --git a/Cargo.lock b/Cargo.lock index 0a6f5850b..fda5f2493 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -520,6 +520,9 @@ name = "bitflags" version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" +dependencies = [ + "serde", +] [[package]] name = "block-buffer" @@ -1255,6 +1258,15 @@ dependencies = [ "syn 2.0.28", ] +[[package]] +name = "doxygen-rs" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bff670ea0c9bbb8414e7efa6e23ebde2b8f520a7eef78273a3918cf1903e7505" +dependencies = [ + "phf", +] + [[package]] name = "dump" version = "1.5.1" @@ -1731,12 +1743,13 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "grenad" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5232b2d157b7bf63d7abe1b12177039e58db2f29e377517c0cdee1578cca4c93" +checksum = "6a007932af5475ebb5c63bef8812bb1c36f317983bb4ca663e9d6dd58d6a0f8c" dependencies = [ "bytemuck", "byteorder", + "rayon", "tempfile", ] @@ -1810,36 +1823,40 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "heed" -version = "0.12.7" -source = "git+https://github.com/meilisearch/heed?tag=v0.12.7#061a5276b1f336f5f3302bee291e336041d88632" +version = "0.20.0-alpha.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9648a50991c86df7d00c56c268c27754fcf4c80be2ba57fc4a00dc928c6fe934" dependencies = [ + "bitflags 2.3.3", + "bytemuck", "byteorder", "heed-traits", "heed-types", "libc", - "lmdb-rkv-sys", + "lmdb-master-sys", "once_cell", - "page_size 0.4.2", + "page_size 0.6.0", "synchronoise", "url", - "zerocopy", ] [[package]] name = "heed-traits" -version = "0.7.0" -source = "git+https://github.com/meilisearch/heed?tag=v0.12.7#061a5276b1f336f5f3302bee291e336041d88632" +version = "0.20.0-alpha.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ab0b7d9cde969ad36dde692e487dc89d97f7168bf6a7bd3b894ad4bf7278298" [[package]] name = "heed-types" -version = "0.7.2" -source = "git+https://github.com/meilisearch/heed?tag=v0.12.7#061a5276b1f336f5f3302bee291e336041d88632" +version = "0.20.0-alpha.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0cb3567a7363f28b597bf6e9897b9466397951dd0e52df2c8196dd8a71af44a" dependencies = [ "bincode", + "byteorder", "heed-traits", "serde", "serde_json", - "zerocopy", ] [[package]] @@ -2967,11 +2984,13 @@ dependencies = [ ] [[package]] -name = "lmdb-rkv-sys" -version = "0.15.1" -source = "git+https://github.com/meilisearch/lmdb-rs#501aa34a1ab7f092e3ff54a6c22ff6c55931a2d8" +name = "lmdb-master-sys" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "629c123f5321b48fa4f8f4d3b868165b748d9ba79c7103fb58e3a94f736bcedd" dependencies = [ "cc", + "doxygen-rs", "libc", "pkg-config", ] @@ -3281,6 +3300,7 @@ dependencies = [ "logging_timer", "maplit", "md5", + "meili-snap", "memmap2", "mimalloc", "obkv", @@ -3443,9 +3463,9 @@ dependencies = [ [[package]] name = "obkv" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385" +checksum = "6c459142426056c639ff88d053ebaaaeca0ee1411c94362892398ef4ccd81080" [[package]] name = "once_cell" @@ -3470,9 +3490,9 @@ dependencies = [ [[package]] name = "page_size" -version = "0.4.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" +checksum = "1b7663cbd190cfd818d08efa8497f6cd383076688c49a391ef7c0d03cd12b561" dependencies = [ "libc", "winapi", @@ -3480,9 +3500,9 @@ dependencies = [ [[package]] name = "page_size" -version = "0.5.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b7663cbd190cfd818d08efa8497f6cd383076688c49a391ef7c0d03cd12b561" +checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da" dependencies = [ "libc", "winapi", @@ -3628,6 +3648,7 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" dependencies = [ + "phf_macros", "phf_shared", ] @@ -3651,6 +3672,19 @@ dependencies = [ "rand", ] +[[package]] +name = "phf_macros" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn 2.0.28", +] + [[package]] name = "phf_shared" version = "0.11.2" @@ -4477,18 +4511,6 @@ dependencies = [ "crossbeam-queue", ] -[[package]] -name = "synstructure" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", - "unicode-xid", -] - [[package]] name = "synstructure" version = "0.13.0" @@ -5357,28 +5379,7 @@ dependencies = [ "proc-macro2", "quote", "syn 2.0.28", - "synstructure 0.13.0", -] - -[[package]] -name = "zerocopy" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46" -dependencies = [ - "byteorder", - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" -dependencies = [ - "proc-macro2", - "syn 1.0.109", - "synstructure 0.12.6", + "synstructure", ] [[package]] @@ -5399,7 +5400,7 @@ dependencies = [ "proc-macro2", "quote", "syn 2.0.28", - "synstructure 0.13.0", + "synstructure", ] [[package]] diff --git a/README.md b/README.md index cb9475dea..88621729d 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,6 @@

⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍

---- - -### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A! - ---- - Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.

diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 9446c0b0f..0c19b89cf 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -6,9 +6,7 @@ use std::path::Path; use criterion::{criterion_group, criterion_main, Criterion}; use milli::heed::{EnvOpenOptions, RwTxn}; -use milli::update::{ - DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, -}; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::Index; use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; @@ -38,7 +36,7 @@ fn setup_index() -> Index { } fn setup_settings<'t>( - wtxn: &mut RwTxn<'t, '_>, + wtxn: &mut RwTxn<'t>, index: &'t Index, primary_key: &str, searchable_fields: &[&str], @@ -266,17 +264,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); @@ -613,17 +601,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); @@ -875,22 +853,31 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); } +fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec) { + let mut wtxn = index.write_txn().unwrap(); + + let indexer_config = IndexerConfig::default(); + for ids in document_ids_to_delete { + let config = IndexDocumentsConfig::default(); + + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false) + .unwrap(); + (builder, _) = builder.remove_documents_from_db_no_batch(&ids).unwrap(); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); +} + fn indexing_movies_in_three_batches(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); @@ -1112,17 +1099,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); @@ -1338,17 +1315,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); diff --git a/config.toml b/config.toml index c47989f56..bbd70a63f 100644 --- a/config.toml +++ b/config.toml @@ -129,3 +129,6 @@ experimental_enable_metrics = false # Experimental RAM reduction during indexing, do not use in production, see: experimental_reduce_indexing_memory_usage = false + +# Experimentally reduces the maximum number of tasks that will be processed at once, see: +# experimental_max_number_of_batched_tasks = 100 diff --git a/dump/src/lib.rs b/dump/src/lib.rs index fa3cfb49a..15b281c41 100644 --- a/dump/src/lib.rs +++ b/dump/src/lib.rs @@ -267,6 +267,7 @@ pub(crate) mod test { dictionary: Setting::NotSet, synonyms: Setting::NotSet, distinct_attribute: Setting::NotSet, + proximity_precision: Setting::NotSet, typo_tolerance: Setting::NotSet, faceting: Setting::Set(FacetingSettings { max_values_per_facet: Setting::Set(111), diff --git a/dump/src/reader/compat/v5_to_v6.rs b/dump/src/reader/compat/v5_to_v6.rs index 9e938d756..8a0d6e5e1 100644 --- a/dump/src/reader/compat/v5_to_v6.rs +++ b/dump/src/reader/compat/v5_to_v6.rs @@ -345,6 +345,7 @@ impl From> for v6::Settings { dictionary: v6::Setting::NotSet, synonyms: settings.synonyms.into(), distinct_attribute: settings.distinct_attribute.into(), + proximity_precision: v6::Setting::NotSet, typo_tolerance: match settings.typo_tolerance { v5::Setting::Set(typo) => v6::Setting::Set(v6::TypoTolerance { enabled: typo.enabled.into(), diff --git a/dump/src/reader/mod.rs b/dump/src/reader/mod.rs index af02888d2..5bbf4ec4d 100644 --- a/dump/src/reader/mod.rs +++ b/dump/src/reader/mod.rs @@ -13,12 +13,12 @@ use crate::{Result, Version}; mod compat; -pub(self) mod v1; -pub(self) mod v2; -pub(self) mod v3; -pub(self) mod v4; -pub(self) mod v5; -pub(self) mod v6; +mod v1; +mod v2; +mod v3; +mod v4; +mod v5; +mod v6; pub type Document = serde_json::Map; pub type UpdateFile = dyn Iterator>; @@ -526,12 +526,12 @@ pub(crate) mod test { assert!(indexes.is_empty()); // products - insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(products.metadata(), @r###" { "uid": "products", "primaryKey": "sku", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:22.688964637Z", + "updatedAt": "2022-10-09T20:27:23.951017769Z" } "###); @@ -541,12 +541,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); // movies - insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(movies.metadata(), @r###" { "uid": "movies", "primaryKey": "id", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:22.197788495Z", + "updatedAt": "2022-10-09T20:28:01.93111053Z" } "###); @@ -571,12 +571,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce"); // spells - insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(spells.metadata(), @r###" { "uid": "dnd_spells", "primaryKey": "index", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:24.242683494Z", + "updatedAt": "2022-10-09T20:27:24.312809641Z" } "###); @@ -617,12 +617,12 @@ pub(crate) mod test { assert!(indexes.is_empty()); // products - insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(products.metadata(), @r###" { "uid": "products", "primaryKey": "sku", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:56.595257Z", + "updatedAt": "2023-01-30T16:25:58.70348Z" } "###); @@ -632,12 +632,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); // movies - insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(movies.metadata(), @r###" { "uid": "movies", "primaryKey": "id", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:56.192178Z", + "updatedAt": "2023-01-30T16:25:56.455714Z" } "###); @@ -647,12 +647,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720"); // spells - insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(spells.metadata(), @r###" { "uid": "dnd_spells", "primaryKey": "index", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:58.876405Z", + "updatedAt": "2023-01-30T16:25:59.079906Z" } "###); diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap deleted file mode 100644 index 92fc61d72..000000000 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap +++ /dev/null @@ -1,24 +0,0 @@ ---- -source: dump/src/reader/mod.rs -expression: spells.settings().unwrap() ---- -{ - "displayedAttributes": [ - "*" - ], - "searchableAttributes": [ - "*" - ], - "filterableAttributes": [], - "sortableAttributes": [], - "rankingRules": [ - "typo", - "words", - "proximity", - "attribute", - "exactness" - ], - "stopWords": [], - "synonyms": {}, - "distinctAttribute": null -} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap deleted file mode 100644 index b0b54c136..000000000 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap +++ /dev/null @@ -1,38 +0,0 @@ ---- -source: dump/src/reader/mod.rs -expression: products.settings().unwrap() ---- -{ - "displayedAttributes": [ - "*" - ], - "searchableAttributes": [ - "*" - ], - "filterableAttributes": [], - "sortableAttributes": [], - "rankingRules": [ - "typo", - "words", - "proximity", - "attribute", - "exactness" - ], - "stopWords": [], - "synonyms": { - "android": [ - "phone", - "smartphone" - ], - "iphone": [ - "phone", - "smartphone" - ], - "phone": [ - "android", - "iphone", - "smartphone" - ] - }, - "distinctAttribute": null -} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap deleted file mode 100644 index 5c12a0438..000000000 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap +++ /dev/null @@ -1,31 +0,0 @@ ---- -source: dump/src/reader/mod.rs -expression: movies.settings().unwrap() ---- -{ - "displayedAttributes": [ - "*" - ], - "searchableAttributes": [ - "*" - ], - "filterableAttributes": [ - "genres", - "id" - ], - "sortableAttributes": [ - "genres", - "id" - ], - "rankingRules": [ - "typo", - "words", - "proximity", - "attribute", - "exactness", - "release_date:asc" - ], - "stopWords": [], - "synonyms": {}, - "distinctAttribute": null -} diff --git a/dump/src/reader/v1/settings.rs b/dump/src/reader/v1/settings.rs index 2f7976534..94343d150 100644 --- a/dump/src/reader/v1/settings.rs +++ b/dump/src/reader/v1/settings.rs @@ -56,8 +56,7 @@ pub enum RankingRule { Desc(String), } -static ASC_DESC_REGEX: Lazy = - Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap()); +static ASC_DESC_REGEX: Lazy = Lazy::new(|| Regex::new(r"(asc|desc)\(([\w_-]+)\)").unwrap()); impl FromStr for RankingRule { type Err = (); diff --git a/dump/src/reader/v2/mod.rs b/dump/src/reader/v2/mod.rs index 4016e6341..a0ff13a3b 100644 --- a/dump/src/reader/v2/mod.rs +++ b/dump/src/reader/v2/mod.rs @@ -46,6 +46,7 @@ pub type Checked = settings::Checked; pub type Unchecked = settings::Unchecked; pub type Task = updates::UpdateEntry; +pub type Kind = updates::UpdateMeta; // everything related to the errors pub type ResponseError = errors::ResponseError; @@ -107,8 +108,11 @@ impl V2Reader { pub fn indexes(&self) -> Result> + '_> { Ok(self.index_uuid.iter().map(|index| -> Result<_> { V2IndexReader::new( - index.uid.clone(), &self.dump.path().join("indexes").join(format!("index-{}", index.uuid)), + index, + BufReader::new( + File::open(self.dump.path().join("updates").join("data.jsonl")).unwrap(), + ), ) })) } @@ -143,16 +147,41 @@ pub struct V2IndexReader { } impl V2IndexReader { - pub fn new(name: String, path: &Path) -> Result { + pub fn new(path: &Path, index_uuid: &IndexUuid, tasks: BufReader) -> Result { let meta = File::open(path.join("meta.json"))?; let meta: DumpMeta = serde_json::from_reader(meta)?; + let mut created_at = None; + let mut updated_at = None; + + for line in tasks.lines() { + let task: Task = serde_json::from_str(&line?)?; + if !(task.uuid == index_uuid.uuid && task.is_finished()) { + continue; + } + + let new_created_at = match task.update.meta() { + Kind::DocumentsAddition { .. } | Kind::Settings(_) => task.update.finished_at(), + _ => None, + }; + let new_updated_at = task.update.finished_at(); + + if created_at.is_none() || created_at > new_created_at { + created_at = new_created_at; + } + + if updated_at.is_none() || updated_at < new_updated_at { + updated_at = new_updated_at; + } + } + + let current_time = OffsetDateTime::now_utc(); + let metadata = IndexMetadata { - uid: name, + uid: index_uuid.uid.clone(), primary_key: meta.primary_key, - // FIXME: Iterate over the whole task queue to find the creation and last update date. - created_at: OffsetDateTime::now_utc(), - updated_at: OffsetDateTime::now_utc(), + created_at: created_at.unwrap_or(current_time), + updated_at: updated_at.unwrap_or(current_time), }; let ret = V2IndexReader { @@ -248,12 +277,12 @@ pub(crate) mod test { assert!(indexes.is_empty()); // products - insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(products.metadata(), @r###" { "uid": "products", "primaryKey": "sku", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:22.688964637Z", + "updatedAt": "2022-10-09T20:27:23.951017769Z" } "###); @@ -263,12 +292,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); // movies - insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(movies.metadata(), @r###" { "uid": "movies", "primaryKey": "id", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:22.197788495Z", + "updatedAt": "2022-10-09T20:28:01.93111053Z" } "###); @@ -293,12 +322,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce"); // spells - insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(spells.metadata(), @r###" { "uid": "dnd_spells", "primaryKey": "index", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:24.242683494Z", + "updatedAt": "2022-10-09T20:27:24.312809641Z" } "###); @@ -340,12 +369,12 @@ pub(crate) mod test { assert!(indexes.is_empty()); // products - insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(products.metadata(), @r###" { "uid": "products", "primaryKey": "sku", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:56.595257Z", + "updatedAt": "2023-01-30T16:25:58.70348Z" } "###); @@ -355,12 +384,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); // movies - insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(movies.metadata(), @r###" { "uid": "movies", "primaryKey": "id", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:56.192178Z", + "updatedAt": "2023-01-30T16:25:56.455714Z" } "###); @@ -370,12 +399,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720"); // spells - insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(spells.metadata(), @r###" { "uid": "dnd_spells", "primaryKey": "index", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:58.876405Z", + "updatedAt": "2023-01-30T16:25:59.079906Z" } "###); diff --git a/dump/src/reader/v2/updates.rs b/dump/src/reader/v2/updates.rs index 33d88d46f..bf1227c7a 100644 --- a/dump/src/reader/v2/updates.rs +++ b/dump/src/reader/v2/updates.rs @@ -227,4 +227,14 @@ impl UpdateStatus { _ => None, } } + + pub fn finished_at(&self) -> Option { + match self { + UpdateStatus::Processing(_) => None, + UpdateStatus::Enqueued(_) => None, + UpdateStatus::Processed(u) => Some(u.processed_at), + UpdateStatus::Aborted(_) => None, + UpdateStatus::Failed(u) => Some(u.failed_at), + } + } } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 5760c8865..fa5b70606 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -564,10 +564,10 @@ pub mod tests { #[test] fn parse_escaped() { - insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#); - insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#); - insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#); - insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#); + insta::assert_display_snapshot!(p(r"title = 'foo\\'"), @r#"{title} = {foo\}"#); + insta::assert_display_snapshot!(p(r"title = 'foo\\\\'"), @r#"{title} = {foo\\}"#); + insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\'"), @r#"{title} = {foo\\\}"#); + insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\\\'"), @r#"{title} = {foo\\\\}"#); // but it also works with other sequencies insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}"); } diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 63d5ac384..1d70cb025 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -270,8 +270,8 @@ pub mod test { ("aaaa", "", rtok("", "aaaa"), "aaaa"), (r#"aa"aa"#, r#""aa"#, rtok("", "aa"), "aa"), (r#"aa\"aa"#, r#""#, rtok("", r#"aa\"aa"#), r#"aa"aa"#), - (r#"aa\\\aa"#, r#""#, rtok("", r#"aa\\\aa"#), r#"aa\\\aa"#), - (r#"aa\\"\aa"#, r#""\aa"#, rtok("", r#"aa\\"#), r#"aa\\"#), + (r"aa\\\aa", r#""#, rtok("", r"aa\\\aa"), r"aa\\\aa"), + (r#"aa\\"\aa"#, r#""\aa"#, rtok("", r"aa\\"), r"aa\\"), (r#"aa\\\"\aa"#, r#""#, rtok("", r#"aa\\\"\aa"#), r#"aa\\"\aa"#), (r#"\"\""#, r#""#, rtok("", r#"\"\""#), r#""""#), ]; @@ -301,12 +301,12 @@ pub mod test { ); // simple quote assert_eq!( - unescape(Span::new_extra(r#"Hello \'World\'"#, ""), '\''), + unescape(Span::new_extra(r"Hello \'World\'", ""), '\''), r#"Hello 'World'"#.to_string() ); assert_eq!( - unescape(Span::new_extra(r#"Hello \\\'World\\\'"#, ""), '\''), - r#"Hello \\'World\\'"#.to_string() + unescape(Span::new_extra(r"Hello \\\'World\\\'", ""), '\''), + r"Hello \\'World\\'".to_string() ); } @@ -335,19 +335,19 @@ pub mod test { ("\"cha'nnel\"", "cha'nnel", false), ("I'm tamo", "I", false), // escaped thing but not quote - (r#""\\""#, r#"\"#, true), - (r#""\\\\\\""#, r#"\\\"#, true), - (r#""aa\\aa""#, r#"aa\aa"#, true), + (r#""\\""#, r"\", true), + (r#""\\\\\\""#, r"\\\", true), + (r#""aa\\aa""#, r"aa\aa", true), // with double quote (r#""Hello \"world\"""#, r#"Hello "world""#, true), (r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true), (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true), (r#""\"\"""#, r#""""#, true), // with simple quote - (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true), - (r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true), + (r"'Hello \'world\''", r#"Hello 'world'"#, true), + (r"'Hello \\\'world\\\''", r"Hello \'world\'", true), (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true), - (r#"'\'\''"#, r#"''"#, true), + (r"'\'\''", r#"''"#, true), ]; for (input, expected, escaped) in test_case { diff --git a/fuzzers/src/bin/fuzz-indexing.rs b/fuzzers/src/bin/fuzz-indexing.rs index 1d53e069c..baf705709 100644 --- a/fuzzers/src/bin/fuzz-indexing.rs +++ b/fuzzers/src/bin/fuzz-indexing.rs @@ -113,7 +113,7 @@ fn main() { index.documents(&wtxn, res.documents_ids).unwrap(); progression.fetch_add(1, Ordering::Relaxed); } - wtxn.abort().unwrap(); + wtxn.abort(); }); if let err @ Err(_) = handle.join() { stop.store(true, Ordering::Relaxed); diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 9e7c2ae4b..c4a37b7d6 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -22,7 +22,7 @@ log = "0.4.17" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } page_size = "0.5.0" -puffin = "0.16.0" +puffin = { version = "0.16.0", features = ["serialization"] } roaring = { version = "0.10.1", features = ["serde"] } serde = { version = "1.0.160", features = ["derive"] } serde_json = { version = "1.0.95", features = ["preserve_order"] } diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index aa93cda2a..94a8b3f07 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -24,16 +24,15 @@ use std::fs::{self, File}; use std::io::BufWriter; use dump::IndexMetadata; -use log::{debug, error, info}; +use log::{debug, error, info, trace}; use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::{ - DeleteDocuments, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod, - Settings as MilliSettings, + IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; -use meilisearch_types::milli::{self, Filter, BEU32}; +use meilisearch_types::milli::{self, Filter}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; @@ -44,7 +43,7 @@ use uuid::Uuid; use crate::autobatcher::{self, BatchKind}; use crate::utils::{self, swap_index_uid_in_task}; -use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId}; +use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId}; /// Represents a combination of tasks that can all be processed at the same time. /// @@ -105,12 +104,6 @@ pub(crate) enum IndexOperation { operations: Vec, tasks: Vec, }, - DocumentDeletion { - index_uid: String, - // The vec associated with each document deletion tasks. - documents: Vec>, - tasks: Vec, - }, IndexDocumentDeletionByFilter { index_uid: String, task: Task, @@ -162,7 +155,6 @@ impl Batch { } Batch::IndexOperation { op, .. } => match op { IndexOperation::DocumentOperation { tasks, .. } - | IndexOperation::DocumentDeletion { tasks, .. } | IndexOperation::Settings { tasks, .. } | IndexOperation::DocumentClear { tasks, .. } => { tasks.iter().map(|task| task.uid).collect() @@ -227,7 +219,6 @@ impl IndexOperation { pub fn index_uid(&self) -> &str { match self { IndexOperation::DocumentOperation { index_uid, .. } - | IndexOperation::DocumentDeletion { index_uid, .. } | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. } | IndexOperation::DocumentClear { index_uid, .. } | IndexOperation::Settings { index_uid, .. } @@ -243,9 +234,6 @@ impl fmt::Display for IndexOperation { IndexOperation::DocumentOperation { .. } => { f.write_str("IndexOperation::DocumentOperation") } - IndexOperation::DocumentDeletion { .. } => { - f.write_str("IndexOperation::DocumentDeletion") - } IndexOperation::IndexDocumentDeletionByFilter { .. } => { f.write_str("IndexOperation::IndexDocumentDeletionByFilter") } @@ -348,18 +336,27 @@ impl IndexScheduler { BatchKind::DocumentDeletion { deletion_ids } => { let tasks = self.get_existing_tasks(rtxn, deletion_ids)?; - let mut documents = Vec::new(); + let mut operations = Vec::with_capacity(tasks.len()); + let mut documents_counts = Vec::with_capacity(tasks.len()); for task in &tasks { match task.kind { KindWithContent::DocumentDeletion { ref documents_ids, .. } => { - documents.push(documents_ids.clone()) + operations.push(DocumentOperation::Delete(documents_ids.clone())); + documents_counts.push(documents_ids.len() as u64); } _ => unreachable!(), } } Ok(Some(Batch::IndexOperation { - op: IndexOperation::DocumentDeletion { index_uid, documents, tasks }, + op: IndexOperation::DocumentOperation { + index_uid, + primary_key: None, + method: IndexDocumentsMethod::ReplaceDocuments, + documents_counts, + operations, + tasks, + }, must_create_index, })) } @@ -587,7 +584,9 @@ impl IndexScheduler { let index_tasks = self.index_tasks(rtxn, index_name)? & enqueued; // If autobatching is disabled we only take one task at a time. - let tasks_limit = if self.autobatching_enabled { usize::MAX } else { 1 }; + // Otherwise, we take only a maximum of tasks to create batches. + let tasks_limit = + if self.autobatching_enabled { self.max_number_of_batched_tasks } else { 1 }; let enqueued = index_tasks .into_iter() @@ -718,7 +717,7 @@ impl IndexScheduler { // 2. Snapshot the index-scheduler LMDB env // - // When we call copy_to_path, LMDB opens a read transaction by itself, + // When we call copy_to_file, LMDB opens a read transaction by itself, // we can't provide our own. It is an issue as we would like to know // the update files to copy but new ones can be enqueued between the copy // of the env and the new transaction we open to retrieve the enqueued tasks. @@ -731,7 +730,7 @@ impl IndexScheduler { // 2.1 First copy the LMDB env of the index-scheduler let dst = temp_snapshot_dir.path().join("tasks"); fs::create_dir_all(&dst)?; - self.env.copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled)?; + self.env.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; // 2.2 Create a read transaction on the index-scheduler let rtxn = self.env.read_txn()?; @@ -756,7 +755,7 @@ impl IndexScheduler { let index = self.index_mapper.index(&rtxn, name)?; let dst = temp_snapshot_dir.path().join("indexes").join(uuid.to_string()); fs::create_dir_all(&dst)?; - index.copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled)?; + index.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; } drop(rtxn); @@ -769,7 +768,7 @@ impl IndexScheduler { .map_size(1024 * 1024 * 1024) // 1 GiB .max_dbs(2) .open(&self.auth_path)?; - auth.copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled)?; + auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; // 5. Copy and tarball the flat snapshot // 5.1 Find the original name of the database @@ -825,6 +824,10 @@ impl IndexScheduler { // 2. dump the tasks let mut dump_tasks = dump.create_tasks_queue()?; for ret in self.all_tasks.iter(&rtxn)? { + if self.must_stop_processing.get() { + return Err(Error::AbortedTask); + } + let (_, mut t) = ret?; let status = t.status; let content_file = t.content_uuid(); @@ -845,6 +848,9 @@ impl IndexScheduler { // 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. if let Some(content_file) = content_file { + if self.must_stop_processing.get() { + return Err(Error::AbortedTask); + } if status == Status::Enqueued { let content_file = self.file_store.get_update(content_file)?; @@ -884,6 +890,9 @@ impl IndexScheduler { // 3.1. Dump the documents for ret in index.all_documents(&rtxn)? { + if self.must_stop_processing.get() { + return Err(Error::AbortedTask); + } let (_id, doc) = ret?; let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; index_dumper.push_document(&document)?; @@ -903,6 +912,9 @@ impl IndexScheduler { "[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]" )).unwrap(); + if self.must_stop_processing.get() { + return Err(Error::AbortedTask); + } let path = self.dumps_path.join(format!("{}.dump", dump_uid)); let file = File::create(path)?; dump.persist_to(BufWriter::new(file))?; @@ -1096,7 +1108,7 @@ impl IndexScheduler { for task_id in &index_lhs_task_ids | &index_rhs_task_ids { let mut task = self.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; swap_index_uid_in_task(&mut task, (lhs, rhs)); - self.all_tasks.put(wtxn, &BEU32::new(task_id), &task)?; + self.all_tasks.put(wtxn, &task_id, &task)?; } // 4. remove the task from indexuid = before_name @@ -1122,7 +1134,7 @@ impl IndexScheduler { /// The list of processed tasks. fn apply_index_operation<'i>( &self, - index_wtxn: &mut RwTxn<'i, '_>, + index_wtxn: &mut RwTxn<'i>, index: &'i Index, operation: IndexOperation, ) -> Result> { @@ -1195,7 +1207,7 @@ impl IndexScheduler { index, indexer_config, config, - |indexing_step| debug!("update: {:?}", indexing_step), + |indexing_step| trace!("update: {:?}", indexing_step), || must_stop_processing.get(), )?; @@ -1242,7 +1254,8 @@ impl IndexScheduler { let (new_builder, user_result) = builder.remove_documents(document_ids)?; builder = new_builder; - + // Uses Invariant: remove documents actually always returns Ok for the inner result + let count = user_result.unwrap(); let provided_ids = if let Some(Details::DocumentDeletion { provided_ids, .. }) = task.details @@ -1253,23 +1266,11 @@ impl IndexScheduler { unreachable!(); }; - match user_result { - Ok(count) => { - task.status = Status::Succeeded; - task.details = Some(Details::DocumentDeletion { - provided_ids, - deleted_documents: Some(count), - }); - } - Err(e) => { - task.status = Status::Failed; - task.details = Some(Details::DocumentDeletion { - provided_ids, - deleted_documents: Some(0), - }); - task.error = Some(milli::Error::from(e).into()); - } - } + task.status = Status::Succeeded; + task.details = Some(Details::DocumentDeletion { + provided_ids, + deleted_documents: Some(count), + }); } } } @@ -1284,31 +1285,13 @@ impl IndexScheduler { milli::update::Settings::new(index_wtxn, index, indexer_config); builder.reset_primary_key(); builder.execute( - |indexing_step| debug!("update: {:?}", indexing_step), + |indexing_step| trace!("update: {:?}", indexing_step), || must_stop_processing.clone().get(), )?; } Ok(tasks) } - IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => { - let mut builder = milli::update::DeleteDocuments::new(index_wtxn, index)?; - documents.iter().flatten().for_each(|id| { - builder.delete_external_id(id); - }); - - let DocumentDeletionResult { deleted_documents, .. } = builder.execute()?; - - for (task, documents) in tasks.iter_mut().zip(documents) { - task.status = Status::Succeeded; - task.details = Some(Details::DocumentDeletion { - provided_ids: documents.len(), - deleted_documents: Some(deleted_documents.min(documents.len() as u64)), - }); - } - - Ok(tasks) - } IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => { let filter = if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } = @@ -1318,7 +1301,13 @@ impl IndexScheduler { } else { unreachable!() }; - let deleted_documents = delete_document_by_filter(index_wtxn, filter, index); + let deleted_documents = delete_document_by_filter( + index_wtxn, + filter, + self.index_mapper.indexer_config(), + self.must_stop_processing.clone(), + index, + ); let original_filter = if let Some(Details::DocumentDeletionByFilter { original_filter, deleted_documents: _, @@ -1356,6 +1345,9 @@ impl IndexScheduler { for (task, (_, settings)) in tasks.iter_mut().zip(settings) { let checked_settings = settings.clone().check(); + if checked_settings.proximity_precision.set().is_some() { + self.features.features().check_proximity_precision()?; + } task.details = Some(Details::SettingsUpdate { settings: Box::new(settings) }); apply_settings_to_builder(&checked_settings, &mut builder); @@ -1492,10 +1484,9 @@ impl IndexScheduler { } for task in to_delete_tasks.iter() { - self.all_tasks.delete(wtxn, &BEU32::new(task))?; + self.all_tasks.delete(wtxn, &task)?; } for canceled_by in affected_canceled_by { - let canceled_by = BEU32::new(canceled_by); if let Some(mut tasks) = self.canceled_by.get(wtxn, &canceled_by)? { tasks -= &to_delete_tasks; if tasks.is_empty() { @@ -1543,15 +1534,17 @@ impl IndexScheduler { task.details = task.details.map(|d| d.to_failed()); self.update_task(wtxn, &task)?; } - self.canceled_by.put(wtxn, &BEU32::new(cancel_task_id), &tasks_to_cancel)?; + self.canceled_by.put(wtxn, &cancel_task_id, &tasks_to_cancel)?; Ok(content_files_to_delete) } } fn delete_document_by_filter<'a>( - wtxn: &mut RwTxn<'a, '_>, + wtxn: &mut RwTxn<'a>, filter: &serde_json::Value, + indexer_config: &IndexerConfig, + must_stop_processing: MustStopProcessing, index: &'a Index, ) -> Result { let filter = Filter::from_json(filter)?; @@ -1562,9 +1555,26 @@ fn delete_document_by_filter<'a>( } e => e.into(), })?; - let mut delete_operation = DeleteDocuments::new(wtxn, index)?; - delete_operation.delete_documents(&candidates); - delete_operation.execute().map(|result| result.deleted_documents)? + + let config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + + let mut builder = milli::update::IndexDocuments::new( + wtxn, + index, + indexer_config, + config, + |indexing_step| debug!("update: {:?}", indexing_step), + || must_stop_processing.get(), + )?; + + let (new_builder, count) = builder.remove_documents_from_db_no_batch(&candidates)?; + builder = new_builder; + + let _ = builder.execute()?; + count } else { 0 }) diff --git a/index-scheduler/src/error.rs b/index-scheduler/src/error.rs index ddc6960f7..bbe526460 100644 --- a/index-scheduler/src/error.rs +++ b/index-scheduler/src/error.rs @@ -108,6 +108,8 @@ pub enum Error { TaskDeletionWithEmptyQuery, #[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")] TaskCancelationWithEmptyQuery, + #[error("Aborted task")] + AbortedTask, #[error(transparent)] Dump(#[from] dump::Error), @@ -175,6 +177,7 @@ impl Error { | Error::TaskNotFound(_) | Error::TaskDeletionWithEmptyQuery | Error::TaskCancelationWithEmptyQuery + | Error::AbortedTask | Error::Dump(_) | Error::Heed(_) | Error::Milli(_) @@ -236,6 +239,9 @@ impl ErrorCode for Error { Error::TaskDatabaseUpdate(_) => Code::Internal, Error::CreateBatch(_) => Code::Internal, + // This one should never be seen by the end user + Error::AbortedTask => Code::Internal, + #[cfg(test)] Error::PlannedFailure => Code::Internal, } diff --git a/index-scheduler/src/features.rs b/index-scheduler/src/features.rs index 1db27bcd5..ae2823c30 100644 --- a/index-scheduler/src/features.rs +++ b/index-scheduler/src/features.rs @@ -81,6 +81,19 @@ impl RoFeatures { .into()) } } + + pub fn check_proximity_precision(&self) -> Result<()> { + if self.runtime.proximity_precision { + Ok(()) + } else { + Err(FeatureNotEnabledError { + disabled_action: "Using `proximityPrecision` index setting", + feature: "proximity precision", + issue_link: "https://github.com/orgs/meilisearch/discussions/710", + } + .into()) + } + } } impl FeatureData { diff --git a/index-scheduler/src/index_mapper/index_map.rs b/index-scheduler/src/index_mapper/index_map.rs index a24213558..f8080d23b 100644 --- a/index-scheduler/src/index_mapper/index_map.rs +++ b/index-scheduler/src/index_mapper/index_map.rs @@ -1,12 +1,8 @@ -/// the map size to use when we don't succeed in reading it in indexes. -const DEFAULT_MAP_SIZE: usize = 10 * 1024 * 1024 * 1024; // 10 GiB - use std::collections::BTreeMap; use std::path::Path; use std::time::Duration; -use meilisearch_types::heed::flags::Flags; -use meilisearch_types::heed::{EnvClosingEvent, EnvOpenOptions}; +use meilisearch_types::heed::{EnvClosingEvent, EnvFlags, EnvOpenOptions}; use meilisearch_types::milli::Index; use time::OffsetDateTime; use uuid::Uuid; @@ -236,7 +232,7 @@ impl IndexMap { enable_mdb_writemap: bool, map_size_growth: usize, ) { - let map_size = index.map_size().unwrap_or(DEFAULT_MAP_SIZE) + map_size_growth; + let map_size = index.map_size() + map_size_growth; let closing_event = index.prepare_for_closing(); let generation = self.next_generation(); self.unavailable.insert( @@ -309,7 +305,7 @@ fn create_or_open_index( options.map_size(clamp_to_page_size(map_size)); options.max_readers(1024); if enable_mdb_writemap { - unsafe { options.flag(Flags::MdbWriteMap) }; + unsafe { options.flags(EnvFlags::WRITE_MAP) }; } if let Some((created, updated)) = date { @@ -388,7 +384,7 @@ mod tests { fn assert_index_size(index: Index, expected: usize) { let expected = clamp_to_page_size(expected); - let index_map_size = index.map_size().unwrap(); + let index_map_size = index.map_size(); assert_eq!(index_map_size, expected); } } diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index 6096bad38..bd8fa5148 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -1,7 +1,7 @@ use std::collections::BTreeSet; use std::fmt::Write; -use meilisearch_types::heed::types::{OwnedType, SerdeBincode, SerdeJson, Str}; +use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str}; use meilisearch_types::heed::{Database, RoTxn}; use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32}; use meilisearch_types::tasks::{Details, Task}; @@ -30,6 +30,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { index_mapper, features: _, max_number_of_tasks: _, + max_number_of_batched_tasks: _, puffin_frame: _, wake_up: _, dumps_path: _, @@ -115,7 +116,7 @@ pub fn snapshot_bitmap(r: &RoaringBitmap) -> String { snap } -pub fn snapshot_all_tasks(rtxn: &RoTxn, db: Database, SerdeJson>) -> String { +pub fn snapshot_all_tasks(rtxn: &RoTxn, db: Database>) -> String { let mut snap = String::new(); let iter = db.iter(rtxn).unwrap(); for next in iter { @@ -125,10 +126,7 @@ pub fn snapshot_all_tasks(rtxn: &RoTxn, db: Database, SerdeJson snap } -pub fn snapshot_date_db( - rtxn: &RoTxn, - db: Database, CboRoaringBitmapCodec>, -) -> String { +pub fn snapshot_date_db(rtxn: &RoTxn, db: Database) -> String { let mut snap = String::new(); let iter = db.iter(rtxn).unwrap(); for next in iter { @@ -248,10 +246,7 @@ pub fn snapshot_index_tasks(rtxn: &RoTxn, db: Database) } snap } -pub fn snapshot_canceled_by( - rtxn: &RoTxn, - db: Database, RoaringBitmapCodec>, -) -> String { +pub fn snapshot_canceled_by(rtxn: &RoTxn, db: Database) -> String { let mut snap = String::new(); let iter = db.iter(rtxn).unwrap(); for next in iter { diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 95902aa15..a1b6497d9 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -47,8 +47,9 @@ pub use features::RoFeatures; use file_store::FileStore; use meilisearch_types::error::ResponseError; use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures}; -use meilisearch_types::heed::types::{OwnedType, SerdeBincode, SerdeJson, Str}; -use meilisearch_types::heed::{self, Database, Env, RoTxn, RwTxn}; +use meilisearch_types::heed::byteorder::BE; +use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; +use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; @@ -64,8 +65,7 @@ use uuid::Uuid; use crate::index_mapper::IndexMapper; use crate::utils::{check_index_swap_validity, clamp_to_page_size}; -pub(crate) type BEI128 = - meilisearch_types::heed::zerocopy::I128; +pub(crate) type BEI128 = I128; /// Defines a subset of tasks to be retrieved from the [`IndexScheduler`]. /// @@ -258,6 +258,9 @@ pub struct IndexSchedulerOptions { /// The maximum number of tasks stored in the task queue before starting /// to auto schedule task deletions. pub max_number_of_tasks: usize, + /// If the autobatcher is allowed to automatically batch tasks + /// it will only batch this defined number of tasks at once. + pub max_number_of_batched_tasks: usize, /// The experimental features enabled for this instance. pub instance_features: InstanceTogglableFeatures, } @@ -278,7 +281,7 @@ pub struct IndexScheduler { pub(crate) file_store: FileStore, // The main database, it contains all the tasks accessible by their Id. - pub(crate) all_tasks: Database, SerdeJson>, + pub(crate) all_tasks: Database>, /// All the tasks ids grouped by their status. // TODO we should not be able to serialize a `Status::Processing` in this database. @@ -289,16 +292,16 @@ pub struct IndexScheduler { pub(crate) index_tasks: Database, /// Store the tasks that were canceled by a task uid - pub(crate) canceled_by: Database, RoaringBitmapCodec>, + pub(crate) canceled_by: Database, /// Store the task ids of tasks which were enqueued at a specific date - pub(crate) enqueued_at: Database, CboRoaringBitmapCodec>, + pub(crate) enqueued_at: Database, /// Store the task ids of finished tasks which started being processed at a specific date - pub(crate) started_at: Database, CboRoaringBitmapCodec>, + pub(crate) started_at: Database, /// Store the task ids of tasks which finished at a specific date - pub(crate) finished_at: Database, CboRoaringBitmapCodec>, + pub(crate) finished_at: Database, /// In charge of creating, opening, storing and returning indexes. pub(crate) index_mapper: IndexMapper, @@ -316,6 +319,9 @@ pub struct IndexScheduler { /// the finished tasks automatically. pub(crate) max_number_of_tasks: usize, + /// The maximum number of tasks that will be batched together. + pub(crate) max_number_of_batched_tasks: usize, + /// A frame to output the indexation profiling files to disk. pub(crate) puffin_frame: Arc, @@ -373,6 +379,7 @@ impl IndexScheduler { wake_up: self.wake_up.clone(), autobatching_enabled: self.autobatching_enabled, max_number_of_tasks: self.max_number_of_tasks, + max_number_of_batched_tasks: self.max_number_of_batched_tasks, puffin_frame: self.puffin_frame.clone(), snapshots_path: self.snapshots_path.clone(), dumps_path: self.dumps_path.clone(), @@ -471,6 +478,7 @@ impl IndexScheduler { puffin_frame: Arc::new(puffin::GlobalFrameView::default()), autobatching_enabled: options.autobatching_enabled, max_number_of_tasks: options.max_number_of_tasks, + max_number_of_batched_tasks: options.max_number_of_batched_tasks, dumps_path: options.dumps_path, snapshots_path: options.snapshots_path, auth_path: options.auth_path, @@ -730,9 +738,7 @@ impl IndexScheduler { if let Some(canceled_by) = &query.canceled_by { let mut all_canceled_tasks = RoaringBitmap::new(); for cancel_task_uid in canceled_by { - if let Some(canceled_by_uid) = - self.canceled_by.get(rtxn, &BEU32::new(*cancel_task_uid))? - { + if let Some(canceled_by_uid) = self.canceled_by.get(rtxn, cancel_task_uid)? { all_canceled_tasks |= canceled_by_uid; } } @@ -983,7 +989,7 @@ impl IndexScheduler { // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty()) - && (self.env.non_free_pages_size()? * 100) / self.env.map_size()? as u64 > 50 + && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 50 { return Err(Error::NoSpaceLeftInTaskQueue); } @@ -1009,7 +1015,7 @@ impl IndexScheduler { // Get rid of the mutability. let task = task; - self.all_tasks.append(&mut wtxn, &BEU32::new(task.uid), &task)?; + self.all_tasks.put_with_flags(&mut wtxn, PutFlags::APPEND, &task.uid, &task)?; for index in task.indexes() { self.update_index(&mut wtxn, index, |bitmap| { @@ -1183,10 +1189,11 @@ impl IndexScheduler { // If we have an abortion error we must stop the tick here and re-schedule tasks. Err(Error::Milli(milli::Error::InternalError( milli::InternalError::AbortedIndexation, - ))) => { + ))) + | Err(Error::AbortedTask) => { #[cfg(test)] self.breakpoint(Breakpoint::AbortedIndexation); - wtxn.abort().map_err(Error::HeedTransaction)?; + wtxn.abort(); // We make sure that we don't call `stop_processing` on the `processing_tasks`, // this is because we want to let the next tick call `create_next_batch` and keep @@ -1207,7 +1214,7 @@ impl IndexScheduler { let index_uid = index_uid.unwrap(); // fixme: handle error more gracefully? not sure when this could happen self.index_mapper.resize_index(&wtxn, &index_uid)?; - wtxn.abort().map_err(Error::HeedTransaction)?; + wtxn.abort(); return Ok(TickOutcome::TickAgain(0)); } @@ -1353,7 +1360,7 @@ impl IndexScheduler { pub struct Dump<'a> { index_scheduler: &'a IndexScheduler, - wtxn: RwTxn<'a, 'a>, + wtxn: RwTxn<'a>, indexes: HashMap, statuses: HashMap, @@ -1468,7 +1475,7 @@ impl<'a> Dump<'a> { }, }; - self.index_scheduler.all_tasks.put(&mut self.wtxn, &BEU32::new(task.uid), &task)?; + self.index_scheduler.all_tasks.put(&mut self.wtxn, &task.uid, &task)?; for index in task.indexes() { match self.indexes.get_mut(index) { @@ -1510,8 +1517,8 @@ impl<'a> Dump<'a> { } } - self.statuses.entry(task.status).or_insert(RoaringBitmap::new()).insert(task.uid); - self.kinds.entry(task.kind.as_kind()).or_insert(RoaringBitmap::new()).insert(task.uid); + self.statuses.entry(task.status).or_default().insert(task.uid); + self.kinds.entry(task.kind.as_kind()).or_default().insert(task.uid); Ok(task) } @@ -1639,6 +1646,7 @@ mod tests { indexer_config, autobatching_enabled: true, max_number_of_tasks: 1_000_000, + max_number_of_batched_tasks: usize::MAX, instance_features: Default::default(), }; configuration(&mut options); @@ -4339,4 +4347,26 @@ mod tests { } "###); } + + #[test] + fn cancel_processing_dump() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let dump_creation = KindWithContent::DumpCreation { keys: Vec::new(), instance_uid: None }; + let dump_cancellation = KindWithContent::TaskCancelation { + query: "cancel dump".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }; + let _ = index_scheduler.register(dump_creation).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register"); + handle.advance_till([Start, BatchCreated, InsideProcessBatch]); + + let _ = index_scheduler.register(dump_cancellation).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered"); + + snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); + } } diff --git a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap new file mode 100644 index 000000000..ce0343975 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap @@ -0,0 +1,35 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} +---------------------------------------------------------------------- +### Status: +enqueued [0,] +---------------------------------------------------------------------- +### Kind: +"dumpCreation" [0,] +---------------------------------------------------------------------- +### Index Tasks: +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap new file mode 100644 index 000000000..f3d7b363f --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap @@ -0,0 +1,45 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: canceled, canceled_by: 1, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} +1 {uid: 1, status: succeeded, details: { matched_tasks: 1, canceled_tasks: Some(0), original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [1,] +canceled [0,] +---------------------------------------------------------------------- +### Kind: +"taskCancelation" [1,] +"dumpCreation" [0,] +---------------------------------------------------------------------- +### Index Tasks: +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: +1 [0,] + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap new file mode 100644 index 000000000..72ae58e00 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap @@ -0,0 +1,38 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[0,] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} +1 {uid: 1, status: enqueued, details: { matched_tasks: 1, canceled_tasks: None, original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }} +---------------------------------------------------------------------- +### Status: +enqueued [0,1,] +---------------------------------------------------------------------- +### Kind: +"taskCancelation" [1,] +"dumpCreation" [0,] +---------------------------------------------------------------------- +### Index Tasks: +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/utils.rs b/index-scheduler/src/utils.rs index 3971d9116..9f6f90db2 100644 --- a/index-scheduler/src/utils.rs +++ b/index-scheduler/src/utils.rs @@ -3,9 +3,9 @@ use std::collections::{BTreeSet, HashSet}; use std::ops::Bound; -use meilisearch_types::heed::types::{DecodeIgnore, OwnedType}; +use meilisearch_types::heed::types::DecodeIgnore; use meilisearch_types::heed::{Database, RoTxn, RwTxn}; -use meilisearch_types::milli::{CboRoaringBitmapCodec, BEU32}; +use meilisearch_types::milli::CboRoaringBitmapCodec; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status}; use roaring::{MultiOps, RoaringBitmap}; use time::OffsetDateTime; @@ -18,7 +18,7 @@ impl IndexScheduler { } pub(crate) fn last_task_id(&self, rtxn: &RoTxn) -> Result> { - Ok(self.all_tasks.remap_data_type::().last(rtxn)?.map(|(k, _)| k.get() + 1)) + Ok(self.all_tasks.remap_data_type::().last(rtxn)?.map(|(k, _)| k + 1)) } pub(crate) fn next_task_id(&self, rtxn: &RoTxn) -> Result { @@ -26,7 +26,7 @@ impl IndexScheduler { } pub(crate) fn get_task(&self, rtxn: &RoTxn, task_id: TaskId) -> Result> { - Ok(self.all_tasks.get(rtxn, &BEU32::new(task_id))?) + Ok(self.all_tasks.get(rtxn, &task_id)?) } /// Convert an iterator to a `Vec` of tasks. The tasks MUST exist or a @@ -88,7 +88,7 @@ impl IndexScheduler { } } - self.all_tasks.put(wtxn, &BEU32::new(task.uid), task)?; + self.all_tasks.put(wtxn, &task.uid, task)?; Ok(()) } @@ -169,11 +169,11 @@ impl IndexScheduler { pub(crate) fn insert_task_datetime( wtxn: &mut RwTxn, - database: Database, CboRoaringBitmapCodec>, + database: Database, time: OffsetDateTime, task_id: TaskId, ) -> Result<()> { - let timestamp = BEI128::new(time.unix_timestamp_nanos()); + let timestamp = time.unix_timestamp_nanos(); let mut task_ids = database.get(wtxn, ×tamp)?.unwrap_or_default(); task_ids.insert(task_id); database.put(wtxn, ×tamp, &RoaringBitmap::from_iter(task_ids))?; @@ -182,11 +182,11 @@ pub(crate) fn insert_task_datetime( pub(crate) fn remove_task_datetime( wtxn: &mut RwTxn, - database: Database, CboRoaringBitmapCodec>, + database: Database, time: OffsetDateTime, task_id: TaskId, ) -> Result<()> { - let timestamp = BEI128::new(time.unix_timestamp_nanos()); + let timestamp = time.unix_timestamp_nanos(); if let Some(mut existing) = database.get(wtxn, ×tamp)? { existing.remove(task_id); if existing.is_empty() { @@ -202,7 +202,7 @@ pub(crate) fn remove_task_datetime( pub(crate) fn keep_tasks_within_datetimes( rtxn: &RoTxn, tasks: &mut RoaringBitmap, - database: Database, CboRoaringBitmapCodec>, + database: Database, after: Option, before: Option, ) -> Result<()> { @@ -213,8 +213,8 @@ pub(crate) fn keep_tasks_within_datetimes( (Some(after), Some(before)) => (Bound::Excluded(*after), Bound::Excluded(*before)), }; let mut collected_task_ids = RoaringBitmap::new(); - let start = map_bound(start, |b| BEI128::new(b.unix_timestamp_nanos())); - let end = map_bound(end, |b| BEI128::new(b.unix_timestamp_nanos())); + let start = map_bound(start, |b| b.unix_timestamp_nanos()); + let end = map_bound(end, |b| b.unix_timestamp_nanos()); let iter = database.range(rtxn, &(start, end))?; for r in iter { let (_timestamp, task_ids) = r?; @@ -337,8 +337,6 @@ impl IndexScheduler { let rtxn = self.env.read_txn().unwrap(); for task in self.all_tasks.iter(&rtxn).unwrap() { let (task_id, task) = task.unwrap(); - let task_id = task_id.get(); - let task_index_uid = task.index_uid().map(ToOwned::to_owned); let Task { @@ -361,16 +359,13 @@ impl IndexScheduler { .unwrap() .contains(task.uid)); } - let db_enqueued_at = self - .enqueued_at - .get(&rtxn, &BEI128::new(enqueued_at.unix_timestamp_nanos())) - .unwrap() - .unwrap(); + let db_enqueued_at = + self.enqueued_at.get(&rtxn, &enqueued_at.unix_timestamp_nanos()).unwrap().unwrap(); assert!(db_enqueued_at.contains(task_id)); if let Some(started_at) = started_at { let db_started_at = self .started_at - .get(&rtxn, &BEI128::new(started_at.unix_timestamp_nanos())) + .get(&rtxn, &started_at.unix_timestamp_nanos()) .unwrap() .unwrap(); assert!(db_started_at.contains(task_id)); @@ -378,7 +373,7 @@ impl IndexScheduler { if let Some(finished_at) = finished_at { let db_finished_at = self .finished_at - .get(&rtxn, &BEI128::new(finished_at.unix_timestamp_nanos())) + .get(&rtxn, &finished_at.unix_timestamp_nanos()) .unwrap() .unwrap(); assert!(db_finished_at.contains(task_id)); diff --git a/index-scheduler/src/uuid_codec.rs b/index-scheduler/src/uuid_codec.rs index 70a92ca94..54020fa3c 100644 --- a/index-scheduler/src/uuid_codec.rs +++ b/index-scheduler/src/uuid_codec.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::convert::TryInto; -use meilisearch_types::heed::{BytesDecode, BytesEncode}; +use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode}; use uuid::Uuid; /// A heed codec for value of struct Uuid. @@ -10,15 +10,15 @@ pub struct UuidCodec; impl<'a> BytesDecode<'a> for UuidCodec { type DItem = Uuid; - fn bytes_decode(bytes: &'a [u8]) -> Option { - bytes.try_into().ok().map(Uuid::from_bytes) + fn bytes_decode(bytes: &'a [u8]) -> Result { + bytes.try_into().map(Uuid::from_bytes).map_err(Into::into) } } impl BytesEncode<'_> for UuidCodec { type EItem = Uuid; - fn bytes_encode(item: &Self::EItem) -> Option> { - Some(Cow::Borrowed(item.as_bytes())) + fn bytes_encode(item: &Self::EItem) -> Result, BoxedError> { + Ok(Cow::Borrowed(item.as_bytes())) } } diff --git a/meilisearch-auth/src/store.rs b/meilisearch-auth/src/store.rs index 28ec8b5e4..276c035b0 100644 --- a/meilisearch-auth/src/store.rs +++ b/meilisearch-auth/src/store.rs @@ -4,17 +4,20 @@ use std::collections::HashSet; use std::convert::{TryFrom, TryInto}; use std::fs::create_dir_all; use std::path::Path; +use std::result::Result as StdResult; use std::str; use std::str::FromStr; use std::sync::Arc; use hmac::{Hmac, Mac}; +use meilisearch_types::heed::BoxedError; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::KeyId; use meilisearch_types::milli; -use meilisearch_types::milli::heed::types::{ByteSlice, DecodeIgnore, SerdeJson}; +use meilisearch_types::milli::heed::types::{Bytes, DecodeIgnore, SerdeJson}; use meilisearch_types::milli::heed::{Database, Env, EnvOpenOptions, RwTxn}; use sha2::Sha256; +use thiserror::Error; use time::OffsetDateTime; use uuid::fmt::Hyphenated; use uuid::Uuid; @@ -30,7 +33,7 @@ const KEY_ID_ACTION_INDEX_EXPIRATION_DB_NAME: &str = "keyid-action-index-expirat #[derive(Clone)] pub struct HeedAuthStore { env: Arc, - keys: Database>, + keys: Database>, action_keyid_index_expiration: Database>>, should_close_on_drop: bool, } @@ -276,7 +279,7 @@ impl HeedAuthStore { fn delete_key_from_inverted_db(&self, wtxn: &mut RwTxn, key: &KeyId) -> Result<()> { let mut iter = self .action_keyid_index_expiration - .remap_types::() + .remap_types::() .prefix_iter_mut(wtxn, key.as_bytes())?; while iter.next().transpose()?.is_some() { // safety: we don't keep references from inside the LMDB database. @@ -294,23 +297,24 @@ pub struct KeyIdActionCodec; impl<'a> milli::heed::BytesDecode<'a> for KeyIdActionCodec { type DItem = (KeyId, Action, Option<&'a [u8]>); - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (key_id_bytes, action_bytes) = try_split_array_at(bytes)?; - let (action_bytes, index) = match try_split_array_at(action_bytes)? { - (action, []) => (action, None), - (action, index) => (action, Some(index)), - }; + fn bytes_decode(bytes: &'a [u8]) -> StdResult { + let (key_id_bytes, action_bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?; + let (&action_byte, index) = + match try_split_array_at(action_bytes).ok_or(SliceTooShortError)? { + ([action], []) => (action, None), + ([action], index) => (action, Some(index)), + }; let key_id = Uuid::from_bytes(*key_id_bytes); - let action = Action::from_repr(u8::from_be_bytes(*action_bytes))?; + let action = Action::from_repr(action_byte).ok_or(InvalidActionError { action_byte })?; - Some((key_id, action, index)) + Ok((key_id, action, index)) } } impl<'a> milli::heed::BytesEncode<'a> for KeyIdActionCodec { type EItem = (&'a KeyId, &'a Action, Option<&'a [u8]>); - fn bytes_encode((key_id, action, index): &Self::EItem) -> Option> { + fn bytes_encode((key_id, action, index): &Self::EItem) -> StdResult, BoxedError> { let mut bytes = Vec::new(); bytes.extend_from_slice(key_id.as_bytes()); @@ -320,10 +324,20 @@ impl<'a> milli::heed::BytesEncode<'a> for KeyIdActionCodec { bytes.extend_from_slice(index); } - Some(Cow::Owned(bytes)) + Ok(Cow::Owned(bytes)) } } +#[derive(Error, Debug)] +#[error("the slice is too short")] +pub struct SliceTooShortError; + +#[derive(Error, Debug)] +#[error("cannot construct a valid Action from {action_byte}")] +pub struct InvalidActionError { + pub action_byte: u8, +} + pub fn generate_key_as_hexa(uid: Uuid, master_key: &[u8]) -> String { // format uid as hyphenated allowing user to generate their own keys. let mut uid_buffer = [0; Hyphenated::LENGTH]; diff --git a/meilisearch-types/Cargo.toml b/meilisearch-types/Cargo.toml index 639596fa6..136e167de 100644 --- a/meilisearch-types/Cargo.toml +++ b/meilisearch-types/Cargo.toml @@ -15,7 +15,7 @@ actix-web = { version = "4.3.1", default-features = false } anyhow = "1.0.70" convert_case = "0.6.0" csv = "1.2.1" -deserr = { version = "0.6.0", features = ["actix-web"]} +deserr = { version = "0.6.0", features = ["actix-web"] } either = { version = "1.8.1", features = ["serde"] } enum-iterator = "1.4.0" file-store = { path = "../file-store" } diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 4b6711601..b1dc6b777 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -252,6 +252,7 @@ InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ; InvalidSearchSort , InvalidRequest , BAD_REQUEST ; InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ; +InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ; InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ; InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ; @@ -324,7 +325,6 @@ impl ErrorCode for milli::Error { UserError::SerdeJson(_) | UserError::InvalidLmdbOpenOptions | UserError::DocumentLimitReached - | UserError::AccessingSoftDeletedDocument { .. } | UserError::UnknownInternalDocumentId { .. } => Code::Internal, UserError::InvalidStoreFile => Code::InvalidStoreFile, UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice, @@ -387,11 +387,11 @@ impl ErrorCode for HeedError { HeedError::Mdb(MdbError::Invalid) => Code::InvalidStoreFile, HeedError::Io(e) => e.error_code(), HeedError::Mdb(_) - | HeedError::Encoding - | HeedError::Decoding + | HeedError::Encoding(_) + | HeedError::Decoding(_) | HeedError::InvalidDatabaseTyping | HeedError::DatabaseClosing - | HeedError::BadOpenOptions => Code::Internal, + | HeedError::BadOpenOptions { .. } => Code::Internal, } } } diff --git a/meilisearch-types/src/features.rs b/meilisearch-types/src/features.rs index 33afe2d24..f0cbce340 100644 --- a/meilisearch-types/src/features.rs +++ b/meilisearch-types/src/features.rs @@ -7,6 +7,7 @@ pub struct RuntimeTogglableFeatures { pub vector_store: bool, pub metrics: bool, pub export_puffin_reports: bool, + pub proximity_precision: bool, } #[derive(Default, Debug, Clone, Copy)] diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 7bef64d4b..487354b8e 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -8,6 +8,7 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; +use milli::proximity::ProximityPrecision; use milli::update::Setting; use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; use serde::{Deserialize, Serialize, Serializer}; @@ -186,6 +187,9 @@ pub struct Settings { #[deserr(default, error = DeserrJsonError)] pub distinct_attribute: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default, error = DeserrJsonError)] + pub proximity_precision: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] pub typo_tolerance: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] @@ -214,6 +218,7 @@ impl Settings { separator_tokens: Setting::Reset, dictionary: Setting::Reset, distinct_attribute: Setting::Reset, + proximity_precision: Setting::Reset, typo_tolerance: Setting::Reset, faceting: Setting::Reset, pagination: Setting::Reset, @@ -234,6 +239,7 @@ impl Settings { dictionary, synonyms, distinct_attribute, + proximity_precision, typo_tolerance, faceting, pagination, @@ -252,6 +258,7 @@ impl Settings { dictionary, synonyms, distinct_attribute, + proximity_precision, typo_tolerance, faceting, pagination, @@ -296,6 +303,7 @@ impl Settings { separator_tokens: self.separator_tokens, dictionary: self.dictionary, distinct_attribute: self.distinct_attribute, + proximity_precision: self.proximity_precision, typo_tolerance: self.typo_tolerance, faceting: self.faceting, pagination: self.pagination, @@ -390,6 +398,12 @@ pub fn apply_settings_to_builder( Setting::NotSet => (), } + match settings.proximity_precision { + Setting::Set(ref precision) => builder.set_proximity_precision((*precision).into()), + Setting::Reset => builder.reset_proximity_precision(), + Setting::NotSet => (), + } + match settings.typo_tolerance { Setting::Set(ref value) => { match value.enabled { @@ -509,6 +523,8 @@ pub fn settings( let distinct_field = index.distinct_field(rtxn)?.map(String::from); + let proximity_precision = index.proximity_precision(rtxn)?.map(ProximityPrecisionView::from); + let synonyms = index.user_defined_synonyms(rtxn)?; let min_typo_word_len = MinWordSizeTyposSetting { @@ -532,7 +548,10 @@ pub fn settings( let faceting = FacetingSettings { max_values_per_facet: Setting::Set( - index.max_values_per_facet(rtxn)?.unwrap_or(DEFAULT_VALUES_PER_FACET), + index + .max_values_per_facet(rtxn)? + .map(|x| x as usize) + .unwrap_or(DEFAULT_VALUES_PER_FACET), ), sort_facet_values_by: Setting::Set( index @@ -545,7 +564,10 @@ pub fn settings( let pagination = PaginationSettings { max_total_hits: Setting::Set( - index.pagination_max_total_hits(rtxn)?.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS), + index + .pagination_max_total_hits(rtxn)? + .map(|x| x as usize) + .unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS), ), }; @@ -569,6 +591,10 @@ pub fn settings( Some(field) => Setting::Set(field), None => Setting::Reset, }, + proximity_precision: match proximity_precision { + Some(precision) => Setting::Set(precision), + None => Setting::Reset, + }, synonyms: Setting::Set(synonyms), typo_tolerance: Setting::Set(typo_tolerance), faceting: Setting::Set(faceting), @@ -673,6 +699,31 @@ impl From for Criterion { } } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +pub enum ProximityPrecisionView { + WordScale, + AttributeScale, +} + +impl From for ProximityPrecisionView { + fn from(value: ProximityPrecision) -> Self { + match value { + ProximityPrecision::WordScale => ProximityPrecisionView::WordScale, + ProximityPrecision::AttributeScale => ProximityPrecisionView::AttributeScale, + } + } +} +impl From for ProximityPrecision { + fn from(value: ProximityPrecisionView) -> Self { + match value { + ProximityPrecisionView::WordScale => ProximityPrecision::WordScale, + ProximityPrecisionView::AttributeScale => ProximityPrecision::AttributeScale, + } + } +} + #[cfg(test)] pub(crate) mod test { use super::*; @@ -692,6 +743,7 @@ pub(crate) mod test { dictionary: Setting::NotSet, synonyms: Setting::NotSet, distinct_attribute: Setting::NotSet, + proximity_precision: Setting::NotSet, typo_tolerance: Setting::NotSet, faceting: Setting::NotSet, pagination: Setting::NotSet, @@ -716,6 +768,7 @@ pub(crate) mod test { dictionary: Setting::NotSet, synonyms: Setting::NotSet, distinct_attribute: Setting::NotSet, + proximity_precision: Setting::NotSet, typo_tolerance: Setting::NotSet, faceting: Setting::NotSet, pagination: Setting::NotSet, diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index e14116645..c59b38fa6 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -39,7 +39,7 @@ byte-unit = { version = "4.0.19", default-features = false, features = [ bytes = "1.4.0" clap = { version = "4.2.1", features = ["derive", "env"] } crossbeam-channel = "0.5.8" -deserr = { version = "0.6.0", features = ["actix-web"]} +deserr = { version = "0.6.0", features = ["actix-web"] } dump = { path = "../dump" } either = "1.8.1" env_logger = "0.10.0" diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 2f0014ab7..f75516731 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -251,6 +251,7 @@ struct Infos { env: String, experimental_enable_metrics: bool, experimental_reduce_indexing_memory_usage: bool, + experimental_max_number_of_batched_tasks: usize, db_path: bool, import_dump: bool, dump_dir: bool, @@ -285,6 +286,7 @@ impl From for Infos { db_path, experimental_enable_metrics, experimental_reduce_indexing_memory_usage, + experimental_max_number_of_batched_tasks, http_addr, master_key: _, env, @@ -340,6 +342,7 @@ impl From for Infos { ignore_snapshot_if_db_exists, http_addr: http_addr != default_http_addr(), http_payload_size_limit, + experimental_max_number_of_batched_tasks, log_level: log_level.to_string(), max_indexing_memory, max_indexing_threads, diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 603d8ff86..e0f488eea 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -234,6 +234,7 @@ fn open_or_create_database_unchecked( indexer_config: (&opt.indexer_options).try_into()?, autobatching_enabled: true, max_number_of_tasks: 1_000_000, + max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks, index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize, index_count: DEFAULT_INDEX_COUNT, instance_features, @@ -362,7 +363,7 @@ fn import_dump( update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }, - |indexing_step| log::debug!("update: {:?}", indexing_step), + |indexing_step| log::trace!("update: {:?}", indexing_step), || false, )?; @@ -397,6 +398,7 @@ pub fn configure_data( .app_data(web::Data::from(analytics)) .app_data( web::JsonConfig::default() + .limit(http_payload_size_limit) .content_type(|mime| mime == mime::APPLICATION_JSON) .error_handler(|err, req: &HttpRequest| match err { JsonPayloadError::ContentType => match req.headers().get(CONTENT_TYPE) { diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index b8489c3e3..1ed20f5b5 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -51,6 +51,8 @@ const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE"; +const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str = + "MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS"; const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml"; const DEFAULT_DB_PATH: &str = "./data.ms"; @@ -301,6 +303,11 @@ pub struct Opt { #[serde(default)] pub experimental_reduce_indexing_memory_usage: bool, + /// Experimentally reduces the maximum number of tasks that will be processed at once, see: + #[clap(long, env = MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS, default_value_t = default_limit_batched_tasks())] + #[serde(default = "default_limit_batched_tasks")] + pub experimental_max_number_of_batched_tasks: usize, + #[serde(flatten)] #[clap(flatten)] pub indexer_options: IndexerOpts, @@ -371,6 +378,7 @@ impl Opt { max_index_size: _, max_task_db_size: _, http_payload_size_limit, + experimental_max_number_of_batched_tasks, ssl_cert_path, ssl_key_path, ssl_auth_path, @@ -392,8 +400,8 @@ impl Opt { config_file_path: _, #[cfg(feature = "analytics")] no_analytics, - experimental_enable_metrics: enable_metrics_route, - experimental_reduce_indexing_memory_usage: reduce_indexing_memory_usage, + experimental_enable_metrics, + experimental_reduce_indexing_memory_usage, } = self; export_to_env_if_not_present(MEILI_DB_PATH, db_path); export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr); @@ -409,6 +417,10 @@ impl Opt { MEILI_HTTP_PAYLOAD_SIZE_LIMIT, http_payload_size_limit.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS, + experimental_max_number_of_batched_tasks.to_string(), + ); if let Some(ssl_cert_path) = ssl_cert_path { export_to_env_if_not_present(MEILI_SSL_CERT_PATH, ssl_cert_path); } @@ -433,11 +445,11 @@ impl Opt { export_to_env_if_not_present(MEILI_LOG_LEVEL, log_level.to_string()); export_to_env_if_not_present( MEILI_EXPERIMENTAL_ENABLE_METRICS, - enable_metrics_route.to_string(), + experimental_enable_metrics.to_string(), ); export_to_env_if_not_present( MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE, - reduce_indexing_memory_usage.to_string(), + experimental_reduce_indexing_memory_usage.to_string(), ); indexer_options.export_to_env(); } @@ -727,6 +739,10 @@ fn default_http_payload_size_limit() -> Byte { Byte::from_str(DEFAULT_HTTP_PAYLOAD_SIZE_LIMIT).unwrap() } +fn default_limit_batched_tasks() -> usize { + usize::MAX +} + fn default_snapshot_dir() -> PathBuf { PathBuf::from(DEFAULT_SNAPSHOT_DIR) } diff --git a/meilisearch/src/routes/features.rs b/meilisearch/src/routes/features.rs index e7fd8de22..29e922eba 100644 --- a/meilisearch/src/routes/features.rs +++ b/meilisearch/src/routes/features.rs @@ -48,6 +48,8 @@ pub struct RuntimeTogglableFeatures { pub metrics: Option, #[deserr(default)] pub export_puffin_reports: Option, + #[deserr(default)] + pub proximity_precision: Option, } async fn patch_features( @@ -70,6 +72,10 @@ async fn patch_features( .0 .export_puffin_reports .unwrap_or(old_features.export_puffin_reports), + proximity_precision: new_features + .0 + .proximity_precision + .unwrap_or(old_features.proximity_precision), }; // explicitly destructure for analytics rather than using the `Serialize` implementation, because @@ -80,6 +86,7 @@ async fn patch_features( vector_store, metrics, export_puffin_reports, + proximity_precision, } = new_features; analytics.publish( @@ -89,6 +96,7 @@ async fn patch_features( "vector_store": vector_store, "metrics": metrics, "export_puffin_reports": export_puffin_reports, + "proximity_precision": proximity_precision, }), Some(&req), ); diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 2afc1b5fb..6d59f60dd 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -3,7 +3,7 @@ use std::io::ErrorKind; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; use actix_web::{web, HttpMessage, HttpRequest, HttpResponse}; -use bstr::ByteSlice; +use bstr::ByteSlice as _; use deserr::actix_web::{AwebJson, AwebQueryParameter}; use deserr::Deserr; use futures::StreamExt; @@ -612,8 +612,8 @@ fn retrieve_document>( let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); let internal_id = index - .external_documents_ids(&txn)? - .get(doc_id.as_bytes()) + .external_documents_ids() + .get(&txn, doc_id)? .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; let document = index diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 3921b535e..c22db24f0 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -78,6 +78,7 @@ macro_rules! make_setting_route { let body = body.into_inner(); + #[allow(clippy::redundant_closure_call)] $analytics(&body, &req); let new_settings = Settings { @@ -434,6 +435,30 @@ make_setting_route!( } ); +make_setting_route!( + "/proximity-precision", + put, + meilisearch_types::settings::ProximityPrecisionView, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsProximityPrecision, + >, + proximity_precision, + "proximityPrecision", + analytics, + |precision: &Option, req: &HttpRequest| { + use serde_json::json; + analytics.publish( + "ProximityPrecision Updated".to_string(), + json!({ + "proximity_precision": { + "set": precision.is_some(), + } + }), + Some(req), + ); + } +); + make_setting_route!( "/ranking-rules", put, @@ -540,6 +565,7 @@ generate_configure!( displayed_attributes, searchable_attributes, distinct_attribute, + proximity_precision, stop_words, separator_tokens, non_separator_tokens, @@ -593,6 +619,9 @@ pub async fn update_all( "distinct_attribute": { "set": new_settings.distinct_attribute.as_ref().set().is_some() }, + "proximity_precision": { + "set": new_settings.proximity_precision.as_ref().set().is_some() + }, "typo_tolerance": { "enabled": new_settings.typo_tolerance .as_ref() diff --git a/meilisearch/src/routes/multi_search.rs b/meilisearch/src/routes/multi_search.rs index 3a028022a..bcb8bb2a1 100644 --- a/meilisearch/src/routes/multi_search.rs +++ b/meilisearch/src/routes/multi_search.rs @@ -46,49 +46,46 @@ pub async fn multi_search_with_post( // Explicitly expect a `(ResponseError, usize)` for the error type rather than `ResponseError` only, // so that `?` doesn't work if it doesn't use `with_index`, ensuring that it is not forgotten in case of code // changes. - let search_results: Result<_, (ResponseError, usize)> = (|| { - async { - let mut search_results = Vec::with_capacity(queries.len()); - for (query_index, (index_uid, mut query)) in - queries.into_iter().map(SearchQueryWithIndex::into_index_query).enumerate() + let search_results: Result<_, (ResponseError, usize)> = async { + let mut search_results = Vec::with_capacity(queries.len()); + for (query_index, (index_uid, mut query)) in + queries.into_iter().map(SearchQueryWithIndex::into_index_query).enumerate() + { + debug!("multi-search #{query_index}: called with params: {:?}", query); + + // Check index from API key + if !index_scheduler.filters().is_index_authorized(&index_uid) { + return Err(AuthenticationError::InvalidToken).with_index(query_index); + } + // Apply search rules from tenant token + if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) { - debug!("multi-search #{query_index}: called with params: {:?}", query); + add_search_rules(&mut query, search_rules); + } - // Check index from API key - if !index_scheduler.filters().is_index_authorized(&index_uid) { - return Err(AuthenticationError::InvalidToken).with_index(query_index); - } - // Apply search rules from tenant token - if let Some(search_rules) = - index_scheduler.filters().get_index_search_rules(&index_uid) - { - add_search_rules(&mut query, search_rules); - } + let index = index_scheduler + .index(&index_uid) + .map_err(|err| { + let mut err = ResponseError::from(err); + // Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but + // here the resource not found is not part of the URL. + err.code = StatusCode::BAD_REQUEST; + err + }) + .with_index(query_index)?; - let index = index_scheduler - .index(&index_uid) - .map_err(|err| { - let mut err = ResponseError::from(err); - // Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but - // here the resource not found is not part of the URL. - err.code = StatusCode::BAD_REQUEST; - err - }) + let search_result = + tokio::task::spawn_blocking(move || perform_search(&index, query, features)) + .await .with_index(query_index)?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, features)) - .await - .with_index(query_index)?; - - search_results.push(SearchResultWithIndex { - index_uid: index_uid.into_inner(), - result: search_result.with_index(query_index)?, - }); - } - Ok(search_results) + search_results.push(SearchResultWithIndex { + index_uid: index_uid.into_inner(), + result: search_result.with_index(query_index)?, + }); } - })() + Ok(search_results) + } .await; if search_results.is_ok() { diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index c9ebed80e..41f073b48 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -360,6 +360,7 @@ fn prepare_search<'t>( let max_total_hits = index .pagination_max_total_hits(rtxn) .map_err(milli::Error::from)? + .map(|x| x as usize) .unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS); search.exhaustive_number_hits(is_finite_pagination); @@ -586,6 +587,7 @@ pub fn perform_search( let max_values_by_facet = index .max_values_per_facet(&rtxn) .map_err(milli::Error::from)? + .map(|x| x as usize) .unwrap_or(DEFAULT_VALUES_PER_FACET); facet_distribution.max_values_per_facet(max_values_by_facet); diff --git a/meilisearch/tests/assets/v6_v1.6.0_use_deactivated_experimental_setting.dump b/meilisearch/tests/assets/v6_v1.6.0_use_deactivated_experimental_setting.dump new file mode 100644 index 000000000..4f50733fd Binary files /dev/null and b/meilisearch/tests/assets/v6_v1.6.0_use_deactivated_experimental_setting.dump differ diff --git a/meilisearch/tests/documents/delete_documents.rs b/meilisearch/tests/documents/delete_documents.rs index b3f04aea0..5a15e95ff 100644 --- a/meilisearch/tests/documents/delete_documents.rs +++ b/meilisearch/tests/documents/delete_documents.rs @@ -397,7 +397,7 @@ async fn delete_document_by_complex_filter() { "canceledBy": null, "details": { "providedIds": 0, - "deletedDocuments": 4, + "deletedDocuments": 2, "originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]" }, "error": null, diff --git a/meilisearch/tests/dumps/data.rs b/meilisearch/tests/dumps/data.rs index 5df09bfd1..d353aaf1d 100644 --- a/meilisearch/tests/dumps/data.rs +++ b/meilisearch/tests/dumps/data.rs @@ -20,6 +20,8 @@ pub enum GetDump { RubyGemsWithSettingsV4, TestV5, + + TestV6WithExperimental, } impl GetDump { @@ -68,6 +70,10 @@ impl GetDump { GetDump::TestV5 => { exist_relative_path!("tests/assets/v5_v0.28.0_test_dump.dump").into() } + GetDump::TestV6WithExperimental => exist_relative_path!( + "tests/assets/v6_v1.6.0_use_deactivated_experimental_setting.dump" + ) + .into(), } } } diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index e60893d4e..9e949436a 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -59,6 +59,7 @@ async fn import_dump_v1_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -219,6 +220,7 @@ async fn import_dump_v1_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -365,6 +367,7 @@ async fn import_dump_v1_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -497,6 +500,7 @@ async fn import_dump_v2_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -641,6 +645,7 @@ async fn import_dump_v2_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -784,6 +789,7 @@ async fn import_dump_v2_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -916,6 +922,7 @@ async fn import_dump_v3_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1060,6 +1067,7 @@ async fn import_dump_v3_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1203,6 +1211,7 @@ async fn import_dump_v3_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1335,6 +1344,7 @@ async fn import_dump_v4_movie_raw() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1479,6 +1489,7 @@ async fn import_dump_v4_movie_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1622,6 +1633,7 @@ async fn import_dump_v4_rubygems_with_settings() { "dictionary": [], "synonyms": {}, "distinctAttribute": null, + "proximityPrecision": null, "typoTolerance": { "enabled": true, "minWordSizeForTypos": { @@ -1810,3 +1822,108 @@ async fn import_dump_v5() { json_string!(tasks, { ".results[].details.dumpUid" => "[uid]", ".results[].duration" => "[duration]" , ".results[].startedAt" => "[date]" , ".results[].finishedAt" => "[date]" }) ); } + +#[actix_rt::test] +async fn import_dump_v6_containing_experimental_features() { + let temp = tempfile::tempdir().unwrap(); + + let options = Opt { + import_dump: Some(GetDump::TestV6WithExperimental.path()), + ..default_settings(temp.path()) + }; + let mut server = Server::new_auth_with_options(options, temp).await; + server.use_api_key("MASTER_KEY"); + + let (indexes, code) = server.list_indexes(None, None).await; + assert_eq!(code, 200, "{indexes}"); + + assert_eq!(indexes["results"].as_array().unwrap().len(), 1); + assert_eq!(indexes["results"][0]["uid"], json!("movies")); + assert_eq!(indexes["results"][0]["primaryKey"], json!("id")); + + let (response, code) = server.get_features().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "scoreDetails": false, + "vectorStore": false, + "metrics": false, + "exportPuffinReports": false, + "proximityPrecision": false + } + "###); + + let index = server.index("movies"); + + let (response, code) = index.settings().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "displayedAttributes": [ + "*" + ], + "searchableAttributes": [ + "*" + ], + "filterableAttributes": [], + "sortableAttributes": [], + "rankingRules": [ + "words", + "typo", + "proximity" + ], + "stopWords": [], + "nonSeparatorTokens": [], + "separatorTokens": [], + "dictionary": [], + "synonyms": {}, + "distinctAttribute": null, + "proximityPrecision": "attributeScale", + "typoTolerance": { + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 5, + "twoTypos": 9 + }, + "disableOnWords": [], + "disableOnAttributes": [] + }, + "faceting": { + "maxValuesPerFacet": 100, + "sortFacetValuesBy": { + "*": "alpha" + } + }, + "pagination": { + "maxTotalHits": 1000 + } + } + "###); + + // the expected order is [1, 3, 2] instead of [3, 1, 2] + // because the attribute scale doesn't make the difference between 1 and 3. + index + .search(json!({"q": "the soup of day"}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "many the fish" + }, + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish" + }, + { + "id": 2, + "a": "Soup of day", + "b": "many the lazy fish" + } + ] + "###); + }) + .await; +} diff --git a/meilisearch/tests/features/mod.rs b/meilisearch/tests/features/mod.rs index abb006ac8..812a5c6bb 100644 --- a/meilisearch/tests/features/mod.rs +++ b/meilisearch/tests/features/mod.rs @@ -21,7 +21,8 @@ async fn experimental_features() { "scoreDetails": false, "vectorStore": false, "metrics": false, - "exportPuffinReports": false + "exportPuffinReports": false, + "proximityPrecision": false } "###); @@ -33,7 +34,8 @@ async fn experimental_features() { "scoreDetails": false, "vectorStore": true, "metrics": false, - "exportPuffinReports": false + "exportPuffinReports": false, + "proximityPrecision": false } "###); @@ -45,7 +47,8 @@ async fn experimental_features() { "scoreDetails": false, "vectorStore": true, "metrics": false, - "exportPuffinReports": false + "exportPuffinReports": false, + "proximityPrecision": false } "###); @@ -58,7 +61,8 @@ async fn experimental_features() { "scoreDetails": false, "vectorStore": true, "metrics": false, - "exportPuffinReports": false + "exportPuffinReports": false, + "proximityPrecision": false } "###); @@ -71,7 +75,8 @@ async fn experimental_features() { "scoreDetails": false, "vectorStore": true, "metrics": false, - "exportPuffinReports": false + "exportPuffinReports": false, + "proximityPrecision": false } "###); } @@ -91,7 +96,8 @@ async fn experimental_feature_metrics() { "scoreDetails": false, "vectorStore": false, "metrics": true, - "exportPuffinReports": false + "exportPuffinReports": false, + "proximityPrecision": false } "###); @@ -146,7 +152,7 @@ async fn errors() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`, `metrics`, `exportPuffinReports`", + "message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`, `metrics`, `exportPuffinReports`, `proximityPrecision`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" diff --git a/meilisearch/tests/search/distinct.rs b/meilisearch/tests/search/distinct.rs index 14ce88da2..aea98215d 100644 --- a/meilisearch/tests/search/distinct.rs +++ b/meilisearch/tests/search/distinct.rs @@ -4,7 +4,7 @@ use once_cell::sync::Lazy; use crate::common::{Server, Value}; use crate::json; -pub(self) static DOCUMENTS: Lazy = Lazy::new(|| { +static DOCUMENTS: Lazy = Lazy::new(|| { json!([ { "id": 1, @@ -107,8 +107,8 @@ pub(self) static DOCUMENTS: Lazy = Lazy::new(|| { ]) }); -pub(self) static DOCUMENT_PRIMARY_KEY: &str = "id"; -pub(self) static DOCUMENT_DISTINCT_KEY: &str = "product_id"; +static DOCUMENT_PRIMARY_KEY: &str = "id"; +static DOCUMENT_DISTINCT_KEY: &str = "product_id"; /// testing: https://github.com/meilisearch/meilisearch/issues/4078 #[actix_rt::test] diff --git a/meilisearch/tests/search/facet_search.rs b/meilisearch/tests/search/facet_search.rs index 1b06f1b98..8c1229f1a 100644 --- a/meilisearch/tests/search/facet_search.rs +++ b/meilisearch/tests/search/facet_search.rs @@ -4,7 +4,7 @@ use once_cell::sync::Lazy; use crate::common::{Server, Value}; use crate::json; -pub(self) static DOCUMENTS: Lazy = Lazy::new(|| { +static DOCUMENTS: Lazy = Lazy::new(|| { json!([ { "title": "Shazam!", diff --git a/meilisearch/tests/search/geo.rs b/meilisearch/tests/search/geo.rs index 67a4ca7df..5c6bb78a1 100644 --- a/meilisearch/tests/search/geo.rs +++ b/meilisearch/tests/search/geo.rs @@ -4,7 +4,7 @@ use once_cell::sync::Lazy; use crate::common::{Server, Value}; use crate::json; -pub(self) static DOCUMENTS: Lazy = Lazy::new(|| { +static DOCUMENTS: Lazy = Lazy::new(|| { json!([ { "id": 1, diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 0cf322401..00678f7d4 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -15,7 +15,7 @@ use once_cell::sync::Lazy; use crate::common::{Server, Value}; use crate::json; -pub(self) static DOCUMENTS: Lazy = Lazy::new(|| { +static DOCUMENTS: Lazy = Lazy::new(|| { json!([ { "title": "Shazam!", @@ -40,7 +40,7 @@ pub(self) static DOCUMENTS: Lazy = Lazy::new(|| { ]) }); -pub(self) static NESTED_DOCUMENTS: Lazy = Lazy::new(|| { +static NESTED_DOCUMENTS: Lazy = Lazy::new(|| { json!([ { "id": 852, diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index b5c4644a5..0ea556b94 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -54,7 +54,7 @@ async fn get_settings() { let (response, code) = index.settings().await; assert_eq!(code, 200); let settings = response.as_object().unwrap(); - assert_eq!(settings.keys().len(), 14); + assert_eq!(settings.keys().len(), 15); assert_eq!(settings["displayedAttributes"], json!(["*"])); assert_eq!(settings["searchableAttributes"], json!(["*"])); assert_eq!(settings["filterableAttributes"], json!([])); diff --git a/meilisearch/tests/settings/mod.rs b/meilisearch/tests/settings/mod.rs index 70125a360..ccb4139e6 100644 --- a/meilisearch/tests/settings/mod.rs +++ b/meilisearch/tests/settings/mod.rs @@ -1,4 +1,5 @@ mod distinct; mod errors; mod get_settings; +mod proximity_settings; mod tokenizer_customization; diff --git a/meilisearch/tests/settings/proximity_settings.rs b/meilisearch/tests/settings/proximity_settings.rs new file mode 100644 index 000000000..d445adbfa --- /dev/null +++ b/meilisearch/tests/settings/proximity_settings.rs @@ -0,0 +1,396 @@ +use meili_snap::{json_string, snapshot}; +use once_cell::sync::Lazy; + +use crate::common::Server; +use crate::json; + +static DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "id": 1, + "a": "Soup of the day", + "b": "many the fish", + }, + { + "id": 2, + "a": "Soup of day", + "b": "many the lazy fish", + }, + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish", + }, + ]) +}); + +#[actix_rt::test] +async fn attribute_scale_search() { + let server = Server::new().await; + let (response, code) = server.set_features(json!({"proximityPrecision": true})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "scoreDetails": false, + "vectorStore": false, + "metrics": false, + "exportPuffinReports": false, + "proximityPrecision": true + } + "###); + let index = server.index("test"); + + index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(0).await; + + let (response, code) = index + .update_settings(json!({ + "proximityPrecision": "attributeScale", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(1).await; + + // the expected order is [1, 3, 2] instead of [3, 1, 2] + // because the attribute scale doesn't make the difference between 1 and 3. + index + .search(json!({"q": "the soup of day"}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "many the fish" + }, + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish" + }, + { + "id": 2, + "a": "Soup of day", + "b": "many the lazy fish" + } + ] + "###); + }) + .await; + + // the expected order is [1, 2, 3] instead of [1, 3, 2] + // because the attribute scale sees all the word in the same attribute + // and so doesn't make the difference between the documents. + index + .search(json!({"q": "many the fish"}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "many the fish" + }, + { + "id": 2, + "a": "Soup of day", + "b": "many the lazy fish" + }, + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish" + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn attribute_scale_phrase_search() { + let server = Server::new().await; + let (response, code) = server.set_features(json!({"proximityPrecision": true})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "scoreDetails": false, + "vectorStore": false, + "metrics": false, + "exportPuffinReports": false, + "proximityPrecision": true + } + "###); + let index = server.index("test"); + + index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(0).await; + + let (_response, _code) = index + .update_settings(json!({ + "proximityPrecision": "attributeScale", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + index.wait_task(1).await; + + // the expected order is [1, 3] instead of [3, 1] + // because the attribute scale doesn't make the difference between 1 and 3. + // But 2 shouldn't be returned because "the" is not in the same attribute. + index + .search(json!({"q": "\"the soup of day\""}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "many the fish" + }, + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish" + } + ] + "###); + }) + .await; + + // the expected order is [1, 2, 3] instead of [1, 3] + // because the attribute scale sees all the word in the same attribute + // and so doesn't make the difference between the documents. + index + .search(json!({"q": "\"many the fish\""}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "many the fish" + }, + { + "id": 2, + "a": "Soup of day", + "b": "many the lazy fish" + }, + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish" + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn word_scale_set_and_reset() { + let server = Server::new().await; + let (response, code) = server.set_features(json!({"proximityPrecision": true})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "scoreDetails": false, + "vectorStore": false, + "metrics": false, + "exportPuffinReports": false, + "proximityPrecision": true + } + "###); + let index = server.index("test"); + + index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(0).await; + + // Set and reset the setting ensuring the swap between the 2 settings is applied. + let (_response, _code) = index + .update_settings(json!({ + "proximityPrecision": "attributeScale", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + index.wait_task(1).await; + + let (_response, _code) = index + .update_settings(json!({ + "proximityPrecision": "wordScale", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + index.wait_task(2).await; + + // [3, 1, 2] + index + .search(json!({"q": "the soup of day"}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish" + }, + { + "id": 1, + "a": "Soup of the day", + "b": "many the fish" + }, + { + "id": 2, + "a": "Soup of day", + "b": "many the lazy fish" + } + ] + "###); + }) + .await; + + // [1, 3, 2] + index + .search(json!({"q": "many the fish"}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "many the fish" + }, + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish" + }, + { + "id": 2, + "a": "Soup of day", + "b": "many the lazy fish" + } + ] + "###); + }) + .await; + + // [3] + index + .search(json!({"q": "\"the soup of day\""}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish" + } + ] + "###); + }) + .await; + + // [1, 3] + index + .search(json!({"q": "\"many the fish\""}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "many the fish" + }, + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish" + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn attribute_scale_default_ranking_rules() { + let server = Server::new().await; + let (response, code) = server.set_features(json!({"proximityPrecision": true})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "scoreDetails": false, + "vectorStore": false, + "metrics": false, + "exportPuffinReports": false, + "proximityPrecision": true + } + "###); + let index = server.index("test"); + + index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(0).await; + + let (response, code) = index + .update_settings(json!({ + "proximityPrecision": "attributeScale" + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(1).await; + + // the expected order is [3, 1, 2] + index + .search(json!({"q": "the soup of day"}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish" + }, + { + "id": 1, + "a": "Soup of the day", + "b": "many the fish" + }, + { + "id": 2, + "a": "Soup of day", + "b": "many the lazy fish" + } + ] + "###); + }) + .await; + + // the expected order is [1, 3, 2] instead of [1, 3] + // because the attribute scale sees all the word in the same attribute + // and so doesn't remove the document 2. + index + .search(json!({"q": "\"many the fish\""}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "many the fish" + }, + { + "id": 3, + "a": "the Soup of day", + "b": "many the fish" + }, + { + "id": 2, + "a": "Soup of day", + "b": "many the lazy fish" + } + ] + "###); + }) + .await; +} diff --git a/meilitool/src/main.rs b/meilitool/src/main.rs index 2b40e42c2..f199df216 100644 --- a/meilitool/src/main.rs +++ b/meilitool/src/main.rs @@ -7,8 +7,8 @@ use clap::{Parser, Subcommand}; use dump::{DumpWriter, IndexMetadata}; use file_store::FileStore; use meilisearch_auth::AuthController; -use meilisearch_types::heed::types::{OwnedType, SerdeJson, Str}; -use meilisearch_types::heed::{Database, Env, EnvOpenOptions, PolyDatabase, RoTxn, RwTxn}; +use meilisearch_types::heed::types::{SerdeJson, Str}; +use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::tasks::{Status, Task}; @@ -148,15 +148,17 @@ fn try_opening_poly_database( env: &Env, rtxn: &RoTxn, db_name: &str, -) -> anyhow::Result { - env.open_poly_database(rtxn, Some(db_name)) +) -> anyhow::Result> { + env.database_options() + .name(db_name) + .open(rtxn) .with_context(|| format!("While opening the {db_name:?} poly database"))? .with_context(|| format!("Missing the {db_name:?} poly database")) } fn try_clearing_poly_database( wtxn: &mut RwTxn, - database: PolyDatabase, + database: Database, db_name: &str, ) -> anyhow::Result<()> { database.clear(wtxn).with_context(|| format!("While clearing the {db_name:?} database")) @@ -212,7 +214,7 @@ fn export_a_dump( eprintln!("Successfully dumped {count} keys!"); let rtxn = env.read_txn()?; - let all_tasks: Database, SerdeJson> = + let all_tasks: Database> = try_opening_database(&env, &rtxn, "all-tasks")?; let index_mapping: Database = try_opening_database(&env, &rtxn, "index-mapping")?; diff --git a/meilitool/src/uuid_codec.rs b/meilitool/src/uuid_codec.rs index 70a92ca94..54020fa3c 100644 --- a/meilitool/src/uuid_codec.rs +++ b/meilitool/src/uuid_codec.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::convert::TryInto; -use meilisearch_types::heed::{BytesDecode, BytesEncode}; +use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode}; use uuid::Uuid; /// A heed codec for value of struct Uuid. @@ -10,15 +10,15 @@ pub struct UuidCodec; impl<'a> BytesDecode<'a> for UuidCodec { type DItem = Uuid; - fn bytes_decode(bytes: &'a [u8]) -> Option { - bytes.try_into().ok().map(Uuid::from_bytes) + fn bytes_decode(bytes: &'a [u8]) -> Result { + bytes.try_into().map(Uuid::from_bytes).map_err(Into::into) } } impl BytesEncode<'_> for UuidCodec { type EItem = Uuid; - fn bytes_encode(item: &Self::EItem) -> Option> { - Some(Cow::Borrowed(item.as_bytes())) + fn bytes_encode(item: &Self::EItem) -> Result, BoxedError> { + Ok(Cow::Borrowed(item.as_bytes())) } } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index cf5fe9726..8aa2a6f3f 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -20,17 +20,17 @@ byteorder = "1.4.3" charabia = { version = "0.8.5", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.8" -deserr = { version = "0.6.0", features = ["actix-web"]} +deserr = "0.6.0" either = { version = "1.8.1", features = ["serde"] } flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" -grenad = { version = "0.4.4", default-features = false, features = [ - "tempfile", +grenad = { version = "0.4.5", default-features = false, features = [ + "rayon", "tempfile" ] } -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [ - "lmdb", "read-txn-no-tls" +heed = { version = "0.20.0-alpha.9", default-features = false, features = [ + "serde-json", "serde-bincode", "read-txn-no-tls" ] } indexmap = { version = "2.0.0", features = ["serde"] } instant-distance = { version = "0.6.1", features = ["with-serde"] } @@ -79,6 +79,7 @@ big_s = "1.0.2" insta = "1.29.0" maplit = "1.0.2" md5 = "0.7.0" +meili-snap = { path = "../meili-snap" } rand = { version = "0.8.5", features = ["small_rng"] } [features] diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 7c037b3bf..a874ac17e 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,5 +1,6 @@ mod builder; mod enriched; +mod primary_key; mod reader; mod serde_impl; @@ -11,6 +12,7 @@ use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; use obkv::KvReader; +pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY}; pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; @@ -87,6 +89,12 @@ impl DocumentsBatchIndex { } } +impl FieldIdMapper for DocumentsBatchIndex { + fn id(&self, name: &str) -> Option { + self.id(name) + } +} + #[derive(Debug, thiserror::Error)] pub enum Error { #[error("Error parsing number {value:?} at line {line}: {error}")] diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs new file mode 100644 index 000000000..16a95c21f --- /dev/null +++ b/milli/src/documents/primary_key.rs @@ -0,0 +1,172 @@ +use std::iter; +use std::result::Result as StdResult; + +use serde_json::Value; + +use crate::{FieldId, InternalError, Object, Result, UserError}; + +/// The symbol used to define levels in a nested primary key. +const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; + +/// The default primary that is used when not specified. +pub const DEFAULT_PRIMARY_KEY: &str = "id"; + +/// Trait for objects that can map the name of a field to its [`FieldId`]. +pub trait FieldIdMapper { + /// Attempts to map the passed name to its [`FieldId`]. + /// + /// `None` if the field with this name was not found. + fn id(&self, name: &str) -> Option; +} + +/// A type that represent the type of primary key that has been set +/// for this index, a classic flat one or a nested one. +#[derive(Debug, Clone, Copy)] +pub enum PrimaryKey<'a> { + Flat { name: &'a str, field_id: FieldId }, + Nested { name: &'a str }, +} + +pub enum DocumentIdExtractionError { + InvalidDocumentId(UserError), + MissingDocumentId, + TooManyDocumentIds(usize), +} + +impl<'a> PrimaryKey<'a> { + pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option { + Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) { + Self::Nested { name: path } + } else { + let field_id = fields.id(path)?; + Self::Flat { name: path, field_id } + }) + } + + pub fn name(&self) -> &str { + match self { + PrimaryKey::Flat { name, .. } => name, + PrimaryKey::Nested { name } => name, + } + } + + pub fn document_id( + &self, + document: &obkv::KvReader, + fields: &impl FieldIdMapper, + ) -> Result> { + match self { + PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) { + Some(document_id_bytes) => { + let document_id = serde_json::from_slice(document_id_bytes) + .map_err(InternalError::SerdeJson)?; + match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => { + Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error))) + } + } + } + None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), + }, + nested @ PrimaryKey::Nested { .. } => { + let mut matching_documents_ids = Vec::new(); + for (first_level_name, right) in nested.possible_level_names() { + if let Some(field_id) = fields.id(first_level_name) { + if let Some(value_bytes) = document.get(field_id) { + let object = serde_json::from_slice(value_bytes) + .map_err(InternalError::SerdeJson)?; + fetch_matching_values(object, right, &mut matching_documents_ids); + + if matching_documents_ids.len() >= 2 { + return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds( + matching_documents_ids.len(), + ))); + } + } + } + } + + match matching_documents_ids.pop() { + Some(document_id) => match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => { + Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error))) + } + }, + None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), + } + } + } + } + + /// Returns an `Iterator` that gives all the possible fields names the primary key + /// can have depending of the first level name and depth of the objects. + pub fn possible_level_names(&self) -> impl Iterator + '_ { + let name = self.name(); + name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) + .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) + .chain(iter::once((name, ""))) + } +} + +fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { + match value { + Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), + otherwise => output.push(otherwise), + } +} + +fn fetch_matching_values_in_object( + object: Object, + selector: &str, + base_key: &str, + output: &mut Vec, +) { + for (key, value) in object { + let base_key = if base_key.is_empty() { + key.to_string() + } else { + format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) + }; + + if starts_with(selector, &base_key) { + match value { + Value::Object(object) => { + fetch_matching_values_in_object(object, selector, &base_key, output) + } + value => output.push(value), + } + } + } +} + +fn starts_with(selector: &str, key: &str) -> bool { + selector.strip_prefix(key).map_or(false, |tail| { + tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) + }) +} + +// FIXME: move to a DocumentId struct + +fn validate_document_id(document_id: &str) -> Option<&str> { + if !document_id.is_empty() + && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) + { + Some(document_id) + } else { + None + } +} + +pub fn validate_document_id_value(document_id: Value) -> Result> { + match document_id { + Value::String(string) => match validate_document_id(&string) { + Some(s) if s.len() == string.len() => Ok(Ok(string)), + Some(s) => Ok(Ok(s.to_string())), + None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), + }, + Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), + content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), + } +} diff --git a/milli/src/error.rs b/milli/src/error.rs index e9e1fddd3..cbbd8a3e5 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -89,8 +89,6 @@ pub enum FieldIdMapMissingEntry { #[derive(Error, Debug)] pub enum UserError { - #[error("A soft deleted internal document id have been used: `{document_id}`.")] - AccessingSoftDeletedDocument { document_id: DocumentId }, #[error("A document cannot contain more than 65,535 fields.")] AttributeLimitReached, #[error(transparent)] @@ -154,7 +152,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco valid_fields: BTreeSet, hidden_fields: bool, }, - #[error("{}", HeedError::BadOpenOptions)] + #[error("an environment is already opened with different options")] InvalidLmdbOpenOptions, #[error("You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.")] SortRankingRuleMissing, @@ -328,11 +326,12 @@ impl From for Error { HeedError::Mdb(MdbError::MapFull) => UserError(MaxDatabaseSizeReached), HeedError::Mdb(MdbError::Invalid) => UserError(InvalidStoreFile), HeedError::Mdb(error) => InternalError(Store(error)), - HeedError::Encoding => InternalError(Serialization(Encoding { db_name: None })), - HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })), + // TODO use the encoding + HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })), + HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })), HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), HeedError::DatabaseClosing => InternalError(DatabaseClosing), - HeedError::BadOpenOptions => UserError(InvalidLmdbOpenOptions), + HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions), } } } diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 36b147336..361617826 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -1,159 +1,75 @@ -use std::borrow::Cow; use std::collections::HashMap; -use std::convert::TryInto; -use std::{fmt, str}; -use fst::map::IndexedValue; -use fst::{IntoStreamer, Streamer}; -use roaring::RoaringBitmap; +use heed::types::Str; +use heed::{Database, RoIter, RoTxn, RwTxn}; -const DELETED_ID: u64 = u64::MAX; +use crate::{DocumentId, BEU32}; -pub struct ExternalDocumentsIds<'a> { - pub(crate) hard: fst::Map>, - pub(crate) soft: fst::Map>, - soft_deleted_docids: RoaringBitmap, +pub enum DocumentOperationKind { + Create, + Delete, } -impl<'a> ExternalDocumentsIds<'a> { - pub fn new( - hard: fst::Map>, - soft: fst::Map>, - soft_deleted_docids: RoaringBitmap, - ) -> ExternalDocumentsIds<'a> { - ExternalDocumentsIds { hard, soft, soft_deleted_docids } - } +pub struct DocumentOperation { + pub external_id: String, + pub internal_id: DocumentId, + pub kind: DocumentOperationKind, +} - pub fn into_static(self) -> ExternalDocumentsIds<'static> { - ExternalDocumentsIds { - hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), - soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), - soft_deleted_docids: self.soft_deleted_docids, - } +pub struct ExternalDocumentsIds(Database); + +impl ExternalDocumentsIds { + pub fn new(db: Database) -> ExternalDocumentsIds { + ExternalDocumentsIds(db) } /// Returns `true` if hard and soft external documents lists are empty. - pub fn is_empty(&self) -> bool { - self.hard.is_empty() && self.soft.is_empty() + pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result { + self.0.is_empty(rtxn).map_err(Into::into) } - pub fn get>(&self, external_id: A) -> Option { - let external_id = external_id.as_ref(); - match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { - Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => { - Some(id.try_into().unwrap()) - } - _otherwise => None, - } - } - - /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they - /// don't contain any soft deleted document id. - pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> { - let mut new_hard_builder = fst::MapBuilder::memory(); - - let union_op = self.hard.op().add(&self.soft).r#union(); - let mut iter = union_op.into_stream(); - while let Some((external_id, docids)) = iter.next() { - // prefer selecting the ids from soft, always - let id = indexed_last_value(docids).unwrap(); - if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) { - new_hard_builder.insert(external_id, id)?; - } - } - drop(iter); - - // Delete soft map completely - self.soft = fst::Map::default().map_data(Cow::Owned)?; - // We save the new map as the new hard map. - self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; - - Ok(()) - } - - pub fn insert_ids>(&mut self, other: &fst::Map) -> fst::Result<()> { - let union_op = self.soft.op().add(other).r#union(); - - let mut new_soft_builder = fst::MapBuilder::memory(); - let mut iter = union_op.into_stream(); - while let Some((external_id, marked_docids)) = iter.next() { - let id = indexed_last_value(marked_docids).unwrap(); - new_soft_builder.insert(external_id, id)?; - } - - drop(iter); - - // We save the new map as the new soft map. - self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; - self.merge_soft_into_hard() + pub fn get>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result> { + self.0.get(rtxn, external_id.as_ref()) } /// An helper function to debug this type, returns an `HashMap` of both, /// soft and hard fst maps, combined. - pub fn to_hash_map(&self) -> HashMap { - let mut map = HashMap::new(); - - let union_op = self.hard.op().add(&self.soft).r#union(); - let mut iter = union_op.into_stream(); - while let Some((external_id, marked_docids)) = iter.next() { - let id = indexed_last_value(marked_docids).unwrap(); - if id != DELETED_ID { - let external_id = str::from_utf8(external_id).unwrap(); - map.insert(external_id.to_owned(), id.try_into().unwrap()); - } + pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result> { + let mut map = HashMap::default(); + for result in self.0.iter(rtxn)? { + let (external, internal) = result?; + map.insert(external.to_owned(), internal); } - - map + Ok(map) } - /// Return an fst of the combined hard and soft deleted ID. - pub fn to_fst<'b>(&'b self) -> fst::Result>>> { - if self.soft.is_empty() { - return Ok(Cow::Borrowed(&self.hard)); - } - let union_op = self.hard.op().add(&self.soft).r#union(); - - let mut iter = union_op.into_stream(); - let mut new_hard_builder = fst::MapBuilder::memory(); - while let Some((external_id, marked_docids)) = iter.next() { - let value = indexed_last_value(marked_docids).unwrap(); - if value != DELETED_ID { - new_hard_builder.insert(external_id, value)?; + /// Applies the list of operations passed as argument, modifying the current external to internal id mapping. + /// + /// If the list contains multiple operations on the same external id, then the result is unspecified. + /// + /// # Panics + /// + /// - If attempting to delete a document that doesn't exist + /// - If attempting to create a document that already exists + pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec) -> heed::Result<()> { + for DocumentOperation { external_id, internal_id, kind } in operations { + match kind { + DocumentOperationKind::Create => { + self.0.put(wtxn, &external_id, &internal_id)?; + } + DocumentOperationKind::Delete => { + if !self.0.delete(wtxn, &external_id)? { + panic!("Attempting to delete a non-existing document") + } + } } } - drop(iter); - - Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?)) - } - - fn merge_soft_into_hard(&mut self) -> fst::Result<()> { - if self.soft.len() >= self.hard.len() / 2 { - self.hard = self.to_fst()?.into_owned(); - self.soft = fst::Map::default().map_data(Cow::Owned)?; - } - Ok(()) } -} -impl fmt::Debug for ExternalDocumentsIds<'_> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish() + /// Returns an iterator over all the external ids. + pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result> { + self.0.iter(rtxn) } } - -impl Default for ExternalDocumentsIds<'static> { - fn default() -> Self { - ExternalDocumentsIds { - hard: fst::Map::default().map_data(Cow::Owned).unwrap(), - soft: fst::Map::default().map_data(Cow::Owned).unwrap(), - soft_deleted_docids: RoaringBitmap::new(), - } - } -} - -/// Returns the value of the `IndexedValue` with the highest _index_. -fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option { - indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value) -} diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 810ff755b..9c1c87f82 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -81,6 +81,12 @@ impl Default for FieldsIdsMap { } } +impl crate::documents::FieldIdMapper for FieldsIdsMap { + fn id(&self, name: &str) -> Option { + self.id(name) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/milli/src/heed_codec/beu16_str_codec.rs b/milli/src/heed_codec/beu16_str_codec.rs index d1b85d47f..ba04f0900 100644 --- a/milli/src/heed_codec/beu16_str_codec.rs +++ b/milli/src/heed_codec/beu16_str_codec.rs @@ -2,26 +2,28 @@ use std::borrow::Cow; use std::convert::TryInto; use std::str; +use heed::BoxedError; + pub struct BEU16StrCodec; impl<'a> heed::BytesDecode<'a> for BEU16StrCodec { type DItem = (u16, &'a str); - fn bytes_decode(bytes: &'a [u8]) -> Option { + fn bytes_decode(bytes: &'a [u8]) -> Result { let (n_bytes, str_bytes) = bytes.split_at(2); - let n = n_bytes.try_into().map(u16::from_be_bytes).ok()?; - let s = str::from_utf8(str_bytes).ok()?; - Some((n, s)) + let n = n_bytes.try_into().map(u16::from_be_bytes)?; + let s = str::from_utf8(str_bytes)?; + Ok((n, s)) } } impl<'a> heed::BytesEncode<'a> for BEU16StrCodec { type EItem = (u16, &'a str); - fn bytes_encode((n, s): &Self::EItem) -> Option> { + fn bytes_encode((n, s): &Self::EItem) -> Result, BoxedError> { let mut bytes = Vec::with_capacity(s.len() + 2); bytes.extend_from_slice(&n.to_be_bytes()); bytes.extend_from_slice(s.as_bytes()); - Some(Cow::Owned(bytes)) + Ok(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/beu32_str_codec.rs b/milli/src/heed_codec/beu32_str_codec.rs index c525d6b5b..762e31ca4 100644 --- a/milli/src/heed_codec/beu32_str_codec.rs +++ b/milli/src/heed_codec/beu32_str_codec.rs @@ -2,26 +2,28 @@ use std::borrow::Cow; use std::convert::TryInto; use std::str; +use heed::BoxedError; + pub struct BEU32StrCodec; impl<'a> heed::BytesDecode<'a> for BEU32StrCodec { type DItem = (u32, &'a str); - fn bytes_decode(bytes: &'a [u8]) -> Option { + fn bytes_decode(bytes: &'a [u8]) -> Result { let (n_bytes, str_bytes) = bytes.split_at(4); - let n = n_bytes.try_into().map(u32::from_be_bytes).ok()?; - let s = str::from_utf8(str_bytes).ok()?; - Some((n, s)) + let n = n_bytes.try_into().map(u32::from_be_bytes)?; + let s = str::from_utf8(str_bytes)?; + Ok((n, s)) } } impl<'a> heed::BytesEncode<'a> for BEU32StrCodec { type EItem = (u32, &'a str); - fn bytes_encode((n, s): &Self::EItem) -> Option> { + fn bytes_encode((n, s): &Self::EItem) -> Result, BoxedError> { let mut bytes = Vec::with_capacity(s.len() + 4); bytes.extend_from_slice(&n.to_be_bytes()); bytes.extend_from_slice(s.as_bytes()); - Some(Cow::Owned(bytes)) + Ok(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/byte_slice_ref.rs b/milli/src/heed_codec/byte_slice_ref.rs index 48eda63c5..a4b5748f1 100644 --- a/milli/src/heed_codec/byte_slice_ref.rs +++ b/milli/src/heed_codec/byte_slice_ref.rs @@ -1,23 +1,23 @@ use std::borrow::Cow; -use heed::{BytesDecode, BytesEncode}; +use heed::{BoxedError, BytesDecode, BytesEncode}; -/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated +/// A codec for values of type `&[u8]`. Unlike `Bytes`, its `EItem` and `DItem` associated /// types are equivalent (= `&'a [u8]`) and these values can reside within another structure. -pub struct ByteSliceRefCodec; +pub struct BytesRefCodec; -impl<'a> BytesEncode<'a> for ByteSliceRefCodec { +impl<'a> BytesEncode<'a> for BytesRefCodec { type EItem = &'a [u8]; - fn bytes_encode(item: &'a Self::EItem) -> Option> { - Some(Cow::Borrowed(item)) + fn bytes_encode(item: &'a Self::EItem) -> Result, BoxedError> { + Ok(Cow::Borrowed(item)) } } -impl<'a> BytesDecode<'a> for ByteSliceRefCodec { +impl<'a> BytesDecode<'a> for BytesRefCodec { type DItem = &'a [u8]; - fn bytes_decode(bytes: &'a [u8]) -> Option { - Some(bytes) + fn bytes_decode(bytes: &'a [u8]) -> Result { + Ok(bytes) } } diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs index cc9919ad2..7e281adfa 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs @@ -1,8 +1,9 @@ use std::borrow::Cow; use std::marker::PhantomData; -use heed::{BytesDecode, BytesEncode}; +use heed::{BoxedError, BytesDecode, BytesEncode}; +use crate::heed_codec::SliceTooShortError; use crate::{try_split_array_at, DocumentId, FieldId}; pub struct FieldDocIdFacetCodec(PhantomData); @@ -13,16 +14,16 @@ where { type DItem = (FieldId, DocumentId, C::DItem); - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + fn bytes_decode(bytes: &'a [u8]) -> Result { + let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?; let field_id = u16::from_be_bytes(field_id_bytes); - let (document_id_bytes, bytes) = try_split_array_at(bytes)?; + let (document_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?; let document_id = u32::from_be_bytes(document_id_bytes); let value = C::bytes_decode(bytes)?; - Some((field_id, document_id, value)) + Ok((field_id, document_id, value)) } } @@ -32,13 +33,15 @@ where { type EItem = (FieldId, DocumentId, C::EItem); - fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option> { + fn bytes_encode( + (field_id, document_id, value): &'a Self::EItem, + ) -> Result, BoxedError> { let mut bytes = Vec::with_capacity(32); bytes.extend_from_slice(&field_id.to_be_bytes()); // 2 bytes bytes.extend_from_slice(&document_id.to_be_bytes()); // 4 bytes let value_bytes = C::bytes_encode(value)?; // variable length, if f64 -> 16 bytes, if string -> large, potentially bytes.extend_from_slice(&value_bytes); - Some(Cow::Owned(bytes)) + Ok(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index d36ec8434..7bb874060 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -5,8 +5,8 @@ use std::borrow::Cow; use std::convert::TryFrom; use std::marker::PhantomData; -use heed::types::{DecodeIgnore, OwnedType}; -use heed::{BytesDecode, BytesEncode}; +use heed::types::DecodeIgnore; +use heed::{BoxedError, BytesDecode, BytesEncode}; use roaring::RoaringBitmap; pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec; @@ -18,7 +18,7 @@ pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec; pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec; pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec; -pub type FieldIdCodec = OwnedType; +pub type FieldIdCodec = BEU16; /// Tries to split a slice in half at the given middle point, /// `None` if the slice is too short. @@ -58,7 +58,7 @@ where { type EItem = FacetGroupKey; - fn bytes_encode(value: &'a Self::EItem) -> Option> { + fn bytes_encode(value: &'a Self::EItem) -> Result, BoxedError> { let mut v = vec![]; v.extend_from_slice(&value.field_id.to_be_bytes()); v.extend_from_slice(&[value.level]); @@ -66,7 +66,7 @@ where let bound = T::bytes_encode(&value.left_bound)?; v.extend_from_slice(&bound); - Some(Cow::Owned(v)) + Ok(Cow::Owned(v)) } } impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec @@ -75,11 +75,11 @@ where { type DItem = FacetGroupKey; - fn bytes_decode(bytes: &'a [u8]) -> Option { - let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?); + fn bytes_decode(bytes: &'a [u8]) -> Result { + let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1])?); let level = bytes[2]; let bound = T::bytes_decode(&bytes[3..])?; - Some(FacetGroupKey { field_id: fid, level, left_bound: bound }) + Ok(FacetGroupKey { field_id: fid, level, left_bound: bound }) } } @@ -87,17 +87,17 @@ pub struct FacetGroupValueCodec; impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { type EItem = FacetGroupValue; - fn bytes_encode(value: &'a Self::EItem) -> Option> { + fn bytes_encode(value: &'a Self::EItem) -> Result, BoxedError> { let mut v = vec![value.size]; CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); - Some(Cow::Owned(v)) + Ok(Cow::Owned(v)) } } impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { type DItem = FacetGroupValue; - fn bytes_decode(bytes: &'a [u8]) -> Option { + fn bytes_decode(bytes: &'a [u8]) -> Result { let size = bytes[0]; - let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?; - Some(FacetGroupValue { size, bitmap }) + let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?; + Ok(FacetGroupValue { size, bitmap }) } } diff --git a/milli/src/heed_codec/facet/ordered_f64_codec.rs b/milli/src/heed_codec/facet/ordered_f64_codec.rs index 5ac9ffcfc..b692b2363 100644 --- a/milli/src/heed_codec/facet/ordered_f64_codec.rs +++ b/milli/src/heed_codec/facet/ordered_f64_codec.rs @@ -1,37 +1,45 @@ use std::borrow::Cow; use std::convert::TryInto; -use heed::BytesDecode; +use heed::{BoxedError, BytesDecode}; +use thiserror::Error; use crate::facet::value_encoding::f64_into_bytes; +use crate::heed_codec::SliceTooShortError; pub struct OrderedF64Codec; impl<'a> BytesDecode<'a> for OrderedF64Codec { type DItem = f64; - fn bytes_decode(bytes: &'a [u8]) -> Option { + fn bytes_decode(bytes: &'a [u8]) -> Result { if bytes.len() < 16 { - return None; + Err(SliceTooShortError.into()) + } else { + bytes[8..].try_into().map(f64::from_be_bytes).map_err(Into::into) } - let f = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; - Some(f) } } impl heed::BytesEncode<'_> for OrderedF64Codec { type EItem = f64; - fn bytes_encode(f: &Self::EItem) -> Option> { + fn bytes_encode(f: &Self::EItem) -> Result, BoxedError> { let mut buffer = [0u8; 16]; // write the globally ordered float - let bytes = f64_into_bytes(*f)?; + let bytes = f64_into_bytes(*f).ok_or(InvalidGloballyOrderedFloatError { float: *f })?; buffer[..8].copy_from_slice(&bytes[..]); // Then the f64 value just to be able to read it back let bytes = f.to_be_bytes(); buffer[8..16].copy_from_slice(&bytes[..]); - Some(Cow::Owned(buffer.to_vec())) + Ok(Cow::Owned(buffer.to_vec())) } } + +#[derive(Error, Debug)] +#[error("the float {float} cannot be converted to a globally ordered representation")] +pub struct InvalidGloballyOrderedFloatError { + float: f64, +} diff --git a/milli/src/heed_codec/field_id_word_count_codec.rs b/milli/src/heed_codec/field_id_word_count_codec.rs index aca7a80c4..19d8d63c6 100644 --- a/milli/src/heed_codec/field_id_word_count_codec.rs +++ b/milli/src/heed_codec/field_id_word_count_codec.rs @@ -1,5 +1,8 @@ use std::borrow::Cow; +use heed::BoxedError; + +use super::SliceTooShortError; use crate::{try_split_array_at, FieldId}; pub struct FieldIdWordCountCodec; @@ -7,21 +10,21 @@ pub struct FieldIdWordCountCodec; impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec { type DItem = (FieldId, u8); - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + fn bytes_decode(bytes: &'a [u8]) -> Result { + let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?; let field_id = u16::from_be_bytes(field_id_bytes); - let ([word_count], _nothing) = try_split_array_at(bytes)?; - Some((field_id, word_count)) + let ([word_count], _nothing) = try_split_array_at(bytes).ok_or(SliceTooShortError)?; + Ok((field_id, word_count)) } } impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec { type EItem = (FieldId, u8); - fn bytes_encode((field_id, word_count): &Self::EItem) -> Option> { + fn bytes_encode((field_id, word_count): &Self::EItem) -> Result, BoxedError> { let mut bytes = Vec::with_capacity(2 + 1); bytes.extend_from_slice(&field_id.to_be_bytes()); bytes.push(*word_count); - Some(Cow::Owned(bytes)) + Ok(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/fst_set_codec.rs b/milli/src/heed_codec/fst_set_codec.rs index fc79acf29..b402c8ff3 100644 --- a/milli/src/heed_codec/fst_set_codec.rs +++ b/milli/src/heed_codec/fst_set_codec.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use fst::Set; -use heed::{BytesDecode, BytesEncode}; +use heed::{BoxedError, BytesDecode, BytesEncode}; /// A codec for values of type `Set<&[u8]>`. pub struct FstSetCodec; @@ -9,15 +9,15 @@ pub struct FstSetCodec; impl<'a> BytesEncode<'a> for FstSetCodec { type EItem = Set>; - fn bytes_encode(item: &'a Self::EItem) -> Option> { - Some(Cow::Borrowed(item.as_fst().as_bytes())) + fn bytes_encode(item: &'a Self::EItem) -> Result, BoxedError> { + Ok(Cow::Borrowed(item.as_fst().as_bytes())) } } impl<'a> BytesDecode<'a> for FstSetCodec { type DItem = Set<&'a [u8]>; - fn bytes_decode(bytes: &'a [u8]) -> Option { - Set::new(bytes).ok() + fn bytes_decode(bytes: &'a [u8]) -> Result { + Set::new(bytes).map_err(Into::into) } } diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index d04eaa644..449d1955c 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -12,8 +12,10 @@ mod str_beu32_codec; mod str_ref; mod str_str_u8_codec; -pub use byte_slice_ref::ByteSliceRefCodec; +pub use byte_slice_ref::BytesRefCodec; +use heed::BoxedError; pub use str_ref::StrRefCodec; +use thiserror::Error; pub use self::beu16_str_codec::BEU16StrCodec; pub use self::beu32_str_codec::BEU32StrCodec; @@ -31,5 +33,9 @@ pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; pub trait BytesDecodeOwned { type DItem; - fn bytes_decode_owned(bytes: &[u8]) -> Option; + fn bytes_decode_owned(bytes: &[u8]) -> Result; } + +#[derive(Error, Debug)] +#[error("the slice is too short")] +pub struct SliceTooShortError; diff --git a/milli/src/heed_codec/obkv_codec.rs b/milli/src/heed_codec/obkv_codec.rs index 6dad771a8..d2408c87d 100644 --- a/milli/src/heed_codec/obkv_codec.rs +++ b/milli/src/heed_codec/obkv_codec.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; +use heed::BoxedError; use obkv::{KvReaderU16, KvWriterU16}; pub struct ObkvCodec; @@ -7,15 +8,15 @@ pub struct ObkvCodec; impl<'a> heed::BytesDecode<'a> for ObkvCodec { type DItem = KvReaderU16<'a>; - fn bytes_decode(bytes: &'a [u8]) -> Option { - Some(KvReaderU16::new(bytes)) + fn bytes_decode(bytes: &'a [u8]) -> Result { + Ok(KvReaderU16::new(bytes)) } } impl heed::BytesEncode<'_> for ObkvCodec { type EItem = KvWriterU16>; - fn bytes_encode(item: &Self::EItem) -> Option> { - item.clone().into_inner().map(Cow::Owned).ok() + fn bytes_encode(item: &Self::EItem) -> Result, BoxedError> { + item.clone().into_inner().map(Cow::Owned).map_err(Into::into) } } diff --git a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs index 9ad2e9707..c5e7e3e89 100644 --- a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::convert::TryInto; use std::mem::size_of; -use heed::BytesDecode; +use heed::{BoxedError, BytesDecode}; use roaring::RoaringBitmap; use crate::heed_codec::BytesDecodeOwned; @@ -19,22 +19,22 @@ impl BoRoaringBitmapCodec { impl BytesDecode<'_> for BoRoaringBitmapCodec { type DItem = RoaringBitmap; - fn bytes_decode(bytes: &[u8]) -> Option { + fn bytes_decode(bytes: &[u8]) -> Result { let mut bitmap = RoaringBitmap::new(); for chunk in bytes.chunks(size_of::()) { - let bytes = chunk.try_into().ok()?; + let bytes = chunk.try_into()?; bitmap.push(u32::from_ne_bytes(bytes)); } - Some(bitmap) + Ok(bitmap) } } impl BytesDecodeOwned for BoRoaringBitmapCodec { type DItem = RoaringBitmap; - fn bytes_decode_owned(bytes: &[u8]) -> Option { + fn bytes_decode_owned(bytes: &[u8]) -> Result { Self::bytes_decode(bytes) } } @@ -42,9 +42,9 @@ impl BytesDecodeOwned for BoRoaringBitmapCodec { impl heed::BytesEncode<'_> for BoRoaringBitmapCodec { type EItem = RoaringBitmap; - fn bytes_encode(item: &Self::EItem) -> Option> { + fn bytes_encode(item: &Self::EItem) -> Result, BoxedError> { let mut out = Vec::new(); BoRoaringBitmapCodec::serialize_into(item, &mut out); - Some(Cow::Owned(out)) + Ok(Cow::Owned(out)) } } diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index bf76287d8..dcab42c0a 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -3,9 +3,11 @@ use std::io; use std::mem::size_of; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; +use heed::BoxedError; use roaring::RoaringBitmap; use crate::heed_codec::BytesDecodeOwned; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; /// This is the limit where using a byteorder became less size efficient /// than using a direct roaring encoding, it is also the point where we are able @@ -60,12 +62,16 @@ impl CboRoaringBitmapCodec { /// if the merged values length is under the threshold, values are directly /// serialized in the buffer else a RoaringBitmap is created from the /// values and is serialized in the buffer. - pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { + pub fn merge_into(slices: I, buffer: &mut Vec) -> io::Result<()> + where + I: IntoIterator, + A: AsRef<[u8]>, + { let mut roaring = RoaringBitmap::new(); let mut vec = Vec::new(); for bytes in slices { - if bytes.len() <= THRESHOLD * size_of::() { + if bytes.as_ref().len() <= THRESHOLD * size_of::() { let mut reader = bytes.as_ref(); while let Ok(integer) = reader.read_u32::() { vec.push(integer); @@ -85,7 +91,7 @@ impl CboRoaringBitmapCodec { } } else { // We can unwrap safely because the vector is sorted upper. - let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap(); + let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap(); roaring.serialize_into(buffer)?; } } else { @@ -95,31 +101,58 @@ impl CboRoaringBitmapCodec { Ok(()) } + + /// Merges a DelAdd delta into a CboRoaringBitmap. + pub fn merge_deladd_into<'a>( + deladd: KvReaderDelAdd<'_>, + previous: &[u8], + buffer: &'a mut Vec, + ) -> io::Result> { + // Deserialize the bitmap that is already there + let mut previous = Self::deserialize_from(previous)?; + + // Remove integers we no more want in the previous bitmap + if let Some(value) = deladd.get(DelAdd::Deletion) { + previous -= Self::deserialize_from(value)?; + } + + // Insert the new integers we want in the previous bitmap + if let Some(value) = deladd.get(DelAdd::Addition) { + previous |= Self::deserialize_from(value)?; + } + + if previous.is_empty() { + return Ok(None); + } + + Self::serialize_into(&previous, buffer); + Ok(Some(&buffer[..])) + } } impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { type DItem = RoaringBitmap; - fn bytes_decode(bytes: &[u8]) -> Option { - Self::deserialize_from(bytes).ok() + fn bytes_decode(bytes: &[u8]) -> Result { + Self::deserialize_from(bytes).map_err(Into::into) } } impl BytesDecodeOwned for CboRoaringBitmapCodec { type DItem = RoaringBitmap; - fn bytes_decode_owned(bytes: &[u8]) -> Option { - Self::deserialize_from(bytes).ok() + fn bytes_decode_owned(bytes: &[u8]) -> Result { + Self::deserialize_from(bytes).map_err(Into::into) } } impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { type EItem = RoaringBitmap; - fn bytes_encode(item: &Self::EItem) -> Option> { + fn bytes_encode(item: &Self::EItem) -> Result, BoxedError> { let mut vec = Vec::with_capacity(Self::serialized_size(item)); Self::serialize_into(item, &mut vec); - Some(Cow::Owned(vec)) + Ok(Cow::Owned(vec)) } } diff --git a/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs index f982cc105..aa532ffdd 100644 --- a/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; +use heed::BoxedError; use roaring::RoaringBitmap; use crate::heed_codec::BytesDecodeOwned; @@ -9,25 +10,25 @@ pub struct RoaringBitmapCodec; impl heed::BytesDecode<'_> for RoaringBitmapCodec { type DItem = RoaringBitmap; - fn bytes_decode(bytes: &[u8]) -> Option { - RoaringBitmap::deserialize_unchecked_from(bytes).ok() + fn bytes_decode(bytes: &[u8]) -> Result { + RoaringBitmap::deserialize_unchecked_from(bytes).map_err(Into::into) } } impl BytesDecodeOwned for RoaringBitmapCodec { type DItem = RoaringBitmap; - fn bytes_decode_owned(bytes: &[u8]) -> Option { - RoaringBitmap::deserialize_from(bytes).ok() + fn bytes_decode_owned(bytes: &[u8]) -> Result { + RoaringBitmap::deserialize_from(bytes).map_err(Into::into) } } impl heed::BytesEncode<'_> for RoaringBitmapCodec { type EItem = RoaringBitmap; - fn bytes_encode(item: &Self::EItem) -> Option> { + fn bytes_encode(item: &Self::EItem) -> Result, BoxedError> { let mut bytes = Vec::with_capacity(item.serialized_size()); - item.serialize_into(&mut bytes).ok()?; - Some(Cow::Owned(bytes)) + item.serialize_into(&mut bytes)?; + Ok(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs index 8fae60df7..cf4997d26 100644 --- a/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs @@ -1,6 +1,6 @@ use std::mem; -use heed::BytesDecode; +use heed::{BoxedError, BytesDecode}; use crate::heed_codec::BytesDecodeOwned; @@ -9,15 +9,15 @@ pub struct BoRoaringBitmapLenCodec; impl BytesDecode<'_> for BoRoaringBitmapLenCodec { type DItem = u64; - fn bytes_decode(bytes: &[u8]) -> Option { - Some((bytes.len() / mem::size_of::()) as u64) + fn bytes_decode(bytes: &[u8]) -> Result { + Ok((bytes.len() / mem::size_of::()) as u64) } } impl BytesDecodeOwned for BoRoaringBitmapLenCodec { type DItem = u64; - fn bytes_decode_owned(bytes: &[u8]) -> Option { + fn bytes_decode_owned(bytes: &[u8]) -> Result { Self::bytes_decode(bytes) } } diff --git a/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs index 5719a538a..c2565c939 100644 --- a/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs @@ -1,6 +1,6 @@ use std::mem; -use heed::BytesDecode; +use heed::{BoxedError, BytesDecode}; use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec}; use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD; @@ -11,7 +11,7 @@ pub struct CboRoaringBitmapLenCodec; impl BytesDecode<'_> for CboRoaringBitmapLenCodec { type DItem = u64; - fn bytes_decode(bytes: &[u8]) -> Option { + fn bytes_decode(bytes: &[u8]) -> Result { if bytes.len() <= THRESHOLD * mem::size_of::() { // If there is threshold or less than threshold integers that can fit into this array // of bytes it means that we used the ByteOrder codec serializer. @@ -27,7 +27,7 @@ impl BytesDecode<'_> for CboRoaringBitmapLenCodec { impl BytesDecodeOwned for CboRoaringBitmapLenCodec { type DItem = u64; - fn bytes_decode_owned(bytes: &[u8]) -> Option { + fn bytes_decode_owned(bytes: &[u8]) -> Result { Self::bytes_decode(bytes) } } diff --git a/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs index a9b0506ff..578cb31e2 100644 --- a/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs @@ -2,6 +2,7 @@ use std::io::{self, BufRead, Read}; use std::mem; use byteorder::{LittleEndian, ReadBytesExt}; +use heed::BoxedError; use crate::heed_codec::BytesDecodeOwned; @@ -56,16 +57,16 @@ impl RoaringBitmapLenCodec { impl heed::BytesDecode<'_> for RoaringBitmapLenCodec { type DItem = u64; - fn bytes_decode(bytes: &[u8]) -> Option { - RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok() + fn bytes_decode(bytes: &[u8]) -> Result { + RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into) } } impl BytesDecodeOwned for RoaringBitmapLenCodec { type DItem = u64; - fn bytes_decode_owned(bytes: &[u8]) -> Option { - RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok() + fn bytes_decode_owned(bytes: &[u8]) -> Result { + RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into) } } diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs index 83e8a7241..ef2ad4bec 100644 --- a/milli/src/heed_codec/script_language_codec.rs +++ b/milli/src/heed_codec/script_language_codec.rs @@ -1,30 +1,31 @@ use std::borrow::Cow; +use std::ffi::CStr; use std::str; use charabia::{Language, Script}; +use heed::BoxedError; pub struct ScriptLanguageCodec; impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec { type DItem = (Script, Language); - fn bytes_decode(bytes: &'a [u8]) -> Option { - let sep = bytes.iter().position(|b| *b == 0)?; - let (s_bytes, l_bytes) = bytes.split_at(sep); - let script = str::from_utf8(s_bytes).ok()?; + fn bytes_decode(bytes: &'a [u8]) -> Result { + let cstr = CStr::from_bytes_until_nul(bytes)?; + let script = cstr.to_str()?; let script_name = Script::from_name(script); - let lan = str::from_utf8(l_bytes).ok()?; // skip '\0' byte between the two strings. - let lan_name = Language::from_name(&lan[1..]); + let lan = str::from_utf8(&bytes[script.len() + 1..])?; + let lan_name = Language::from_name(lan); - Some((script_name, lan_name)) + Ok((script_name, lan_name)) } } impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec { type EItem = (Script, Language); - fn bytes_encode((script, lan): &Self::EItem) -> Option> { + fn bytes_encode((script, lan): &Self::EItem) -> Result, BoxedError> { let script_name = script.name().as_bytes(); let lan_name = lan.name().as_bytes(); @@ -33,6 +34,6 @@ impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec { bytes.push(0); bytes.extend_from_slice(lan_name); - Some(Cow::Owned(bytes)) + Ok(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/str_beu32_codec.rs b/milli/src/heed_codec/str_beu32_codec.rs index cce849e37..e3ffddcd6 100644 --- a/milli/src/heed_codec/str_beu32_codec.rs +++ b/milli/src/heed_codec/str_beu32_codec.rs @@ -3,37 +3,41 @@ use std::convert::TryInto; use std::mem::size_of; use std::str; +use heed::BoxedError; + +use super::SliceTooShortError; + pub struct StrBEU32Codec; impl<'a> heed::BytesDecode<'a> for StrBEU32Codec { type DItem = (&'a str, u32); - fn bytes_decode(bytes: &'a [u8]) -> Option { + fn bytes_decode(bytes: &'a [u8]) -> Result { let footer_len = size_of::(); if bytes.len() < footer_len { - return None; + return Err(SliceTooShortError.into()); } let (word, bytes) = bytes.split_at(bytes.len() - footer_len); - let word = str::from_utf8(word).ok()?; - let pos = bytes.try_into().map(u32::from_be_bytes).ok()?; + let word = str::from_utf8(word)?; + let pos = bytes.try_into().map(u32::from_be_bytes)?; - Some((word, pos)) + Ok((word, pos)) } } impl<'a> heed::BytesEncode<'a> for StrBEU32Codec { type EItem = (&'a str, u32); - fn bytes_encode((word, pos): &Self::EItem) -> Option> { + fn bytes_encode((word, pos): &Self::EItem) -> Result, BoxedError> { let pos = pos.to_be_bytes(); let mut bytes = Vec::with_capacity(word.len() + pos.len()); bytes.extend_from_slice(word.as_bytes()); bytes.extend_from_slice(&pos[..]); - Some(Cow::Owned(bytes)) + Ok(Cow::Owned(bytes)) } } @@ -42,26 +46,27 @@ pub struct StrBEU16Codec; impl<'a> heed::BytesDecode<'a> for StrBEU16Codec { type DItem = (&'a str, u16); - fn bytes_decode(bytes: &'a [u8]) -> Option { + fn bytes_decode(bytes: &'a [u8]) -> Result { let footer_len = size_of::(); if bytes.len() < footer_len + 1 { - return None; + return Err(SliceTooShortError.into()); } let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len); - let (_, word) = word_plus_nul_byte.split_last()?; - let word = str::from_utf8(word).ok()?; - let pos = bytes.try_into().map(u16::from_be_bytes).ok()?; + // unwrap: we just checked the footer + 1 above. + let (_, word) = word_plus_nul_byte.split_last().unwrap(); + let word = str::from_utf8(word)?; + let pos = bytes.try_into().map(u16::from_be_bytes)?; - Some((word, pos)) + Ok((word, pos)) } } impl<'a> heed::BytesEncode<'a> for StrBEU16Codec { type EItem = (&'a str, u16); - fn bytes_encode((word, pos): &Self::EItem) -> Option> { + fn bytes_encode((word, pos): &Self::EItem) -> Result, BoxedError> { let pos = pos.to_be_bytes(); let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len()); @@ -69,6 +74,6 @@ impl<'a> heed::BytesEncode<'a> for StrBEU16Codec { bytes.push(0); bytes.extend_from_slice(&pos[..]); - Some(Cow::Owned(bytes)) + Ok(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/str_ref.rs b/milli/src/heed_codec/str_ref.rs index ced5cc65e..bdf262a46 100644 --- a/milli/src/heed_codec/str_ref.rs +++ b/milli/src/heed_codec/str_ref.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; -use heed::{BytesDecode, BytesEncode}; +use heed::{BoxedError, BytesDecode, BytesEncode}; /// A codec for values of type `&str`. Unlike `Str`, its `EItem` and `DItem` associated /// types are equivalent (= `&'a str`) and these values can reside within another structure. @@ -8,15 +8,14 @@ pub struct StrRefCodec; impl<'a> BytesEncode<'a> for StrRefCodec { type EItem = &'a str; - fn bytes_encode(item: &'a &'a str) -> Option> { - Some(Cow::Borrowed(item.as_bytes())) + fn bytes_encode(item: &'a &'a str) -> Result, BoxedError> { + Ok(Cow::Borrowed(item.as_bytes())) } } impl<'a> BytesDecode<'a> for StrRefCodec { type DItem = &'a str; - fn bytes_decode(bytes: &'a [u8]) -> Option { - let s = std::str::from_utf8(bytes).ok()?; - Some(s) + fn bytes_decode(bytes: &'a [u8]) -> Result { + std::str::from_utf8(bytes).map_err(Into::into) } } diff --git a/milli/src/heed_codec/str_str_u8_codec.rs b/milli/src/heed_codec/str_str_u8_codec.rs index 60be8ddc7..0aedf0c94 100644 --- a/milli/src/heed_codec/str_str_u8_codec.rs +++ b/milli/src/heed_codec/str_str_u8_codec.rs @@ -1,32 +1,36 @@ use std::borrow::Cow; +use std::ffi::CStr; use std::str; +use heed::BoxedError; + +use super::SliceTooShortError; + pub struct U8StrStrCodec; impl<'a> heed::BytesDecode<'a> for U8StrStrCodec { type DItem = (u8, &'a str, &'a str); - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (n, bytes) = bytes.split_first()?; - let s1_end = bytes.iter().position(|b| *b == 0)?; - let (s1_bytes, rest) = bytes.split_at(s1_end); - let s2_bytes = &rest[1..]; - let s1 = str::from_utf8(s1_bytes).ok()?; - let s2 = str::from_utf8(s2_bytes).ok()?; - Some((*n, s1, s2)) + fn bytes_decode(bytes: &'a [u8]) -> Result { + let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?; + let cstr = CStr::from_bytes_until_nul(bytes)?; + let s1 = cstr.to_str()?; + // skip '\0' byte between the two strings. + let s2 = str::from_utf8(&bytes[s1.len() + 1..])?; + Ok((*n, s1, s2)) } } impl<'a> heed::BytesEncode<'a> for U8StrStrCodec { type EItem = (u8, &'a str, &'a str); - fn bytes_encode((n, s1, s2): &Self::EItem) -> Option> { + fn bytes_encode((n, s1, s2): &Self::EItem) -> Result, BoxedError> { let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); bytes.push(*n); bytes.extend_from_slice(s1.as_bytes()); bytes.push(0); bytes.extend_from_slice(s2.as_bytes()); - Some(Cow::Owned(bytes)) + Ok(Cow::Owned(bytes)) } } pub struct UncheckedU8StrStrCodec; @@ -34,24 +38,25 @@ pub struct UncheckedU8StrStrCodec; impl<'a> heed::BytesDecode<'a> for UncheckedU8StrStrCodec { type DItem = (u8, &'a [u8], &'a [u8]); - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (n, bytes) = bytes.split_first()?; - let s1_end = bytes.iter().position(|b| *b == 0)?; - let (s1_bytes, rest) = bytes.split_at(s1_end); - let s2_bytes = &rest[1..]; - Some((*n, s1_bytes, s2_bytes)) + fn bytes_decode(bytes: &'a [u8]) -> Result { + let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?; + let cstr = CStr::from_bytes_until_nul(bytes)?; + let s1_bytes = cstr.to_bytes(); + // skip '\0' byte between the two strings. + let s2_bytes = &bytes[s1_bytes.len() + 1..]; + Ok((*n, s1_bytes, s2_bytes)) } } impl<'a> heed::BytesEncode<'a> for UncheckedU8StrStrCodec { type EItem = (u8, &'a [u8], &'a [u8]); - fn bytes_encode((n, s1, s2): &Self::EItem) -> Option> { + fn bytes_encode((n, s1, s2): &Self::EItem) -> Result, BoxedError> { let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); bytes.push(*n); bytes.extend_from_slice(s1); bytes.push(0); bytes.extend_from_slice(s2); - Some(Cow::Owned(bytes)) + Ok(Cow::Owned(bytes)) } } diff --git a/milli/src/index.rs b/milli/src/index.rs index d563f852b..01a01ac37 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,20 +1,18 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fs::File; -use std::mem::size_of; use std::path::Path; use charabia::{Language, Script}; -use heed::flags::Flags; use heed::types::*; -use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn}; +use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use roaring::RoaringBitmap; use rstar::RTree; use time::OffsetDateTime; use crate::distance::NDotProductPoint; +use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; -use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, @@ -23,12 +21,13 @@ use crate::heed_codec::facet::{ use crate::heed_codec::{ BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec, }; +use crate::proximity::ProximityPrecision; use crate::readable_slices::ReadableSlices; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, OrderBy, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, - BEU32, + BEU32, BEU64, }; /// The HNSW data-structure that we serialize, fill and search in. @@ -42,7 +41,6 @@ pub mod main_key { pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; - pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids"; pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; @@ -54,17 +52,13 @@ pub mod main_key { /// It is concatenated with a big-endian encoded number (non-human readable). /// e.g. vector-hnsw0x0032. pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw"; - pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; - pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; - pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const STOP_WORDS_KEY: &str = "stop-words"; pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens"; pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens"; pub const DICTIONARY_KEY: &str = "dictionary"; - pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; pub const SYNONYMS_KEY: &str = "synonyms"; pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms"; pub const WORDS_FST_KEY: &str = "words-fst"; @@ -79,6 +73,7 @@ pub mod main_key { pub const MAX_VALUES_PER_FACET: &str = "max-values-per-facet"; pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by"; pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits"; + pub const PROXIMITY_PRECISION: &str = "proximity-precision"; } pub mod db_name { @@ -87,10 +82,9 @@ pub mod db_name { pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids"; + pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; - pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; - pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids"; pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; @@ -116,26 +110,25 @@ pub struct Index { pub(crate) env: heed::Env, /// Contains many different types (e.g. the fields ids map). - pub(crate) main: PolyDatabase, + pub(crate) main: Database, + + /// Maps the external documents ids with the internal document id. + pub external_documents_ids: Database, /// A word and all the documents ids containing the word. - pub word_docids: Database, + pub word_docids: Database, /// A word and all the documents ids containing the word, from attributes for which typos are not allowed. - pub exact_word_docids: Database, + pub exact_word_docids: Database, /// A prefix of word and all the documents ids containing this prefix. - pub word_prefix_docids: Database, + pub word_prefix_docids: Database, /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. - pub exact_word_prefix_docids: Database, + pub exact_word_prefix_docids: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, - /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. - pub word_prefix_pair_proximity_docids: Database, - /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears. - pub prefix_word_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. pub word_position_docids: Database, @@ -166,7 +159,7 @@ pub struct Index { /// Maps the facet field id of the normalized-for-search string facets with their original versions. pub facet_id_normalized_string_strings: Database>>, /// Maps the facet field id of the string facets with an FST containing all the facets values. - pub facet_id_string_fst: Database, FstSetCodec>, + pub facet_id_string_fst: Database, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, @@ -174,10 +167,10 @@ pub struct Index { pub field_id_docid_facet_strings: Database, /// Maps a vector id to the document id that have it. - pub vector_id_docid: Database, OwnedType>, + pub vector_id_docid: Database, /// Maps the document id to the document as an obkv store. - pub(crate) documents: Database, ObkvCodec>, + pub(crate) documents: Database, } impl Index { @@ -189,13 +182,14 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(25); - unsafe { options.flag(Flags::MdbAlwaysFreePages) }; + options.max_dbs(24); let env = options.open(path)?; let mut wtxn = env.write_txn()?; - let main = env.create_poly_database(&mut wtxn, Some(MAIN))?; + let main = env.database_options().name(MAIN).create(&mut wtxn)?; let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?; + let external_documents_ids = + env.create_database(&mut wtxn, Some(EXTERNAL_DOCUMENTS_IDS))?; let exact_word_docids = env.create_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?; let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?; let exact_word_prefix_docids = @@ -204,10 +198,6 @@ impl Index { env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let script_language_docids = env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?; - let word_prefix_pair_proximity_docids = - env.create_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; - let prefix_word_pair_proximity_docids = - env.create_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?; let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?; let field_id_word_count_docids = @@ -241,14 +231,13 @@ impl Index { Ok(Index { env, main, + external_documents_ids, word_docids, exact_word_docids, word_prefix_docids, exact_word_prefix_docids, word_pair_proximity_docids, script_language_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, word_position_docids, word_fid_docids, word_prefix_position_docids, @@ -275,24 +264,16 @@ impl Index { fn set_creation_dates( env: &heed::Env, - main: PolyDatabase, + main: Database, created_at: OffsetDateTime, updated_at: OffsetDateTime, ) -> heed::Result<()> { let mut txn = env.write_txn()?; // The db was just created, we update its metadata with the relevant information. - if main.get::<_, Str, SerdeJson>(&txn, main_key::CREATED_AT_KEY)?.is_none() - { - main.put::<_, Str, SerdeJson>( - &mut txn, - main_key::UPDATED_AT_KEY, - &updated_at, - )?; - main.put::<_, Str, SerdeJson>( - &mut txn, - main_key::CREATED_AT_KEY, - &created_at, - )?; + let main = main.remap_types::>(); + if main.get(&txn, main_key::CREATED_AT_KEY)?.is_none() { + main.put(&mut txn, main_key::UPDATED_AT_KEY, &updated_at)?; + main.put(&mut txn, main_key::CREATED_AT_KEY, &created_at)?; txn.commit()?; } Ok(()) @@ -329,12 +310,12 @@ impl Index { /// /// This value is the maximum between the map size passed during the opening of the index /// and the on-disk size of the index at the time of opening. - pub fn map_size(&self) -> Result { - Ok(self.env.map_size()?) + pub fn map_size(&self) -> usize { + self.env.info().map_size } - pub fn copy_to_path>(&self, path: P, option: CompactionOption) -> Result { - self.env.copy_to_path(path, option).map_err(Into::into) + pub fn copy_to_file>(&self, path: P, option: CompactionOption) -> Result { + self.env.copy_to_file(path, option).map_err(Into::into) } /// Returns an `EnvClosingEvent` that can be used to wait for the closing event, @@ -354,45 +335,29 @@ impl Index { wtxn: &mut RwTxn, docids: &RoaringBitmap, ) -> heed::Result<()> { - self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) + self.main.remap_types::().put( + wtxn, + main_key::DOCUMENTS_IDS_KEY, + docids, + ) } /// Returns the internal documents ids. pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result { Ok(self .main - .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)? + .remap_types::() + .get(rtxn, main_key::DOCUMENTS_IDS_KEY)? .unwrap_or_default()) } /// Returns the number of documents indexed in the database. pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result { - let count = - self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?; - Ok(count.unwrap_or_default()) - } - - /* deleted documents ids */ - - /// Writes the soft deleted documents ids. - pub(crate) fn put_soft_deleted_documents_ids( - &self, - wtxn: &mut RwTxn, - docids: &RoaringBitmap, - ) -> heed::Result<()> { - self.main.put::<_, Str, RoaringBitmapCodec>( - wtxn, - main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY, - docids, - ) - } - - /// Returns the soft deleted documents ids. - pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self + let count = self .main - .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)? - .unwrap_or_default()) + .remap_types::() + .get(rtxn, main_key::DOCUMENTS_IDS_KEY)?; + Ok(count.unwrap_or_default()) } /* primary key */ @@ -400,60 +365,25 @@ impl Index { /// Writes the documents primary key, this is the field name that is used to store the id. pub(crate) fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { self.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, primary_key) + self.main.remap_types::().put(wtxn, main_key::PRIMARY_KEY_KEY, primary_key) } /// Deletes the primary key of the documents, this can be done to reset indexes settings. pub(crate) fn delete_primary_key(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::PRIMARY_KEY_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::PRIMARY_KEY_KEY) } /// Returns the documents primary key, `None` if it hasn't been defined. pub fn primary_key<'t>(&self, rtxn: &'t RoTxn) -> heed::Result> { - self.main.get::<_, Str, Str>(rtxn, main_key::PRIMARY_KEY_KEY) + self.main.remap_types::().get(rtxn, main_key::PRIMARY_KEY_KEY) } /* external documents ids */ - /// Writes the external documents ids and internal ids (i.e. `u32`). - pub(crate) fn put_external_documents_ids( - &self, - wtxn: &mut RwTxn, - external_documents_ids: &ExternalDocumentsIds<'_>, - ) -> heed::Result<()> { - let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids; - let hard = hard.as_fst().as_bytes(); - let soft = soft.as_fst().as_bytes(); - self.main.put::<_, Str, ByteSlice>( - wtxn, - main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, - hard, - )?; - self.main.put::<_, Str, ByteSlice>( - wtxn, - main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, - soft, - )?; - Ok(()) - } - /// Returns the external documents ids map which associate the external ids /// with the internal ids (i.e. `u32`). - pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result> { - let hard = - self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; - let soft = - self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; - let hard = match hard { - Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, - None => fst::Map::default().map_data(Cow::Owned)?, - }; - let soft = match soft { - Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?, - None => fst::Map::default().map_data(Cow::Owned)?, - }; - let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?; - Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids)) + pub fn external_documents_ids(&self) -> ExternalDocumentsIds { + ExternalDocumentsIds::new(self.external_documents_ids) } /* fields ids map */ @@ -465,7 +395,11 @@ impl Index { wtxn: &mut RwTxn, map: &FieldsIdsMap, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) + self.main.remap_types::>().put( + wtxn, + main_key::FIELDS_IDS_MAP_KEY, + map, + ) } /// Returns the fields ids map which associate the documents keys with an internal field id @@ -473,7 +407,8 @@ impl Index { pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result { Ok(self .main - .get::<_, Str, SerdeJson>(rtxn, main_key::FIELDS_IDS_MAP_KEY)? + .remap_types::>() + .get(rtxn, main_key::FIELDS_IDS_MAP_KEY)? .unwrap_or_default()) } @@ -485,19 +420,24 @@ impl Index { wtxn: &mut RwTxn, rtree: &RTree, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode>>(wtxn, main_key::GEO_RTREE_KEY, rtree) + self.main.remap_types::>>().put( + wtxn, + main_key::GEO_RTREE_KEY, + rtree, + ) } /// Delete the `rtree` which associates coordinates to documents ids. pub(crate) fn delete_geo_rtree(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::GEO_RTREE_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::GEO_RTREE_KEY) } /// Returns the `rtree` which associates coordinates to documents ids. pub fn geo_rtree(&self, rtxn: &RoTxn) -> Result>> { match self .main - .get::<_, Str, SerdeBincode>>(rtxn, main_key::GEO_RTREE_KEY)? + .remap_types::>>() + .get(rtxn, main_key::GEO_RTREE_KEY)? { Some(rtree) => Ok(Some(rtree)), None => Ok(None), @@ -512,7 +452,7 @@ impl Index { wtxn: &mut RwTxn, docids: &RoaringBitmap, ) -> heed::Result<()> { - self.main.put::<_, Str, RoaringBitmapCodec>( + self.main.remap_types::().put( wtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY, docids, @@ -521,14 +461,15 @@ impl Index { /// Delete the documents ids that are faceted with a _geo field. pub(crate) fn delete_geo_faceted_documents_ids(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY) } /// Retrieve all the documents ids that are faceted with a _geo field. pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { match self .main - .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY)? + .remap_types::() + .get(rtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY)? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), @@ -543,22 +484,22 @@ impl Index { self.delete_vector_hnsw(wtxn)?; let chunk_size = 1024 * 1024 * (1024 + 512); // 1.5 GiB - let bytes = bincode::serialize(hnsw).map_err(|_| heed::Error::Encoding)?; + let bytes = bincode::serialize(hnsw).map_err(Into::into).map_err(heed::Error::Encoding)?; for (i, chunk) in bytes.chunks(chunk_size).enumerate() { let i = i as u32; let mut key = main_key::VECTOR_HNSW_KEY_PREFIX.as_bytes().to_vec(); key.extend_from_slice(&i.to_be_bytes()); - self.main.put::<_, ByteSlice, ByteSlice>(wtxn, &key, chunk)?; + self.main.remap_types::().put(wtxn, &key, chunk)?; } Ok(()) } /// Delete the `hnsw`. pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result { - let mut iter = self.main.prefix_iter_mut::<_, ByteSlice, DecodeIgnore>( - wtxn, - main_key::VECTOR_HNSW_KEY_PREFIX.as_bytes(), - )?; + let mut iter = self + .main + .remap_types::() + .prefix_iter_mut(wtxn, main_key::VECTOR_HNSW_KEY_PREFIX.as_bytes())?; let mut deleted = false; while iter.next().transpose()?.is_some() { // We do not keep a reference to the key or the value. @@ -570,8 +511,10 @@ impl Index { /// Returns the `hnsw`. pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result> { let mut slices = Vec::new(); - for result in - self.main.prefix_iter::<_, Str, ByteSlice>(rtxn, main_key::VECTOR_HNSW_KEY_PREFIX)? + for result in self + .main + .remap_types::() + .prefix_iter(rtxn, main_key::VECTOR_HNSW_KEY_PREFIX)? { let (_, slice) = result?; slices.push(slice); @@ -581,7 +524,11 @@ impl Index { Ok(None) } else { let readable_slices: ReadableSlices<_> = slices.into_iter().collect(); - Ok(Some(bincode::deserialize_from(readable_slices).map_err(|_| heed::Error::Decoding)?)) + Ok(Some( + bincode::deserialize_from(readable_slices) + .map_err(Into::into) + .map_err(heed::Error::Decoding)?, + )) } } @@ -594,7 +541,7 @@ impl Index { wtxn: &mut RwTxn, distribution: &FieldDistribution, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>( + self.main.remap_types::>().put( wtxn, main_key::FIELD_DISTRIBUTION_KEY, distribution, @@ -606,7 +553,8 @@ impl Index { pub fn field_distribution(&self, rtxn: &RoTxn) -> heed::Result { Ok(self .main - .get::<_, Str, SerdeJson>(rtxn, main_key::FIELD_DISTRIBUTION_KEY)? + .remap_types::>() + .get(rtxn, main_key::FIELD_DISTRIBUTION_KEY)? .unwrap_or_default()) } @@ -619,7 +567,7 @@ impl Index { wtxn: &mut RwTxn, fields: &[&str], ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<&[&str]>>( + self.main.remap_types::>().put( wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields, @@ -629,13 +577,15 @@ impl Index { /// Deletes the displayed fields ids, this will make the engine to display /// all the documents attributes in the order of the `FieldsIdsMap`. pub(crate) fn delete_displayed_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::DISPLAYED_FIELDS_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::DISPLAYED_FIELDS_KEY) } /// Returns the displayed fields in the order they were set by the user. If it returns /// `None` it means that all the attributes are set as displayed in the order of the `FieldsIdsMap`. pub fn displayed_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { - self.main.get::<_, Str, SerdeBincode>>(rtxn, main_key::DISPLAYED_FIELDS_KEY) + self.main + .remap_types::>>() + .get(rtxn, main_key::DISPLAYED_FIELDS_KEY) } /// Identical to `displayed_fields`, but returns the ids instead. @@ -715,7 +665,7 @@ impl Index { /// Writes the searchable fields, when this list is specified, only these are indexed. fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<&[&str]>>( + self.main.remap_types::>().put( wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields, @@ -724,13 +674,15 @@ impl Index { /// Deletes the searchable fields, when no fields are specified, all fields are indexed. fn delete_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::SEARCHABLE_FIELDS_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::SEARCHABLE_FIELDS_KEY) } /// Returns the searchable fields, those are the fields that are indexed, /// if the searchable fields aren't there it means that **all** the fields are indexed. pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { - self.main.get::<_, Str, SerdeBincode>>(rtxn, main_key::SEARCHABLE_FIELDS_KEY) + self.main + .remap_types::>>() + .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY) } /// Identical to `searchable_fields`, but returns the ids instead. @@ -756,7 +708,7 @@ impl Index { wtxn: &mut RwTxn, fields: &[&str], ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<_>>( + self.main.remap_types::>().put( wtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY, &fields, @@ -768,7 +720,7 @@ impl Index { &self, wtxn: &mut RwTxn, ) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) } /// Returns the user defined searchable fields. @@ -777,7 +729,8 @@ impl Index { rtxn: &'t RoTxn, ) -> heed::Result>> { self.main - .get::<_, Str, SerdeBincode>>(rtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) + .remap_types::>>() + .get(rtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) } /* filterable fields */ @@ -788,19 +741,24 @@ impl Index { wtxn: &mut RwTxn, fields: &HashSet, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) + self.main.remap_types::>().put( + wtxn, + main_key::FILTERABLE_FIELDS_KEY, + fields, + ) } /// Deletes the filterable fields ids in the database. pub(crate) fn delete_filterable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::FILTERABLE_FIELDS_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::FILTERABLE_FIELDS_KEY) } /// Returns the filterable fields names. pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result> { Ok(self .main - .get::<_, Str, SerdeJson<_>>(rtxn, main_key::FILTERABLE_FIELDS_KEY)? + .remap_types::>() + .get(rtxn, main_key::FILTERABLE_FIELDS_KEY)? .unwrap_or_default()) } @@ -827,19 +785,24 @@ impl Index { wtxn: &mut RwTxn, fields: &HashSet, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::SORTABLE_FIELDS_KEY, fields) + self.main.remap_types::>().put( + wtxn, + main_key::SORTABLE_FIELDS_KEY, + fields, + ) } /// Deletes the sortable fields ids in the database. pub(crate) fn delete_sortable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::SORTABLE_FIELDS_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::SORTABLE_FIELDS_KEY) } /// Returns the sortable fields names. pub fn sortable_fields(&self, rtxn: &RoTxn) -> heed::Result> { Ok(self .main - .get::<_, Str, SerdeJson<_>>(rtxn, main_key::SORTABLE_FIELDS_KEY)? + .remap_types::>() + .get(rtxn, main_key::SORTABLE_FIELDS_KEY)? .unwrap_or_default()) } @@ -858,14 +821,19 @@ impl Index { wtxn: &mut RwTxn, fields: &HashSet, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::HIDDEN_FACETED_FIELDS_KEY, fields) + self.main.remap_types::>().put( + wtxn, + main_key::HIDDEN_FACETED_FIELDS_KEY, + fields, + ) } /// Returns the faceted fields names. pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { Ok(self .main - .get::<_, Str, SerdeJson<_>>(rtxn, main_key::HIDDEN_FACETED_FIELDS_KEY)? + .remap_types::>() + .get(rtxn, main_key::HIDDEN_FACETED_FIELDS_KEY)? .unwrap_or_default()) } @@ -926,51 +894,13 @@ impl Index { /* faceted documents ids */ - /// Writes the documents ids that are faceted under this field id for the given facet type. - pub fn put_faceted_documents_ids( - &self, - wtxn: &mut RwTxn, - field_id: FieldId, - facet_type: FacetType, - docids: &RoaringBitmap, - ) -> heed::Result<()> { - let key = match facet_type { - FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, - FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, - }; - let mut buffer = vec![0u8; key.len() + size_of::()]; - buffer[..key.len()].copy_from_slice(key.as_bytes()); - buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); - self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) - } - - /// Retrieve all the documents ids that are faceted under this field id for the given facet type. - pub fn faceted_documents_ids( - &self, - rtxn: &RoTxn, - field_id: FieldId, - facet_type: FacetType, - ) -> heed::Result { - let key = match facet_type { - FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, - FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, - }; - let mut buffer = vec![0u8; key.len() + size_of::()]; - buffer[..key.len()].copy_from_slice(key.as_bytes()); - buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); - match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { - Some(docids) => Ok(docids), - None => Ok(RoaringBitmap::new()), - } - } - /// Retrieve all the documents which contain this field id set as null pub fn null_faceted_documents_ids( &self, rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - match self.facet_id_is_null_docids.get(rtxn, &BEU16::new(field_id))? { + match self.facet_id_is_null_docids.get(rtxn, &field_id)? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), } @@ -982,7 +912,7 @@ impl Index { rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - match self.facet_id_is_empty_docids.get(rtxn, &BEU16::new(field_id))? { + match self.facet_id_is_empty_docids.get(rtxn, &field_id)? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), } @@ -994,7 +924,7 @@ impl Index { rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - match self.facet_id_exists_docids.get(rtxn, &BEU16::new(field_id))? { + match self.facet_id_exists_docids.get(rtxn, &field_id)? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), } @@ -1007,15 +937,15 @@ impl Index { wtxn: &mut RwTxn, distinct_field: &str, ) -> heed::Result<()> { - self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field) + self.main.remap_types::().put(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field) } pub fn distinct_field<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { - self.main.get::<_, Str, Str>(rtxn, main_key::DISTINCT_FIELD_KEY) + self.main.remap_types::().get(rtxn, main_key::DISTINCT_FIELD_KEY) } pub(crate) fn delete_distinct_field(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::DISTINCT_FIELD_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::DISTINCT_FIELD_KEY) } /* criteria */ @@ -1025,15 +955,23 @@ impl Index { wtxn: &mut RwTxn, criteria: &[Criterion], ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) + self.main.remap_types::>().put( + wtxn, + main_key::CRITERIA_KEY, + &criteria, + ) } pub(crate) fn delete_criteria(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::CRITERIA_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::CRITERIA_KEY) } pub fn criteria(&self, rtxn: &RoTxn) -> heed::Result> { - match self.main.get::<_, Str, SerdeJson>>(rtxn, main_key::CRITERIA_KEY)? { + match self + .main + .remap_types::>>() + .get(rtxn, main_key::CRITERIA_KEY)? + { Some(criteria) => Ok(criteria), None => Ok(default_criteria()), } @@ -1047,12 +985,16 @@ impl Index { wtxn: &mut RwTxn, fst: &fst::Set, ) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) + self.main.remap_types::().put( + wtxn, + main_key::WORDS_FST_KEY, + fst.as_fst().as_bytes(), + ) } /// Returns the FST which is the words dictionary of the engine. pub fn words_fst<'t>(&self, rtxn: &'t RoTxn) -> Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::WORDS_FST_KEY)? { + match self.main.remap_types::().get(rtxn, main_key::WORDS_FST_KEY)? { Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), None => Ok(fst::Set::default().map_data(Cow::Owned)?), } @@ -1065,15 +1007,19 @@ impl Index { wtxn: &mut RwTxn, fst: &fst::Set, ) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) + self.main.remap_types::().put( + wtxn, + main_key::STOP_WORDS_KEY, + fst.as_fst().as_bytes(), + ) } pub(crate) fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::STOP_WORDS_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::STOP_WORDS_KEY) } pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::STOP_WORDS_KEY)? { + match self.main.remap_types::().get(rtxn, main_key::STOP_WORDS_KEY)? { Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), None => Ok(None), } @@ -1086,18 +1032,22 @@ impl Index { wtxn: &mut RwTxn, set: &BTreeSet, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::NON_SEPARATOR_TOKENS_KEY, set) + self.main.remap_types::>().put( + wtxn, + main_key::NON_SEPARATOR_TOKENS_KEY, + set, + ) } pub(crate) fn delete_non_separator_tokens(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::NON_SEPARATOR_TOKENS_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::NON_SEPARATOR_TOKENS_KEY) } pub fn non_separator_tokens(&self, rtxn: &RoTxn) -> Result>> { - Ok(self.main.get::<_, Str, SerdeBincode>>( - rtxn, - main_key::NON_SEPARATOR_TOKENS_KEY, - )?) + Ok(self + .main + .remap_types::>>() + .get(rtxn, main_key::NON_SEPARATOR_TOKENS_KEY)?) } /* separator tokens */ @@ -1107,17 +1057,22 @@ impl Index { wtxn: &mut RwTxn, set: &BTreeSet, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SEPARATOR_TOKENS_KEY, set) + self.main.remap_types::>().put( + wtxn, + main_key::SEPARATOR_TOKENS_KEY, + set, + ) } pub(crate) fn delete_separator_tokens(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::SEPARATOR_TOKENS_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::SEPARATOR_TOKENS_KEY) } pub fn separator_tokens(&self, rtxn: &RoTxn) -> Result>> { Ok(self .main - .get::<_, Str, SerdeBincode>>(rtxn, main_key::SEPARATOR_TOKENS_KEY)?) + .remap_types::>>() + .get(rtxn, main_key::SEPARATOR_TOKENS_KEY)?) } /* separators easing method */ @@ -1147,17 +1102,18 @@ impl Index { wtxn: &mut RwTxn, set: &BTreeSet, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::DICTIONARY_KEY, set) + self.main.remap_types::>().put(wtxn, main_key::DICTIONARY_KEY, set) } pub(crate) fn delete_dictionary(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::DICTIONARY_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::DICTIONARY_KEY) } pub fn dictionary(&self, rtxn: &RoTxn) -> Result>> { Ok(self .main - .get::<_, Str, SerdeBincode>>(rtxn, main_key::DICTIONARY_KEY)?) + .remap_types::>>() + .get(rtxn, main_key::DICTIONARY_KEY)?) } /* synonyms */ @@ -1168,8 +1124,12 @@ impl Index { synonyms: &HashMap, Vec>>, user_defined_synonyms: &BTreeMap>, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)?; - self.main.put::<_, Str, SerdeBincode<_>>( + self.main.remap_types::>().put( + wtxn, + main_key::SYNONYMS_KEY, + synonyms, + )?; + self.main.remap_types::>().put( wtxn, main_key::USER_DEFINED_SYNONYMS_KEY, user_defined_synonyms, @@ -1177,8 +1137,8 @@ impl Index { } pub(crate) fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY)?; - self.main.delete::<_, Str>(wtxn, main_key::USER_DEFINED_SYNONYMS_KEY) + self.main.remap_key_type::().delete(wtxn, main_key::SYNONYMS_KEY)?; + self.main.remap_key_type::().delete(wtxn, main_key::USER_DEFINED_SYNONYMS_KEY) } pub fn user_defined_synonyms( @@ -1187,14 +1147,16 @@ impl Index { ) -> heed::Result>> { Ok(self .main - .get::<_, Str, SerdeBincode<_>>(rtxn, main_key::USER_DEFINED_SYNONYMS_KEY)? + .remap_types::>() + .get(rtxn, main_key::USER_DEFINED_SYNONYMS_KEY)? .unwrap_or_default()) } pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result, Vec>>> { Ok(self .main - .get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)? + .remap_types::>() + .get(rtxn, main_key::SYNONYMS_KEY)? .unwrap_or_default()) } @@ -1215,7 +1177,7 @@ impl Index { wtxn: &mut RwTxn, fst: &fst::Set, ) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>( + self.main.remap_types::().put( wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes(), @@ -1224,7 +1186,7 @@ impl Index { /// Returns the FST which is the words prefixes dictionnary of the engine. pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? { + match self.main.remap_types::().get(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? { Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), None => Ok(fst::Set::default().map_data(Cow::Owned)?), } @@ -1246,15 +1208,10 @@ impl Index { rtxn: &'t RoTxn, ids: impl IntoIterator + 'a, ) -> Result)>> + 'a> { - let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; - Ok(ids.into_iter().map(move |id| { - if soft_deleted_documents.contains(id) { - return Err(UserError::AccessingSoftDeletedDocument { document_id: id })?; - } let kv = self .documents - .get(rtxn, &BEU32::new(id))? + .get(rtxn, &id)? .ok_or(UserError::UnknownInternalDocumentId { document_id: id })?; Ok((id, kv)) })) @@ -1277,6 +1234,36 @@ impl Index { self.iter_documents(rtxn, self.documents_ids(rtxn)?) } + pub fn external_id_of<'a, 't: 'a>( + &'a self, + rtxn: &'t RoTxn, + ids: impl IntoIterator + 'a, + ) -> Result> + 'a> { + let fields = self.fields_ids_map(rtxn)?; + + // uses precondition "never called on an empty index" + let primary_key = self.primary_key(rtxn)?.ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + })?; + let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| { + InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldName { + field_name: primary_key.to_owned(), + process: "external_id_of", + }) + })?; + Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> { + let (_docid, obkv) = entry?; + match primary_key.document_id(&obkv, &fields)? { + Ok(document_id) => Ok(document_id), + Err(_) => Err(InternalError::DocumentsError( + crate::documents::Error::InvalidDocumentFormat, + ) + .into()), + } + })) + } + pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { FacetDistribution::new(rtxn, self) } @@ -1289,7 +1276,8 @@ impl Index { pub fn created_at(&self, rtxn: &RoTxn) -> Result { Ok(self .main - .get::<_, Str, SerdeJson>(rtxn, main_key::CREATED_AT_KEY)? + .remap_types::>() + .get(rtxn, main_key::CREATED_AT_KEY)? .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, key: Some(main_key::CREATED_AT_KEY), @@ -1300,7 +1288,8 @@ impl Index { pub fn updated_at(&self, rtxn: &RoTxn) -> Result { Ok(self .main - .get::<_, Str, SerdeJson>(rtxn, main_key::UPDATED_AT_KEY)? + .remap_types::>() + .get(rtxn, main_key::UPDATED_AT_KEY)? .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, key: Some(main_key::UPDATED_AT_KEY), @@ -1312,14 +1301,18 @@ impl Index { wtxn: &mut RwTxn, time: &OffsetDateTime, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>(wtxn, main_key::UPDATED_AT_KEY, time) + self.main.remap_types::>().put( + wtxn, + main_key::UPDATED_AT_KEY, + time, + ) } pub fn authorize_typos(&self, txn: &RoTxn) -> heed::Result { // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We // identify 0 as being false, and anything else as true. The absence of a value is true, // because by default, we authorize typos. - match self.main.get::<_, Str, OwnedType>(txn, main_key::AUTHORIZE_TYPOS)? { + match self.main.remap_types::().get(txn, main_key::AUTHORIZE_TYPOS)? { Some(0) => Ok(false), _ => Ok(true), } @@ -1329,7 +1322,7 @@ impl Index { // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We // identify 0 as being false, and anything else as true. The absence of a value is true, // because by default, we authorize typos. - self.main.put::<_, Str, OwnedType>(txn, main_key::AUTHORIZE_TYPOS, &(flag as u8))?; + self.main.remap_types::().put(txn, main_key::AUTHORIZE_TYPOS, &(flag as u8))?; Ok(()) } @@ -1340,7 +1333,8 @@ impl Index { // because by default, we authorize typos. Ok(self .main - .get::<_, Str, OwnedType>(txn, main_key::ONE_TYPO_WORD_LEN)? + .remap_types::() + .get(txn, main_key::ONE_TYPO_WORD_LEN)? .unwrap_or(DEFAULT_MIN_WORD_LEN_ONE_TYPO)) } @@ -1348,7 +1342,7 @@ impl Index { // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We // identify 0 as being false, and anything else as true. The absence of a value is true, // because by default, we authorize typos. - self.main.put::<_, Str, OwnedType>(txn, main_key::ONE_TYPO_WORD_LEN, &val)?; + self.main.remap_types::().put(txn, main_key::ONE_TYPO_WORD_LEN, &val)?; Ok(()) } @@ -1358,7 +1352,8 @@ impl Index { // because by default, we authorize typos. Ok(self .main - .get::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN)? + .remap_types::() + .get(txn, main_key::TWO_TYPOS_WORD_LEN)? .unwrap_or(DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) } @@ -1366,13 +1361,13 @@ impl Index { // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We // identify 0 as being false, and anything else as true. The absence of a value is true, // because by default, we authorize typos. - self.main.put::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?; + self.main.remap_types::().put(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?; Ok(()) } /// List the words on which typo are not allowed pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result>>> { - match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? { + match self.main.remap_types::().get(txn, main_key::EXACT_WORDS)? { Some(bytes) => Ok(Some(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?)), None => Ok(None), } @@ -1383,7 +1378,7 @@ impl Index { txn: &mut RwTxn, words: &fst::Set, ) -> Result<()> { - self.main.put::<_, Str, ByteSlice>( + self.main.remap_types::().put( txn, main_key::EXACT_WORDS, words.as_fst().as_bytes(), @@ -1395,7 +1390,8 @@ impl Index { pub fn exact_attributes<'t>(&self, txn: &'t RoTxn) -> Result> { Ok(self .main - .get::<_, Str, SerdeBincode>>(txn, main_key::EXACT_ATTRIBUTES)? + .remap_types::>>() + .get(txn, main_key::EXACT_ATTRIBUTES)? .unwrap_or_default()) } @@ -1408,34 +1404,36 @@ impl Index { /// Writes the exact attributes to the database. pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> { - self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?; + self.main.remap_types::>().put( + txn, + main_key::EXACT_ATTRIBUTES, + &attrs, + )?; Ok(()) } /// Clears the exact attributes from the store. pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES) + self.main.remap_key_type::().delete(txn, main_key::EXACT_ATTRIBUTES) } - pub fn max_values_per_facet(&self, txn: &RoTxn) -> heed::Result> { - self.main.get::<_, Str, OwnedType>(txn, main_key::MAX_VALUES_PER_FACET) + pub fn max_values_per_facet(&self, txn: &RoTxn) -> heed::Result> { + self.main.remap_types::().get(txn, main_key::MAX_VALUES_PER_FACET) } - pub(crate) fn put_max_values_per_facet(&self, txn: &mut RwTxn, val: usize) -> heed::Result<()> { - self.main.put::<_, Str, OwnedType>(txn, main_key::MAX_VALUES_PER_FACET, &val) + pub(crate) fn put_max_values_per_facet(&self, txn: &mut RwTxn, val: u64) -> heed::Result<()> { + self.main.remap_types::().put(txn, main_key::MAX_VALUES_PER_FACET, &val) } pub(crate) fn delete_max_values_per_facet(&self, txn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(txn, main_key::MAX_VALUES_PER_FACET) + self.main.remap_key_type::().delete(txn, main_key::MAX_VALUES_PER_FACET) } pub fn sort_facet_values_by(&self, txn: &RoTxn) -> heed::Result> { let mut orders = self .main - .get::<_, Str, SerdeJson>>( - txn, - main_key::SORT_FACET_VALUES_BY, - )? + .remap_types::>>() + .get(txn, main_key::SORT_FACET_VALUES_BY)? .unwrap_or_default(); // Insert the default ordering if it is not already overwritten by the user. orders.entry("*".to_string()).or_insert(OrderBy::Lexicographic); @@ -1447,27 +1445,49 @@ impl Index { txn: &mut RwTxn, val: &HashMap, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<_>>(txn, main_key::SORT_FACET_VALUES_BY, &val) + self.main.remap_types::>().put(txn, main_key::SORT_FACET_VALUES_BY, &val) } pub(crate) fn delete_sort_facet_values_by(&self, txn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(txn, main_key::SORT_FACET_VALUES_BY) + self.main.remap_key_type::().delete(txn, main_key::SORT_FACET_VALUES_BY) } - pub fn pagination_max_total_hits(&self, txn: &RoTxn) -> heed::Result> { - self.main.get::<_, Str, OwnedType>(txn, main_key::PAGINATION_MAX_TOTAL_HITS) + pub fn pagination_max_total_hits(&self, txn: &RoTxn) -> heed::Result> { + self.main.remap_types::().get(txn, main_key::PAGINATION_MAX_TOTAL_HITS) } pub(crate) fn put_pagination_max_total_hits( &self, txn: &mut RwTxn, - val: usize, + val: u64, ) -> heed::Result<()> { - self.main.put::<_, Str, OwnedType>(txn, main_key::PAGINATION_MAX_TOTAL_HITS, &val) + self.main.remap_types::().put(txn, main_key::PAGINATION_MAX_TOTAL_HITS, &val) } pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS) + self.main.remap_key_type::().delete(txn, main_key::PAGINATION_MAX_TOTAL_HITS) + } + + pub fn proximity_precision(&self, txn: &RoTxn) -> heed::Result> { + self.main + .remap_types::>() + .get(txn, main_key::PROXIMITY_PRECISION) + } + + pub(crate) fn put_proximity_precision( + &self, + txn: &mut RwTxn, + val: ProximityPrecision, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + txn, + main_key::PROXIMITY_PRECISION, + &val, + ) + } + + pub(crate) fn delete_proximity_precision(&self, txn: &mut RwTxn) -> heed::Result { + self.main.remap_key_type::().delete(txn, main_key::PROXIMITY_PRECISION) } /* script language docids */ @@ -1477,14 +1497,10 @@ impl Index { rtxn: &RoTxn, key: &(Script, Language), ) -> heed::Result> { - let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; - let doc_ids = self.script_language_docids.get(rtxn, key)?; - Ok(doc_ids.map(|ids| ids - soft_deleted_documents)) + self.script_language_docids.get(rtxn, key) } pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result>> { - let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; - let mut script_language: HashMap> = HashMap::new(); let mut script_language_doc_count: Vec<(Script, Language, u64)> = Vec::new(); let mut total = 0; @@ -1492,7 +1508,7 @@ impl Index { let ((script, language), docids) = sl?; // keep only Languages that contains at least 1 document. - let remaining_documents_count = (docids - &soft_deleted_documents).len(); + let remaining_documents_count = docids.len(); total += remaining_documents_count; if remaining_documents_count > 0 { script_language_doc_count.push((script, language, remaining_documents_count)); @@ -1528,8 +1544,7 @@ pub(crate) mod tests { use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig, - IndexDocumentsMethod, IndexerConfig, Settings, + self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult}; @@ -1566,7 +1581,7 @@ pub(crate) mod tests { } pub fn add_documents_using_wtxn<'t, R>( &'t self, - wtxn: &mut RwTxn<'t, '_>, + wtxn: &mut RwTxn<'t>, documents: DocumentsBatchReader, ) -> Result<(), crate::error::Error> where @@ -1610,7 +1625,7 @@ pub(crate) mod tests { } pub fn update_settings_using_wtxn<'t>( &'t self, - wtxn: &mut RwTxn<'t, '_>, + wtxn: &mut RwTxn<'t>, update: impl Fn(&mut Settings), ) -> Result<(), crate::error::Error> { let mut builder = update::Settings::new(wtxn, &self.inner, &self.indexer_config); @@ -1619,16 +1634,36 @@ pub(crate) mod tests { Ok(()) } - pub fn delete_document(&self, external_document_id: &str) { + pub fn delete_documents_using_wtxn<'t>( + &'t self, + wtxn: &mut RwTxn<'t>, + external_document_ids: Vec, + ) { + let builder = IndexDocuments::new( + wtxn, + self, + &self.indexer_config, + self.index_documents_config.clone(), + |_| (), + || false, + ) + .unwrap(); + let (builder, user_error) = builder.remove_documents(external_document_ids).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + } + + pub fn delete_documents(&self, external_document_ids: Vec) { let mut wtxn = self.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, self).unwrap(); - delete.strategy(self.index_documents_config.deletion_strategy); + self.delete_documents_using_wtxn(&mut wtxn, external_document_ids); - delete.delete_external_id(external_document_id); - delete.execute().unwrap(); wtxn.commit().unwrap(); } + + pub fn delete_document(&self, external_document_id: &str) { + self.delete_documents(vec![external_document_id.to_string()]) + } } #[test] @@ -1942,9 +1977,7 @@ pub(crate) mod tests { use big_s::S; use maplit::hashset; - let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; - let index = index; + let index = TempIndex::new(); index .update_settings(|settings| { @@ -1963,14 +1996,12 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 0 0 1 1 2 2 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); db_snap!(index, facet_id_f64_docids, 1, @r###" 1 0 0 1 [0, ] 1 0 1 1 [1, ] @@ -1986,44 +2017,37 @@ pub(crate) mod tests { } index.add_documents(documents!(docs)).unwrap(); - db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 0 4 - 1 5 - 2 6 + docids: + 0 0 + 1 1 + 2 2 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[0, 1, 2, ]"); db_snap!(index, facet_id_f64_docids, 2, @r###" - 1 0 0 1 [0, ] - 1 0 1 1 [1, 4, ] - 1 0 2 1 [2, 5, ] - 1 0 3 1 [3, 6, ] + 1 0 1 1 [0, ] + 1 0 2 1 [1, ] + 1 0 3 1 [2, 3, ] "###); index .add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }])) .unwrap(); - db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - 3 7 - hard: - 0 4 - 1 5 - 2 6 + docids: + 0 0 + 1 1 + 2 2 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[0, 1, 2, 3, ]"); db_snap!(index, facet_id_f64_docids, 3, @r###" - 1 0 0 1 [0, ] - 1 0 1 1 [1, 4, ] - 1 0 2 1 [2, 5, ] - 1 0 3 1 [3, 6, ] - 1 0 4 1 [7, ] + 1 0 1 1 [0, ] + 1 0 2 1 [1, ] + 1 0 3 1 [2, ] + 1 0 4 1 [3, ] "###); index @@ -2032,300 +2056,30 @@ pub(crate) mod tests { }) .unwrap(); - db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - hard: - 0 4 - 1 5 - 2 6 - 3 7 + docids: + 0 0 + 1 1 + 2 2 + 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); db_snap!(index, facet_id_f64_docids, 3, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [7, ] + 0 0 0 1 [0, ] + 0 0 1 1 [1, ] + 0 0 2 1 [2, ] + 0 0 3 1 [3, ] + 1 0 1 1 [0, ] + 1 0 2 1 [1, ] + 1 0 3 1 [2, ] + 1 0 4 1 [3, ] "###); } - #[test] - fn replace_documents_in_batches_external_ids_and_soft_deletion_check() { - use big_s::S; - use maplit::hashset; - - let mut index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("doggo") }); - }) - .unwrap(); - - let add_documents = |index: &TempIndex, docs: Vec>| { - let mut wtxn = index.write_txn().unwrap(); - let mut builder = IndexDocuments::new( - &mut wtxn, - index, - &index.indexer_config, - index.index_documents_config.clone(), - |_| (), - || false, - ) - .unwrap(); - for docs in docs { - (builder, _) = builder.add_documents(documents!(docs)).unwrap(); - } - builder.execute().unwrap(); - wtxn.commit().unwrap(); - }; - // First Batch - { - let mut docs1 = vec![]; - for i in 0..4 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1]); - - db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: - 0 0 - 1 1 - 2 2 - 3 3 - "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [0, ] - 1 0 1 1 [1, ] - 1 0 2 1 [2, ] - 1 0 3 1 [3, ] - "###); - } - // Second Batch: replace the documents with soft-deletion - { - index.index_documents_config.deletion_strategy = - crate::update::DeletionStrategy::AlwaysSoft; - let mut docs1 = vec![]; - for i in 0..3 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i+1 } - )); - } - let mut docs2 = vec![]; - for i in 0..3 { - docs2.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1, docs2]); - - db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, ]"); - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [0, 4, ] - 1 0 1 1 [1, 5, ] - 1 0 2 1 [2, 6, ] - 1 0 3 1 [3, ] - "###); - } - let rtxn = index.read_txn().unwrap(); - let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(3), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [4]).unwrap()[0]; - - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(0), - "doggo": Number(0), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [5]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(1), - "doggo": Number(1), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [6]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(2), - "doggo": Number(2), - } - "###); - drop(rtxn); - // Third Batch: replace the documents with soft-deletion again - { - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; - let mut docs1 = vec![]; - for i in 0..3 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i+1 } - )); - } - let mut docs2 = vec![]; - for i in 0..4 { - docs2.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1, docs2]); - - db_snap!(index, documents_ids, @"[3, 7, 8, 9, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: - 0 7 - 1 8 - 2 9 - 3 3 - "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, 4, 5, 6, ]"); - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [0, 4, 7, ] - 1 0 1 1 [1, 5, 8, ] - 1 0 2 1 [2, 6, 9, ] - 1 0 3 1 [3, ] - "###); - } - let rtxn = index.read_txn().unwrap(); - let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(3), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [7]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(0), - "doggo": Number(0), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [8]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(1), - "doggo": Number(1), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [9]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(2), - "doggo": Number(2), - } - "###); - drop(rtxn); - - // Fourth Batch: replace the documents without soft-deletion - { - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; - let mut docs1 = vec![]; - for i in 0..3 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i+2 } - )); - } - let mut docs2 = vec![]; - for i in 0..1 { - docs2.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1, docs2]); - - db_snap!(index, documents_ids, @"[3, 10, 11, 12, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: - 0 10 - 1 11 - 2 12 - 3 3 - "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [10, ] - 1 0 3 1 [3, 11, ] - 1 0 4 1 [12, ] - "###); - - let rtxn = index.read_txn().unwrap(); - let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(3), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [10]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(0), - "doggo": Number(0), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [11]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(1), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [12]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(2), - "doggo": Number(4), - } - "###); - drop(rtxn); - } - } - #[test] fn bug_3021_first() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; index @@ -2343,23 +2097,18 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 34 1 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); index.delete_document("34"); db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 34 1 + docids: 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); index .update_settings(|s| { @@ -2371,11 +2120,9 @@ pub(crate) mod tests { // do not contain any entry for previously soft-deleted document ids db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - hard: + docids: 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); // So that this document addition works correctly now. // It would be wrongly interpreted as a replacement before @@ -2383,24 +2130,19 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, 4, @r###" - soft: - hard: + docids: 34 1 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); // We do the test again, but deleting the document with id 0 instead of id 1 now index.delete_document("38"); db_snap!(index, documents_ids, @"[1, ]"); db_snap!(index, external_documents_ids, 5, @r###" - soft: - hard: + docids: 34 1 - 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 5, @"[0, ]"); index .update_settings(|s| { @@ -2410,11 +2152,9 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[1, ]"); db_snap!(index, external_documents_ids, 6, @r###" - soft: - hard: + docids: 34 1 "###); - db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); // And adding lots of documents afterwards instead of just one. // These extra subtests don't add much, but it's better than nothing. @@ -2422,8 +2162,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); db_snap!(index, external_documents_ids, 7, @r###" - soft: - hard: + docids: 34 1 38 0 39 2 @@ -2431,14 +2170,38 @@ pub(crate) mod tests { 41 3 42 5 "###); - db_snap!(index, soft_deleted_documents_ids, 7, @"[]"); + } + + #[test] + fn simple_delete() { + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + index + .add_documents(documents!([ + { "id": 30 }, + { "id": 34 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + docids: + 30 0 + 34 1"###); + + index.delete_document("34"); + + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + docids: + 30 0 + "###); } #[test] fn bug_3021_second() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; index @@ -2456,23 +2219,18 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 30 0 34 1 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); index.delete_document("34"); db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: + docids: 30 0 - 34 1 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); index .update_settings(|s| { @@ -2484,11 +2242,9 @@ pub(crate) mod tests { // do not contain any entry for previously soft-deleted document ids db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - hard: + docids: 30 0 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); // So that when we add a new document index.add_documents(documents!({ "primary_key": 35, "b": 2 })).unwrap(); @@ -2497,12 +2253,10 @@ pub(crate) mod tests { // The external documents ids don't have several external ids pointing to the same // internal document id db_snap!(index, external_documents_ids, 4, @r###" - soft: - hard: + docids: 30 0 35 1 "###); - db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); // And when we add 34 again, we don't replace document 35 index.add_documents(documents!({ "primary_key": 34, "a": 1 })).unwrap(); @@ -2510,13 +2264,11 @@ pub(crate) mod tests { // And document 35 still exists, is not deleted db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, 5, @r###" - soft: - hard: + docids: 30 0 34 2 35 1 "###); - db_snap!(index, soft_deleted_documents_ids, 5, @"[]"); let rtxn = index.read_txn().unwrap(); let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0]; @@ -2548,8 +2300,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); db_snap!(index, external_documents_ids, 6, @r###" - soft: - hard: + docids: 30 0 34 2 35 1 @@ -2557,14 +2308,12 @@ pub(crate) mod tests { 38 4 39 5 "###); - db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); } #[test] fn bug_3021_third() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; index @@ -2583,38 +2332,29 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 3 0 4 1 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); index.delete_document("3"); db_snap!(index, documents_ids, @"[1, 2, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 3 0 + docids: 4 1 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[0, ]"); - - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap(); - db_snap!(index, documents_ids, @"[2, 3, ]"); + db_snap!(index, documents_ids, @"[1, 2, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 4 3 + docids: + 4 1 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); index .add_documents(documents!([ @@ -2622,15 +2362,13 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: + docids: 3 0 - 4 3 + 4 1 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); } #[test] @@ -2638,7 +2376,6 @@ pub(crate) mod tests { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index .update_settings(|settings| { @@ -2655,12 +2392,10 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: + docids: 11 0 4 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[]"); index .add_documents(documents!([ @@ -2669,31 +2404,23 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: - 1 3 + docids: + 1 2 11 0 - 4 2 + 4 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[1, ]"); - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysHard); - delete.execute().unwrap(); - wtxn.commit().unwrap(); + index.delete_documents(Default::default()); - db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: - 1 3 + docids: + 1 2 11 0 - 4 2 + 4 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[]"); index .add_documents(documents!([ @@ -2702,15 +2429,13 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, documents_ids, @"[0, 1, 4, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: - 1 4 + docids: + 1 2 11 0 4 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]"); let rtxn = index.read_txn().unwrap(); let search = Search::new(&rtxn, &index); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index cfa438609..acea72c41 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -66,9 +66,9 @@ pub use self::search::{ pub type Result = std::result::Result; pub type Attribute = u32; -pub type BEU16 = heed::zerocopy::U16; -pub type BEU32 = heed::zerocopy::U32; -pub type BEU64 = heed::zerocopy::U64; +pub type BEU16 = heed::types::U16; +pub type BEU32 = heed::types::U32; +pub type BEU64 = heed::types::U64; pub type DocumentId = u32; pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; diff --git a/milli/src/proximity.rs b/milli/src/proximity.rs index 8261015a3..2745527c1 100644 --- a/milli/src/proximity.rs +++ b/milli/src/proximity.rs @@ -1,5 +1,7 @@ use std::cmp; +use serde::{Deserialize, Serialize}; + use crate::{relative_from_absolute_position, Position}; pub const MAX_DISTANCE: u32 = 4; @@ -25,3 +27,11 @@ pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { pub fn path_proximity(path: &[Position]) -> u32 { path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::() } + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +pub enum ProximityPrecision { + #[default] + WordScale, + AttributeScale, +} diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index acf117ef6..90da16797 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -2,7 +2,7 @@ use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::ControlFlow; use std::{fmt, mem}; -use heed::types::ByteSlice; +use heed::types::Bytes; use heed::BytesDecode; use indexmap::IndexMap; use roaring::RoaringBitmap; @@ -13,7 +13,7 @@ use crate::facet::FacetType; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec, }; -use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec}; +use crate::heed_codec::{BytesRefCodec, StrRefCodec}; use crate::search::facet::facet_distribution_iter::{ count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution, }; @@ -105,7 +105,7 @@ impl<'a> FacetDistribution<'a> { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db - .remap_key_type::() + .remap_key_type::() .prefix_iter(self.rtxn, &key_buffer)? .remap_key_type::(); @@ -129,7 +129,7 @@ impl<'a> FacetDistribution<'a> { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db - .remap_key_type::() + .remap_key_type::() .prefix_iter(self.rtxn, &key_buffer)? .remap_key_type::(); @@ -172,9 +172,7 @@ impl<'a> FacetDistribution<'a> { search_function( self.rtxn, - self.index - .facet_id_f64_docids - .remap_key_type::>(), + self.index.facet_id_f64_docids.remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids, _| { @@ -203,9 +201,7 @@ impl<'a> FacetDistribution<'a> { search_function( self.rtxn, - self.index - .facet_id_string_docids - .remap_key_type::>(), + self.index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids, any_docid| { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 722a30e6d..d993ef2dc 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; -use crate::heed_codec::ByteSliceRefCodec; +use crate::heed_codec::BytesRefCodec; use crate::DocumentId; /// Call the given closure on the facet distribution of the candidate documents. @@ -23,7 +23,7 @@ use crate::DocumentId; /// keep iterating over the different facet values or stop. pub fn lexicographically_iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: &RoaringBitmap, callback: CB, @@ -34,11 +34,11 @@ where let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback }; let highest_level = get_highest_level( rtxn, - db.remap_key_type::>(), + db.remap_key_type::>(), field_id, )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; Ok(()) } else { @@ -48,7 +48,7 @@ where pub fn count_iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: &RoaringBitmap, mut callback: CB, @@ -77,11 +77,11 @@ where let mut heap = BinaryHeap::new(); let highest_level = get_highest_level( rtxn, - db.remap_key_type::>(), + db.remap_key_type::>(), field_id, )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { // We first fill the heap with values from the highest level let starting_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; @@ -146,7 +146,7 @@ where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, callback: CB, } diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 26854bc1a..f1a26ded5 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; -use crate::heed_codec::ByteSliceRefCodec; +use crate::heed_codec::BytesRefCodec; use crate::Result; /// Find all the document ids for which the given field contains a value contained within @@ -25,11 +25,11 @@ where let inner; let left = match left { Bound::Included(left) => { - inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; + inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?; Bound::Included(inner.as_ref()) } Bound::Excluded(left) => { - inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; + inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?; Bound::Excluded(inner.as_ref()) } Bound::Unbounded => Bound::Unbounded, @@ -37,25 +37,22 @@ where let inner; let right = match right { Bound::Included(right) => { - inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; + inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?; Bound::Included(inner.as_ref()) } Bound::Excluded(right) => { - inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; + inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?; Bound::Excluded(inner.as_ref()) } Bound::Unbounded => Bound::Unbounded, }; - let db = db.remap_key_type::>(); + let db = db.remap_key_type::>(); let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(starting_left_bound) = - get_first_facet_value::(rtxn, db, field_id)? - { - let rightmost_bound = Bound::Included( - get_last_facet_value::(rtxn, db, field_id)?.unwrap(), - ); // will not fail because get_first_facet_value succeeded + if let Some(starting_left_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let rightmost_bound = + Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) @@ -67,7 +64,7 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 892401c08..20c277c63 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -5,7 +5,7 @@ use super::{get_first_facet_value, get_highest_level}; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; -use crate::heed_codec::ByteSliceRefCodec; +use crate::heed_codec::BytesRefCodec; /// Return an iterator which iterates over the given candidate documents in /// ascending order of their facet value for the given field id. @@ -13,7 +13,7 @@ use crate::heed_codec::ByteSliceRefCodec; /// The documents returned by the iterator are grouped by the facet values that /// determined their rank. For example, given the documents: /// -/// ```ignore +/// ```text /// 0: { "colour": ["blue", "green"] } /// 1: { "colour": ["blue", "red"] } /// 2: { "colour": ["orange", "red"] } @@ -22,7 +22,7 @@ use crate::heed_codec::ByteSliceRefCodec; /// ``` /// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator /// over the following elements: -/// ```ignore +/// ```text /// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue" /// [3] // same for "green" /// [2] // same for "orange" @@ -31,12 +31,12 @@ use crate::heed_codec::ByteSliceRefCodec; /// Note that once a document id is returned by the iterator, it is never returned again. pub fn ascending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); @@ -53,14 +53,12 @@ pub fn ascending_facet_sort<'t>( struct AscendingFacetSort<'t, 'e> { rtxn: &'t heed::RoTxn<'e>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, #[allow(clippy::type_complexity)] stack: Vec<( RoaringBitmap, - std::iter::Take< - heed::RoRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, - >, + std::iter::Take, FacetGroupValueCodec>>, )>, } diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 549f50f0a..ae6eb60d0 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -7,21 +7,21 @@ use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; -use crate::heed_codec::ByteSliceRefCodec; +use crate::heed_codec::BytesRefCodec; /// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort). /// /// This function does the same thing, but in the opposite order. pub fn descending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); Ok(itertools::Either::Left(DescendingFacetSort { @@ -37,13 +37,13 @@ pub fn descending_facet_sort<'t>( struct DescendingFacetSort<'t> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, #[allow(clippy::type_complexity)] stack: Vec<( RoaringBitmap, std::iter::Take< - heed::RoRevRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, + heed::RoRevRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, >, Bound<&'t [u8]>, )>, @@ -100,7 +100,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { *right_bound = Bound::Excluded(left_bound); let iter = match self .db - .remap_key_type::>() + .remap_key_type::>() .rev_range(self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow)) { Ok(iter) => iter, @@ -123,7 +123,7 @@ mod tests { use roaring::RoaringBitmap; use crate::heed_codec::facet::FacetGroupKeyCodec; - use crate::heed_codec::ByteSliceRefCodec; + use crate::heed_codec::BytesRefCodec; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::tests::{ @@ -144,7 +144,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).collect::(); let mut results = String::new(); - let db = index.content.remap_key_type::>(); + let db = index.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); for el in iter { let (docids, _) = el.unwrap(); @@ -167,7 +167,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).collect::(); let mut results = String::new(); - let db = index.content.remap_key_type::>(); + let db = index.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap(); for el in iter { let (docids, _) = el.unwrap(); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index c4cdb37e6..dbd9538a5 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -223,12 +223,9 @@ impl<'a> Filter<'a> { impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time - let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?; - // and finally we delete all the soft_deleted_documents, again, only once at the very end self.inner_evaluate(rtxn, index, &filterable_fields) - .map(|result| result - soft_deleted_documents) } fn evaluate_operator( diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index ebc9e1da0..f44d6a153 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,13 +1,13 @@ pub use facet_sort_ascending::ascending_facet_sort; pub use facet_sort_descending::descending_facet_sort; -use heed::types::{ByteSlice, DecodeIgnore}; +use heed::types::{Bytes, DecodeIgnore}; use heed::{BytesDecode, RoTxn}; use roaring::RoaringBitmap; pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::filter::{BadGeoError, Filter}; use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; -use crate::heed_codec::ByteSliceRefCodec; +use crate::heed_codec::BytesRefCodec; use crate::{Index, Result}; mod facet_distribution; mod facet_distribution_iter; @@ -22,8 +22,10 @@ fn facet_extreme_value<'t>( let extreme_value = if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) }; let (_, extreme_value) = extreme_value?; - - Ok(OrderedF64Codec::bytes_decode(extreme_value)) + OrderedF64Codec::bytes_decode(extreme_value) + .map(Some) + .map_err(heed::Error::Decoding) + .map_err(Into::into) } pub fn facet_min_value<'t>( @@ -32,7 +34,7 @@ pub fn facet_min_value<'t>( field_id: u16, candidates: RoaringBitmap, ) -> Result> { - let db = index.facet_id_f64_docids.remap_key_type::>(); + let db = index.facet_id_f64_docids.remap_key_type::>(); let it = ascending_facet_sort(rtxn, db, field_id, candidates)?; facet_extreme_value(it) } @@ -43,7 +45,7 @@ pub fn facet_max_value<'t>( field_id: u16, candidates: RoaringBitmap, ) -> Result> { - let db = index.facet_id_f64_docids.remap_key_type::>(); + let db = index.facet_id_f64_docids.remap_key_type::>(); let it = descending_facet_sort(rtxn, db, field_id, candidates)?; facet_extreme_value(it) } @@ -51,7 +53,7 @@ pub fn facet_max_value<'t>( /// Get the first facet value in the facet database pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -60,13 +62,12 @@ where let mut level0prefix = vec![]; level0prefix.extend_from_slice(&field_id.to_be_bytes()); level0prefix.push(0); - let mut level0_iter_forward = db - .as_polymorph() - .prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; + let mut level0_iter_forward = + db.remap_types::().prefix_iter(txn, level0prefix.as_slice())?; if let Some(first) = level0_iter_forward.next() { let (first_key, _) = first?; let first_key = FacetGroupKeyCodec::::bytes_decode(first_key) - .ok_or(heed::Error::Encoding)?; + .map_err(heed::Error::Decoding)?; Ok(Some(first_key.left_bound)) } else { Ok(None) @@ -76,7 +77,7 @@ where /// Get the last facet value in the facet database pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -85,13 +86,12 @@ where let mut level0prefix = vec![]; level0prefix.extend_from_slice(&field_id.to_be_bytes()); level0prefix.push(0); - let mut level0_iter_backward = db - .as_polymorph() - .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; + let mut level0_iter_backward = + db.remap_types::().rev_prefix_iter(txn, level0prefix.as_slice())?; if let Some(last) = level0_iter_backward.next() { let (last_key, _) = last?; let last_key = FacetGroupKeyCodec::::bytes_decode(last_key) - .ok_or(heed::Error::Encoding)?; + .map_err(heed::Error::Decoding)?; Ok(Some(last_key.left_bound)) } else { Ok(None) @@ -101,17 +101,17 @@ where /// Get the height of the highest level in the facet database pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); Ok(db - .as_polymorph() - .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, field_id_prefix)? + .remap_types::() + .rev_prefix_iter(txn, field_id_prefix)? .next() .map(|el| { let (key, _) = el.unwrap(); - let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); key.level }) .unwrap_or(0)) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 786b565ae..ee8cd1faf 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,8 +17,7 @@ use crate::error::UserError; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::{ - execute_search, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, Result, - SearchContext, BEU16, + execute_search, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, Result, SearchContext, }; // Building these factories is not free. @@ -299,7 +298,7 @@ impl<'a> SearchForFacetValues<'a> { None => return Ok(Vec::new()), }; - let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &BEU16::new(fid))? { + let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? { Some(fst) => fst, None => return Ok(vec![]), }; diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index ce846009a..76948f1ed 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -3,16 +3,17 @@ use std::collections::hash_map::Entry; use std::hash::Hash; use fxhash::FxHashMap; -use heed::types::ByteSlice; +use heed::types::Bytes; use heed::{BytesEncode, Database, RoTxn}; use roaring::RoaringBitmap; use super::interner::Interned; use super::Word; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; +use crate::proximity::ProximityPrecision; use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; use crate::{ - CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, + CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, }; /// A cache storing pointers to values in the LMDB databases. @@ -25,7 +26,7 @@ pub struct DatabaseCache<'ctx> { pub word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option>>, pub word_prefix_pair_proximity_docids: - FxHashMap<(u8, Interned, Interned), Option>>, + FxHashMap<(u8, Interned, Interned), Option>, pub prefix_word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option>>, pub word_docids: FxHashMap, Option>>, @@ -50,7 +51,7 @@ impl<'ctx> DatabaseCache<'ctx> { cache_key: K1, db_key: &'v KC::EItem, cache: &mut FxHashMap>>, - db: Database, + db: Database, ) -> Result> where K1: Copy + Eq + Hash, @@ -63,12 +64,14 @@ impl<'ctx> DatabaseCache<'ctx> { } match cache.get(&cache_key).unwrap() { - Some(Cow::Borrowed(bytes)) => { - DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) - } - Some(Cow::Owned(bytes)) => { - DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) - } + Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes) + .map(Some) + .map_err(heed::Error::Decoding) + .map_err(Into::into), + Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes) + .map(Some) + .map_err(heed::Error::Decoding) + .map_err(Into::into), None => Ok(None), } } @@ -78,7 +81,7 @@ impl<'ctx> DatabaseCache<'ctx> { cache_key: K1, db_keys: &'v [KC::EItem], cache: &mut FxHashMap>>, - db: Database, + db: Database, merger: MergeFn, ) -> Result> where @@ -110,12 +113,14 @@ impl<'ctx> DatabaseCache<'ctx> { } match cache.get(&cache_key).unwrap() { - Some(Cow::Borrowed(bytes)) => { - DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) - } - Some(Cow::Owned(bytes)) => { - DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) - } + Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes) + .map(Some) + .map_err(heed::Error::Decoding) + .map_err(Into::into), + Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes) + .map(Some) + .map_err(heed::Error::Decoding) + .map_err(Into::into), None => Ok(None), } } @@ -165,16 +170,16 @@ impl<'ctx> SearchContext<'ctx> { word, &keys[..], &mut self.db_cache.word_docids, - self.index.word_fid_docids.remap_data_type::(), + self.index.word_fid_docids.remap_data_type::(), merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, word, self.word_interner.get(word).as_str(), &mut self.db_cache.word_docids, - self.index.word_docids.remap_data_type::(), + self.index.word_docids.remap_data_type::(), ), } } @@ -194,16 +199,16 @@ impl<'ctx> SearchContext<'ctx> { word, &keys[..], &mut self.db_cache.exact_word_docids, - self.index.word_fid_docids.remap_data_type::(), + self.index.word_fid_docids.remap_data_type::(), merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, word, self.word_interner.get(word).as_str(), &mut self.db_cache.exact_word_docids, - self.index.exact_word_docids.remap_data_type::(), + self.index.exact_word_docids.remap_data_type::(), ), } } @@ -244,16 +249,16 @@ impl<'ctx> SearchContext<'ctx> { prefix, &keys[..], &mut self.db_cache.word_prefix_docids, - self.index.word_prefix_fid_docids.remap_data_type::(), + self.index.word_prefix_fid_docids.remap_data_type::(), merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, prefix, self.word_interner.get(prefix).as_str(), &mut self.db_cache.word_prefix_docids, - self.index.word_prefix_docids.remap_data_type::(), + self.index.word_prefix_docids.remap_data_type::(), ), } } @@ -273,16 +278,16 @@ impl<'ctx> SearchContext<'ctx> { prefix, &keys[..], &mut self.db_cache.exact_word_prefix_docids, - self.index.word_prefix_fid_docids.remap_data_type::(), + self.index.word_prefix_fid_docids.remap_data_type::(), merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, prefix, self.word_interner.get(prefix).as_str(), &mut self.db_cache.exact_word_prefix_docids, - self.index.exact_word_prefix_docids.remap_data_type::(), + self.index.exact_word_prefix_docids.remap_data_type::(), ), } } @@ -293,17 +298,67 @@ impl<'ctx> SearchContext<'ctx> { word2: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - (proximity, word1, word2), - &( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(word2).as_str(), - ), - &mut self.db_cache.word_pair_proximity_docids, - self.index.word_pair_proximity_docids.remap_data_type::(), - ) + match self.index.proximity_precision(self.txn)?.unwrap_or_default() { + ProximityPrecision::AttributeScale => { + // Force proximity to 0 because: + // in AttributeScale, there are only 2 possible distances: + // 1. words in same attribute: in that the DB contains (0, word1, word2) + // 2. words in different attributes: no DB entry for these two words. + let proximity = 0; + let docids = if let Some(docids) = + self.db_cache.word_pair_proximity_docids.get(&(proximity, word1, word2)) + { + docids + .as_ref() + .map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d)) + .transpose() + .map_err(heed::Error::Decoding)? + } else { + // Compute the distance at the attribute level and store it in the cache. + let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { + fids + } else { + self.index.fields_ids_map(self.txn)?.ids().collect() + }; + let mut docids = RoaringBitmap::new(); + for fid in fids { + // for each field, intersect left word bitmap and right word bitmap, + // then merge the result in a global bitmap before storing it in the cache. + let word1_docids = self.get_db_word_fid_docids(word1, fid)?; + let word2_docids = self.get_db_word_fid_docids(word2, fid)?; + if let (Some(word1_docids), Some(word2_docids)) = + (word1_docids, word2_docids) + { + docids |= word1_docids & word2_docids; + } + } + let encoded = CboRoaringBitmapCodec::bytes_encode(&docids) + .map(Cow::into_owned) + .map(Cow::Owned) + .map(Some) + .map_err(heed::Error::Decoding)?; + self.db_cache + .word_pair_proximity_docids + .insert((proximity, word1, word2), encoded); + Some(docids) + }; + + Ok(docids) + } + ProximityPrecision::WordScale => { + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + self.txn, + (proximity, word1, word2), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(word2).as_str(), + ), + &mut self.db_cache.word_pair_proximity_docids, + self.index.word_pair_proximity_docids.remap_data_type::(), + ) + } + } } pub fn get_db_word_pair_proximity_docids_len( @@ -312,54 +367,107 @@ impl<'ctx> SearchContext<'ctx> { word2: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( - self.txn, - (proximity, word1, word2), - &( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(word2).as_str(), - ), - &mut self.db_cache.word_pair_proximity_docids, - self.index.word_pair_proximity_docids.remap_data_type::(), - ) + match self.index.proximity_precision(self.txn)?.unwrap_or_default() { + ProximityPrecision::AttributeScale => Ok(self + .get_db_word_pair_proximity_docids(word1, word2, proximity)? + .map(|d| d.len())), + ProximityPrecision::WordScale => { + DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( + self.txn, + (proximity, word1, word2), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(word2).as_str(), + ), + &mut self.db_cache.word_pair_proximity_docids, + self.index.word_pair_proximity_docids.remap_data_type::(), + ) + } + } } pub fn get_db_word_prefix_pair_proximity_docids( &mut self, word1: Interned, prefix2: Interned, - proximity: u8, + mut proximity: u8, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - (proximity, word1, prefix2), - &( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(prefix2).as_str(), - ), - &mut self.db_cache.word_prefix_pair_proximity_docids, - self.index.word_prefix_pair_proximity_docids.remap_data_type::(), - ) + let proximity_precision = self.index.proximity_precision(self.txn)?.unwrap_or_default(); + if proximity_precision == ProximityPrecision::AttributeScale { + // Force proximity to 0 because: + // in AttributeScale, there are only 2 possible distances: + // 1. words in same attribute: in that the DB contains (0, word1, word2) + // 2. words in different attributes: no DB entry for these two words. + proximity = 0; + } + + let docids = if let Some(docids) = + self.db_cache.word_prefix_pair_proximity_docids.get(&(proximity, word1, prefix2)) + { + docids.clone() + } else { + let prefix_docids = match proximity_precision { + ProximityPrecision::AttributeScale => { + // Compute the distance at the attribute level and store it in the cache. + let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { + fids + } else { + self.index.fields_ids_map(self.txn)?.ids().collect() + }; + let mut prefix_docids = RoaringBitmap::new(); + // for each field, intersect left word bitmap and right word bitmap, + // then merge the result in a global bitmap before storing it in the cache. + for fid in fids { + let word1_docids = self.get_db_word_fid_docids(word1, fid)?; + let prefix2_docids = self.get_db_word_prefix_fid_docids(prefix2, fid)?; + if let (Some(word1_docids), Some(prefix2_docids)) = + (word1_docids, prefix2_docids) + { + prefix_docids |= word1_docids & prefix2_docids; + } + } + prefix_docids + } + ProximityPrecision::WordScale => { + // compute docids using prefix iter and store the result in the cache. + let key = U8StrStrCodec::bytes_encode(&( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(prefix2).as_str(), + )) + .unwrap() + .into_owned(); + let mut prefix_docids = RoaringBitmap::new(); + let remap_key_type = self + .index + .word_pair_proximity_docids + .remap_key_type::() + .prefix_iter(self.txn, &key)?; + for result in remap_key_type { + let (_, docids) = result?; + + prefix_docids |= docids; + } + prefix_docids + } + }; + self.db_cache + .word_prefix_pair_proximity_docids + .insert((proximity, word1, prefix2), Some(prefix_docids.clone())); + Some(prefix_docids) + }; + Ok(docids) } + pub fn get_db_prefix_word_pair_proximity_docids( &mut self, left_prefix: Interned, right: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - (proximity, left_prefix, right), - &( - proximity, - self.word_interner.get(left_prefix).as_str(), - self.word_interner.get(right).as_str(), - ), - &mut self.db_cache.prefix_word_pair_proximity_docids, - self.index.prefix_word_pair_proximity_docids.remap_data_type::(), - ) + // only accept exact matches on reverted positions + self.get_db_word_pair_proximity_docids(left_prefix, right, proximity) } pub fn get_db_word_fid_docids( @@ -377,7 +485,7 @@ impl<'ctx> SearchContext<'ctx> { (word, fid), &(self.word_interner.get(word).as_str(), fid), &mut self.db_cache.word_fid_docids, - self.index.word_fid_docids.remap_data_type::(), + self.index.word_fid_docids.remap_data_type::(), ) } @@ -396,7 +504,7 @@ impl<'ctx> SearchContext<'ctx> { (word_prefix, fid), &(self.word_interner.get(word_prefix).as_str(), fid), &mut self.db_cache.word_prefix_fid_docids, - self.index.word_prefix_fid_docids.remap_data_type::(), + self.index.word_prefix_fid_docids.remap_data_type::(), ) } @@ -410,7 +518,7 @@ impl<'ctx> SearchContext<'ctx> { let remap_key_type = self .index .word_fid_docids - .remap_types::() + .remap_types::() .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { @@ -436,7 +544,7 @@ impl<'ctx> SearchContext<'ctx> { let remap_key_type = self .index .word_prefix_fid_docids - .remap_types::() + .remap_types::() .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { @@ -464,7 +572,7 @@ impl<'ctx> SearchContext<'ctx> { (word, position), &(self.word_interner.get(word).as_str(), position), &mut self.db_cache.word_position_docids, - self.index.word_position_docids.remap_data_type::(), + self.index.word_position_docids.remap_data_type::(), ) } @@ -478,7 +586,7 @@ impl<'ctx> SearchContext<'ctx> { (word_prefix, position), &(self.word_interner.get(word_prefix).as_str(), position), &mut self.db_cache.word_prefix_position_docids, - self.index.word_prefix_position_docids.remap_data_type::(), + self.index.word_prefix_position_docids.remap_data_type::(), ) } @@ -492,7 +600,7 @@ impl<'ctx> SearchContext<'ctx> { let remap_key_type = self .index .word_position_docids - .remap_types::() + .remap_types::() .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { @@ -523,7 +631,7 @@ impl<'ctx> SearchContext<'ctx> { let remap_key_type = self .index .word_prefix_position_docids - .remap_types::() + .remap_types::() .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs index e90ffe878..25ea0b0a3 100644 --- a/milli/src/search/new/distinct.rs +++ b/milli/src/search/new/distinct.rs @@ -1,4 +1,4 @@ -use heed::types::{ByteSlice, Str, Unit}; +use heed::types::{Bytes, Str, Unit}; use heed::{Database, RoPrefix, RoTxn}; use roaring::RoaringBitmap; @@ -8,7 +8,7 @@ const DOCID_SIZE: usize = 4; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec, }; -use crate::heed_codec::ByteSliceRefCodec; +use crate::heed_codec::BytesRefCodec; use crate::{Index, Result, SearchContext}; pub struct DistinctOutput { @@ -71,7 +71,7 @@ pub fn distinct_single_docid( /// Return all the docids containing the given value in the given field fn facet_value_docids( - database: Database, FacetGroupValueCodec>, + database: Database, FacetGroupValueCodec>, txn: &RoTxn, field_id: u16, facet_value: &[u8], @@ -87,12 +87,12 @@ fn facet_number_values<'a>( field_id: u16, index: &Index, txn: &'a RoTxn, -) -> Result, Unit>> { +) -> Result, Unit>> { let key = facet_values_prefix_key(field_id, docid); let iter = index .field_id_docid_facet_f64s - .remap_key_type::() + .remap_key_type::() .prefix_iter(txn, &key)? .remap_key_type(); @@ -105,12 +105,12 @@ pub fn facet_string_values<'a>( field_id: u16, index: &Index, txn: &'a RoTxn, -) -> Result, Str>> { +) -> Result, Str>> { let key = facet_values_prefix_key(field_id, docid); let iter = index .field_id_docid_facet_strings - .remap_key_type::() + .remap_key_type::() .prefix_iter(txn, &key)? .remap_types(); diff --git a/milli/src/search/new/geo_sort.rs b/milli/src/search/new/geo_sort.rs index bd9546048..b2e3a2f3d 100644 --- a/milli/src/search/new/geo_sort.rs +++ b/milli/src/search/new/geo_sort.rs @@ -1,7 +1,7 @@ use std::collections::VecDeque; use std::iter::FromIterator; -use heed::types::{ByteSlice, Unit}; +use heed::types::{Bytes, Unit}; use heed::{RoPrefix, RoTxn}; use roaring::RoaringBitmap; use rstar::RTree; @@ -34,7 +34,7 @@ fn facet_number_values<'a>( let iter = index .field_id_docid_facet_f64s - .remap_key_type::() + .remap_key_type::() .prefix_iter(txn, &key)? .remap_key_type(); @@ -163,7 +163,7 @@ impl GeoSort { // computing the distance between two points is expensive thus we cache the result documents .sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize); - self.cached_sorted_docids.extend(documents.into_iter()); + self.cached_sorted_docids.extend(documents); }; Ok(()) diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index c2d325a86..e94be2e77 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -228,7 +228,7 @@ impl Ord for Interned { impl PartialOrd for Interned { fn partial_cmp(&self, other: &Self) -> Option { - self.idx.partial_cmp(&other.idx) + Some(self.cmp(other)) } } @@ -241,7 +241,7 @@ impl PartialEq for Interned { } impl Clone for Interned { fn clone(&self) -> Self { - Self { idx: self.idx, _phantom: PhantomData } + *self } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index ba29dbd1f..a1b5da4e8 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -52,7 +52,6 @@ use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; use crate::{ AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, - BEU32, }; /// A structure used throughout the execution of a search query. @@ -469,8 +468,8 @@ pub fn execute_search( let mut docids = Vec::new(); let mut uniq_docids = RoaringBitmap::new(); for instant_distance::Item { distance: _, pid, point: _ } in neighbors { - let index = BEU32::new(pid.into_inner()); - let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap().get(); + let index = pid.into_inner(); + let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap(); if universe.contains(docid) && uniq_docids.insert(docid) { docids.push(docid); if docids.len() == (from + length) { @@ -627,7 +626,8 @@ fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec> field: field.to_string(), valid_fields, hidden_fields, - })?; + } + .into()); } Member::Geo(_) if !sortable_fields.contains("_geo") => { let (valid_fields, hidden_fields) = @@ -637,7 +637,8 @@ fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec> field: "_geo".to_string(), valid_fields, hidden_fields, - })?; + } + .into()); } _ => (), } diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 72a427379..6760c8be7 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -175,7 +175,7 @@ impl QueryTermSubset { pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option { let original = ctx.term_interner.get(self.original); - let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { return None }; + let use_prefix_db = original.zero_typo.use_prefix_db?; let word = match &self.zero_typo_subset { NTypoTermSubset::All => Some(use_prefix_db), NTypoTermSubset::Subset { words, phrases: _ } => { diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 6f7321e7b..fb234b293 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -4,7 +4,7 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait, SearchContext}; use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; -use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec}; +use crate::heed_codec::{BytesRefCodec, StrRefCodec}; use crate::score_details::{self, ScoreDetails}; use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; use crate::{FieldId, Index, Result}; @@ -100,11 +100,11 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, let number_db = ctx .index .facet_id_f64_docids - .remap_key_type::>(); + .remap_key_type::>(); let string_db = ctx .index .facet_id_string_docids - .remap_key_type::>(); + .remap_key_type::>(); let (number_iter, string_iter) = if self.is_ascending { let number_iter = ascending_facet_sort( diff --git a/milli/src/search/new/tests/attribute_fid.rs b/milli/src/search/new/tests/attribute_fid.rs index 09e52a394..38225404c 100644 --- a/milli/src/search/new/tests/attribute_fid.rs +++ b/milli/src/search/new/tests/attribute_fid.rs @@ -124,8 +124,7 @@ fn test_attribute_fid_simple() { s.query("the quick brown fox jumps over the lazy dog"); s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); } @@ -142,7 +141,6 @@ fn test_attribute_fid_ngrams() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); } diff --git a/milli/src/search/new/tests/attribute_position.rs b/milli/src/search/new/tests/attribute_position.rs index 1513528ec..68c5de540 100644 --- a/milli/src/search/new/tests/attribute_position.rs +++ b/milli/src/search/new/tests/attribute_position.rs @@ -141,8 +141,7 @@ fn test_attribute_position_simple() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); } #[test] @@ -158,8 +157,7 @@ fn test_attribute_position_repeated() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); } @@ -176,8 +174,7 @@ fn test_attribute_position_different_fields() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); } @@ -194,7 +191,6 @@ fn test_attribute_position_ngrams() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); } diff --git a/milli/src/search/new/tests/exactness.rs b/milli/src/search/new/tests/exactness.rs index a486342c1..c52006e3d 100644 --- a/milli/src/search/new/tests/exactness.rs +++ b/milli/src/search/new/tests/exactness.rs @@ -478,8 +478,7 @@ fn test_exactness_simple_ordered() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -511,8 +510,7 @@ fn test_exactness_simple_reversed() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -535,8 +533,7 @@ fn test_exactness_simple_reversed() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -566,8 +563,7 @@ fn test_exactness_simple_random() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -596,8 +592,7 @@ fn test_exactness_attribute_starts_with_simple() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -623,8 +618,7 @@ fn test_exactness_attribute_starts_with_phrase() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -644,8 +638,7 @@ fn test_exactness_attribute_starts_with_phrase() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -674,8 +667,7 @@ fn test_exactness_all_candidates_with_typo() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -711,8 +703,7 @@ fn test_exactness_after_words() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -760,8 +751,7 @@ fn test_words_after_exactness() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 9, 18, 8, 17, 16, 6, 7, 15, 5, 14, 4, 13, 3, 12, 2, 1, 11]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -809,8 +799,7 @@ fn test_proximity_after_exactness() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 4, 5, 8, 7, 3, 6]"); @@ -847,8 +836,7 @@ fn test_proximity_after_exactness() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -881,8 +869,7 @@ fn test_exactness_followed_by_typo_prefer_no_typo_prefix() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 4, 3]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -917,8 +904,7 @@ fn test_typo_followed_by_exactness() { let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - let document_ids_scores: Vec<_> = - documents_ids.iter().zip(document_scores.into_iter()).collect(); + let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); insta::assert_snapshot!(format!("{document_ids_scores:#?}")); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 4, 3]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index 4d340ae1c..2d181a537 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -371,7 +371,7 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best s"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -379,13 +379,13 @@ fn test_proximity_prefix_db() { insta::assert_debug_snapshot!(texts, @r###" [ "\"this is the best summer meal\"", - "\"summer best\"", "\"this is the best meal of summer\"", - "\"summer x best\"", "\"this is the best meal I have ever had in such a beautiful summer day\"", "\"this is the best cooked meal of the summer\"", "\"this is the best meal of the summer\"", "\"summer x y best\"", + "\"summer x best\"", + "\"summer best\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", ] "###); @@ -423,17 +423,17 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best win"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ + "\"this is the best winter meal\"", + "\"this is the best meal of winter\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best cooked meal of the winter\"", "\"this is the best meal of the winter\"", - "\"this is the best meal of winter\"", - "\"this is the best winter meal\"", "\"winter x y best\"", "\"winter x best\"", "\"winter best\"", @@ -471,20 +471,20 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best wi"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"this is the best winter meal\"", - "\"winter best\"", "\"this is the best meal of winter\"", - "\"winter x best\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best cooked meal of the winter\"", "\"this is the best meal of the winter\"", "\"winter x y best\"", + "\"winter x best\"", + "\"winter best\"", ] "###); } diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap index 8f3b964c1..efcfef7f1 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" }, ), ], - [ - Proximity( - Rank { - rank: 3, - max_rank: 4, - }, - ), - ], [ Proximity( Rank { @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 2, + rank: 1, + max_rank: 4, + }, + ), + ], + [ + Proximity( + Rank { + rank: 1, max_rank: 4, }, ), diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap index 1ee6bfc91..242bc3424 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" }, ), ], - [ - Proximity( - Rank { - rank: 3, - max_rank: 4, - }, - ), - ], [ Proximity( Rank { @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 2, + rank: 1, + max_rank: 4, + }, + ), + ], + [ + Proximity( + Rank { + rank: 1, max_rank: 4, }, ), diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap index 5129f1b3b..efcfef7f1 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap @@ -6,7 +6,7 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 1, + rank: 4, max_rank: 4, }, ), @@ -14,7 +14,7 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 1, + rank: 2, max_rank: 4, }, ), diff --git a/milli/src/search/new/tests/sort.rs b/milli/src/search/new/tests/sort.rs index aa6aa971f..8fdf52d44 100644 --- a/milli/src/search/new/tests/sort.rs +++ b/milli/src/search/new/tests/sort.rs @@ -13,6 +13,7 @@ This module tests the `sort` ranking rule: use big_s::S; use maplit::hashset; +use meili_snap::insta; use crate::index::tests::TempIndex; use crate::search::new::tests::collect_field_values; diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 158f515b8..28c4cb45c 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -4,9 +4,8 @@ use std::path::Path; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; -use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index}; +use crate::{make_db_snap_from_iter, obkv_to_json, Index}; #[track_caller] pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { @@ -98,7 +97,6 @@ Create a snapshot test of the given database. - `facet_id_string_docids` - `documents_ids` - `stop_words` - - `soft_deleted_documents_ids` - `field_distribution` - `fields_ids_map` - `geo_faceted_documents_ids` @@ -221,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String { &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) }) } -pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { - make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( - (proximity, word1, prefix), - b, - )| { - &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) - }) -} -pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String { - make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |( - (proximity, prefix, word2), - b, - )| { - &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b)) - }) -} pub fn snap_word_position_docids(index: &Index) -> String { make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) @@ -308,12 +290,6 @@ pub fn snap_stop_words(index: &Index) -> String { let snap = format!("{stop_words:?}"); snap } -pub fn snap_soft_deleted_documents_ids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); - - display_bitmap(&soft_deleted_documents_ids) -} pub fn snap_field_distributions(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let mut snap = String::new(); @@ -340,50 +316,21 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { } pub fn snap_external_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); - let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap(); + let external_ids = index.external_documents_ids().to_hash_map(&rtxn).unwrap(); + // ensure fixed order (not guaranteed by hashmap) + let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect(); + external_ids.sort_by(|(l, _), (r, _)| l.cmp(r)); let mut snap = String::new(); - writeln!(&mut snap, "soft:").unwrap(); - let stream_soft = soft.stream(); - let soft_external_ids = stream_soft.into_str_vec().unwrap(); - for (key, id) in soft_external_ids { - writeln!(&mut snap, "{key:<24} {id}").unwrap(); - } - writeln!(&mut snap, "hard:").unwrap(); - let stream_hard = hard.stream(); - let hard_external_ids = stream_hard.into_str_vec().unwrap(); - for (key, id) in hard_external_ids { + writeln!(&mut snap, "docids:").unwrap(); + for (key, id) in external_ids { writeln!(&mut snap, "{key:<24} {id}").unwrap(); } snap } -pub fn snap_number_faceted_documents_ids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let mut snap = String::new(); - for field_id in fields_ids_map.ids() { - let number_faceted_documents_ids = - index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap(); - writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids)) - .unwrap(); - } - snap -} -pub fn snap_string_faceted_documents_ids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let mut snap = String::new(); - for field_id in fields_ids_map.ids() { - let string_faceted_documents_ids = - index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap(); - writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids)) - .unwrap(); - } - snap -} pub fn snap_words_fst(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let words_fst = index.words_fst(&rtxn).unwrap(); @@ -516,9 +463,6 @@ macro_rules! full_snap_of_db { ($index:ident, stop_words) => {{ $crate::snapshot_tests::snap_stop_words(&$index) }}; - ($index:ident, soft_deleted_documents_ids) => {{ - $crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index) - }}; ($index:ident, field_distribution) => {{ $crate::snapshot_tests::snap_field_distributions(&$index) }}; @@ -531,12 +475,6 @@ macro_rules! full_snap_of_db { ($index:ident, external_documents_ids) => {{ $crate::snapshot_tests::snap_external_documents_ids(&$index) }}; - ($index:ident, number_faceted_documents_ids) => {{ - $crate::snapshot_tests::snap_number_faceted_documents_ids(&$index) - }}; - ($index:ident, string_faceted_documents_ids) => {{ - $crate::snapshot_tests::snap_string_faceted_documents_ids(&$index) - }}; ($index:ident, words_fst) => {{ $crate::snapshot_tests::snap_words_fst(&$index) }}; diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs index 784bee5a7..f460693ba 100644 --- a/milli/src/update/available_documents_ids.rs +++ b/milli/src/update/available_documents_ids.rs @@ -8,16 +8,11 @@ pub struct AvailableDocumentsIds { } impl AvailableDocumentsIds { - pub fn from_documents_ids( - docids: &RoaringBitmap, - soft_deleted_docids: &RoaringBitmap, - ) -> AvailableDocumentsIds { - let used_docids = docids | soft_deleted_docids; - - match used_docids.max() { + pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds { + match docids.max() { Some(last_id) => { let mut available = RoaringBitmap::from_iter(0..last_id); - available -= used_docids; + available -= docids; let iter = match last_id.checked_add(1) { Some(id) => id..=u32::max_value(), @@ -50,7 +45,7 @@ mod tests { #[test] fn empty() { let base = RoaringBitmap::new(); - let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); + let left = AvailableDocumentsIds::from_documents_ids(&base); let right = 0..=u32::max_value(); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } @@ -63,28 +58,8 @@ mod tests { base.insert(100); base.insert(405); - let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); + let left = AvailableDocumentsIds::from_documents_ids(&base); let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } - - #[test] - fn soft_deleted() { - let mut base = RoaringBitmap::new(); - base.insert(0); - base.insert(10); - base.insert(100); - base.insert(405); - - let mut soft_deleted = RoaringBitmap::new(); - soft_deleted.insert(1); - soft_deleted.insert(11); - soft_deleted.insert(101); - soft_deleted.insert(406); - - let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted); - let right = - (0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n)); - left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); - } } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ab42fd854..59adda3e8 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,16 +1,16 @@ +use heed::RwTxn; use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::facet::FacetType; -use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; +use crate::{FieldDistribution, Index, Result}; -pub struct ClearDocuments<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, +pub struct ClearDocuments<'t, 'i> { + wtxn: &'t mut RwTxn<'i>, index: &'i Index, } -impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> ClearDocuments<'t, 'u, 'i> { +impl<'t, 'i> ClearDocuments<'t, 'i> { + pub fn new(wtxn: &'t mut RwTxn<'i>, index: &'i Index) -> ClearDocuments<'t, 'i> { ClearDocuments { wtxn, index } } @@ -21,13 +21,12 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { let Index { env: _env, main: _main, + external_documents_ids, word_docids, exact_word_docids, word_prefix_docids, exact_word_prefix_docids, word_pair_proximity_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, word_position_docids, word_fid_docids, field_id_word_count_docids, @@ -51,43 +50,23 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We retrieve the number of documents ids that we are deleting. let number_of_documents = self.index.number_of_documents(self.wtxn)?; - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; - self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &empty_roaring)?; - self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?; self.index.delete_vector_hnsw(self.wtxn)?; - // We clean all the faceted documents ids. - for field_id in faceted_fields { - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::Number, - &empty_roaring, - )?; - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::String, - &empty_roaring, - )?; - } - // Clear the other databases. + external_documents_ids.clear(self.wtxn)?; word_docids.clear(self.wtxn)?; exact_word_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?; exact_word_prefix_docids.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; - word_prefix_pair_proximity_docids.clear(self.wtxn)?; - prefix_word_pair_proximity_docids.clear(self.wtxn)?; word_position_docids.clear(self.wtxn)?; word_fid_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; @@ -140,7 +119,7 @@ mod tests { assert!(index.words_fst(&rtxn).unwrap().is_empty()); assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); - assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); + assert!(index.external_documents_ids().is_empty(&rtxn).unwrap()); assert!(index.documents_ids(&rtxn).unwrap().is_empty()); assert!(index.field_distribution(&rtxn).unwrap().is_empty()); assert!(index.geo_rtree(&rtxn).unwrap().is_none()); @@ -150,7 +129,6 @@ mod tests { assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); - assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs new file mode 100644 index 000000000..794beb5df --- /dev/null +++ b/milli/src/update/del_add.rs @@ -0,0 +1,125 @@ +use obkv::Key; + +pub type KvWriterDelAdd = obkv::KvWriter; +pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>; + +/// DelAdd defines the new value to add in the database and old value to delete from the database. +/// +/// Its used in an OBKV to be serialized in grenad files. +#[repr(u8)] +#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)] +pub enum DelAdd { + Deletion = 0, + Addition = 1, +} + +impl Key for DelAdd { + const BYTES_SIZE: usize = std::mem::size_of::(); + type BYTES = [u8; Self::BYTES_SIZE]; + + fn to_be_bytes(&self) -> Self::BYTES { + u8::to_be_bytes(*self as u8) + } + + fn from_be_bytes(array: Self::BYTES) -> Self { + match u8::from_be_bytes(array) { + 0 => Self::Deletion, + 1 => Self::Addition, + otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise), + } + } +} + +/// Creates a Kv> from Kv +/// +/// Deletion: put all the values under DelAdd::Deletion +/// Addition: put all the values under DelAdd::Addition, +/// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition, +pub fn into_del_add_obkv( + reader: obkv::KvReader, + operation: DelAddOperation, + buffer: &mut Vec, +) -> Result<(), std::io::Error> { + let mut writer = obkv::KvWriter::new(buffer); + let mut value_buffer = Vec::new(); + for (key, value) in reader.iter() { + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) { + value_writer.insert(DelAdd::Deletion, value)?; + } + if matches!(operation, DelAddOperation::Addition | DelAddOperation::DeletionAndAddition) { + value_writer.insert(DelAdd::Addition, value)?; + } + value_writer.finish()?; + writer.insert(key, &value_buffer)?; + } + + writer.finish() +} + +/// Enum controlling the side of the DelAdd obkv in which the provided value will be written. +#[derive(Debug, Clone, Copy)] +pub enum DelAddOperation { + Deletion, + Addition, + DeletionAndAddition, +} + +/// Creates a Kv> from two Kv +/// +/// putting each deletion obkv's keys under an DelAdd::Deletion +/// and putting each addition obkv's keys under an DelAdd::Addition +pub fn del_add_from_two_obkvs( + deletion: obkv::KvReader, + addition: obkv::KvReader, + buffer: &mut Vec, +) -> Result<(), std::io::Error> { + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + let mut writer = obkv::KvWriter::new(buffer); + let mut value_buffer = Vec::new(); + + for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) { + value_buffer.clear(); + match eob { + Left((k, v)) => { + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Deletion, v).unwrap(); + writer.insert(k, value_writer.into_inner()?).unwrap(); + } + Right((k, v)) => { + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Addition, v).unwrap(); + writer.insert(k, value_writer.into_inner()?).unwrap(); + } + Both((k, deletion), (_, addition)) => { + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Deletion, deletion).unwrap(); + value_writer.insert(DelAdd::Addition, addition).unwrap(); + writer.insert(k, value_writer.into_inner()?).unwrap(); + } + } + } + + writer.finish() +} + +pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool { + del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition) +} + +/// A function that extracts and returns the Add side of a DelAdd obkv. +/// This is useful when there are no previous value in the database and +/// therefore we don't need to do a diff with what's already there. +/// +/// If there is no Add side we currently write an empty buffer +/// which is a valid CboRoaringBitmap. +#[allow(clippy::ptr_arg)] // required to avoid signature mismatch +pub fn deladd_serialize_add_side<'a>( + obkv: &'a [u8], + _buffer: &mut Vec, +) -> crate::Result<&'a [u8]> { + Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) +} diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs deleted file mode 100644 index 164ad0c7e..000000000 --- a/milli/src/update/delete_documents.rs +++ /dev/null @@ -1,1255 +0,0 @@ -use std::collections::btree_map::Entry; -use std::collections::{BTreeSet, HashMap, HashSet}; - -use fst::IntoStreamer; -use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice}; -use heed::{BytesDecode, BytesEncode, Database, RwIter}; -use instant_distance::PointId; -use roaring::RoaringBitmap; -use serde::{Deserialize, Serialize}; -use time::OffsetDateTime; - -use super::facet::delete::FacetsDelete; -use super::ClearDocuments; -use crate::error::InternalError; -use crate::facet::FacetType; -use crate::heed_codec::facet::FieldDocIdFacetCodec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::index::Hnsw; -use crate::{ - ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32, -}; - -pub struct DeleteDocuments<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - external_documents_ids: ExternalDocumentsIds<'static>, - to_delete_docids: RoaringBitmap, - strategy: DeletionStrategy, -} - -/// Result of a [`DeleteDocuments`] operation. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct DocumentDeletionResult { - pub deleted_documents: u64, - pub remaining_documents: u64, -} - -/// Strategy for deleting documents. -/// -/// - Soft-deleted documents are simply marked as deleted without being actually removed from DB. -/// - Hard-deleted documents are definitely suppressed from the DB. -/// -/// Soft-deleted documents trade disk space for runtime performance. -/// -/// Note that any of these variants can be used at any given moment for any indexation in a database. -/// For instance, you can use an [`AlwaysSoft`] followed by an [`AlwaysHard`] option without issue. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] -pub enum DeletionStrategy { - #[default] - /// Definitely suppress documents according to the number or size of soft-deleted documents - Dynamic, - /// Never definitely suppress documents - AlwaysSoft, - /// Always definitely suppress documents - AlwaysHard, -} - -impl std::fmt::Display for DeletionStrategy { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - DeletionStrategy::Dynamic => write!(f, "dynamic"), - DeletionStrategy::AlwaysSoft => write!(f, "always_soft"), - DeletionStrategy::AlwaysHard => write!(f, "always_hard"), - } - } -} - -/// Result of a [`DeleteDocuments`] operation, used for internal purposes. -/// -/// It is a superset of the [`DocumentDeletionResult`] structure, giving -/// additional information about the algorithm used to delete the documents. -#[derive(Debug)] -pub(crate) struct DetailedDocumentDeletionResult { - pub deleted_documents: u64, - pub remaining_documents: u64, -} - -impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> Result> { - let external_documents_ids = index.external_documents_ids(wtxn)?.into_static(); - - Ok(DeleteDocuments { - wtxn, - index, - external_documents_ids, - to_delete_docids: RoaringBitmap::new(), - strategy: Default::default(), - }) - } - - pub fn strategy(&mut self, strategy: DeletionStrategy) { - self.strategy = strategy; - } - - pub fn delete_document(&mut self, docid: u32) { - self.to_delete_docids.insert(docid); - } - - pub fn delete_documents(&mut self, docids: &RoaringBitmap) { - self.to_delete_docids |= docids; - } - - pub fn delete_external_id(&mut self, external_id: &str) -> Option { - let docid = self.external_documents_ids.get(external_id)?; - self.delete_document(docid); - Some(docid) - } - - pub fn execute(self) -> Result { - let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } = - self.execute_inner()?; - - Ok(DocumentDeletionResult { deleted_documents, remaining_documents }) - } - - pub(crate) fn execute_inner(mut self) -> Result { - puffin::profile_function!(); - - self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; - - // We retrieve the current documents ids that are in the database. - let mut documents_ids = self.index.documents_ids(self.wtxn)?; - let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?; - let current_documents_ids_len = documents_ids.len(); - - // We can and must stop removing documents in a database that is empty. - if documents_ids.is_empty() { - // but if there was still documents to delete we clear the database entirely - if !soft_deleted_docids.is_empty() { - ClearDocuments::new(self.wtxn, self.index).execute()?; - } - return Ok(DetailedDocumentDeletionResult { - deleted_documents: 0, - remaining_documents: 0, - }); - } - - // We remove the documents ids that we want to delete - // from the documents in the database and write them back. - documents_ids -= &self.to_delete_docids; - self.index.put_documents_ids(self.wtxn, &documents_ids)?; - - // We can execute a ClearDocuments operation when the number of documents - // to delete is exactly the number of documents in the database. - if current_documents_ids_len == self.to_delete_docids.len() { - let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?; - return Ok(DetailedDocumentDeletionResult { - deleted_documents: current_documents_ids_len, - remaining_documents, - }); - } - - let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let mut field_distribution = self.index.field_distribution(self.wtxn)?; - - // we update the field distribution - for docid in self.to_delete_docids.iter() { - let key = BEU32::new(docid); - let document = - self.index.documents.get(self.wtxn, &key)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: "documents", key: None }, - )?; - for (fid, _value) in document.iter() { - let field_name = - fields_ids_map.name(fid).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: fid, - process: "delete documents", - })?; - if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) - { - match entry.get().checked_sub(1) { - Some(0) | None => entry.remove(), - Some(count) => entry.insert(count), - }; - } - } - } - - self.index.put_field_distribution(self.wtxn, &field_distribution)?; - - soft_deleted_docids |= &self.to_delete_docids; - - // We always soft-delete the documents, even if they will be permanently - // deleted immediately after. - self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; - - // decide for a hard or soft deletion depending on the strategy - let soft_deletion = match self.strategy { - DeletionStrategy::Dynamic => { - // decide to keep the soft deleted in the DB for now if they meet 2 criteria: - // 1. There is less than a fixed rate of 50% of soft-deleted to actual documents, *and* - // 2. Soft-deleted occupy an average of less than a fixed size on disk - - let size_used = self.index.used_size()?; - let nb_documents = self.index.number_of_documents(self.wtxn)?; - let nb_soft_deleted = soft_deleted_docids.len(); - - (nb_soft_deleted < nb_documents) && { - const SOFT_DELETED_SIZE_BYTE_THRESHOLD: u64 = 1_073_741_824; // 1GiB - - // nb_documents + nb_soft_deleted !=0 because if nb_documents is 0 we short-circuit earlier, and then we moved the documents to delete - // from the documents_docids to the soft_deleted_docids. - let estimated_document_size = size_used / (nb_documents + nb_soft_deleted); - let estimated_size_used_by_soft_deleted = - estimated_document_size * nb_soft_deleted; - estimated_size_used_by_soft_deleted < SOFT_DELETED_SIZE_BYTE_THRESHOLD - } - } - DeletionStrategy::AlwaysSoft => true, - DeletionStrategy::AlwaysHard => false, - }; - - if soft_deletion { - // Keep the soft-deleted in the DB - return Ok(DetailedDocumentDeletionResult { - deleted_documents: self.to_delete_docids.len(), - remaining_documents: documents_ids.len(), - }); - } - - self.to_delete_docids = soft_deleted_docids; - - let Index { - env: _env, - main: _main, - word_docids, - exact_word_docids, - word_prefix_docids, - exact_word_prefix_docids, - word_pair_proximity_docids, - field_id_word_count_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, - word_position_docids, - word_prefix_position_docids, - word_fid_docids, - word_prefix_fid_docids, - facet_id_f64_docids: _, - facet_id_string_docids: _, - facet_id_normalized_string_strings: _, - facet_id_string_fst: _, - field_id_docid_facet_f64s: _, - field_id_docid_facet_strings: _, - script_language_docids, - facet_id_exists_docids, - facet_id_is_null_docids, - facet_id_is_empty_docids, - vector_id_docid, - documents, - } = self.index; - // Remove from the documents database - for docid in &self.to_delete_docids { - documents.delete(self.wtxn, &BEU32::new(docid))?; - } - // We acquire the current external documents ids map... - // Note that its soft-deleted document ids field will be equal to the `to_delete_docids` - let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?; - // We then remove the soft-deleted docids from it - new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; - // and write it back to the main database. - let new_external_documents_ids = new_external_documents_ids.into_static(); - self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; - - let mut words_to_keep = BTreeSet::default(); - let mut words_to_delete = BTreeSet::default(); - // We iterate over the words and delete the documents ids - // from the word docids database. - remove_from_word_docids( - self.wtxn, - word_docids, - &self.to_delete_docids, - &mut words_to_keep, - &mut words_to_delete, - )?; - remove_from_word_docids( - self.wtxn, - exact_word_docids, - &self.to_delete_docids, - &mut words_to_keep, - &mut words_to_delete, - )?; - - // We construct an FST set that contains the words to delete from the words FST. - let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?; - - let new_words_fst = { - // We retrieve the current words FST from the database. - let words_fst = self.index.words_fst(self.wtxn)?; - let difference = words_fst.op().add(&words_to_delete).difference(); - - // We stream the new external ids that does no more contains the to-delete external ids. - let mut new_words_fst_builder = fst::SetBuilder::memory(); - new_words_fst_builder.extend_stream(difference.into_stream())?; - - // We create an words FST set from the above builder. - new_words_fst_builder.into_set() - }; - - // We write the new words FST into the main database. - self.index.put_words_fst(self.wtxn, &new_words_fst)?; - - let prefixes_to_delete = - remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.to_delete_docids)?; - - let exact_prefix_to_delete = remove_from_word_prefix_docids( - self.wtxn, - exact_word_prefix_docids, - &self.to_delete_docids, - )?; - - let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union(); - - // We compute the new prefix FST and write it only if there is a change. - if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() { - let new_words_prefixes_fst = { - // We retrieve the current words prefixes FST from the database. - let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?; - let difference = - words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference(); - - // We stream the new external ids that does no more contains the to-delete external ids. - let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory(); - new_words_prefixes_fst_builder.extend_stream(difference.into_stream())?; - - // We create an words FST set from the above builder. - new_words_prefixes_fst_builder.into_set() - }; - - // We write the new words prefixes FST into the main database. - self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?; - } - - for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { - // We delete the documents ids from the word prefix pair proximity database docids - // and remove the empty pairs too. - Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?; - } - Self::delete_from_db( - word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_position_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - - // Remove the documents ids from the field id word count database. - Self::delete_from_db( - field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - - if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { - let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; - - let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree - .iter() - .filter(|&point| self.to_delete_docids.contains(point.data.0)) - .cloned() - .map(|point| (point, point.data.0)) - .unzip(); - points_to_remove.iter().for_each(|point| { - rtree.remove(point); - }); - geo_faceted_doc_ids -= docids_to_remove; - - self.index.put_geo_rtree(self.wtxn, &rtree)?; - self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; - } - - for facet_type in [FacetType::Number, FacetType::String] { - let mut affected_facet_values = HashMap::new(); - for field_id in self.index.faceted_fields_ids(self.wtxn)? { - // Remove docids from the number faceted documents ids - let mut docids = - self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?; - docids -= &self.to_delete_docids; - self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?; - - let facet_values = remove_docids_from_field_id_docid_facet_value( - self.index, - self.wtxn, - facet_type, - field_id, - &self.to_delete_docids, - )?; - if !facet_values.is_empty() { - affected_facet_values.insert(field_id, facet_values); - } - } - FacetsDelete::new( - self.index, - facet_type, - affected_facet_values, - &self.to_delete_docids, - ) - .execute(self.wtxn)?; - } - - // Remove the documents ids from the script language database. - Self::delete_from_db( - script_language_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_docids( - self.wtxn, - facet_id_exists_docids, - &self.to_delete_docids, - )?; - - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_docids( - self.wtxn, - facet_id_is_null_docids, - &self.to_delete_docids, - )?; - - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_docids( - self.wtxn, - facet_id_is_empty_docids, - &self.to_delete_docids, - )?; - - // An ugly and slow way to remove the vectors from the HNSW - // It basically reconstructs the HNSW from scratch without editing the current one. - if let Some(current_hnsw) = self.index.vector_hnsw(self.wtxn)? { - let mut points = Vec::new(); - let mut docids = Vec::new(); - for result in vector_id_docid.iter(self.wtxn)? { - let (vector_id, docid) = result?; - if !self.to_delete_docids.contains(docid.get()) { - let pid = PointId::from(vector_id.get()); - let vector = current_hnsw[pid].clone(); - points.push(vector); - docids.push(docid); - } - } - - let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points); - - vector_id_docid.clear(self.wtxn)?; - for (pid, docid) in pids.into_iter().zip(docids) { - vector_id_docid.put(self.wtxn, &BEU32::new(pid.into_inner()), &docid)?; - } - self.index.put_vector_hnsw(self.wtxn, &new_hnsw)?; - } - - self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; - - Ok(DetailedDocumentDeletionResult { - deleted_documents: self.to_delete_docids.len(), - remaining_documents: documents_ids.len(), - }) - } - - fn delete_from_db( - mut iter: RwIter, C>, - to_delete_docids: &RoaringBitmap, - ) -> Result<()> - where - C: for<'a> BytesDecode<'a, DItem = RoaringBitmap> - + for<'a> BytesEncode<'a, EItem = RoaringBitmap>, - { - puffin::profile_function!(); - - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - Ok(()) - } -} - -fn remove_from_word_prefix_docids( - txn: &mut heed::RwTxn, - db: &Database, - to_remove: &RoaringBitmap, -) -> Result>> { - puffin::profile_function!(); - - let mut prefixes_to_delete = fst::SetBuilder::memory(); - - // We iterate over the word prefix docids database and remove the deleted documents ids - // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. - let mut iter = db.iter_mut(txn)?; - while let Some(result) = iter.next() { - let (prefix, mut docids) = result?; - let prefix = prefix.to_owned(); - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - prefixes_to_delete.insert(prefix)?; - } else if docids.len() != previous_len { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&prefix, &docids)? }; - } - } - - Ok(prefixes_to_delete.into_set()) -} - -fn remove_from_word_docids( - txn: &mut heed::RwTxn, - db: &heed::Database, - to_remove: &RoaringBitmap, - words_to_keep: &mut BTreeSet, - words_to_remove: &mut BTreeSet, -) -> Result<()> { - puffin::profile_function!(); - - // We create an iterator to be able to get the content and delete the word docids. - // It's faster to acquire a cursor to get and delete or put, as we avoid traversing - // the LMDB B-Tree two times but only once. - let mut iter = db.iter_mut(txn)?; - while let Some((key, mut docids)) = iter.next().transpose()? { - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - words_to_remove.insert(key.to_owned()); - } else { - words_to_keep.insert(key.to_owned()); - if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } - } - - Ok(()) -} - -fn remove_docids_from_field_id_docid_facet_value( - index: &Index, - wtxn: &mut heed::RwTxn, - facet_type: FacetType, - field_id: FieldId, - to_remove: &RoaringBitmap, -) -> heed::Result>> { - puffin::profile_function!(); - - let db = match facet_type { - FacetType::String => { - index.field_id_docid_facet_strings.remap_types::() - } - FacetType::Number => { - index.field_id_docid_facet_f64s.remap_types::() - } - }; - let mut all_affected_facet_values = HashSet::default(); - let mut iter = db - .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? - .remap_key_type::>(); - - while let Some(result) = iter.next() { - let ((_, docid, facet_value), _) = result?; - if to_remove.contains(docid) { - if !all_affected_facet_values.contains(facet_value) { - all_affected_facet_values.insert(facet_value.to_owned()); - } - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } - } - - Ok(all_affected_facet_values) -} - -fn remove_docids_from_facet_id_docids<'a, C>( - wtxn: &'a mut heed::RwTxn, - db: &heed::Database, - to_remove: &RoaringBitmap, -) -> heed::Result<()> -where - C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, -{ - puffin::profile_function!(); - - let mut iter = db.remap_key_type::().iter_mut(wtxn)?; - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use big_s::S; - use heed::RwTxn; - use maplit::hashset; - - use super::*; - use crate::index::tests::TempIndex; - use crate::{db_snap, Filter, Search}; - - fn delete_documents<'t>( - wtxn: &mut RwTxn<'t, '_>, - index: &'t Index, - external_ids: &[&str], - strategy: DeletionStrategy, - ) -> Vec { - let external_document_ids = index.external_documents_ids(wtxn).unwrap(); - let ids_to_delete: Vec = external_ids - .iter() - .map(|id| external_document_ids.get(id.as_bytes()).unwrap()) - .collect(); - - // Delete some documents. - let mut builder = DeleteDocuments::new(wtxn, index).unwrap(); - builder.strategy(strategy); - external_ids.iter().for_each(|id| { - builder.delete_external_id(id); - }); - builder.execute().unwrap(); - - ids_to_delete - } - - fn delete_documents_with_numbers_as_primary_key_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, - { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, - { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } - ]), - ) - .unwrap(); - - // delete those documents, ids are synchronous therefore 0, 1, and 2. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_document(0); - builder.delete_document(1); - builder.delete_document(2); - builder.strategy(deletion_strategy); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - // All these snapshots should be empty since the database was cleared - db_snap!(index, documents_ids, deletion_strategy); - db_snap!(index, word_docids, deletion_strategy); - db_snap!(index, word_pair_proximity_docids, deletion_strategy); - db_snap!(index, facet_id_exists_docids, deletion_strategy); - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - - let rtxn = index.read_txn().unwrap(); - - assert!(index.field_distribution(&rtxn).unwrap().is_empty()); - } - - #[test] - fn delete_documents_with_numbers_as_primary_key() { - delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysHard); - delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysSoft); - } - - fn delete_documents_with_strange_primary_key_(strategy: DeletionStrategy) { - let index = TempIndex::new(); - - index - .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()])) - .unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "mysuperid": 0, "name": "kevin" }, - { "mysuperid": 1, "name": "kevina" }, - { "mysuperid": 2, "name": "benoit" } - ]), - ) - .unwrap(); - wtxn.commit().unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - - // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_external_id("0"); - builder.delete_external_id("1"); - builder.strategy(strategy); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, strategy); - db_snap!(index, word_docids, strategy); - db_snap!(index, word_pair_proximity_docids, strategy); - db_snap!(index, soft_deleted_documents_ids, strategy); - } - - #[test] - fn delete_documents_with_strange_primary_key() { - delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysHard); - delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysSoft); - } - - fn filtered_placeholder_search_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label"), S("label2") }); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"], deletion_strategy); - - // Placeholder search with filter - let filter = Filter::from_str("label = sign").unwrap().unwrap(); - let results = index.search(&wtxn).filter(filter).execute().unwrap(); - assert!(results.documents_ids.is_empty()); - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - db_snap!(index, word_docids, deletion_strategy); - db_snap!(index, facet_id_f64_docids, deletion_strategy); - db_snap!(index, word_pair_proximity_docids, deletion_strategy); - db_snap!(index, facet_id_exists_docids, deletion_strategy); - db_snap!(index, facet_id_string_docids, deletion_strategy); - } - - #[test] - fn filtered_placeholder_search_should_not_return_deleted_documents() { - filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysHard, - ); - filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysSoft, - ); - } - - fn placeholder_search_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"], deletion_strategy); - - // Placeholder search - let results = index.search(&wtxn).execute().unwrap(); - assert!(!results.documents_ids.is_empty()); - for id in results.documents_ids.iter() { - assert!( - !deleted_internal_ids.contains(id), - "The document {} was supposed to be deleted", - id - ); - } - - wtxn.commit().unwrap(); - } - - #[test] - fn placeholder_search_should_not_return_deleted_documents() { - placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn search_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy); - - // search for abstract - let results = index.search(&wtxn).query("abstract").execute().unwrap(); - assert!(!results.documents_ids.is_empty()); - for id in results.documents_ids.iter() { - assert!( - !deleted_internal_ids.contains(id), - "The document {} was supposed to be deleted", - id - ); - } - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - } - - #[test] - fn search_should_not_return_deleted_documents() { - search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn geo_filtered_placeholder_search_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("id")); - settings.set_filterable_fields(hashset!(S("_geo"))); - settings.set_sortable_fields(hashset!(S("_geo"))); - }) - .unwrap(); - - index.add_documents_using_wtxn(&mut wtxn, documents!([ - { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } }, - { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } }, - { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } }, - { "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } }, - { "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } }, - { "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } }, - { "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } }, - { "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } }, - { "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } }, - { "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } }, - { "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } }, - { "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } }, - { "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } }, - { "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } }, - { "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } }, - { "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } }, - { "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } }, - { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } }, - { "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } }, - { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } } - ])).unwrap(); - - let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &external_ids_to_delete, deletion_strategy); - - // Placeholder search with geo filter - let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); - let results = index.search(&wtxn).filter(filter).execute().unwrap(); - assert!(!results.documents_ids.is_empty()); - for id in results.documents_ids.iter() { - assert!( - !deleted_internal_ids.contains(id), - "The document {} was supposed to be deleted", - id - ); - } - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - db_snap!(index, facet_id_f64_docids, deletion_strategy); - db_snap!(index, facet_id_string_docids, deletion_strategy); - } - - #[test] - fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { - geo_filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysHard, - ); - geo_filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysSoft, - ); - } - - fn get_documents_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - let deleted_external_ids = ["1_7", "1_52"]; - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &deleted_external_ids, deletion_strategy); - - // list all documents - let results = index.all_documents(&wtxn).unwrap(); - for result in results { - let (id, _) = result.unwrap(); - assert!( - !deleted_internal_ids.contains(&id), - "The document {} was supposed to be deleted", - id - ); - } - - // list internal document ids - let results = index.documents_ids(&wtxn).unwrap(); - for id in results { - assert!( - !deleted_internal_ids.contains(&id), - "The document {} was supposed to be deleted", - id - ); - } - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - - // get internal docids from deleted external document ids - let results = index.external_documents_ids(&rtxn).unwrap(); - for id in deleted_external_ids { - assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id); - } - drop(rtxn); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - } - - #[test] - fn get_documents_should_not_return_deleted_documents() { - get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn stats_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index.add_documents_using_wtxn(&mut wtxn, documents!([ - { "docid": "1_4", "label": ["sign"]}, - { "docid": "1_5", "label": ["letter"]}, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"}, - { "docid": "1_36", "label": ["drawing","painting","pattern"]}, - { "docid": "1_37", "label": ["art","drawing","outdoor"]}, - { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"}, - { "docid": "1_39", "label": ["abstract"]}, - { "docid": "1_40", "label": ["cartoon"]}, - { "docid": "1_41", "label": ["art","drawing"]}, - { "docid": "1_42", "label": ["art","pattern"]}, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32}, - { "docid": "1_44", "label": ["drawing"], "number": 44i32}, - { "docid": "1_45", "label": ["art"]}, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]}, - { "docid": "1_47", "label": ["abstract","pattern"]}, - { "docid": "1_52", "label": ["abstract","cartoon"]}, - { "docid": "1_57", "label": ["abstract","drawing","pattern"]}, - { "docid": "1_58", "label": ["abstract","art","cartoon"]}, - { "docid": "1_68", "label": ["design"]}, - { "docid": "1_69", "label": ["geometry"]} - ])).unwrap(); - - delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy); - - // count internal documents - let results = index.number_of_documents(&wtxn).unwrap(); - assert_eq!(18, results); - - // count field distribution - let results = index.field_distribution(&wtxn).unwrap(); - assert_eq!(Some(&18), results.get("label")); - assert_eq!(Some(&1), results.get("title")); - assert_eq!(Some(&2), results.get("number")); - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - } - - #[test] - fn stats_should_not_return_deleted_documents() { - stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn stored_detected_script_and_language_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - use charabia::{Language, Script}; - let index = TempIndex::new(); - let mut wtxn = index.write_txn().unwrap(); - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, - { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, - { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, - { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" }, - { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" }, - { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" }, - ])) - .unwrap(); - - let key_cmn = (Script::Cj, Language::Cmn); - let cj_cmn_docs = - index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default(); - let mut expected_cj_cmn_docids = RoaringBitmap::new(); - expected_cj_cmn_docids.push(1); - expected_cj_cmn_docids.push(5); - assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); - - delete_documents(&mut wtxn, &index, &["1"], deletion_strategy); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let cj_cmn_docs = - index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default(); - let mut expected_cj_cmn_docids = RoaringBitmap::new(); - expected_cj_cmn_docids.push(5); - assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); - } - - #[test] - fn stored_detected_script_and_language_should_not_return_deleted_documents() { - stored_detected_script_and_language_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysHard, - ); - stored_detected_script_and_language_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysSoft, - ); - } - - #[test] - fn delete_words_exact_attributes() { - let index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key(S("id")); - settings.set_searchable_fields(vec![S("text"), S("exact")]); - settings.set_exact_attributes(vec![S("exact")].into_iter().collect()); - }) - .unwrap(); - - index - .add_documents(documents!([ - { "id": 0, "text": "hello" }, - { "id": 1, "exact": "hello"} - ])) - .unwrap(); - db_snap!(index, word_docids, 1, @r###" - hello [0, ] - "###); - db_snap!(index, exact_word_docids, 1, @r###" - hello [1, ] - "###); - db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); - - let mut wtxn = index.write_txn().unwrap(); - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &["1"], DeletionStrategy::AlwaysHard); - wtxn.commit().unwrap(); - - db_snap!(index, word_docids, 2, @r###" - hello [0, ] - "###); - db_snap!(index, exact_word_docids, 2, @""); - db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); - - insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]"); - let txn = index.read_txn().unwrap(); - let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap(); - insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###); - - let mut s = Search::new(&txn, &index); - s.query("hello"); - let crate::SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); - } -} diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index a3f0c8f71..3bd4cf5f5 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,10 +1,9 @@ -use std::borrow::Cow; use std::fs::File; use std::io::BufReader; use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use heed::types::Bytes; +use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn}; use roaring::RoaringBitmap; use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; @@ -12,18 +11,16 @@ use crate::facet::FacetType; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; -use crate::heed_codec::ByteSliceRefCodec; +use crate::heed_codec::BytesRefCodec; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; -use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result}; /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases /// by rebuilding the database "from scratch". /// /// First, the new elements are inserted into the level 0 of the database. Then, the /// higher levels are cleared and recomputed from the content of level 0. -/// -/// Finally, the `faceted_documents_ids` value in the main database of `Index` -/// is updated to contain the new set of faceted documents. pub struct FacetsUpdateBulk<'i> { index: &'i Index, group_size: u8, @@ -31,7 +28,7 @@ pub struct FacetsUpdateBulk<'i> { facet_type: FacetType, field_ids: Vec, // None if level 0 does not need to be updated - new_data: Option>>, + delta_data: Option>>, } impl<'i> FacetsUpdateBulk<'i> { @@ -39,7 +36,7 @@ impl<'i> FacetsUpdateBulk<'i> { index: &'i Index, field_ids: Vec, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, group_size: u8, min_level_size: u8, ) -> FacetsUpdateBulk<'i> { @@ -49,7 +46,7 @@ impl<'i> FacetsUpdateBulk<'i> { group_size, min_level_size, facet_type, - new_data: Some(new_data), + delta_data: Some(delta_data), } } @@ -64,29 +61,26 @@ impl<'i> FacetsUpdateBulk<'i> { group_size: FACET_GROUP_SIZE, min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, - new_data: None, + delta_data: None, } } #[logging_timer::time("FacetsUpdateBulk::{}")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { - let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; + let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self; let db = match facet_type { - FacetType::String => index - .facet_id_string_docids - .remap_key_type::>(), + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; - let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; + let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size }; - inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { - index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; - Ok(()) - })?; + inner.update(wtxn, &field_ids)?; Ok(()) } @@ -94,32 +88,25 @@ impl<'i> FacetsUpdateBulk<'i> { /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type pub(crate) struct FacetsUpdateBulkInner { - pub db: heed::Database, FacetGroupValueCodec>, - pub new_data: Option>, + pub db: heed::Database, FacetGroupValueCodec>, + pub delta_data: Option>, pub group_size: u8, pub min_level_size: u8, } impl FacetsUpdateBulkInner { - pub fn update( - mut self, - wtxn: &mut RwTxn, - field_ids: &[u16], - mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>, - ) -> Result<()> { + pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> { self.update_level0(wtxn)?; for &field_id in field_ids.iter() { self.clear_levels(wtxn, field_id)?; } for &field_id in field_ids.iter() { - let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?; - - handle_all_docids(wtxn, field_id, all_docids)?; + let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?; for level_reader in level_readers { let mut cursor = level_reader.into_cursor()?; while let Some((k, v)) = cursor.move_on_next()? { - self.db.remap_types::().put(wtxn, k, v)?; + self.db.remap_types::().put(wtxn, k, v)?; } } } @@ -133,35 +120,48 @@ impl FacetsUpdateBulkInner { self.db.delete_range(wtxn, &range).map(drop)?; Ok(()) } + fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { - let new_data = match self.new_data.take() { + let delta_data = match self.delta_data.take() { Some(x) => x, None => return Ok(()), }; if self.db.is_empty(wtxn)? { let mut buffer = Vec::new(); - let mut database = self.db.iter_mut(wtxn)?.remap_types::(); - let mut cursor = new_data.into_cursor()?; + let mut database = self.db.iter_mut(wtxn)?.remap_types::(); + let mut cursor = delta_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if !valid_lmdb_key(key) { continue; } + let value = KvReaderDelAdd::new(value); + + // DB is empty, it is safe to ignore Del operations + let Some(value) = value.get(DelAdd::Addition) else { + continue; + }; + buffer.clear(); // the group size for level 0 buffer.push(1); // then we extend the buffer with the docids bitmap buffer.extend_from_slice(value); - unsafe { database.append(key, &buffer)? }; + unsafe { + database.put_current_with_options::(PutFlags::APPEND, key, &buffer)? + }; } } else { let mut buffer = Vec::new(); - let database = self.db.remap_types::(); + let database = self.db.remap_types::(); - let mut cursor = new_data.into_cursor()?; + let mut cursor = delta_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if !valid_lmdb_key(key) { continue; } + + let value = KvReaderDelAdd::new(value); + // the value is a CboRoaringBitmap, but I still need to prepend the // group size for level 0 (= 1) to it buffer.clear(); @@ -169,17 +169,27 @@ impl FacetsUpdateBulkInner { // then we extend the buffer with the docids bitmap match database.get(wtxn, key)? { Some(prev_value) => { + // prev_value is the group size for level 0, followed by the previous bitmap. let old_bitmap = &prev_value[1..]; - CboRoaringBitmapCodec::merge_into( - &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], - &mut buffer, - )?; + CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?; } None => { + // it is safe to ignore the del in that case. + let Some(value) = value.get(DelAdd::Addition) else { + // won't put the key in DB as the value would be empty + continue; + }; + buffer.extend_from_slice(value); } }; - database.put(wtxn, key, &buffer)?; + let new_bitmap = &buffer[1..]; + // if the new bitmap is empty, let's remove it + if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 { + database.delete(wtxn, key)?; + } else { + database.put(wtxn, key, &buffer)?; + } } } Ok(()) @@ -188,16 +198,10 @@ impl FacetsUpdateBulkInner { &self, field_id: FieldId, txn: &RoTxn, - ) -> Result<(Vec>>, RoaringBitmap)> { - let mut all_docids = RoaringBitmap::new(); - let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { - for bitmap in bitmaps { - all_docids |= bitmap; - } - Ok(()) - })?; + ) -> Result>>> { + let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?; - Ok((subwriters, all_docids)) + Ok(subwriters) } #[allow(clippy::type_complexity)] fn read_level_0<'t>( @@ -217,9 +221,9 @@ impl FacetsUpdateBulkInner { let level_0_iter = self .db - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, level_0_prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); + .remap_types::() + .prefix_iter(rtxn, level_0_prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); let mut left_bound: &[u8] = &[]; let mut first_iteration_for_new_group = true; @@ -305,11 +309,11 @@ impl FacetsUpdateBulkInner { bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { let key = FacetGroupKey { field_id, level, left_bound }; - let key = FacetGroupKeyCodec::::bytes_encode(&key) - .ok_or(Error::Encoding)?; + let key = FacetGroupKeyCodec::::bytes_encode(&key) + .map_err(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = - FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; + FacetGroupValueCodec::bytes_encode(&value).map_err(Error::Encoding)?; cur_writer.insert(key, value)?; cur_writer_len += 1; } @@ -334,10 +338,10 @@ impl FacetsUpdateBulkInner { bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { let key = FacetGroupKey { field_id, level, left_bound }; - let key = FacetGroupKeyCodec::::bytes_encode(&key) - .ok_or(Error::Encoding)?; + let key = FacetGroupKeyCodec::::bytes_encode(&key) + .map_err(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; - let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; + let value = FacetGroupValueCodec::bytes_encode(&value).map_err(Error::Encoding)?; cur_writer.insert(key, value)?; cur_writer_len += 1; } @@ -491,7 +495,6 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); - db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); } #[test] diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs deleted file mode 100644 index 883abc8ca..000000000 --- a/milli/src/update/facet/delete.rs +++ /dev/null @@ -1,360 +0,0 @@ -use std::collections::{HashMap, HashSet}; - -use heed::RwTxn; -use log::debug; -use roaring::RoaringBitmap; -use time::OffsetDateTime; - -use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; -use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; -use crate::heed_codec::ByteSliceRefCodec; -use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}; -use crate::{FieldId, Index, Result}; - -/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases. -/// -/// Depending on the number of removed elements and the existing size of the database, we use either -/// a bulk delete method or an incremental delete method. -pub struct FacetsDelete<'i, 'b> { - index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, - facet_type: FacetType, - affected_facet_values: HashMap>>, - docids_to_delete: &'b RoaringBitmap, - group_size: u8, - max_group_size: u8, - min_level_size: u8, -} -impl<'i, 'b> FacetsDelete<'i, 'b> { - pub fn new( - index: &'i Index, - facet_type: FacetType, - affected_facet_values: HashMap>>, - docids_to_delete: &'b RoaringBitmap, - ) -> Self { - let database = match facet_type { - FacetType::String => index - .facet_id_string_docids - .remap_key_type::>(), - FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() - } - }; - Self { - index, - database, - facet_type, - affected_facet_values, - docids_to_delete, - group_size: FACET_GROUP_SIZE, - max_group_size: FACET_MAX_GROUP_SIZE, - min_level_size: FACET_MIN_LEVEL_SIZE, - } - } - - pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> { - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - - for (field_id, affected_facet_values) in self.affected_facet_values { - // This is an incorrect condition, since we assume that the length of the database is equal - // to the number of facet values for the given field_id. It means that in some cases, we might - // wrongly choose the incremental indexer over the bulk indexer. But the only case where that could - // really be a performance problem is when we fully delete a large ratio of all facet values for - // each field id. This would almost never happen. Still, to be overly cautious, I have added a - // 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance - // penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead. - if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) { - // Bulk delete - let mut modified = false; - - for facet_value in affected_facet_values { - let key = - FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() }; - let mut old = self.database.get(wtxn, &key)?.unwrap(); - let previous_len = old.bitmap.len(); - old.bitmap -= self.docids_to_delete; - if old.bitmap.is_empty() { - modified = true; - self.database.delete(wtxn, &key)?; - } else if old.bitmap.len() != previous_len { - modified = true; - self.database.put(wtxn, &key, &old)?; - } - } - if modified { - let builder = FacetsUpdateBulk::new_not_updating_level_0( - self.index, - vec![field_id], - self.facet_type, - ); - builder.execute(wtxn)?; - } - } else { - // Incremental - let inc = FacetsUpdateIncrementalInner { - db: self.database, - group_size: self.group_size, - min_level_size: self.min_level_size, - max_group_size: self.max_group_size, - }; - for facet_value in affected_facet_values { - inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?; - } - } - } - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use std::iter::FromIterator; - - use big_s::S; - use maplit::hashset; - use rand::seq::SliceRandom; - use rand::SeedableRng; - use roaring::RoaringBitmap; - - use crate::db_snap; - use crate::documents::documents_batch_reader_from_objects; - use crate::index::tests::TempIndex; - use crate::update::facet::test_helpers::ordered_string; - use crate::update::{DeleteDocuments, DeletionStrategy}; - - #[test] - fn delete_mixed_incremental_and_bulk() { - // The point of this test is to create an index populated with documents - // containing different filterable attributes. Then, we delete a bunch of documents - // such that a mix of the incremental and bulk indexer is used (depending on the field id) - let index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index - .update_settings(|settings| { - settings.set_filterable_fields( - hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "label": i / 10, - "colour": i / 100, - "timestamp": i / 2, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576"); - db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf"); - - let mut wtxn = index.env.write_txn().unwrap(); - - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); - builder.delete_documents(&RoaringBitmap::from_iter(0..100)); - // by deleting the first 100 documents, we expect that: - // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) - // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 - // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 - // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 - // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, @"[]"); - db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6"); - db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56"); - } - - // Same test as above but working with string values for the facets - #[test] - fn delete_mixed_incremental_and_bulk_string() { - // The point of this test is to create an index populated with documents - // containing different filterable attributes. Then, we delete a bunch of documents - // such that a mix of the incremental and bulk indexer is used (depending on the field id) - let index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index - .update_settings(|settings| { - settings.set_filterable_fields( - hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "label": ordered_string(i / 10), - "colour": ordered_string(i / 100), - "timestamp": ordered_string(i / 2), - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) - db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); - db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); - - let mut wtxn = index.env.write_txn().unwrap(); - - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); - builder.delete_documents(&RoaringBitmap::from_iter(0..100)); - // by deleting the first 100 documents, we expect that: - // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) - // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 - // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 - // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 - // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, @"[]"); - db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc"); - db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f"); - } - - #[test] - fn delete_almost_all_incrementally_string() { - let index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index - .update_settings(|settings| { - settings.set_filterable_fields( - hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "label": ordered_string(i / 10), - "colour": ordered_string(i / 100), - "timestamp": ordered_string(i / 2), - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) - db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); - db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); - - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - - let mut docids_to_delete = (0..1000).collect::>(); - docids_to_delete.shuffle(&mut rng); - for docid in docids_to_delete.into_iter().take(990) { - let mut wtxn = index.env.write_txn().unwrap(); - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); - builder.delete_documents(&RoaringBitmap::from_iter([docid])); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - } - - db_snap!(index, soft_deleted_documents_ids, @"[]"); - db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d"); - db_snap!(index, string_faceted_documents_ids, 2, @r###" - 0 [] - 1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] - 2 [292, 324, 358, 381, 493, 839, 852, ] - 3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] - "###); - } -} - -#[allow(unused)] -#[cfg(test)] -mod comparison_bench { - use std::iter::once; - - use rand::Rng; - use roaring::RoaringBitmap; - - use crate::heed_codec::facet::OrderedF64Codec; - use crate::update::facet::test_helpers::FacetIndex; - - // This is a simple test to get an intuition on the relative speed - // of the incremental vs. bulk indexer. - // - // The benchmark shows the worst-case scenario for the incremental indexer, since - // each facet value contains only one document ID. - // - // In that scenario, it appears that the incremental indexer is about 70 times slower than the - // bulk indexer. - // #[test] - fn benchmark_facet_indexing_delete() { - let mut r = rand::thread_rng(); - - for i in 1..=20 { - let size = 50_000 * i; - let index = FacetIndex::::new(4, 8, 5); - - let mut txn = index.env.write_txn().unwrap(); - let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); - for i in 0..size { - // field id = 0, left_bound = i, docids = [i] - elements.push(((0, i as f64), once(i).collect())); - } - let timer = std::time::Instant::now(); - index.bulk_insert(&mut txn, &[0], elements.iter()); - let time_spent = timer.elapsed().as_millis(); - println!("bulk {size} : {time_spent}ms"); - - txn.commit().unwrap(); - - for nbr_doc in [1, 100, 1000, 10_000] { - let mut txn = index.env.write_txn().unwrap(); - let timer = std::time::Instant::now(); - // - // delete one document - // - for _ in 0..nbr_doc { - let deleted_u32 = r.gen::() % size; - let deleted_f64 = deleted_u32 as f64; - index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32) - } - let time_spent = timer.elapsed().as_millis(); - println!(" delete {nbr_doc} : {time_spent}ms"); - txn.abort().unwrap(); - } - } - } -} diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 743c0b038..9d8ee08f4 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,19 +1,20 @@ -use std::collections::HashMap; use std::fs::File; use std::io::BufReader; -use heed::types::{ByteSlice, DecodeIgnore}; +use heed::types::{Bytes, DecodeIgnore}; use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use obkv::KvReader; use roaring::RoaringBitmap; use crate::facet::FacetType; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; -use crate::heed_codec::ByteSliceRefCodec; +use crate::heed_codec::BytesRefCodec; use crate::search::facet::get_highest_level; +use crate::update::del_add::DelAdd; use crate::update::index_documents::valid_lmdb_key; -use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use crate::{CboRoaringBitmapCodec, Index, Result}; enum InsertionResult { InPlace, @@ -28,72 +29,76 @@ enum DeletionResult { /// Algorithm to incrementally insert and delete elememts into the /// `facet_id_(string/f64)_docids` databases. -/// -/// Rhe `faceted_documents_ids` value in the main database of `Index` -/// is also updated to contain the new set of faceted documents. -pub struct FacetsUpdateIncremental<'i> { - index: &'i Index, +pub struct FacetsUpdateIncremental { inner: FacetsUpdateIncrementalInner, - facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, } -impl<'i> FacetsUpdateIncremental<'i> { +impl FacetsUpdateIncremental { pub fn new( - index: &'i Index, + index: &Index, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, group_size: u8, min_level_size: u8, max_group_size: u8, ) -> Self { FacetsUpdateIncremental { - index, inner: FacetsUpdateIncrementalInner { db: match facet_type { FacetType::String => index .facet_id_string_docids - .remap_key_type::>(), + .remap_key_type::>(), FacetType::Number => index .facet_id_f64_docids - .remap_key_type::>(), + .remap_key_type::>(), }, group_size, max_group_size, min_level_size, }, - facet_type, - new_data, + delta_data, } } - pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { - let mut new_faceted_docids = HashMap::::default(); - - let mut cursor = self.new_data.into_cursor()?; + pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> { + let mut cursor = self.delta_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if !valid_lmdb_key(key) { continue; } - let key = FacetGroupKeyCodec::::bytes_decode(key) - .ok_or(heed::Error::Encoding)?; - let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; - *new_faceted_docids.entry(key.field_id).or_default() |= docids; + let key = FacetGroupKeyCodec::::bytes_decode(key) + .map_err(heed::Error::Encoding)?; + let value = KvReader::new(value); + + let docids_to_delete = value + .get(DelAdd::Deletion) + .map(CboRoaringBitmapCodec::bytes_decode) + .map(|o| o.map_err(heed::Error::Encoding)); + + let docids_to_add = value + .get(DelAdd::Addition) + .map(CboRoaringBitmapCodec::bytes_decode) + .map(|o| o.map_err(heed::Error::Encoding)); + + if let Some(docids_to_delete) = docids_to_delete { + let docids_to_delete = docids_to_delete?; + self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?; + } + + if let Some(docids_to_add) = docids_to_add { + let docids_to_add = docids_to_add?; + self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?; + } } - for (field_id, new_docids) in new_faceted_docids { - let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; - docids |= new_docids; - self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; - } Ok(()) } } /// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type pub struct FacetsUpdateIncrementalInner { - pub db: heed::Database, FacetGroupValueCodec>, + pub db: heed::Database, FacetGroupValueCodec>, pub group_size: u8, pub min_level_size: u8, pub max_group_size: u8, @@ -129,15 +134,14 @@ impl FacetsUpdateIncrementalInner { prefix.extend_from_slice(&field_id.to_be_bytes()); prefix.push(level); - let mut iter = - self.db.as_polymorph().prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( - txn, - prefix.as_slice(), - )?; + let mut iter = self + .db + .remap_types::() + .prefix_iter(txn, prefix.as_slice())?; let (key_bytes, value) = iter.next().unwrap()?; Ok(( - FacetGroupKeyCodec::::bytes_decode(key_bytes) - .ok_or(Error::Encoding)? + FacetGroupKeyCodec::::bytes_decode(key_bytes) + .map_err(Error::Encoding)? .into_owned(), value, )) @@ -172,10 +176,8 @@ impl FacetsUpdateIncrementalInner { level0_prefix.extend_from_slice(&field_id.to_be_bytes()); level0_prefix.push(0); - let mut iter = self - .db - .as_polymorph() - .prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, &level0_prefix)?; + let mut iter = + self.db.remap_types::().prefix_iter(txn, &level0_prefix)?; if iter.next().is_none() { drop(iter); @@ -377,11 +379,8 @@ impl FacetsUpdateIncrementalInner { highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); highest_level_prefix.push(highest_level); - let size_highest_level = self - .db - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)? - .count(); + let size_highest_level = + self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?.count(); if size_highest_level < self.group_size as usize * self.min_level_size as usize { return Ok(()); @@ -389,8 +388,8 @@ impl FacetsUpdateIncrementalInner { let mut groups_iter = self .db - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, &highest_level_prefix)?; + .remap_types::() + .prefix_iter(txn, &highest_level_prefix)?; let nbr_new_groups = size_highest_level / self.group_size as usize; let nbr_leftover_elements = size_highest_level % self.group_size as usize; @@ -401,8 +400,8 @@ impl FacetsUpdateIncrementalInner { let mut values = RoaringBitmap::new(); for _ in 0..group_size { let (key_bytes, value_i) = groups_iter.next().unwrap()?; - let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) - .ok_or(Error::Encoding)?; + let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) + .map_err(Error::Encoding)?; if first_key.is_none() { first_key = Some(key_i); @@ -424,8 +423,8 @@ impl FacetsUpdateIncrementalInner { let mut values = RoaringBitmap::new(); for _ in 0..nbr_leftover_elements { let (key_bytes, value_i) = groups_iter.next().unwrap()?; - let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) - .ok_or(Error::Encoding)?; + let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) + .map_err(Error::Encoding)?; if first_key.is_none() { first_key = Some(key_i); @@ -592,23 +591,21 @@ impl FacetsUpdateIncrementalInner { if highest_level == 0 || self .db - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)? + .remap_types::() + .prefix_iter(txn, &highest_level_prefix)? .count() >= self.min_level_size as usize { return Ok(()); } let mut to_delete = vec![]; - let mut iter = self - .db - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)?; + let mut iter = + self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?; for el in iter.by_ref() { let (k, _) = el?; to_delete.push( - FacetGroupKeyCodec::::bytes_decode(k) - .ok_or(Error::Encoding)? + FacetGroupKeyCodec::::bytes_decode(k) + .map_err(Error::Encoding)? .into_owned(), ); } @@ -1116,7 +1113,7 @@ mod fuzz { #[no_coverage] fn compare_with_trivial_database(tempdir: Rc, operations: &[Operation]) { - let index = FacetIndex::::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten + let index = FacetIndex::::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten let mut txn = index.env.write_txn().unwrap(); let mut trivial_db = TrivialDatabase::>::default(); @@ -1162,16 +1159,13 @@ mod fuzz { let level0iter = index .content .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( - &mut txn, - &field_id.to_be_bytes(), - ) + .prefix_iter::<_, Bytes, FacetGroupValueCodec>(&mut txn, &field_id.to_be_bytes()) .unwrap(); for ((key, values), group) in values_field_id.iter().zip(level0iter) { let (group_key, group_values) = group.unwrap(); let group_key = - FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); assert_eq!(key, &group_key.left_bound); assert_eq!(values, &group_values.bitmap); } @@ -1181,13 +1175,13 @@ mod fuzz { let level0iter = index .content .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) + .prefix_iter::<_, Bytes, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) .unwrap(); for ((key, values), group) in values_field_id.iter().zip(level0iter) { let (group_key, group_values) = group.unwrap(); let group_key = - FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); assert_eq!(key, &group_key.left_bound); assert_eq!(values, &group_values.bitmap); } diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index bbd25f91e..ad8a838c8 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -14,7 +14,7 @@ The databases must be able to return results for queries such as: The algorithms that implement these queries are found in the `src/search/facet` folder. To make these queries fast to compute, the database adopts a tree structure: -```ignore +```text ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ ┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │ │Level 2│ │ │ │ │ @@ -41,7 +41,7 @@ These documents all contain a facet value that is contained within `ab .. gaf`. In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a [`FacetGroupValue`], which have the following format: -```ignore +```text FacetGroupKey: - field id : u16 - level : u8 @@ -83,7 +83,7 @@ use std::iter::FromIterator; use charabia::normalizer::{Normalize, NormalizerOption}; use grenad::{CompressionType, SortAlgorithm}; -use heed::types::{ByteSlice, DecodeIgnore, SerdeJson}; +use heed::types::{Bytes, DecodeIgnore, SerdeJson}; use heed::BytesEncode; use log::debug; use time::OffsetDateTime; @@ -92,13 +92,12 @@ use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; -use crate::heed_codec::ByteSliceRefCodec; +use crate::heed_codec::BytesRefCodec; use crate::update::index_documents::create_sorter; use crate::update::merge_btreeset_string; -use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH}; +use crate::{BEU16StrCodec, Index, Result, MAX_FACET_VALUE_LENGTH}; pub mod bulk; -pub mod delete; pub mod incremental; /// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases. @@ -107,9 +106,9 @@ pub mod incremental; /// a bulk update method or an incremental update method. pub struct FacetsUpdate<'i> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, + database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, group_size: u8, max_group_size: u8, min_level_size: u8, @@ -118,14 +117,14 @@ impl<'i> FacetsUpdate<'i> { pub fn new( index: &'i Index, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, ) -> Self { let database = match facet_type { - FacetType::String => index - .facet_id_string_docids - .remap_key_type::>(), + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; Self { @@ -135,26 +134,26 @@ impl<'i> FacetsUpdate<'i> { max_group_size: FACET_MAX_GROUP_SIZE, min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, - new_data, + delta_data, } } pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { - if self.new_data.is_empty() { + if self.delta_data.is_empty() { return Ok(()); } debug!("Computing and writing the facet values levels docids into LMDB on disk..."); self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // See self::comparison_bench::benchmark_facet_indexing - if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { + if self.delta_data.len() >= (self.database.len(wtxn)? / 50) { let field_ids = self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); let bulk_update = FacetsUpdateBulk::new( self.index, field_ids, self.facet_type, - self.new_data, + self.delta_data, self.group_size, self.min_level_size, ); @@ -163,7 +162,7 @@ impl<'i> FacetsUpdate<'i> { let incremental_update = FacetsUpdateIncremental::new( self.index, self.facet_type, - self.new_data, + self.delta_data, self.group_size, self.min_level_size, self.max_group_size, @@ -208,8 +207,8 @@ impl<'i> FacetsUpdate<'i> { } let set = BTreeSet::from_iter(std::iter::once(left_bound)); let key = (field_id, normalized_facet.as_ref()); - let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?; - let val = SerdeJson::bytes_encode(&set).ok_or(heed::Error::Encoding)?; + let key = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; sorter.insert(key, val)?; } } @@ -218,10 +217,11 @@ impl<'i> FacetsUpdate<'i> { // as the grenad sorter already merged them for us. let mut merger_iter = sorter.into_stream_merger_iter()?; while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? { - self.index - .facet_id_normalized_string_strings - .remap_types::() - .put(wtxn, key_bytes, btreeset_bytes)?; + self.index.facet_id_normalized_string_strings.remap_types::().put( + wtxn, + key_bytes, + btreeset_bytes, + )?; } // We compute one FST by string facet @@ -253,7 +253,7 @@ impl<'i> FacetsUpdate<'i> { // We write those FSTs in LMDB now for (field_id, fst) in text_fsts { - self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?; + self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?; } Ok(()) @@ -268,7 +268,7 @@ pub(crate) mod test_helpers { use std::marker::PhantomData; use std::rc::Rc; - use heed::types::ByteSlice; + use heed::types::Bytes; use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; use roaring::RoaringBitmap; @@ -276,9 +276,10 @@ pub(crate) mod test_helpers { use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; - use crate::heed_codec::ByteSliceRefCodec; + use crate::heed_codec::BytesRefCodec; use crate::search::facet::get_highest_level; use crate::snapshot_tests::display_bitmap; + use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::FacetsUpdateIncrementalInner; use crate::CboRoaringBitmapCodec; @@ -306,7 +307,7 @@ pub(crate) mod test_helpers { BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, { pub env: Env, - pub content: heed::Database, FacetGroupValueCodec>, + pub content: heed::Database, FacetGroupValueCodec>, pub group_size: Cell, pub min_level_size: Cell, pub max_group_size: Cell, @@ -454,21 +455,23 @@ pub(crate) mod test_helpers { let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned(); let key: FacetGroupKey<&[u8]> = FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; - let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + let mut inner_writer = KvWriterDelAdd::memory(); let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap(); - writer.insert(&key, &value).unwrap(); + inner_writer.insert(DelAdd::Addition, value).unwrap(); + writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap(); } writer.finish().unwrap(); let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); let update = FacetsUpdateBulkInner { db: self.content, - new_data: Some(reader), + delta_data: Some(reader), group_size: self.group_size.get(), min_level_size: self.min_level_size.get(), }; - update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); + update.update(wtxn, field_ids).unwrap(); } pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) { @@ -484,12 +487,12 @@ pub(crate) mod test_helpers { let iter = self .content - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, &level_no_prefix) + .remap_types::() + .prefix_iter(txn, &level_no_prefix) .unwrap(); for el in iter { let (key, value) = el.unwrap(); - let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); let mut prefix_start_below = vec![]; prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); @@ -499,14 +502,11 @@ pub(crate) mod test_helpers { let start_below = { let mut start_below_iter = self .content - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( - txn, - &prefix_start_below, - ) + .remap_types::() + .prefix_iter(txn, &prefix_start_below) .unwrap(); let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); - FacetGroupKeyCodec::::bytes_decode(key_bytes).unwrap() + FacetGroupKeyCodec::::bytes_decode(key_bytes).unwrap() }; assert!(value.size > 0); @@ -556,101 +556,6 @@ pub(crate) mod test_helpers { } } -#[cfg(test)] -mod tests { - use big_s::S; - use maplit::hashset; - - use crate::db_snap; - use crate::documents::documents_batch_reader_from_objects; - use crate::index::tests::TempIndex; - use crate::update::DeletionStrategy; - - #[test] - fn replace_all_identical_soft_deletion_then_hard_deletion() { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("size") }); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "size": i % 250, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b"); - db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9"); - db_snap!(index, soft_deleted_documents_ids, "initial", @"[]"); - - let mut documents = vec![]; - for i in 0..999 { - documents.push( - serde_json::json! { - { - "id": i, - "size": i % 250, - "other": 0, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f"); - db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06"); - db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); - - // Then replace the last document while disabling soft_deletion - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; - let mut documents = vec![]; - for i in 999..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "size": i % 250, - "other": 0, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6"); - db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028"); - db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]"); - } -} - #[allow(unused)] #[cfg(test)] mod comparison_bench { @@ -705,7 +610,7 @@ mod comparison_bench { } let time_spent = timer.elapsed().as_millis(); println!(" add {nbr_doc} : {time_spent}ms"); - txn.abort().unwrap(); + txn.abort(); } } } diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 22b16f253..03eb3f4de 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -1,20 +1,17 @@ +use std::fmt; use std::io::{BufWriter, Read, Seek}; use std::result::Result as StdResult; -use std::{fmt, iter}; use serde::{Deserialize, Serialize}; use serde_json::Value; -use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; +use crate::documents::{ + DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader, + EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY, +}; use crate::error::{GeoError, InternalError, UserError}; use crate::update::index_documents::{obkv_to_object, writer_into_reader}; -use crate::{FieldId, Index, Object, Result}; - -/// The symbol used to define levels in a nested primary key. -const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; - -/// The default primary that is used when not specified. -const DEFAULT_PRIMARY_KEY: &str = "id"; +use crate::{FieldId, Index, Result}; /// This function validates and enrich the documents by checking that: /// - we can infer a primary key, @@ -41,14 +38,12 @@ pub fn enrich_documents_batch( // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. let primary_key = match index.primary_key(rtxn)? { - Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { - PrimaryKey::nested(primary_key) - } - Some(primary_key) => match documents_batch_index.id(primary_key) { - Some(id) => PrimaryKey::flat(primary_key, id), - None if autogenerate_docids => { - PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) - } + Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) { + Some(primary_key) => primary_key, + None if autogenerate_docids => PrimaryKey::Flat { + name: primary_key, + field_id: documents_batch_index.insert(primary_key), + }, None => { return match cursor.next_document()? { Some(first_document) => Ok(Err(UserError::MissingDocumentId { @@ -76,14 +71,14 @@ pub fn enrich_documents_batch( }); match guesses.as_slice() { - [] if autogenerate_docids => PrimaryKey::flat( - DEFAULT_PRIMARY_KEY, - documents_batch_index.insert(DEFAULT_PRIMARY_KEY), - ), + [] if autogenerate_docids => PrimaryKey::Flat { + name: DEFAULT_PRIMARY_KEY, + field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY), + }, [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), [(field_id, name)] => { log::info!("Primary key was not specified in index. Inferred to '{name}'"); - PrimaryKey::flat(name, *field_id) + PrimaryKey::Flat { name, field_id: *field_id } } multiple => { return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { @@ -156,92 +151,24 @@ fn fetch_or_generate_document_id( uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], count: u32, ) -> Result> { - match primary_key { - PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { - match document.get(primary_key_id) { - Some(document_id_bytes) => { - let document_id = serde_json::from_slice(document_id_bytes) - .map_err(InternalError::SerdeJson)?; - match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), - Err(user_error) => Ok(Err(user_error)), - } - } - None if autogenerate_docids => { - let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); - Ok(Ok(DocumentId::generated(uuid.to_string(), count))) - } - None => Ok(Err(UserError::MissingDocumentId { - primary_key: primary_key.to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })), - } + Ok(match primary_key.document_id(document, documents_batch_index)? { + Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }), + Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error), + Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => { + let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); + Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count }) } - nested @ PrimaryKey::Nested { .. } => { - let mut matching_documents_ids = Vec::new(); - for (first_level_name, right) in nested.possible_level_names() { - if let Some(field_id) = documents_batch_index.id(first_level_name) { - if let Some(value_bytes) = document.get(field_id) { - let object = serde_json::from_slice(value_bytes) - .map_err(InternalError::SerdeJson)?; - fetch_matching_values(object, right, &mut matching_documents_ids); - - if matching_documents_ids.len() >= 2 { - return Ok(Err(UserError::TooManyDocumentIds { - primary_key: nested.name().to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })); - } - } - } - } - - match matching_documents_ids.pop() { - Some(document_id) => match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), - Err(user_error) => Ok(Err(user_error)), - }, - None => Ok(Err(UserError::MissingDocumentId { - primary_key: nested.name().to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })), - } + Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(document, documents_batch_index)?, + }), + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(document, documents_batch_index)?, + }) } - } -} - -/// A type that represent the type of primary key that has been set -/// for this index, a classic flat one or a nested one. -#[derive(Debug, Clone, Copy)] -enum PrimaryKey<'a> { - Flat { name: &'a str, field_id: FieldId }, - Nested { name: &'a str }, -} - -impl PrimaryKey<'_> { - fn flat(name: &str, field_id: FieldId) -> PrimaryKey { - PrimaryKey::Flat { name, field_id } - } - - fn nested(name: &str) -> PrimaryKey { - PrimaryKey::Nested { name } - } - - fn name(&self) -> &str { - match self { - PrimaryKey::Flat { name, .. } => name, - PrimaryKey::Nested { name } => name, - } - } - - /// Returns an `Iterator` that gives all the possible fields names the primary key - /// can have depending of the first level name and deepnes of the objects. - fn possible_level_names(&self) -> impl Iterator + '_ { - let name = self.name(); - name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) - .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) - .chain(iter::once((name, ""))) - } + }) } /// A type that represents a document id that has been retrieved from a document or auto-generated. @@ -255,14 +182,6 @@ pub enum DocumentId { } impl DocumentId { - fn retrieved(value: String) -> DocumentId { - DocumentId::Retrieved { value } - } - - fn generated(value: String, document_nth: u32) -> DocumentId { - DocumentId::Generated { value, document_nth } - } - fn debug(&self) -> String { format!("{:?}", self) } @@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId { } } -fn starts_with(selector: &str, key: &str) -> bool { - selector.strip_prefix(key).map_or(false, |tail| { - tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) - }) -} - -pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { - match value { - Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), - otherwise => output.push(otherwise), - } -} - -pub fn fetch_matching_values_in_object( - object: Object, - selector: &str, - base_key: &str, - output: &mut Vec, -) { - for (key, value) in object { - let base_key = if base_key.is_empty() { - key.to_string() - } else { - format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) - }; - - if starts_with(selector, &base_key) { - match value { - Value::Object(object) => { - fetch_matching_values_in_object(object, selector, &base_key, output) - } - value => output.push(value), - } - } - } -} - -pub fn validate_document_id(document_id: &str) -> Option<&str> { - if !document_id.is_empty() - && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) - { - Some(document_id) - } else { - None - } -} - -/// Parses a Json encoded document id and validate it, returning a user error when it is one. -pub fn validate_document_id_value(document_id: Value) -> Result> { - match document_id { - Value::String(string) => match validate_document_id(&string) { - Some(s) if s.len() == string.len() => Ok(Ok(string)), - Some(s) => Ok(Ok(s.to_string())), - None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), - }, - Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), - content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), - } -} - /// Try to extract an `f64` from a JSON `Value` and return the `Value` /// in the `Err` variant if it failed. pub fn extract_finite_float_from_value(value: Value) -> StdResult { diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 643d16354..a6bbf939a 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -5,18 +5,16 @@ use std::io::BufReader; use std::{io, mem, str}; use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; -use obkv::KvReader; +use obkv::{KvReader, KvWriterU16}; use roaring::RoaringBitmap; use serde_json::Value; -use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; +use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; -use crate::update::index_documents::MergeFn; -use crate::{ - absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, -}; +use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; +use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; -pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>; +pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; /// Extracts the word and positions where this word appear and /// prefixes it by the document id. @@ -32,25 +30,162 @@ pub fn extract_docid_word_positions( allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: Option, -) -> Result<(RoaringBitmap, grenad::Reader>, ScriptLanguageDocidsMap)> { +) -> Result<(grenad::Reader>, ScriptLanguageDocidsMap)> { puffin::profile_function!(); let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); + // initialize destination values. let mut documents_ids = RoaringBitmap::new(); let mut script_language_docids = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, - concat_u32s_array, + keep_latest_obkv, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - let mut buffers = Buffers::default(); + // initialize buffers. + let mut del_buffers = Buffers::default(); + let mut add_buffers = Buffers::default(); + let mut key_buffer = Vec::new(); + let mut value_buffer = Vec::new(); + + // initialize tokenizer. + let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None); + let tokenizer = builder.build(); + + // iterate over documents. + let mut cursor = obkv_documents.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let document_id = key + .try_into() + .map(u32::from_be_bytes) + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + let obkv = KvReader::::new(value); + + // if the searchable fields didn't change, skip the searchable indexing for this document. + if !searchable_fields_changed(&KvReader::::new(value), searchable_fields) { + continue; + } + + documents_ids.push(document_id); + + // Update key buffer prefix. + key_buffer.clear(); + key_buffer.extend_from_slice(&document_id.to_be_bytes()); + + // Tokenize deletions and additions in 2 diffferent threads. + let (del, add): (Result<_>, Result<_>) = rayon::join( + || { + // deletions + lang_safe_tokens_from_document( + &obkv, + searchable_fields, + &tokenizer, + stop_words, + allowed_separators, + dictionary, + max_positions_per_attributes, + DelAdd::Deletion, + &mut del_buffers, + ) + }, + || { + // additions + lang_safe_tokens_from_document( + &obkv, + searchable_fields, + &tokenizer, + stop_words, + allowed_separators, + dictionary, + max_positions_per_attributes, + DelAdd::Addition, + &mut add_buffers, + ) + }, + ); + + let (del_obkv, del_script_language_word_count) = del?; + let (add_obkv, add_script_language_word_count) = add?; + + // merge deletions and additions. + // transforming two KV> into one KV>> + value_buffer.clear(); + del_add_from_two_obkvs( + KvReader::::new(del_obkv), + KvReader::::new(add_obkv), + &mut value_buffer, + )?; + + // write each KV> into the sorter, field by field. + let obkv = KvReader::::new(&value_buffer); + for (field_id, value) in obkv.iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&field_id.to_be_bytes()); + docid_word_positions_sorter.insert(&key_buffer, value)?; + } + + // update script_language_docids deletions. + for (script, languages_frequency) in del_script_language_word_count { + for (language, _) in languages_frequency { + let entry = script_language_docids + .entry((script, language)) + .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); + entry.0.push(document_id); + } + } + + // update script_language_docids additions. + for (script, languages_frequency) in add_script_language_word_count { + for (language, _) in languages_frequency { + let entry = script_language_docids + .entry((script, language)) + .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); + entry.1.push(document_id); + } + } + } + + // the returned sorter is serialized as: key: (DocId, FieldId), value: KV>. + sorter_into_reader(docid_word_positions_sorter, indexer) + .map(|reader| (reader, script_language_docids)) +} + +/// Check if any searchable fields of a document changed. +fn searchable_fields_changed( + obkv: &KvReader, + searchable_fields: &Option>, +) -> bool { + for (field_id, field_bytes) in obkv.iter() { + if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + let del_add = KvReaderDelAdd::new(field_bytes); + match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { + // if both fields are None, check the next field. + (None, None) => (), + // if both contains a value and values are the same, check the next field. + (Some(del), Some(add)) if del == add => (), + // otherwise the fields are different, return true. + _otherwise => return true, + } + } + } + + false +} + +/// Factorize tokenizer building. +fn tokenizer_builder<'a>( + stop_words: Option<&'a fst::Set<&[u8]>>, + allowed_separators: Option<&'a [&str]>, + dictionary: Option<&'a [&str]>, + script_language: Option<&'a HashMap>>, +) -> TokenizerBuilder<'a, &'a [u8]> { let mut tokenizer_builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { tokenizer_builder.stop_words(stop_words); @@ -61,130 +196,146 @@ pub fn extract_docid_word_positions( if let Some(separators) = allowed_separators { tokenizer_builder.separators(separators); } - let tokenizer = tokenizer_builder.build(); - let mut cursor = obkv_documents.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let document_id = key - .try_into() - .map(u32::from_be_bytes) - .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let obkv = KvReader::::new(value); + if let Some(script_language) = script_language { + tokenizer_builder.allow_list(script_language); + } - documents_ids.push(document_id); - buffers.key_buffer.clear(); - buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes()); + tokenizer_builder +} - let mut script_language_word_count = HashMap::new(); +/// Extract words mapped with their positions of a document, +/// ensuring no Language detection mistakes was made. +#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct +fn lang_safe_tokens_from_document<'a>( + obkv: &KvReader, + searchable_fields: &Option>, + tokenizer: &Tokenizer, + stop_words: Option<&fst::Set<&[u8]>>, + allowed_separators: Option<&[&str]>, + dictionary: Option<&[&str]>, + max_positions_per_attributes: u32, + del_add: DelAdd, + buffers: &'a mut Buffers, +) -> Result<(&'a [u8], HashMap>)> { + let mut script_language_word_count = HashMap::new(); - extract_tokens_from_document( - &obkv, - searchable_fields, - &tokenizer, - max_positions_per_attributes, - &mut buffers, - &mut script_language_word_count, - &mut docid_word_positions_sorter, - )?; + tokens_from_document( + obkv, + searchable_fields, + tokenizer, + max_positions_per_attributes, + del_add, + buffers, + &mut script_language_word_count, + )?; - // if we detect a potetial mistake in the language detection, - // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. - // context: https://github.com/meilisearch/meilisearch/issues/3565 - if script_language_word_count - .values() - .map(Vec::as_slice) - .any(potential_language_detection_error) - { - // build an allow list with the most frequent detected languages in the document. - let script_language: HashMap<_, _> = - script_language_word_count.iter().filter_map(most_frequent_languages).collect(); + // if we detect a potetial mistake in the language detection, + // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. + // context: https://github.com/meilisearch/meilisearch/issues/3565 + if script_language_word_count + .values() + .map(Vec::as_slice) + .any(potential_language_detection_error) + { + // build an allow list with the most frequent detected languages in the document. + let script_language: HashMap<_, _> = + script_language_word_count.iter().filter_map(most_frequent_languages).collect(); - // if the allow list is empty, meaning that no Language is considered frequent, - // then we don't rerun the extraction. - if !script_language.is_empty() { - // build a new temporary tokenizer including the allow list. - let mut tokenizer_builder = TokenizerBuilder::new(); - if let Some(stop_words) = stop_words { - tokenizer_builder.stop_words(stop_words); - } - tokenizer_builder.allow_list(&script_language); - let tokenizer = tokenizer_builder.build(); + // if the allow list is empty, meaning that no Language is considered frequent, + // then we don't rerun the extraction. + if !script_language.is_empty() { + // build a new temporary tokenizer including the allow list. + let mut builder = tokenizer_builder( + stop_words, + allowed_separators, + dictionary, + Some(&script_language), + ); + let tokenizer = builder.build(); - script_language_word_count.clear(); + script_language_word_count.clear(); - // rerun the extraction. - extract_tokens_from_document( - &obkv, - searchable_fields, - &tokenizer, - max_positions_per_attributes, - &mut buffers, - &mut script_language_word_count, - &mut docid_word_positions_sorter, - )?; - } - } - - for (script, languages_frequency) in script_language_word_count { - for (language, _) in languages_frequency { - let entry = script_language_docids - .entry((script, language)) - .or_insert_with(RoaringBitmap::new); - entry.push(document_id); - } + // rerun the extraction. + tokens_from_document( + obkv, + searchable_fields, + &tokenizer, + max_positions_per_attributes, + del_add, + buffers, + &mut script_language_word_count, + )?; } } - sorter_into_reader(docid_word_positions_sorter, indexer) - .map(|reader| (documents_ids, reader, script_language_docids)) + // returns a (KV>, HashMap>) + Ok((&buffers.obkv_buffer, script_language_word_count)) } -fn extract_tokens_from_document( +/// Extract words mapped with their positions of a document. +fn tokens_from_document<'a>( obkv: &KvReader, searchable_fields: &Option>, tokenizer: &Tokenizer, max_positions_per_attributes: u32, - buffers: &mut Buffers, + del_add: DelAdd, + buffers: &'a mut Buffers, script_language_word_count: &mut HashMap>, - docid_word_positions_sorter: &mut grenad::Sorter, -) -> Result<()> { +) -> Result<&'a [u8]> { + buffers.obkv_buffer.clear(); + let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { + // if field is searchable. if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { - let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - buffers.field_buffer.clear(); - if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { - let tokens = process_tokens(tokenizer.tokenize(field)) - .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); + // extract deletion or addition only. + if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { + // parse json. + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - for (index, token) in tokens { - // if a language has been detected for the token, we update the counter. - if let Some(language) = token.language { - let script = token.script; - let entry = - script_language_word_count.entry(script).or_insert_with(Vec::new); - match entry.iter_mut().find(|(l, _)| *l == language) { - Some((_, n)) => *n += 1, - None => entry.push((language, 1)), + // prepare writing destination. + buffers.obkv_positions_buffer.clear(); + let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer); + + // convert json into a unique string. + buffers.field_buffer.clear(); + if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { + // create an iterator of token with their positions. + let tokens = process_tokens(tokenizer.tokenize(field)) + .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); + + for (index, token) in tokens { + // if a language has been detected for the token, we update the counter. + if let Some(language) = token.language { + let script = token.script; + let entry = script_language_word_count.entry(script).or_default(); + match entry.iter_mut().find(|(l, _)| *l == language) { + Some((_, n)) => *n += 1, + None => entry.push((language, 1)), + } + } + + // keep a word only if it is not empty and fit in a LMDB key. + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + let position: u16 = index + .try_into() + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + writer.insert(position, token.as_bytes())?; } } - let token = token.lemma().trim(); - if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - buffers.key_buffer.truncate(mem::size_of::()); - buffers.key_buffer.extend_from_slice(token.as_bytes()); - let position: u16 = index - .try_into() - .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let position = absolute_from_relative_position(field_id, position); - docid_word_positions_sorter - .insert(&buffers.key_buffer, position.to_ne_bytes())?; - } + // write positions into document. + let positions = writer.into_inner()?; + document_writer.insert(field_id, positions)?; } } } } - Ok(()) + // returns a KV> + Ok(document_writer.into_inner().map(|v| v.as_slice())?) } /// Transform a JSON value into a string that can be indexed. @@ -287,10 +438,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize) #[derive(Default)] struct Buffers { - // the key buffer is the concatenation of the internal document id with the field id. - // The buffer has to be completelly cleared between documents, - // and the field id part must be cleared between each field. - key_buffer: Vec, // the field buffer for each fields desserialization, and must be cleared between each field. field_buffer: String, + // buffer used to store the value data containing an obkv. + obkv_buffer: Vec, + // buffer used to store the value data containing an obkv of tokens with their positions. + obkv_positions_buffer: Vec, } diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index d557e0b6c..f860aacba 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -4,11 +4,12 @@ use std::io::{self, BufReader}; use heed::{BytesDecode, BytesEncode}; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, + create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, }; +use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. @@ -17,7 +18,7 @@ use crate::Result; /// documents ids from the given chunk of docid facet number positions. #[logging_timer::time] pub fn extract_facet_number_docids( - docid_fid_facet_number: grenad::Reader, + fid_docid_facet_number: grenad::Reader, indexer: GrenadParameters, ) -> Result>> { puffin::profile_function!(); @@ -26,21 +27,30 @@ pub fn extract_facet_number_docids( let mut facet_number_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - let mut cursor = docid_fid_facet_number.into_cursor()?; - while let Some((key_bytes, _)) = cursor.move_on_next()? { + let mut buffer = Vec::new(); + let mut cursor = fid_docid_facet_number.into_cursor()?; + while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? { let (field_id, document_id, number) = FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); let key = FacetGroupKey { field_id, level: 0, left_bound: number }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() { + obkv.insert(deladd_key, document_id.to_ne_bytes())?; + } + obkv.finish()?; + + facet_number_docids_sorter.insert(key_bytes, &buffer)?; } sorter_into_reader(facet_number_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index b1b27449e..2ade776c3 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -1,13 +1,15 @@ use std::fs::File; -use std::io::{self, BufReader}; +use std::io::BufReader; +use std::{io, str}; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::StrRefCodec; -use crate::update::index_documents::merge_cbo_roaring_bitmaps; -use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; +use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps; +use crate::{FieldId, Result}; /// Extracts the facet string and the documents ids where this facet string appear. /// @@ -24,15 +26,16 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); + let mut buffer = Vec::new(); let mut cursor = docid_fid_facet_string.into_cursor()?; - while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { + while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); @@ -40,21 +43,17 @@ pub fn extract_facet_string_docids( try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); - let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?; - - let normalised_truncated_value: String; - if normalised_value.len() > MAX_FACET_VALUE_LENGTH { - normalised_truncated_value = normalised_value - .char_indices() - .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); - normalised_value = normalised_truncated_value.as_str(); - } - let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; + let normalized_value = str::from_utf8(normalized_value_bytes)?; + let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - // document id is encoded in native-endian because of the CBO roaring bitmap codec - facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; + + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() { + obkv.insert(deladd_key, document_id.to_ne_bytes())?; + } + obkv.finish()?; + facet_string_docids_sorter.insert(&key_bytes, &buffer)?; } sorter_into_reader(facet_string_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 42c355323..b7de1e621 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,24 +1,34 @@ +use std::borrow::Cow; use std::collections::{BTreeMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; use std::mem::size_of; +use std::result::Result as StdResult; -use heed::zerocopy::AsBytes; +use bytemuck::bytes_of; +use grenad::Sorter; use heed::BytesEncode; +use itertools::EitherOrBoth; +use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::{from_slice, Value}; +use FilterableValues::{Empty, Null, Values}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; +use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{create_writer, writer_into_reader}; -use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; +use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH}; + +/// The length of the elements that are always in the buffer when inserting new values. +const TRUNCATE_SIZE: usize = size_of::() + size_of::(); /// The extracted facet values stored in grenad files by type. pub struct ExtractedFacetValues { - pub docid_fid_facet_numbers_chunk: grenad::Reader>, - pub docid_fid_facet_strings_chunk: grenad::Reader>, + pub fid_docid_facet_numbers_chunk: grenad::Reader>, + pub fid_docid_facet_strings_chunk: grenad::Reader>, pub fid_facet_is_null_docids_chunk: grenad::Reader>, pub fid_facet_is_empty_docids_chunk: grenad::Reader>, pub fid_facet_exists_docids_chunk: grenad::Reader>, @@ -58,71 +68,150 @@ pub fn extract_fid_docid_facet_values( max_memory.map(|m| m / 2), ); - let mut facet_exists_docids = BTreeMap::::new(); - let mut facet_is_null_docids = BTreeMap::::new(); - let mut facet_is_empty_docids = BTreeMap::::new(); + // The tuples represents the Del and Add side for a bitmap + let mut facet_exists_docids = BTreeMap::::new(); + let mut facet_is_null_docids = BTreeMap::::new(); + let mut facet_is_empty_docids = BTreeMap::::new(); + + // We create two buffers for mutable ref issues with closures. + let mut numbers_key_buffer = Vec::new(); + let mut strings_key_buffer = Vec::new(); - let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::new(value); for (field_id, field_bytes) in obkv.iter() { if faceted_fields.contains(&field_id) { - key_buffer.clear(); + numbers_key_buffer.clear(); + strings_key_buffer.clear(); // Set key to the field_id // Note: this encoding is consistent with FieldIdCodec - key_buffer.extend_from_slice(&field_id.to_be_bytes()); + numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); + strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); - // Here, we know already that the document must be added to the “field id exists” database let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); - let document = BEU32::from(document).get(); - - facet_exists_docids.entry(field_id).or_default().insert(document); + let document = DocumentId::from_be_bytes(document); // For the other extraction tasks, prefix the key with the field_id and the document_id - key_buffer.extend_from_slice(docid_bytes); + numbers_key_buffer.extend_from_slice(docid_bytes); + strings_key_buffer.extend_from_slice(docid_bytes); - let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + let del_add_obkv = obkv::KvReader::new(field_bytes); + let del_value = match del_add_obkv.get(DelAdd::Deletion) { + Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), + None => None, + }; + let add_value = match del_add_obkv.get(DelAdd::Addition) { + Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), + None => None, + }; - match extract_facet_values( - &value, - geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng), - ) { - FilterableValues::Null => { - facet_is_null_docids.entry(field_id).or_default().insert(document); - } - FilterableValues::Empty => { - facet_is_empty_docids.entry(field_id).or_default().insert(document); - } - FilterableValues::Values { numbers, strings } => { - // insert facet numbers in sorter - for number in numbers { - key_buffer.truncate(size_of::() + size_of::()); - if let Some(value_bytes) = f64_into_bytes(number) { - key_buffer.extend_from_slice(&value_bytes); - key_buffer.extend_from_slice(&number.to_be_bytes()); + // We insert the document id on the Del and the Add side if the field exists. + let (ref mut del_exists, ref mut add_exists) = + facet_exists_docids.entry(field_id).or_default(); + let (ref mut del_is_null, ref mut add_is_null) = + facet_is_null_docids.entry(field_id).or_default(); + let (ref mut del_is_empty, ref mut add_is_empty) = + facet_is_empty_docids.entry(field_id).or_default(); - fid_docid_facet_numbers_sorter - .insert(&key_buffer, ().as_bytes())?; - } + if del_value.is_some() { + del_exists.insert(document); + } + if add_value.is_some() { + add_exists.insert(document); + } + + let geo_support = + geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng); + let del_filterable_values = + del_value.map(|value| extract_facet_values(&value, geo_support)); + let add_filterable_values = + add_value.map(|value| extract_facet_values(&value, geo_support)); + + // Those closures are just here to simplify things a bit. + let mut insert_numbers_diff = |del_numbers, add_numbers| { + insert_numbers_diff( + &mut fid_docid_facet_numbers_sorter, + &mut numbers_key_buffer, + del_numbers, + add_numbers, + ) + }; + let mut insert_strings_diff = |del_strings, add_strings| { + insert_strings_diff( + &mut fid_docid_facet_strings_sorter, + &mut strings_key_buffer, + del_strings, + add_strings, + ) + }; + + match (del_filterable_values, add_filterable_values) { + (None, None) => (), + (Some(del_filterable_values), None) => match del_filterable_values { + Null => { + del_is_null.insert(document); } - - // insert normalized and original facet string in sorter - for (normalized, original) in - strings.into_iter().filter(|(n, _)| !n.is_empty()) - { - let normalized_truncated_value: String = normalized - .char_indices() - .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); - - key_buffer.truncate(size_of::() + size_of::()); - key_buffer.extend_from_slice(normalized_truncated_value.as_bytes()); - fid_docid_facet_strings_sorter - .insert(&key_buffer, original.as_bytes())?; + Empty => { + del_is_empty.insert(document); + } + Values { numbers, strings } => { + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + }, + (None, Some(add_filterable_values)) => match add_filterable_values { + Null => { + add_is_null.insert(document); + } + Empty => { + add_is_empty.insert(document); + } + Values { numbers, strings } => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + } + }, + (Some(del_filterable_values), Some(add_filterable_values)) => { + match (del_filterable_values, add_filterable_values) { + (Null, Null) | (Empty, Empty) => (), + (Null, Empty) => { + del_is_null.insert(document); + add_is_empty.insert(document); + } + (Empty, Null) => { + del_is_empty.insert(document); + add_is_null.insert(document); + } + (Null, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + del_is_null.insert(document); + } + (Empty, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + del_is_empty.insert(document); + } + (Values { numbers, strings }, Null) => { + add_is_null.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + (Values { numbers, strings }, Empty) => { + add_is_empty.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + ( + Values { numbers: del_numbers, strings: del_strings }, + Values { numbers: add_numbers, strings: add_strings }, + ) => { + insert_numbers_diff(del_numbers, add_numbers)?; + insert_strings_diff(del_strings, add_strings)?; + } } } } @@ -130,14 +219,15 @@ pub fn extract_fid_docid_facet_values( } } + let mut buffer = Vec::new(); let mut facet_exists_docids_writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, tempfile::tempfile()?, ); - for (fid, bitmap) in facet_exists_docids.into_iter() { - let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); - facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() { + deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; + facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?; } let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; @@ -146,9 +236,9 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_level, tempfile::tempfile()?, ); - for (fid, bitmap) in facet_is_null_docids.into_iter() { - let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); - facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() { + deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; + facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?; } let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; @@ -157,21 +247,156 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_level, tempfile::tempfile()?, ); - for (fid, bitmap) in facet_is_empty_docids.into_iter() { - let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); - facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() { + deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; + facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?; } let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?; Ok(ExtractedFacetValues { - docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, - docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, + fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, fid_facet_is_null_docids_chunk: facet_is_null_docids_reader, fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader, fid_facet_exists_docids_chunk: facet_exists_docids_reader, }) } +/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps. +fn deladd_obkv_cbo_roaring_bitmaps( + buffer: &mut Vec, + del_bitmap: &RoaringBitmap, + add_bitmap: &RoaringBitmap, +) -> io::Result<()> { + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(buffer); + let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap(); + let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap(); + obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?; + obkv.insert(DelAdd::Addition, add_bitmap_bytes)?; + obkv.finish() +} + +/// Truncates a string to the biggest valid LMDB key size. +fn truncate_string(s: String) -> String { + s.char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect() +} + +/// Computes the diff between both Del and Add numbers and +/// only inserts the parts that differ in the sorter. +fn insert_numbers_diff( + fid_docid_facet_numbers_sorter: &mut Sorter, + key_buffer: &mut Vec, + mut del_numbers: Vec, + mut add_numbers: Vec, +) -> Result<()> +where + MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, Error>, +{ + // We sort and dedup the float numbers + del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); + add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); + del_numbers.dedup_by_key(|f| OrderedFloat(*f)); + add_numbers.dedup_by_key(|f| OrderedFloat(*f)); + + let merged_numbers_iter = itertools::merge_join_by( + del_numbers.into_iter().map(OrderedFloat), + add_numbers.into_iter().map(OrderedFloat), + |del, add| del.cmp(add), + ); + + // insert facet numbers in sorter + for eob in merged_numbers_iter { + key_buffer.truncate(TRUNCATE_SIZE); + match eob { + EitherOrBoth::Both(_, _) => (), // no need to touch anything + EitherOrBoth::Left(OrderedFloat(number)) => { + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those numbers. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, bytes_of(&()))?; + let bytes = obkv.into_inner()?; + fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; + } + } + EitherOrBoth::Right(OrderedFloat(number)) => { + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those numbers. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, bytes_of(&()))?; + let bytes = obkv.into_inner()?; + fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; + } + } + } + } + + Ok(()) +} + +/// Computes the diff between both Del and Add strings and +/// only inserts the parts that differ in the sorter. +fn insert_strings_diff( + fid_docid_facet_strings_sorter: &mut Sorter, + key_buffer: &mut Vec, + mut del_strings: Vec<(String, String)>, + mut add_strings: Vec<(String, String)>, +) -> Result<()> +where + MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, Error>, +{ + // We sort and dedup the normalized and original strings + del_strings.sort_unstable(); + add_strings.sort_unstable(); + del_strings.dedup(); + add_strings.dedup(); + + let merged_strings_iter = itertools::merge_join_by( + del_strings.into_iter().filter(|(n, _)| !n.is_empty()), + add_strings.into_iter().filter(|(n, _)| !n.is_empty()), + |del, add| del.cmp(add), + ); + + // insert normalized and original facet string in sorter + for eob in merged_strings_iter { + key_buffer.truncate(TRUNCATE_SIZE); + match eob { + EitherOrBoth::Both(_, _) => (), // no need to touch anything + EitherOrBoth::Left((normalized, original)) => { + let truncated = truncate_string(normalized); + key_buffer.extend_from_slice(truncated.as_bytes()); + + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, original)?; + let bytes = obkv.into_inner()?; + fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; + } + EitherOrBoth::Right((normalized, original)) => { + let truncated = truncate_string(normalized); + key_buffer.extend_from_slice(truncated.as_bytes()); + + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, original)?; + let bytes = obkv.into_inner()?; + fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; + } + } + } + + Ok(()) +} + /// Represent what a document field contains. enum FilterableValues { /// Corresponds to the JSON `null` value. @@ -182,6 +407,7 @@ enum FilterableValues { Values { numbers: Vec, strings: Vec<(String, String)> }, } +/// Extracts the facet values of a JSON field. fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues { fn inner_extract_facet_values( value: &Value, diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 92564b4cd..182d0c5d8 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -1,16 +1,18 @@ -use std::collections::HashMap; use std::fs::File; use std::io::{self, BufReader}; -use grenad::Sorter; +use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, + GrenadParameters, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{relative_from_absolute_position, DocumentId, FieldId, Result}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::Result; + +const MAX_COUNTED_WORDS: usize = 30; /// Extracts the field id word count and the documents ids where /// this field id with this amount of words appear. @@ -28,70 +30,62 @@ pub fn extract_fid_word_count_docids( let mut fid_word_count_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - // This map is assumed to not consume a lot of memory. - let mut document_fid_wordcount = HashMap::new(); - let mut current_document_id = None; - + let mut key_buffer = Vec::new(); + let mut value_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, _word_bytes) = try_split_array_at(key) + let (document_id_bytes, fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - let curr_document_id = *current_document_id.get_or_insert(document_id); - if curr_document_id != document_id { - drain_document_fid_wordcount_into_sorter( - &mut fid_word_count_docids_sorter, - &mut document_fid_wordcount, - curr_document_id, - )?; - current_document_id = Some(document_id); + let del_add_reader = KvReaderDelAdd::new(value); + let deletion = del_add_reader + // get deleted words + .get(DelAdd::Deletion) + // count deleted words + .map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count()) + // keep the count if under or equal to MAX_COUNTED_WORDS + .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); + let addition = del_add_reader + // get added words + .get(DelAdd::Addition) + // count added words + .map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count()) + // keep the count if under or equal to MAX_COUNTED_WORDS + .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); + + if deletion != addition { + // Insert deleted word count in sorter if exist. + if let Some(word_count) = deletion { + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + key_buffer.clear(); + key_buffer.extend_from_slice(fid_bytes); + key_buffer.push(word_count as u8); + fid_word_count_docids_sorter + .insert(&key_buffer, value_writer.into_inner().unwrap())?; + } + // Insert added word count in sorter if exist. + if let Some(word_count) = addition { + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key_buffer.clear(); + key_buffer.extend_from_slice(fid_bytes); + key_buffer.push(word_count as u8); + fid_word_count_docids_sorter + .insert(&key_buffer, value_writer.into_inner().unwrap())?; + } } - - for position in read_u32_ne_bytes(value) { - let (field_id, _) = relative_from_absolute_position(position); - - let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); - *value += 1; - } - } - - if let Some(document_id) = current_document_id { - // We must make sure that don't lose the current document field id - // word count map if we break because we reached the end of the chunk. - drain_document_fid_wordcount_into_sorter( - &mut fid_word_count_docids_sorter, - &mut document_fid_wordcount, - document_id, - )?; } sorter_into_reader(fid_word_count_docids_sorter, indexer) } - -fn drain_document_fid_wordcount_into_sorter( - fid_word_count_docids_sorter: &mut Sorter, - document_fid_wordcount: &mut HashMap, - document_id: DocumentId, -) -> Result<()> { - let mut key_buffer = Vec::new(); - - for (fid, count) in document_fid_wordcount.drain() { - if count <= 30 { - key_buffer.clear(); - key_buffer.extend_from_slice(&fid.to_be_bytes()); - key_buffer.push(count as u8); - - fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; - } - } - - Ok(()) -} diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 285a4bdba..5ee7967d2 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -6,6 +6,7 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::GeoError; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::extract_finite_float_from_value; use crate::{FieldId, InternalError, Result}; @@ -30,39 +31,71 @@ pub fn extract_geo_points( let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::new(value); - // since we only needs the primary key when we throw an error we create this getter to - // lazily get it when needed + // since we only need the primary key when we throw an error + // we create this getter to lazily get it when needed let document_id = || -> Value { let document_id = obkv.get(primary_key_id).unwrap(); serde_json::from_slice(document_id).unwrap() }; // first we get the two fields - let lat = obkv.get(lat_fid); - let lng = obkv.get(lng_fid); + match (obkv.get(lat_fid), obkv.get(lng_fid)) { + (Some(lat), Some(lng)) => { + let deladd_lat_obkv = KvReaderDelAdd::new(lat); + let deladd_lng_obkv = KvReaderDelAdd::new(lng); - if let Some((lat, lng)) = lat.zip(lng) { - // then we extract the values - let lat = extract_finite_float_from_value( - serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, - ) - .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; + // then we extract the values + let del_lat_lng = deladd_lat_obkv + .get(DelAdd::Deletion) + .zip(deladd_lng_obkv.get(DelAdd::Deletion)) + .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id)) + .transpose()?; + let add_lat_lng = deladd_lat_obkv + .get(DelAdd::Addition) + .zip(deladd_lng_obkv.get(DelAdd::Addition)) + .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id)) + .transpose()?; - let lng = extract_finite_float_from_value( - serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, - ) - .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; - - #[allow(clippy::drop_non_drop)] - let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; - writer.insert(docid_bytes, bytes)?; - } else if lat.is_none() && lng.is_some() { - return Err(GeoError::MissingLatitude { document_id: document_id() })?; - } else if lat.is_some() && lng.is_none() { - return Err(GeoError::MissingLongitude { document_id: document_id() })?; + if del_lat_lng != add_lat_lng { + let mut obkv = KvWriterDelAdd::memory(); + if let Some([lat, lng]) = del_lat_lng { + #[allow(clippy::drop_non_drop)] + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; + obkv.insert(DelAdd::Deletion, bytes)?; + } + if let Some([lat, lng]) = add_lat_lng { + #[allow(clippy::drop_non_drop)] + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; + obkv.insert(DelAdd::Addition, bytes)?; + } + let bytes = obkv.into_inner()?; + writer.insert(docid_bytes, bytes)?; + } + } + (None, Some(_)) => { + return Err(GeoError::MissingLatitude { document_id: document_id() }.into()) + } + (Some(_), None) => { + return Err(GeoError::MissingLongitude { document_id: document_id() }.into()) + } + (None, None) => (), } - // else => the _geo object was `null`, there is nothing to do } writer_into_reader(writer) } + +/// Extract the finite floats lat and lng from two bytes slices. +fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> { + let lat = extract_finite_float_from_value( + serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; + + let lng = extract_finite_float_from_value( + serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; + + Ok([lat, lng]) +} diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 863bc07c3..317a9aec3 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -1,13 +1,24 @@ +use std::cmp::Ordering; use std::convert::TryFrom; use std::fs::File; -use std::io::{self, BufReader}; +use std::io::{self, BufReader, BufWriter}; +use std::mem::size_of; +use std::str::from_utf8; use bytemuck::cast_slice; +use grenad::Writer; +use itertools::EitherOrBoth; +use ordered_float::OrderedFloat; use serde_json::{from_slice, Value}; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::UserError; -use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::index_documents::helpers::try_split_at; +use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors}; + +/// The length of the elements that are always in the buffer when inserting new values. +const TRUNCATE_SIZE: usize = size_of::(); /// Extracts the embedding vector contained in each document under the `_vectors` field. /// @@ -16,7 +27,6 @@ use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors}; pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, - primary_key_id: FieldId, vectors_fid: FieldId, ) -> Result>> { puffin::profile_function!(); @@ -27,43 +37,112 @@ pub fn extract_vector_points( tempfile::tempfile()?, ); + let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; - while let Some((docid_bytes, value)) = cursor.move_on_next()? { + while let Some((key, value)) = cursor.move_on_next()? { + // this must always be serialized as (docid, external_docid); + let (docid_bytes, external_id_bytes) = + try_split_at(key, std::mem::size_of::()).unwrap(); + debug_assert!(from_utf8(external_id_bytes).is_ok()); + let obkv = obkv::KvReader::new(value); + key_buffer.clear(); + key_buffer.extend_from_slice(docid_bytes); // since we only needs the primary key when we throw an error we create this getter to // lazily get it when needed - let document_id = || -> Value { - let document_id = obkv.get(primary_key_id).unwrap(); - from_slice(document_id).unwrap() - }; + let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; // first we retrieve the _vectors field - if let Some(vectors) = obkv.get(vectors_fid) { - // extract the vectors - let vectors = match from_slice(vectors) { - Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors), - Err(_) => { - return Err(UserError::InvalidVectorsType { - document_id: document_id(), - value: from_slice(vectors).map_err(InternalError::SerdeJson)?, - } - .into()) - } - }; + if let Some(value) = obkv.get(vectors_fid) { + let vectors_obkv = KvReaderDelAdd::new(value); - if let Some(vectors) = vectors { - for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) { - let index = u16::try_from(i).unwrap(); - let mut key = docid_bytes.to_vec(); - key.extend_from_slice(&index.to_be_bytes()); - let bytes = cast_slice(&vector); - writer.insert(key, bytes)?; - } - } + // then we extract the values + let del_vectors = vectors_obkv + .get(DelAdd::Deletion) + .map(|vectors| extract_vectors(vectors, document_id)) + .transpose()? + .flatten(); + let add_vectors = vectors_obkv + .get(DelAdd::Addition) + .map(|vectors| extract_vectors(vectors, document_id)) + .transpose()? + .flatten(); + + // and we finally push the unique vectors into the writer + push_vectors_diff( + &mut writer, + &mut key_buffer, + del_vectors.unwrap_or_default(), + add_vectors.unwrap_or_default(), + )?; } - // else => the `_vectors` object was `null`, there is nothing to do } writer_into_reader(writer) } + +/// Computes the diff between both Del and Add numbers and +/// only inserts the parts that differ in the sorter. +fn push_vectors_diff( + writer: &mut Writer>, + key_buffer: &mut Vec, + mut del_vectors: Vec>, + mut add_vectors: Vec>, +) -> Result<()> { + // We sort and dedup the vectors + del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); + add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); + del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + + let merged_vectors_iter = + itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); + + // insert vectors into the writer + for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { + // Generate the key by extending the unique index to it. + key_buffer.truncate(TRUNCATE_SIZE); + let index = u16::try_from(i).unwrap(); + key_buffer.extend_from_slice(&index.to_be_bytes()); + + match eob { + EitherOrBoth::Both(_, _) => (), // no need to touch anything + EitherOrBoth::Left(vector) => { + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + writer.insert(&key_buffer, bytes)?; + } + EitherOrBoth::Right(vector) => { + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + writer.insert(&key_buffer, bytes)?; + } + } + } + + Ok(()) +} + +/// Compares two vectors by using the OrderingFloat helper. +fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering { + a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat)) +} + +/// Extracts the vectors from a JSON value. +fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result>>> { + match from_slice(value) { + Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)), + Err(_) => Err(UserError::InvalidVectorsType { + document_id: document_id(), + value: from_slice(value).map_err(InternalError::SerdeJson)?, + } + .into()), + } +} diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index f211f7023..66092821f 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -1,18 +1,20 @@ -use std::collections::HashSet; +use std::collections::{BTreeSet, HashSet}; use std::fs::File; use std::io::{self, BufReader}; -use std::iter::FromIterator; -use roaring::RoaringBitmap; +use heed::BytesDecode; +use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, - try_split_array_at, GrenadParameters, + create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, + try_split_array_at, writer_into_reader, GrenadParameters, }; use crate::error::SerializationError; +use crate::heed_codec::StrBEU16Codec; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::update::index_documents::helpers::read_u32_ne_bytes; -use crate::{relative_from_absolute_position, FieldId, Result}; +use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::MergeFn; +use crate::{DocumentId, FieldId, Result}; /// Extracts the word and the documents ids where this word appear. /// @@ -26,65 +28,152 @@ pub fn extract_word_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, exact_attributes: &HashSet, -) -> Result<(grenad::Reader>, grenad::Reader>)> { +) -> Result<( + grenad::Reader>, + grenad::Reader>, + grenad::Reader>, +)> { puffin::profile_function!(); let max_memory = indexer.max_memory_by_thread(); - let mut word_docids_sorter = create_sorter( + let mut word_fid_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|x| x / 2), + max_memory.map(|x| x / 3), + ); + let mut key_buffer = Vec::new(); + let mut del_words = BTreeSet::new(); + let mut add_words = BTreeSet::new(); + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, fid_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let (fid_bytes, _) = try_split_array_at(fid_bytes) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = u32::from_be_bytes(document_id_bytes); + let fid = u16::from_be_bytes(fid_bytes); + + let del_add_reader = KvReaderDelAdd::new(value); + // extract all unique words to remove. + if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { + for (_pos, word) in KvReaderU16::new(deletion).iter() { + del_words.insert(word.to_vec()); + } + } + + // extract all unique additional words. + if let Some(addition) = del_add_reader.get(DelAdd::Addition) { + for (_pos, word) in KvReaderU16::new(addition).iter() { + add_words.insert(word.to_vec()); + } + } + + words_into_sorter( + document_id, + fid, + &mut key_buffer, + &del_words, + &add_words, + &mut word_fid_docids_sorter, + )?; + + del_words.clear(); + add_words.clear(); + } + + let mut word_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_deladd_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|x| x / 3), ); let mut exact_word_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|x| x / 2), + max_memory.map(|x| x / 3), ); - let mut value_buffer = Vec::new(); - let mut cursor = docid_word_positions.into_cursor()?; - while let Some((key, positions)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) - .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - let document_id = u32::from_be_bytes(document_id_bytes); + let mut word_fid_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); - let bitmap = RoaringBitmap::from_iter(Some(document_id)); - serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; + let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; + // TODO: replace sorters by writers by accumulating values into a buffer before inserting them. + while let Some((key, value)) = iter.next()? { + // only keep the value if their is a change to apply in the DB. + if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) { + word_fid_docids_writer.insert(key, value)?; + } - // If there are no exact attributes, we do not need to iterate over positions. - if exact_attributes.is_empty() { - word_docids_sorter.insert(word_bytes, &value_buffer)?; + let (word, fid) = StrBEU16Codec::bytes_decode(key) + .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + + // every words contained in an attribute set to exact must be pushed in the exact_words list. + if exact_attributes.contains(&fid) { + exact_word_docids_sorter.insert(word.as_bytes(), value)?; } else { - let mut added_to_exact = false; - let mut added_to_word_docids = false; - for position in read_u32_ne_bytes(positions) { - // as soon as we know that this word had been to both readers, we don't need to - // iterate over the positions. - if added_to_exact && added_to_word_docids { - break; - } - let (fid, _) = relative_from_absolute_position(position); - if exact_attributes.contains(&fid) && !added_to_exact { - exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; - added_to_exact = true; - } else if !added_to_word_docids { - word_docids_sorter.insert(word_bytes, &value_buffer)?; - added_to_word_docids = true; - } - } + word_docids_sorter.insert(word.as_bytes(), value)?; } } Ok(( sorter_into_reader(word_docids_sorter, indexer)?, sorter_into_reader(exact_word_docids_sorter, indexer)?, + writer_into_reader(word_fid_docids_writer)?, )) } + +fn words_into_sorter( + document_id: DocumentId, + fid: FieldId, + key_buffer: &mut Vec, + del_words: &BTreeSet>, + add_words: &BTreeSet>, + word_fid_docids_sorter: &mut grenad::Sorter, +) -> Result<()> { + puffin::profile_function!(); + + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + let mut buffer = Vec::new(); + for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) { + buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut buffer); + let word_bytes = match eob { + Left(word_bytes) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + word_bytes + } + Right(word_bytes) => { + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + word_bytes + } + Both(word_bytes, _) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + word_bytes + } + }; + + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?; + } + + Ok(()) +} diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs deleted file mode 100644 index 09f571038..000000000 --- a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs +++ /dev/null @@ -1,51 +0,0 @@ -use std::fs::File; -use std::io::{self, BufReader}; - -use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, -}; -use crate::error::SerializationError; -use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{relative_from_absolute_position, DocumentId, Result}; - -/// Extracts the word, field id, and the documents ids where this word appear at this field id. -#[logging_timer::time] -pub fn extract_word_fid_docids( - docid_word_positions: grenad::Reader, - indexer: GrenadParameters, -) -> Result>> { - puffin::profile_function!(); - - let max_memory = indexer.max_memory_by_thread(); - - let mut word_fid_docids_sorter = create_sorter( - grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ); - - let mut key_buffer = Vec::new(); - let mut cursor = docid_word_positions.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) - .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - let document_id = DocumentId::from_be_bytes(document_id_bytes); - - for position in read_u32_ne_bytes(value) { - key_buffer.clear(); - key_buffer.extend_from_slice(word_bytes); - key_buffer.push(0); - let (fid, _) = relative_from_absolute_position(position); - key_buffer.extend_from_slice(&fid.to_be_bytes()); - word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; - } - } - - let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; - - Ok(word_fid_docids_reader) -} diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 9ddd5ff4c..b8a377247 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -1,16 +1,18 @@ -use std::cmp::Ordering; -use std::collections::{BinaryHeap, HashMap}; +use std::collections::{BTreeMap, VecDeque}; use std::fs::File; use std::io::BufReader; -use std::{cmp, io, mem, str, vec}; +use std::{cmp, io}; + +use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, MergeFn, + create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at, + writer_into_reader, GrenadParameters, MergeFn, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::proximity::{positions_proximity, MAX_DISTANCE}; +use crate::proximity::{index_proximity, MAX_DISTANCE}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::{DocumentId, Result}; /// Extracts the best proximity between pairs of words and the documents ids where this pair appear. @@ -26,58 +28,137 @@ pub fn extract_word_pair_proximity_docids( let max_memory = indexer.max_memory_by_thread(); - let mut word_pair_proximity_docids_sorter = create_sorter( - grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory.map(|m| m / 2), - ); + let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE) + .map(|_| { + create_sorter( + grenad::SortAlgorithm::Unstable, + merge_deladd_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / MAX_DISTANCE as usize), + ) + }) + .collect(); - // This map is assumed to not consume a lot of memory. - let mut document_word_positions_heap = BinaryHeap::new(); + let mut del_word_positions: VecDeque<(String, u16)> = + VecDeque::with_capacity(MAX_DISTANCE as usize); + let mut add_word_positions: VecDeque<(String, u16)> = + VecDeque::with_capacity(MAX_DISTANCE as usize); + let mut del_word_pair_proximity = BTreeMap::new(); + let mut add_word_pair_proximity = BTreeMap::new(); let mut current_document_id = None; let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) + let (document_id_bytes, _fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - let word = str::from_utf8(word_bytes)?; - let curr_document_id = *current_document_id.get_or_insert(document_id); - if curr_document_id != document_id { - let document_word_positions_heap = mem::take(&mut document_word_positions_heap); + // if we change document, we fill the sorter + if current_document_id.map_or(false, |id| id != document_id) { + puffin::profile_scope!("Document into sorter"); + document_word_positions_into_sorter( - curr_document_id, - document_word_positions_heap, - &mut word_pair_proximity_docids_sorter, + current_document_id.unwrap(), + &del_word_pair_proximity, + &add_word_pair_proximity, + &mut word_pair_proximity_docids_sorters, )?; - current_document_id = Some(document_id); + del_word_pair_proximity.clear(); + add_word_pair_proximity.clear(); } - let word = word.to_string(); - let mut positions: Vec<_> = read_u32_ne_bytes(value).collect(); - positions.sort_unstable(); - let mut iter = positions.into_iter(); - if let Some(position) = iter.next() { - document_word_positions_heap.push(PeekedWordPosition { word, position, iter }); - } + current_document_id = Some(document_id); + + let (del, add): (Result<_>, Result<_>) = rayon::join( + || { + // deletions + if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) { + for (position, word) in KvReaderU16::new(deletion).iter() { + // drain the proximity window until the head word is considered close to the word we are inserting. + while del_word_positions.get(0).map_or(false, |(_w, p)| { + index_proximity(*p as u32, position as u32) >= MAX_DISTANCE + }) { + word_positions_into_word_pair_proximity( + &mut del_word_positions, + &mut del_word_pair_proximity, + )?; + } + + // insert the new word. + let word = std::str::from_utf8(word)?; + del_word_positions.push_back((word.to_string(), position)); + } + + while !del_word_positions.is_empty() { + word_positions_into_word_pair_proximity( + &mut del_word_positions, + &mut del_word_pair_proximity, + )?; + } + } + + Ok(()) + }, + || { + // additions + if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) { + for (position, word) in KvReaderU16::new(addition).iter() { + // drain the proximity window until the head word is considered close to the word we are inserting. + while add_word_positions.get(0).map_or(false, |(_w, p)| { + index_proximity(*p as u32, position as u32) >= MAX_DISTANCE + }) { + word_positions_into_word_pair_proximity( + &mut add_word_positions, + &mut add_word_pair_proximity, + )?; + } + + // insert the new word. + let word = std::str::from_utf8(word)?; + add_word_positions.push_back((word.to_string(), position)); + } + + while !add_word_positions.is_empty() { + word_positions_into_word_pair_proximity( + &mut add_word_positions, + &mut add_word_pair_proximity, + )?; + } + } + + Ok(()) + }, + ); + + del?; + add?; } if let Some(document_id) = current_document_id { - // We must make sure that don't lose the current document field id - // word count map if we break because we reached the end of the chunk. - let document_word_positions_heap = mem::take(&mut document_word_positions_heap); + puffin::profile_scope!("Final document into sorter"); document_word_positions_into_sorter( document_id, - document_word_positions_heap, - &mut word_pair_proximity_docids_sorter, + &del_word_pair_proximity, + &add_word_pair_proximity, + &mut word_pair_proximity_docids_sorters, )?; } + { + puffin::profile_scope!("sorter_into_reader"); + let mut writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); - sorter_into_reader(word_pair_proximity_docids_sorter, indexer) + for sorter in word_pair_proximity_docids_sorters { + sorter.write_into_stream_writer(&mut writer)?; + } + + writer_into_reader(writer) + } } /// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive. @@ -86,96 +167,66 @@ pub fn extract_word_pair_proximity_docids( /// close to each other. fn document_word_positions_into_sorter( document_id: DocumentId, - mut word_positions_heap: BinaryHeap>>, - word_pair_proximity_docids_sorter: &mut grenad::Sorter, + del_word_pair_proximity: &BTreeMap<(String, String), u8>, + add_word_pair_proximity: &BTreeMap<(String, String), u8>, + word_pair_proximity_docids_sorters: &mut [grenad::Sorter], ) -> Result<()> { - let mut word_pair_proximity = HashMap::new(); - let mut ordered_peeked_word_positions = Vec::new(); - while !word_positions_heap.is_empty() { - while let Some(peeked_word_position) = word_positions_heap.pop() { - ordered_peeked_word_positions.push(peeked_word_position); - if ordered_peeked_word_positions.len() == 7 { - break; - } - } - - if let Some((head, tail)) = ordered_peeked_word_positions.split_first() { - for PeekedWordPosition { word, position, .. } in tail { - let prox = positions_proximity(head.position, *position); - if prox > 0 && prox < MAX_DISTANCE { - word_pair_proximity - .entry((head.word.clone(), word.clone())) - .and_modify(|p| { - *p = cmp::min(*p, prox); - }) - .or_insert(prox); - } - } - - // Push the tail in the heap. - let tail_iter = ordered_peeked_word_positions.drain(1..); - word_positions_heap.extend(tail_iter); - - // Advance the head and push it in the heap. - if let Some(mut head) = ordered_peeked_word_positions.pop() { - if let Some(next_position) = head.iter.next() { - let prox = positions_proximity(head.position, next_position); - - if prox > 0 && prox < MAX_DISTANCE { - word_pair_proximity - .entry((head.word.clone(), head.word.clone())) - .and_modify(|p| { - *p = cmp::min(*p, prox); - }) - .or_insert(prox); - } - - word_positions_heap.push(PeekedWordPosition { - word: head.word, - position: next_position, - iter: head.iter, - }); - } - } - } - } + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + let mut buffer = Vec::new(); let mut key_buffer = Vec::new(); - for ((w1, w2), prox) in word_pair_proximity { + for eob in + merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| { + d.cmp(a) + }) + { + buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut buffer); + let ((w1, w2), prox) = match eob { + Left(key_value) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + key_value + } + Right(key_value) => { + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key_value + } + Both(key_value, _) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key_value + } + }; + key_buffer.clear(); - key_buffer.push(prox as u8); + key_buffer.push(*prox); key_buffer.extend_from_slice(w1.as_bytes()); key_buffer.push(0); key_buffer.extend_from_slice(w2.as_bytes()); - word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + word_pair_proximity_docids_sorters[*prox as usize - 1] + .insert(&key_buffer, value_writer.into_inner().unwrap())?; } Ok(()) } -struct PeekedWordPosition { - word: String, - position: u32, - iter: I, -} - -impl Ord for PeekedWordPosition { - fn cmp(&self, other: &Self) -> Ordering { - self.position.cmp(&other.position).reverse() - } -} - -impl PartialOrd for PeekedWordPosition { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Eq for PeekedWordPosition {} - -impl PartialEq for PeekedWordPosition { - fn eq(&self, other: &Self) -> bool { - self.position == other.position +fn word_positions_into_word_pair_proximity( + word_positions: &mut VecDeque<(String, u16)>, + word_pair_proximity: &mut BTreeMap<(String, String), u8>, +) -> Result<()> { + let (head_word, head_position) = word_positions.pop_front().unwrap(); + for (word, position) in word_positions.iter() { + let prox = index_proximity(head_position as u32, *position as u32) as u8; + if prox > 0 && prox < MAX_DISTANCE as u8 { + word_pair_proximity + .entry((head_word.clone(), word.clone())) + .and_modify(|p| { + *p = cmp::min(*p, prox); + }) + .or_insert(prox); + } } + Ok(()) } diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 94139ddf8..89b77d140 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -1,13 +1,18 @@ +use std::collections::BTreeSet; use std::fs::File; use std::io::{self, BufReader}; +use obkv::KvReaderU16; + use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, + create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, + GrenadParameters, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::MergeFn; +use crate::{bucketed_position, DocumentId, Result}; /// Extracts the word positions and the documents ids where this word appear. /// @@ -24,32 +29,111 @@ pub fn extract_word_position_docids( let mut word_position_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); + let mut del_word_positions: BTreeSet<(u16, Vec)> = BTreeSet::new(); + let mut add_word_positions: BTreeSet<(u16, Vec)> = BTreeSet::new(); + let mut current_document_id: Option = None; let mut key_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) + let (document_id_bytes, _fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = DocumentId::from_be_bytes(document_id_bytes); - for position in read_u32_ne_bytes(value) { - key_buffer.clear(); - key_buffer.extend_from_slice(word_bytes); - key_buffer.push(0); - let (_, position) = relative_from_absolute_position(position); - let position = bucketed_position(position); - key_buffer.extend_from_slice(&position.to_be_bytes()); - word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + if current_document_id.map_or(false, |id| document_id != id) { + words_position_into_sorter( + current_document_id.unwrap(), + &mut key_buffer, + &del_word_positions, + &add_word_positions, + &mut word_position_docids_sorter, + )?; + del_word_positions.clear(); + add_word_positions.clear(); + } + + current_document_id = Some(document_id); + + let del_add_reader = KvReaderDelAdd::new(value); + // extract all unique words to remove. + if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { + for (position, word_bytes) in KvReaderU16::new(deletion).iter() { + let position = bucketed_position(position); + del_word_positions.insert((position, word_bytes.to_vec())); + } + } + + // extract all unique additional words. + if let Some(addition) = del_add_reader.get(DelAdd::Addition) { + for (position, word_bytes) in KvReaderU16::new(addition).iter() { + let position = bucketed_position(position); + add_word_positions.insert((position, word_bytes.to_vec())); + } } } + if let Some(document_id) = current_document_id { + words_position_into_sorter( + document_id, + &mut key_buffer, + &del_word_positions, + &add_word_positions, + &mut word_position_docids_sorter, + )?; + } + + // TODO remove noop DelAdd OBKV let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?; Ok(word_position_docids_reader) } + +fn words_position_into_sorter( + document_id: DocumentId, + key_buffer: &mut Vec, + del_word_positions: &BTreeSet<(u16, Vec)>, + add_word_positions: &BTreeSet<(u16, Vec)>, + word_position_docids_sorter: &mut grenad::Sorter, +) -> Result<()> { + puffin::profile_function!(); + + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + let mut buffer = Vec::new(); + for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a)) + { + buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut buffer); + let (position, word_bytes) = match eob { + Left(key) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + key + } + Right(key) => { + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key + } + Both(key, _) => { + // both values needs to be kept because it will be used in other extractors. + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key + } + }; + + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); + key_buffer.extend_from_slice(&position.to_be_bytes()); + word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?; + } + + Ok(()) +} diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index f44eac8f5..57f349894 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -6,7 +6,6 @@ mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_vector_points; mod extract_word_docids; -mod extract_word_fid_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; @@ -26,14 +25,14 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_vector_points::extract_vector_points; use self::extract_word_docids::extract_word_docids; -use self::extract_word_fid_docids::extract_word_fid_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ - as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, - GrenadParameters, MergeFn, MergeableReader, + as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, + MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; +use crate::proximity::ProximityPrecision; use crate::{FieldId, Result}; /// Extract data for each databases from obkv documents in parallel. @@ -54,6 +53,7 @@ pub(crate) fn data_from_obkv_documents( dictionary: Option<&[&str]>, max_positions_per_attributes: Option, exact_attributes: HashSet, + proximity_precision: ProximityPrecision, ) -> Result<()> { puffin::profile_function!(); @@ -65,7 +65,6 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), vectors_field_id, - primary_key_id, ) }) .collect::>()?; @@ -94,9 +93,9 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, ( - docid_fid_facet_numbers_chunks, + fid_docid_facet_numbers_chunks, ( - docid_fid_facet_strings_chunks, + fid_docid_facet_strings_chunks, ( facet_is_null_docids_chunks, (facet_is_empty_docids_chunks, facet_exists_docids_chunks), @@ -110,7 +109,7 @@ pub(crate) fn data_from_obkv_documents( let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { debug!("merge {} database", "facet-id-exists-docids"); - match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { Ok(reader) => { let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); } @@ -126,7 +125,7 @@ pub(crate) fn data_from_obkv_documents( let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { debug!("merge {} database", "facet-id-is-null-docids"); - match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { Ok(reader) => { let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader))); } @@ -142,7 +141,7 @@ pub(crate) fn data_from_obkv_documents( let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { debug!("merge {} database", "facet-id-is-empty-docids"); - match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { Ok(reader) => { let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader))); } @@ -153,39 +152,48 @@ pub(crate) fn data_from_obkv_documents( }); } - spawn_extraction_task::<_, _, Vec>>>( - docid_word_positions_chunks.clone(), - indexer, - lmdb_writer_sx.clone(), - extract_word_pair_proximity_docids, - merge_cbo_roaring_bitmaps, - TypedChunk::WordPairProximityDocids, - "word-pair-proximity-docids", - ); + if proximity_precision == ProximityPrecision::WordScale { + spawn_extraction_task::<_, _, Vec>>>( + docid_word_positions_chunks.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_word_pair_proximity_docids, + merge_deladd_cbo_roaring_bitmaps, + TypedChunk::WordPairProximityDocids, + "word-pair-proximity-docids", + ); + } spawn_extraction_task::<_, _, Vec>>>( docid_word_positions_chunks.clone(), indexer, lmdb_writer_sx.clone(), extract_fid_word_count_docids, - merge_cbo_roaring_bitmaps, - TypedChunk::FieldIdWordcountDocids, + merge_deladd_cbo_roaring_bitmaps, + TypedChunk::FieldIdWordCountDocids, "field-id-wordcount-docids", ); spawn_extraction_task::< _, _, - Vec<(grenad::Reader>, grenad::Reader>)>, + Vec<( + grenad::Reader>, + grenad::Reader>, + grenad::Reader>, + )>, >( docid_word_positions_chunks.clone(), indexer, lmdb_writer_sx.clone(), move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), - merge_roaring_bitmaps, - |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, + merge_deladd_cbo_roaring_bitmaps, + |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } }, "word-docids", ); @@ -195,36 +203,27 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_word_position_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::WordPositionDocids, "word-position-docids", ); - spawn_extraction_task::<_, _, Vec>>>( - docid_word_positions_chunks, - indexer, - lmdb_writer_sx.clone(), - extract_word_fid_docids, - merge_cbo_roaring_bitmaps, - TypedChunk::WordFidDocids, - "word-fid-docids", - ); spawn_extraction_task::<_, _, Vec>>>( - docid_fid_facet_strings_chunks, + fid_docid_facet_strings_chunks, indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::FieldIdFacetStringDocids, "field-id-facet-string-docids", ); spawn_extraction_task::<_, _, Vec>>>( - docid_fid_facet_numbers_chunks, + fid_docid_facet_numbers_chunks, indexer, lmdb_writer_sx, extract_facet_number_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::FieldIdFacetNumberDocids, "field-id-facet-number-docids", ); @@ -278,7 +277,6 @@ fn send_original_documents_data( indexer: GrenadParameters, lmdb_writer_sx: Sender>, vectors_field_id: Option, - primary_key_id: FieldId, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -287,12 +285,7 @@ fn send_original_documents_data( let documents_chunk_cloned = original_documents_chunk.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); rayon::spawn(move || { - let result = extract_vector_points( - documents_chunk_cloned, - indexer, - primary_key_id, - vectors_field_id, - ); + let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id); let _ = match result { Ok(vector_points) => { lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) @@ -356,10 +349,10 @@ fn send_and_extract_flattened_documents_data( }); } - let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = + let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { - let (documents_ids, docid_word_positions_chunk, script_language_pair) = + let (docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions( flattened_documents_chunk.clone(), indexer, @@ -370,9 +363,6 @@ fn send_and_extract_flattened_documents_data( max_positions_per_attributes, )?; - // send documents_ids to DB writer - let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))); - // send docid_word_positions_chunk to DB writer let docid_word_positions_chunk = unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; @@ -384,8 +374,8 @@ fn send_and_extract_flattened_documents_data( }, || { let ExtractedFacetValues { - docid_fid_facet_numbers_chunk, - docid_fid_facet_strings_chunk, + fid_docid_facet_numbers_chunk, + fid_docid_facet_strings_chunk, fid_facet_is_null_docids_chunk, fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk, @@ -396,26 +386,26 @@ fn send_and_extract_flattened_documents_data( geo_fields_ids, )?; - // send docid_fid_facet_numbers_chunk to DB writer - let docid_fid_facet_numbers_chunk = - unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? }; + // send fid_docid_facet_numbers_chunk to DB writer + let fid_docid_facet_numbers_chunk = + unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? }; let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers( - docid_fid_facet_numbers_chunk.clone(), + fid_docid_facet_numbers_chunk.clone(), ))); - // send docid_fid_facet_strings_chunk to DB writer - let docid_fid_facet_strings_chunk = - unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? }; + // send fid_docid_facet_strings_chunk to DB writer + let fid_docid_facet_strings_chunk = + unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? }; let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings( - docid_fid_facet_strings_chunk.clone(), + fid_docid_facet_strings_chunk.clone(), ))); Ok(( - docid_fid_facet_numbers_chunk, + fid_docid_facet_numbers_chunk, ( - docid_fid_facet_strings_chunk, + fid_docid_facet_strings_chunk, ( fid_facet_is_null_docids_chunk, (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk), @@ -425,5 +415,5 @@ fn send_and_extract_flattened_documents_data( }, ); - Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) + Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?)) } diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 582bf2a5b..e1b27baa2 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -1,14 +1,12 @@ use std::borrow::Cow; use std::fs::File; use std::io::{self, BufReader, BufWriter, Seek}; -use std::time::Instant; use grenad::{CompressionType, Sorter}; -use heed::types::ByteSlice; -use log::debug; +use heed::types::Bytes; use super::{ClonableMmap, MergeFn}; -use crate::error::InternalError; +use crate::update::index_documents::valid_lmdb_key; use crate::Result; pub type CursorClonableMmap = io::Cursor; @@ -47,6 +45,7 @@ pub fn create_sorter( builder.allow_realloc(false); } builder.sort_algorithm(sort_algorithm); + builder.sort_in_parallel(true); builder.build() } @@ -54,6 +53,7 @@ pub fn sorter_into_reader( sorter: grenad::Sorter, indexer: GrenadParameters, ) -> Result>> { + puffin::profile_function!(); let mut writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -115,6 +115,32 @@ impl MergeableReader for Vec<(grenad::Reader>, grenad::Reader>, + grenad::Reader>, + grenad::Reader>, + )> +{ + type Output = ( + grenad::Reader>, + grenad::Reader>, + grenad::Reader>, + ); + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut m1 = MergerBuilder::new(merge_fn); + let mut m2 = MergerBuilder::new(merge_fn); + let mut m3 = MergerBuilder::new(merge_fn); + for (r1, r2, r3) in self.into_iter() { + m1.push(r1)?; + m2.push(r2)?; + m3.push(r3)?; + } + Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?)) + } +} + struct MergerBuilder(grenad::MergerBuilder); impl MergerBuilder { @@ -195,11 +221,13 @@ pub fn grenad_obkv_into_chunks( ); while let Some((document_id, obkv)) = cursor.move_on_next()? { - obkv_documents.insert(document_id, obkv)?; - current_chunk_size += document_id.len() as u64 + obkv.len() as u64; + if !obkv.is_empty() { + obkv_documents.insert(document_id, obkv)?; + current_chunk_size += document_id.len() as u64 + obkv.len() as u64; - if current_chunk_size >= documents_chunk_size as u64 { - return writer_into_reader(obkv_documents).map(Some); + if current_chunk_size >= documents_chunk_size as u64 { + return writer_into_reader(obkv_documents).map(Some); + } } } @@ -210,45 +238,46 @@ pub fn grenad_obkv_into_chunks( Ok(std::iter::from_fn(move || transposer().transpose())) } -pub fn sorter_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, +/// Write provided sorter in database using serialize_value function. +/// merge_values function is used if an entry already exist in the database. +pub fn write_sorter_into_database( sorter: Sorter, - merge: MergeFn, -) -> Result<()> { + database: &heed::Database, + wtxn: &mut heed::RwTxn, + index_is_empty: bool, + serialize_value: FS, + merge_values: FM, +) -> Result<()> +where + FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, + FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, +{ puffin::profile_function!(); - debug!("Writing MTBL sorter..."); - let before = Instant::now(); + + let mut buffer = Vec::new(); + let database = database.remap_types::(); let mut merger_iter = sorter.into_stream_merger_iter()?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - while let Some((k, v)) = merger_iter.next()? { - // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; - } - } else { - while let Some((k, v)) = merger_iter.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).map_err(|_| { - // TODO just wrap this error? - InternalError::IndexingMergingKeys { process: "get-put-merge" } - })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; + while let Some((key, value)) = merger_iter.next()? { + if valid_lmdb_key(key) { + buffer.clear(); + let value = if index_is_empty { + Some(serialize_value(value, &mut buffer)?) + } else { + match database.get(wtxn, key)? { + Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, + None => Some(serialize_value(value, &mut buffer)?), } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + }; + match value { + Some(value) => database.put(wtxn, key, value)?, + None => { + database.delete(wtxn, key)?; } } } } - debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 5d111067a..d355ead68 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -6,22 +6,12 @@ use std::result::Result as StdResult; use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::transform::Operation; use crate::Result; pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result>; -pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - let capacity = values.iter().map(|v| v.len()).sum::(); - let mut output = Vec::with_capacity(capacity); - values.iter().for_each(|integers| output.extend_from_slice(integers)); - Ok(Cow::Owned(output)) - } -} - pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec) -> io::Result<()> { buffer.clear(); buffer.reserve(bitmap.serialized_size()); @@ -75,57 +65,123 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result) { +pub fn merge_two_del_add_obkvs( + base: obkv::KvReaderU16, + update: obkv::KvReaderU16, + merge_additions: bool, + buffer: &mut Vec, +) { use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; buffer.clear(); let mut writer = obkv::KvWriter::new(buffer); + let mut value_buffer = Vec::new(); for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { match eob { - Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), + Left((k, v)) => { + if merge_additions { + writer.insert(k, v).unwrap() + } else { + // If merge_additions is false, recreate an obkv keeping the deletions only. + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + let base_reader = KvReaderDelAdd::new(v); + + if let Some(deletion) = base_reader.get(DelAdd::Deletion) { + value_writer.insert(DelAdd::Deletion, deletion).unwrap(); + value_writer.finish().unwrap(); + writer.insert(k, &value_buffer).unwrap() + } + } + } + Right((k, v)) => writer.insert(k, v).unwrap(), + Both((k, base), (_, update)) => { + // merge deletions and additions. + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + let base_reader = KvReaderDelAdd::new(base); + let update_reader = KvReaderDelAdd::new(update); + + // keep newest deletion. + if let Some(deletion) = update_reader + .get(DelAdd::Deletion) + .or_else(|| base_reader.get(DelAdd::Deletion)) + { + value_writer.insert(DelAdd::Deletion, deletion).unwrap(); + } + + // keep base addition only if merge_additions is true. + let base_addition = + merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten(); + // keep newest addition. + // TODO use or_else + if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) { + value_writer.insert(DelAdd::Addition, addition).unwrap(); + } + + value_writer.finish().unwrap(); + writer.insert(k, &value_buffer).unwrap() + } } } writer.finish().unwrap(); } -/// Merge all the obks in the order we see them. -pub fn merge_obkvs_and_operations<'a>( +/// Merge all the obkvs from the newest to the oldest. +fn inner_merge_del_add_obkvs<'a>( + obkvs: &[Cow<'a, [u8]>], + merge_additions: bool, +) -> Result> { + // pop the newest operation from the list. + let (newest, obkvs) = obkvs.split_last().unwrap(); + // keep the operation type for the returned value. + let newest_operation_type = newest[0]; + + // treat the newest obkv as the starting point of the merge. + let mut acc_operation_type = newest_operation_type; + let mut acc = newest[1..].to_vec(); + let mut buffer = Vec::new(); + // reverse iter from the most recent to the oldest. + for current in obkvs.iter().rev() { + // if in the previous iteration there was a complete deletion, + // stop the merge process. + if acc_operation_type == Operation::Deletion as u8 { + break; + } + + let newest = obkv::KvReader::new(&acc); + let oldest = obkv::KvReader::new(¤t[1..]); + merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer); + + // we want the result of the merge into our accumulator. + std::mem::swap(&mut acc, &mut buffer); + acc_operation_type = current[0]; + } + + acc.insert(0, newest_operation_type); + Ok(Cow::from(acc)) +} + +/// Merge all the obkvs from the newest to the oldest. +pub fn obkvs_merge_additions_and_deletions<'a>( _key: &[u8], obkvs: &[Cow<'a, [u8]>], ) -> Result> { - // [add, add, delete, add, add] - // we can ignore everything that happened before the last delete. - let starting_position = - obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0); - - // [add, add, delete] - // if the last operation was a deletion then we simply return the deletion - if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8 - { - return Ok(obkvs[obkvs.len() - 1].clone()); - } - let mut buffer = Vec::new(); - - // (add, add, delete) [add, add] - // in the other case, no deletion will be encountered during the merge - let mut ret = - obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| { - let first = obkv::KvReader::new(&acc); - let second = obkv::KvReader::new(¤t[1..]); - merge_two_obkvs(first, second, &mut buffer); - - // we want the result of the merge into our accumulator - std::mem::swap(&mut acc, &mut buffer); - acc - }); - - ret.insert(0, Operation::Addition as u8); - Ok(Cow::from(ret)) + inner_merge_del_add_obkvs(obkvs, true) } +/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions. +pub fn obkvs_keep_last_addition_merge_deletions<'a>( + _key: &[u8], + obkvs: &[Cow<'a, [u8]>], +) -> Result> { + inner_merge_del_add_obkvs(obkvs, false) +} + +/// Do a union of all the CboRoaringBitmaps in the values. pub fn merge_cbo_roaring_bitmaps<'a>( _key: &[u8], values: &[Cow<'a, [u8]>], @@ -138,3 +194,52 @@ pub fn merge_cbo_roaring_bitmaps<'a>( Ok(Cow::from(vec)) } } + +/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv +/// separately and outputs a new DelAdd with both unions. +pub fn merge_deladd_cbo_roaring_bitmaps<'a>( + _key: &[u8], + values: &[Cow<'a, [u8]>], +) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + // Retrieve the bitmaps from both sides + let mut del_bitmaps_bytes = Vec::new(); + let mut add_bitmaps_bytes = Vec::new(); + for value in values { + let obkv = KvReaderDelAdd::new(value); + if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { + del_bitmaps_bytes.push(bitmap_bytes); + } + if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) { + add_bitmaps_bytes.push(bitmap_bytes); + } + } + + let mut output_deladd_obkv = KvWriterDelAdd::memory(); + let mut buffer = Vec::new(); + CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?; + output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?; + buffer.clear(); + CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?; + output_deladd_obkv.insert(DelAdd::Addition, &buffer)?; + output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) + } +} + +/// A function that merges a DelAdd of bitmao into an already existing bitmap. +/// +/// The first argument is the DelAdd obkv of CboRoaringBitmaps and +/// the second one is the CboRoaringBitmap to merge into. +pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>( + deladd_obkv: &[u8], + previous: &[u8], + buffer: &'a mut Vec, +) -> Result> { + Ok(CboRoaringBitmapCodec::merge_deladd_into( + KvReaderDelAdd::new(deladd_obkv), + previous, + buffer, + )?) +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index c403f9e3d..52638d6f6 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -9,13 +9,14 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, - merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, + merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader, GrenadParameters, MergeableReader, }; pub use merge_functions::{ - concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, - merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, - serialize_roaring_bitmap, MergeFn, + keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, + obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn, }; use crate::MAX_WORD_LENGTH; @@ -44,10 +45,6 @@ where Some((head, tail)) } -pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { - bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) -} - /// Converts an fst Stream into an HashSet of Strings. pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet> where diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b3e7e203e..f825cad1c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,11 +20,13 @@ use slice_group_by::GroupBy; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; use self::enrich::enrich_documents_batch; -pub use self::enrich::{extract_finite_float_from_value, DocumentId}; +pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, - fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, + fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader, + ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; @@ -32,13 +34,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, - WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, + IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; -use crate::{Index, Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Index, Result}; static MERGED_DATABASE_COUNT: usize = 7; -static PREFIX_DATABASE_COUNT: usize = 5; +static PREFIX_DATABASE_COUNT: usize = 4; static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -67,8 +68,8 @@ impl Default for IndexDocumentsMethod { } } -pub struct IndexDocuments<'t, 'u, 'i, 'a, FP, FA> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, +pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { + wtxn: &'t mut heed::RwTxn<'i>, index: &'i Index, config: IndexDocumentsConfig, indexer_config: &'a IndexerConfig, @@ -86,23 +87,22 @@ pub struct IndexDocumentsConfig { pub words_positions_level_group_size: Option, pub words_positions_min_level_size: Option, pub update_method: IndexDocumentsMethod, - pub deletion_strategy: DeletionStrategy, pub autogenerate_docids: bool, } -impl<'t, 'u, 'i, 'a, FP, FA> IndexDocuments<'t, 'u, 'i, 'a, FP, FA> +impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA> where FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, + wtxn: &'t mut heed::RwTxn<'i>, index: &'i Index, indexer_config: &'a IndexerConfig, config: IndexDocumentsConfig, progress: FP, should_abort: FA, - ) -> Result> { + ) -> Result> { let transform = Some(Transform::new( wtxn, index, @@ -178,6 +178,7 @@ where // Early return when there is no document to add if to_delete.is_empty() { + // Maintains Invariant: remove documents actually always returns Ok for the inner result return Ok((self, Ok(0))); } @@ -190,14 +191,48 @@ where self.deleted_documents += deleted_documents; + // Maintains Invariant: remove documents actually always returns Ok for the inner result Ok((self, Ok(deleted_documents))) } + /// Removes documents from db using their internal document ids. + /// + /// # Warning + /// + /// This function is dangerous and will only work correctly if: + /// + /// - All the passed ids currently exist in the database + /// - No batching using the standards `remove_documents` and `add_documents` took place + /// + /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function. + pub fn remove_documents_from_db_no_batch( + mut self, + to_delete: &RoaringBitmap, + ) -> Result<(Self, u64)> { + puffin::profile_function!(); + + // Early return when there is no document to add + if to_delete.is_empty() { + return Ok((self, 0)); + } + + let deleted_documents = self + .transform + .as_mut() + .expect("Invalid document deletion state") + .remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)? + as u64; + + self.deleted_documents += deleted_documents; + + Ok((self, deleted_documents)) + } + #[logging_timer::time("IndexDocuments::{}")] pub fn execute(mut self) -> Result { puffin::profile_function!(); - if self.added_documents == 0 { + if self.added_documents == 0 && self.deleted_documents == 0 { let number_of_documents = self.index.number_of_documents(self.wtxn)?; return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); } @@ -241,9 +276,6 @@ where primary_key, fields_ids_map, field_distribution, - new_external_documents_ids, - new_documents_ids, - replaced_documents_ids, documents_count, original_documents, flattened_documents, @@ -320,6 +352,7 @@ where let dictionary: Option> = dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; + let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default(); let pool_params = GrenadParameters { chunk_compression_type: self.indexer_config.chunk_compression_type, @@ -360,6 +393,7 @@ where dictionary.as_deref(), max_positions_per_attributes, exact_attributes, + proximity_precision, ) }); @@ -367,29 +401,12 @@ where let _ = lmdb_writer_sx.send(Err(e)); } - // needs to be droped to avoid channel waiting lock. + // needs to be dropped to avoid channel waiting lock. drop(lmdb_writer_sx) }); - // We delete the documents that this document addition replaces. This way we are - // able to simply insert all the documents even if they already exist in the database. - if !replaced_documents_ids.is_empty() { - let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?; - deletion_builder.strategy(self.config.deletion_strategy); - debug!("documents to delete {:?}", replaced_documents_ids); - deletion_builder.delete_documents(&replaced_documents_ids); - let deleted_documents_result = deletion_builder.execute_inner()?; - debug!("{} documents actually deleted", deleted_documents_result.deleted_documents); - } - - let index_documents_ids = self.index.documents_ids(self.wtxn)?; - let index_is_empty = index_documents_ids.is_empty(); + let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0; let mut final_documents_ids = RoaringBitmap::new(); - let mut word_pair_proximity_docids = None; - let mut word_position_docids = None; - let mut word_fid_docids = None; - let mut word_docids = None; - let mut exact_word_docids = None; let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -397,35 +414,40 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); + let mut word_position_docids = None; + let mut word_fid_docids = None; + let mut word_docids = None; + let mut exact_word_docids = None; + for result in lmdb_writer_rx { if (self.should_abort)() { return Err(Error::InternalError(InternalError::AbortedIndexation)); } let typed_chunk = match result? { - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => { let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; word_docids = Some(cloneable_chunk); let cloneable_chunk = unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; exact_word_docids = Some(cloneable_chunk); - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } - } - TypedChunk::WordPairProximityDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_pair_proximity_docids = Some(cloneable_chunk); - TypedChunk::WordPairProximityDocids(chunk) + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; + word_fid_docids = Some(cloneable_chunk); + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } } TypedChunk::WordPositionDocids(chunk) => { let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; word_position_docids = Some(cloneable_chunk); TypedChunk::WordPositionDocids(chunk) } - TypedChunk::WordFidDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_fid_docids = Some(cloneable_chunk); - TypedChunk::WordFidDocids(chunk) - } otherwise => otherwise, }; @@ -457,25 +479,16 @@ where // We write the primary key field id into the main database self.index.put_primary_key(self.wtxn, &primary_key)?; - - // We write the external documents ids into the main database. - let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?; - external_documents_ids.insert_ids(&new_external_documents_ids)?; - let external_documents_ids = external_documents_ids.into_static(); - self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; - - let all_documents_ids = index_documents_ids | new_documents_ids; - self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; + let number_of_documents = self.index.number_of_documents(self.wtxn)?; self.execute_prefix_databases( word_docids, exact_word_docids, - word_pair_proximity_docids, word_position_docids, word_fid_docids, )?; - Ok(all_documents_ids.len()) + Ok(number_of_documents) } #[logging_timer::time("IndexDocuments::{}")] @@ -483,7 +496,6 @@ where self, word_docids: Option>, exact_word_docids: Option>, - word_pair_proximity_docids: Option>, word_position_docids: Option>, word_fid_docids: Option>, ) -> Result<()> @@ -604,32 +616,6 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); - if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { - // Run the word prefix pair proximity docids update operation. - PrefixWordPairsProximityDocids::new( - self.wtxn, - self.index, - self.indexer_config.chunk_compression_type, - self.indexer_config.chunk_compression_level, - ) - .execute( - word_pair_proximity_docids, - &new_prefix_fst_words, - &common_prefix_fst_words, - &del_prefix_fst_words, - )?; - } - - if (self.should_abort)() { - return Err(Error::InternalError(InternalError::AbortedIndexation)); - } - - databases_seen += 1; - (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen, - total_databases: TOTAL_POSTING_DATABASE_COUNT, - }); - if let Some(word_position_docids) = word_position_docids { // Run the words prefix position docids update operation. let mut builder = WordPrefixIntegerDocids::new( @@ -687,8 +673,8 @@ where fn execute_word_prefix_docids( txn: &mut heed::RwTxn, reader: grenad::Reader>, - word_docids_db: Database, - word_prefix_docids_db: Database, + word_docids_db: Database, + word_prefix_docids_db: Database, indexer_config: &IndexerConfig, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], @@ -709,14 +695,15 @@ fn execute_word_prefix_docids( #[cfg(test)] mod tests { use big_s::S; + use fst::IntoStreamer; + use heed::RwTxn; use maplit::hashset; use super::*; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; use crate::search::TermsMatchingStrategy; - use crate::update::DeleteDocuments; - use crate::{db_snap, BEU16}; + use crate::{db_snap, Filter, Search}; #[test] fn simple_document_replacement() { @@ -807,11 +794,10 @@ mod tests { assert_eq!(count, 1); // Check that we get only one document from the database. - // Since the document has been deleted and re-inserted, its internal docid has been incremented to 1 - let docs = index.documents(&rtxn, Some(1)).unwrap(); + let docs = index.documents(&rtxn, Some(0)).unwrap(); assert_eq!(docs.len(), 1); let (id, doc) = docs[0]; - assert_eq!(id, 1); + assert_eq!(id, 0); // Check that this document is equal to the last one sent. let mut doc_iter = doc.iter(); @@ -872,7 +858,7 @@ mod tests { assert_eq!(count, 3); // the document 0 has been deleted and reinserted with the id 3 - let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap(); + let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap(); let kevin_position = docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); assert_eq!(kevin_position, 2); @@ -1018,7 +1004,6 @@ mod tests { assert_eq!(count, 6); db_snap!(index, word_docids, "updated"); - db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]"); drop(rtxn); } @@ -1121,17 +1106,15 @@ mod tests { { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ])) .unwrap(); - let mut wtxn = index.write_txn().unwrap(); - assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_external_id("30"); - builder.execute().unwrap(); + index.delete_document("30"); - let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); - assert!(external_documents_ids.get("30").is_none()); - wtxn.commit().unwrap(); + let txn = index.read_txn().unwrap(); + assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId")); + + let external_documents_ids = index.external_documents_ids(); + assert!(external_documents_ids.get(&txn, "30").unwrap().is_none()); index .add_documents(documents!([ @@ -1140,8 +1123,8 @@ mod tests { .unwrap(); let wtxn = index.write_txn().unwrap(); - let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); - assert!(external_documents_ids.get("30").is_some()); + let external_documents_ids = index.external_documents_ids(); + assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some()); wtxn.commit().unwrap(); index @@ -1435,8 +1418,10 @@ mod tests { index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap(); let rtxn = index.read_txn().unwrap(); - let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); - assert!(external_documents_ids.get("1").is_some()); + let all_documents_count = index.all_documents(&rtxn).unwrap().count(); + assert_eq!(all_documents_count, 1); + let external_documents_ids = index.external_documents_ids(); + assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some()); } #[test] @@ -1490,12 +1475,6 @@ mod tests { 3 2 second second 3 3 third third "###); - db_snap!(index, string_faceted_documents_ids, @r###" - 0 [] - 1 [] - 2 [] - 3 [0, 1, 2, 3, ] - "###); let rtxn = index.read_txn().unwrap(); @@ -1519,12 +1498,6 @@ mod tests { db_snap!(index, facet_id_string_docids, @""); db_snap!(index, field_id_docid_facet_strings, @""); - db_snap!(index, string_faceted_documents_ids, @r###" - 0 [] - 1 [] - 2 [] - 3 [0, 1, 2, 3, ] - "###); let rtxn = index.read_txn().unwrap(); @@ -1551,12 +1524,6 @@ mod tests { 3 2 second second 3 3 third third "###); - db_snap!(index, string_faceted_documents_ids, @r###" - 0 [] - 1 [] - 2 [] - 3 [0, 1, 2, 3, ] - "###); let rtxn = index.read_txn().unwrap(); @@ -1719,7 +1686,7 @@ mod tests { let wtxn = index.read_txn().unwrap(); - let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map(); + let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap(); let ids = map.values().collect::>(); assert_eq!(ids.len(), map.len()); @@ -1778,14 +1745,11 @@ mod tests { let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); let bitmap_colour = - index.facet_id_exists_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap(); + index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6, 7]); - let bitmap_colour_green = index - .facet_id_exists_docids - .get(&rtxn, &BEU16::new(colour_green_id)) - .unwrap() - .unwrap(); + let bitmap_colour_green = + index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6, 7]); }; @@ -1883,21 +1847,15 @@ mod tests { index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap(); let bitmap_null_colour = - index.facet_id_is_null_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap(); + index.facet_id_is_null_docids.get(&rtxn, &colour_id).unwrap().unwrap(); assert_eq!(bitmap_null_colour.into_iter().collect::>(), vec![0]); - let bitmap_colour_green = index - .facet_id_is_null_docids - .get(&rtxn, &BEU16::new(colour_green_id)) - .unwrap() - .unwrap(); + let bitmap_colour_green = + index.facet_id_is_null_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![2]); - let bitmap_colour_blue = index - .facet_id_is_null_docids - .get(&rtxn, &BEU16::new(colour_blue_id)) - .unwrap() - .unwrap(); + let bitmap_colour_blue = + index.facet_id_is_null_docids.get(&rtxn, &colour_blue_id).unwrap().unwrap(); assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![3]); }; @@ -1952,21 +1910,15 @@ mod tests { let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap(); let bitmap_empty_tags = - index.facet_id_is_empty_docids.get(&rtxn, &BEU16::new(tags_id)).unwrap().unwrap(); + index.facet_id_is_empty_docids.get(&rtxn, &tags_id).unwrap().unwrap(); assert_eq!(bitmap_empty_tags.into_iter().collect::>(), vec![2, 6, 9]); - let bitmap_tags_green = index - .facet_id_is_empty_docids - .get(&rtxn, &BEU16::new(tags_green_id)) - .unwrap() - .unwrap(); + let bitmap_tags_green = + index.facet_id_is_empty_docids.get(&rtxn, &tags_green_id).unwrap().unwrap(); assert_eq!(bitmap_tags_green.into_iter().collect::>(), vec![8]); - let bitmap_tags_blue = index - .facet_id_is_empty_docids - .get(&rtxn, &BEU16::new(tags_blue_id)) - .unwrap() - .unwrap(); + let bitmap_tags_blue = + index.facet_id_is_empty_docids.get(&rtxn, &tags_blue_id).unwrap().unwrap(); assert_eq!(bitmap_tags_blue.into_iter().collect::>(), vec![12]); }; @@ -2531,17 +2483,8 @@ mod tests { db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4"); db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83"); - let mut wtxn = index.write_txn().unwrap(); - // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); - builder.delete_external_id("0"); - builder.delete_external_id("3"); - let result = builder.execute().unwrap(); - println!("{result:?}"); - - wtxn.commit().unwrap(); + index.delete_documents(vec!["0".into(), "3".into()]); db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); @@ -2596,8 +2539,7 @@ mod tests { ), ] */ - let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; + let index = TempIndex::new(); // START OF BATCH @@ -2637,8 +2579,7 @@ mod tests { {"id":1,"doggo":"bernese"} "###); db_snap!(index, external_documents_ids, @r###" - soft: - hard: + docids: 1 0 "###); @@ -2683,13 +2624,10 @@ mod tests { "###); db_snap!(index, external_documents_ids, @r###" - soft: - hard: + docids: 0 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[]"); - // BATCH 3 println!("--- ENTERING BATCH 3"); @@ -2731,4 +2669,537 @@ mod tests { let res = index.search(&rtxn).execute().unwrap(); index.documents(&rtxn, res.documents_ids).unwrap(); } + + fn delete_documents<'t>( + wtxn: &mut RwTxn<'t>, + index: &'t TempIndex, + external_ids: &[&str], + ) -> Vec { + let external_document_ids = index.external_documents_ids(); + let ids_to_delete: Vec = external_ids + .iter() + .map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap()) + .collect(); + + // Delete some documents. + index.delete_documents_using_wtxn( + wtxn, + external_ids.iter().map(ToString::to_string).collect(), + ); + + ids_to_delete + } + + #[test] + fn delete_documents_with_numbers_as_primary_key() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, + { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, + { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } + ]), + ) + .unwrap(); + + // delete those documents, ids are synchronous therefore 0, 1, and 2. + index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]); + + wtxn.commit().unwrap(); + + // All these snapshots should be empty since the database was cleared + db_snap!(index, documents_ids); + db_snap!(index, word_docids); + db_snap!(index, word_pair_proximity_docids); + db_snap!(index, facet_id_exists_docids); + + let rtxn = index.read_txn().unwrap(); + + assert!(index.field_distribution(&rtxn).unwrap().is_empty()); + } + + #[test] + fn delete_documents_with_strange_primary_key() { + let index = TempIndex::new(); + + index + .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()])) + .unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "mysuperid": 0, "name": "kevin" }, + { "mysuperid": 1, "name": "kevina" }, + { "mysuperid": 2, "name": "benoit" } + ]), + ) + .unwrap(); + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete not all of the documents but some of them. + index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]); + + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids); + db_snap!(index, word_docids); + db_snap!(index, word_pair_proximity_docids); + } + + #[test] + fn filtered_placeholder_search_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + settings.set_filterable_fields(hashset! { S("label"), S("label2") }); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]); + + // Placeholder search with filter + let filter = Filter::from_str("label = sign").unwrap().unwrap(); + let results = index.search(&wtxn).filter(filter).execute().unwrap(); + assert!(results.documents_ids.is_empty()); + + wtxn.commit().unwrap(); + + db_snap!(index, word_docids); + db_snap!(index, facet_id_f64_docids); + db_snap!(index, word_pair_proximity_docids); + db_snap!(index, facet_id_exists_docids); + db_snap!(index, facet_id_string_docids); + } + + #[test] + fn placeholder_search_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]); + + // Placeholder search + let results = index.search(&wtxn).execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + } + + #[test] + fn search_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + + // search for abstract + let results = index.search(&wtxn).query("abstract").execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + } + + #[test] + fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("id")); + settings.set_filterable_fields(hashset!(S("_geo"))); + settings.set_sortable_fields(hashset!(S("_geo"))); + }) + .unwrap(); + + index.add_documents_using_wtxn(&mut wtxn, documents!([ + { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } }, + { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } }, + { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } }, + { "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } }, + { "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } }, + { "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } }, + { "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } }, + { "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } }, + { "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } }, + { "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } }, + { "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } }, + { "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } }, + { "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } }, + { "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } }, + { "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } }, + { "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } }, + { "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } }, + { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } }, + { "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } }, + { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } } + ])).unwrap(); + + let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete); + + // Placeholder search with geo filter + let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); + let results = index.search(&wtxn).filter(filter).execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + + db_snap!(index, facet_id_f64_docids); + db_snap!(index, facet_id_string_docids); + } + + #[test] + fn get_documents_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_external_ids = ["1_7", "1_52"]; + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids); + + // list all documents + let results = index.all_documents(&wtxn).unwrap(); + for result in results { + let (id, _) = result.unwrap(); + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + + // list internal document ids + let results = index.documents_ids(&wtxn).unwrap(); + for id in results { + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // get internal docids from deleted external document ids + let results = index.external_documents_ids(); + for id in deleted_external_ids { + assert!( + results.get(&rtxn, id).unwrap().is_none(), + "The document {} was supposed to be deleted", + id + ); + } + drop(rtxn); + } + + #[test] + fn stats_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index.add_documents_using_wtxn(&mut wtxn, documents!([ + { "docid": "1_4", "label": ["sign"]}, + { "docid": "1_5", "label": ["letter"]}, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"}, + { "docid": "1_36", "label": ["drawing","painting","pattern"]}, + { "docid": "1_37", "label": ["art","drawing","outdoor"]}, + { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"}, + { "docid": "1_39", "label": ["abstract"]}, + { "docid": "1_40", "label": ["cartoon"]}, + { "docid": "1_41", "label": ["art","drawing"]}, + { "docid": "1_42", "label": ["art","pattern"]}, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32}, + { "docid": "1_44", "label": ["drawing"], "number": 44i32}, + { "docid": "1_45", "label": ["art"]}, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]}, + { "docid": "1_47", "label": ["abstract","pattern"]}, + { "docid": "1_52", "label": ["abstract","cartoon"]}, + { "docid": "1_57", "label": ["abstract","drawing","pattern"]}, + { "docid": "1_58", "label": ["abstract","art","cartoon"]}, + { "docid": "1_68", "label": ["design"]}, + { "docid": "1_69", "label": ["geometry"]} + ])).unwrap(); + + delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + + // count internal documents + let results = index.number_of_documents(&wtxn).unwrap(); + assert_eq!(18, results); + + // count field distribution + let results = index.field_distribution(&wtxn).unwrap(); + assert_eq!(Some(&18), results.get("label")); + assert_eq!(Some(&1), results.get("title")); + assert_eq!(Some(&2), results.get("number")); + + wtxn.commit().unwrap(); + } + + #[test] + fn stored_detected_script_and_language_should_not_return_deleted_documents() { + use charabia::{Language, Script}; + let index = TempIndex::new(); + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, + { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, + { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" }, + { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" }, + { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" }, + ])) + .unwrap(); + + let key_cmn = (Script::Cj, Language::Cmn); + let cj_cmn_docs = + index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default(); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(1); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + + delete_documents(&mut wtxn, &index, &["1"]); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let cj_cmn_docs = + index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default(); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + } + + #[test] + fn delete_words_exact_attributes() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key(S("id")); + settings.set_searchable_fields(vec![S("text"), S("exact")]); + settings.set_exact_attributes(vec![S("exact")].into_iter().collect()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "id": 0, "text": "hello" }, + { "id": 1, "exact": "hello"} + ])) + .unwrap(); + db_snap!(index, word_docids, 1, @r###" + hello [0, ] + "###); + db_snap!(index, exact_word_docids, 1, @r###" + hello [1, ] + "###); + db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); + + let mut wtxn = index.write_txn().unwrap(); + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]); + wtxn.commit().unwrap(); + + db_snap!(index, word_docids, 2, @r###" + hello [0, ] + "###); + db_snap!(index, exact_word_docids, 2, @""); + db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); + + insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]"); + let txn = index.read_txn().unwrap(); + let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap(); + insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###); + + let mut s = Search::new(&txn, &index); + s.query("hello"); + let crate::SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + } } diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap new file mode 100644 index 000000000..8b27dcb0d --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +[] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap new file mode 100644 index 000000000..8a9805f8d --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +[2, ] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap new file mode 100644 index 000000000..bb2f64873 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +benoit [2, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap similarity index 66% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap rename to milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap index 7481b11c4..ed120bf02 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap @@ -1,5 +1,5 @@ --- -source: milli/src/update/delete_documents.rs +source: milli/src/update/index_documents/mod.rs --- 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] 2 [21, ] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap new file mode 100644 index 000000000..deeddff0d --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +2 0 2.2 1 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap similarity index 91% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap rename to milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap index ab1d2175f..2d0b98623 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap @@ -1,5 +1,5 @@ --- -source: milli/src/update/delete_documents.rs +source: milli/src/update/index_documents/mod.rs --- 1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] 1 0 aquarium 1 [5, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap similarity index 95% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap rename to milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap index f8d64e001..73503f098 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap @@ -1,5 +1,5 @@ --- -source: milli/src/update/delete_documents.rs +source: milli/src/update/index_documents/mod.rs --- 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ] 2 [21, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap similarity index 95% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap rename to milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap index 36add107b..022e9f5b1 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap @@ -1,5 +1,5 @@ --- -source: milli/src/update/delete_documents.rs +source: milli/src/update/index_documents/mod.rs --- 1 1 36 [3, ] 1 1 37 [4, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap similarity index 93% rename from milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap rename to milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap index 18a9d9309..c45c350e7 100644 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap @@ -1,5 +1,5 @@ --- -source: milli/src/update/delete_documents.rs +source: milli/src/update/index_documents/mod.rs --- 3 0 48.9021 1 [19, ] 3 0 49.9314 1 [17, ] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap index b0ef38b93..80dbce9e8 100644 --- a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap @@ -1,60 +1,56 @@ --- source: milli/src/update/index_documents/mod.rs --- -0 [1, 7, ] +0 [1, ] 1 [2, ] -10 [1, 7, ] -12 [0, 8, ] +10 [1, ] +12 [0, ] 1344 [3, ] -1813 [8, ] -2 [0, 8, ] +1813 [0, ] +2 [0, ] 23 [5, ] 25 [2, ] -3 [0, 8, ] +3 [0, ] 35 [5, ] -4 [4, 6, ] -42 [0, 5, 8, ] -456 [1, 7, ] -5 [0, 8, ] +4 [4, ] +42 [0, 5, ] +456 [1, ] +5 [0, ] 99 [2, ] adams [5, ] -adventure [1, 7, ] +adventure [1, ] alice [2, ] -and [0, 4, 6, 8, ] -antoine [1, 7, ] -austen [8, ] -austin [0, ] -blood [4, 6, ] +and [0, 4, ] +antoine [1, ] +austen [0, ] +blood [4, ] carroll [2, ] -de [1, 7, ] +de [1, ] douglas [5, ] -exupery [1, 7, ] -fantasy [2, 3, 4, 6, ] +exupery [1, ] +fantasy [2, 3, 4, ] galaxy [5, ] guide [5, ] -half [4, 6, ] -harry [4, 6, ] +half [4, ] +harry [4, ] hitchhiker [5, ] hobbit [3, ] in [2, ] -j [3, 4, 6, 8, ] -jane [0, ] -k [4, 6, ] -le [1, ] +j [0, 3, 4, ] +k [4, ] lewis [2, ] -little [7, ] -petit [1, ] -potter [4, 6, ] -prejudice [0, 8, ] -pride [0, 8, ] -prince [1, 4, 7, ] -princess [6, ] +little [1, ] +potter [4, ] +prejudice [0, ] +pride [0, ] +prince [1, ] +princess [4, ] r [3, ] -romance [0, 8, ] -rowling [4, 6, ] +romance [0, ] +rowling [4, ] s [5, ] -saint [1, 7, ] -the [3, 4, 5, 6, 7, ] +saint [1, ] +the [1, 3, 4, 5, ] to [5, ] tolkien [3, ] wonderland [2, ] diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f0e3bbbf0..ab8e27edb 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; -use std::collections::hash_map::Entry; +use std::collections::btree_map::Entry as BEntry; +use std::collections::hash_map::Entry as HEntry; use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek}; @@ -7,30 +8,28 @@ use std::io::{Read, Seek}; use fxhash::FxHashMap; use heed::RoTxn; use itertools::Itertools; -use obkv::{KvReader, KvWriter}; +use obkv::{KvReader, KvReaderU16, KvWriter}; use roaring::RoaringBitmap; use serde_json::Value; use smartstring::SmartString; use super::helpers::{ - create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn, + create_sorter, create_writer, keep_first, obkvs_keep_last_addition_merge_deletions, + obkvs_merge_additions_and_deletions, sorter_into_reader, MergeFn, }; use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; +use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; +use crate::update::index_documents::GrenadParameters; use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; -use crate::{ - FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, -}; +use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result}; pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, pub field_distribution: FieldDistribution, - pub new_external_documents_ids: fst::Map>, - pub new_documents_ids: RoaringBitmap, - pub replaced_documents_ids: RoaringBitmap, pub documents_count: usize, pub original_documents: File, pub flattened_documents: File, @@ -106,8 +105,8 @@ impl<'a, 'i> Transform<'a, 'i> { // We must choose the appropriate merge function for when two or more documents // with the same user id must be merged or fully replaced in the same batch. let merge_function = match index_documents_method { - IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv, - IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations, + IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions, + IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions, }; // We initialize the sorter with the user indexing settings. @@ -130,17 +129,13 @@ impl<'a, 'i> Transform<'a, 'i> { indexer_settings.max_memory.map(|mem| mem / 2), ); let documents_ids = index.documents_ids(wtxn)?; - let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?; Ok(Transform { index, fields_ids_map: index.fields_ids_map(wtxn)?, indexer_settings, autogenerate_docids, - available_documents_ids: AvailableDocumentsIds::from_documents_ids( - &documents_ids, - &soft_deleted_documents_ids, - ), + available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), original_sorter, flattened_sorter, index_documents_method, @@ -151,6 +146,7 @@ impl<'a, 'i> Transform<'a, 'i> { }) } + #[logging_timer::time] pub fn read_documents( &mut self, reader: EnrichedDocumentsBatchReader, @@ -163,8 +159,10 @@ impl<'a, 'i> Transform<'a, 'i> { FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { + puffin::profile_function!(); + let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); - let external_documents_ids = self.index.external_documents_ids(wtxn)?; + let external_documents_ids = self.index.external_documents_ids(); let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; let primary_key = cursor.primary_key().to_string(); @@ -172,7 +170,8 @@ impl<'a, 'i> Transform<'a, 'i> { self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; let mut obkv_buffer = Vec::new(); - let mut document_sorter_buffer = Vec::new(); + let mut document_sorter_value_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); let mut documents_count = 0; let mut docid_buffer: Vec = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); @@ -213,29 +212,30 @@ impl<'a, 'i> Transform<'a, 'i> { field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2)); // Build the new obkv document. - let mut writer = obkv::KvWriter::new(&mut obkv_buffer); + let mut writer = KvWriter::new(&mut obkv_buffer); for (k, v) in field_buffer_cache.iter() { writer.insert(*k, v)?; } let mut original_docid = None; - let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { - Entry::Occupied(entry) => *entry.get() as u32, - Entry::Vacant(entry) => { - // If the document was already in the db we mark it as a replaced document. - // It'll be deleted later. - if let Some(docid) = external_documents_ids.get(entry.key()) { - // If it was already in the list of replaced documents it means it was deleted - // by the remove_document method. We should starts as if it never existed. - if self.replaced_documents_ids.insert(docid) { - original_docid = Some(docid); + HEntry::Occupied(entry) => *entry.get() as u32, + HEntry::Vacant(entry) => { + let docid = match external_documents_ids.get(wtxn, entry.key())? { + Some(docid) => { + // If it was already in the list of replaced documents it means it was deleted + // by the remove_document method. We should starts as if it never existed. + if self.replaced_documents_ids.insert(docid) { + original_docid = Some(docid); + } + + docid } - } - let docid = self - .available_documents_ids - .next() - .ok_or(UserError::DocumentLimitReached)?; + None => self + .available_documents_ids + .next() + .ok_or(UserError::DocumentLimitReached)?, + }; entry.insert(docid as u64); docid } @@ -243,11 +243,11 @@ impl<'a, 'i> Transform<'a, 'i> { let mut skip_insertion = false; if let Some(original_docid) = original_docid { - let original_key = BEU32::new(original_docid); + let original_key = original_docid; let base_obkv = self .index .documents - .remap_data_type::() + .remap_data_type::() .get(wtxn, &original_key)? .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, @@ -263,47 +263,68 @@ impl<'a, 'i> Transform<'a, 'i> { skip_insertion = true; } else { // we associate the base document with the new key, everything will get merged later. - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(base_obkv); - self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; - match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? { - Some(flattened_obkv) => { - // we recreate our buffer with the flattened documents - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(&flattened_obkv); - self.flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)? + let deladd_operation = match self.index_documents_method { + IndexDocumentsMethod::UpdateDocuments => { + DelAddOperation::DeletionAndAddition } - None => self - .flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)?, + IndexDocumentsMethod::ReplaceDocuments => DelAddOperation::Deletion, + }; + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(base_obkv), + deladd_operation, + &mut document_sorter_value_buffer, + )?; + self.original_sorter + .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; + let base_obkv = KvReader::new(base_obkv); + if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? { + // we recreate our buffer with the flattened documents + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(&flattened_obkv), + deladd_operation, + &mut document_sorter_value_buffer, + )?; } + self.flattened_sorter + .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; } } if !skip_insertion { self.new_documents_ids.insert(docid); - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(&obkv_buffer); + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(&obkv_buffer), + DelAddOperation::Addition, + &mut document_sorter_value_buffer, + )?; // We use the extracted/generated user id as the key for this document. - self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; + self.original_sorter + .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; - match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { - Some(flattened_obkv) => { - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(&flattened_obkv); - self.flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)? - } - None => self - .flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)?, + let flattened_obkv = KvReader::new(&obkv_buffer); + if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(&obkv), + DelAddOperation::Addition, + &mut document_sorter_value_buffer, + )? } + self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; } documents_count += 1; @@ -338,6 +359,7 @@ impl<'a, 'i> Transform<'a, 'i> { /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db, /// it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids. /// - If the document to remove was not present in either the db or the transform we do nothing. + #[logging_timer::time] pub fn remove_documents( &mut self, mut to_remove: Vec, @@ -347,54 +369,176 @@ impl<'a, 'i> Transform<'a, 'i> { where FA: Fn() -> bool + Sync, { + puffin::profile_function!(); + // there may be duplicates in the documents to remove. to_remove.sort_unstable(); to_remove.dedup(); - let external_documents_ids = self.index.external_documents_ids(wtxn)?; + let external_documents_ids = self.index.external_documents_ids(); let mut documents_deleted = 0; + let mut document_sorter_value_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); for to_remove in to_remove { if should_abort() { return Err(Error::InternalError(InternalError::AbortedIndexation)); } - match self.new_external_documents_ids_builder.entry((*to_remove).into()) { - // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. - Entry::Occupied(entry) => { - let doc_id = *entry.get() as u32; - self.original_sorter - .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?; - self.flattened_sorter - .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?; + // Check if the document has been added in the current indexing process. + let deleted_from_current = + match self.new_external_documents_ids_builder.entry((*to_remove).into()) { + // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. + HEntry::Occupied(entry) => { + let docid = *entry.get() as u32; + // Key is the concatenation of the internal docid and the external one. + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes()); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); + obkv::KvWriterU16::new(&mut document_sorter_value_buffer).finish().unwrap(); + self.original_sorter + .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; + self.flattened_sorter + .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; - // we must NOT update the list of replaced_documents_ids - // Either: - // 1. It's already in it and there is nothing to do - // 2. It wasn't in it because the document was created by a previous batch and since - // we're removing it there is nothing to do. - self.new_documents_ids.remove(doc_id); - entry.remove_entry(); - } - Entry::Vacant(entry) => { - // If the document was already in the db we mark it as a `to_delete` document. - // It'll be deleted later. We don't need to push anything to the sorters. - if let Some(docid) = external_documents_ids.get(entry.key()) { - self.replaced_documents_ids.insert(docid); - } else { - // if the document is nowehere to be found, there is nothing to do and we must NOT - // increment the count of documents_deleted - continue; + // we must NOT update the list of replaced_documents_ids + // Either: + // 1. It's already in it and there is nothing to do + // 2. It wasn't in it because the document was created by a previous batch and since + // we're removing it there is nothing to do. + self.new_documents_ids.remove(docid); + entry.remove_entry(); + true } + HEntry::Vacant(_) => false, + }; + + // If the document was already in the db we mark it as a `to_delete` document. + // Then we push the document in sorters in deletion mode. + let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? { + Some(docid) => { + self.remove_document_from_db( + docid, + to_remove, + wtxn, + &mut document_sorter_key_buffer, + &mut document_sorter_value_buffer, + )?; + true } + None => false, }; + // increase counter only if the document existed somewhere before. + if deleted_from_current || deleted_from_db { + documents_deleted += 1; + } + } + + Ok(documents_deleted) + } + + /// Removes documents from db using their internal document ids. + /// + /// # Warning + /// + /// This function is dangerous and will only work correctly if: + /// + /// - All the passed ids currently exist in the database + /// - No batching using the standards `remove_documents` and `add_documents` took place + /// + /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function. + #[logging_timer::time] + pub fn remove_documents_from_db_no_batch( + &mut self, + to_remove: &RoaringBitmap, + wtxn: &mut heed::RwTxn, + should_abort: FA, + ) -> Result + where + FA: Fn() -> bool + Sync, + { + puffin::profile_function!(); + + let mut documents_deleted = 0; + let mut document_sorter_value_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); + let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?; + + for (internal_docid, external_docid) in to_remove.iter().zip(external_ids) { + let external_docid = external_docid?; + if should_abort() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + self.remove_document_from_db( + internal_docid, + external_docid, + wtxn, + &mut document_sorter_key_buffer, + &mut document_sorter_value_buffer, + )?; + documents_deleted += 1; } Ok(documents_deleted) } + fn remove_document_from_db( + &mut self, + internal_docid: u32, + external_docid: String, + txn: &heed::RoTxn, + document_sorter_key_buffer: &mut Vec, + document_sorter_value_buffer: &mut Vec, + ) -> Result<()> { + self.replaced_documents_ids.insert(internal_docid); + + // fetch the obkv document + let original_key = internal_docid; + let base_obkv = self + .index + .documents + .remap_data_type::() + .get(txn, &original_key)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::DOCUMENTS, + key: None, + })?; + + // Key is the concatenation of the internal docid and the external one. + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&internal_docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes()); + // push it as to delete in the original_sorter + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); + into_del_add_obkv( + KvReaderU16::new(base_obkv), + DelAddOperation::Deletion, + document_sorter_value_buffer, + )?; + self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; + + // flatten it and push it as to delete in the flattened_sorter + let flattened_obkv = KvReader::new(base_obkv); + if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { + // we recreate our buffer with the flattened documents + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); + into_del_add_obkv( + KvReaderU16::new(&obkv), + DelAddOperation::Deletion, + document_sorter_value_buffer, + )?; + } + self.flattened_sorter + .insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?; + Ok(()) + } + // Flatten a document from the fields ids map contained in self and insert the new // created fields. Returns `None` if the document doesn't need to be flattened. fn flatten_from_fields_ids_map(&mut self, obkv: KvReader) -> Result>> { @@ -514,42 +658,10 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(()) } - fn remove_deleted_documents_from_field_distribution( - &self, - rtxn: &RoTxn, - field_distribution: &mut FieldDistribution, - ) -> Result<()> { - for deleted_docid in self.replaced_documents_ids.iter() { - let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or( - InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, - )?; - - for (key, _) in obkv.iter() { - let name = - self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Computing field distribution in transform.", - })?; - // We checked that the document was in the db earlier. If we can't find it it means - // there is an inconsistency between the field distribution and the field id map. - let field = - field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Accessing field distribution in transform.", - })?; - *field -= 1; - if *field == 0 { - // since we were able to get the field right before it's safe to unwrap here - field_distribution.remove(name).unwrap(); - } - } - } - Ok(()) - } - /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. + #[logging_timer::time] pub(crate) fn output_from_sorter( self, wtxn: &mut heed::RwTxn, @@ -581,17 +693,13 @@ impl<'a, 'i> Transform<'a, 'i> { // 2. Add all the new documents to the field distribution let mut field_distribution = self.index.field_distribution(wtxn)?; - self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?; - // Here we are going to do the document count + field distribution + `write_into_stream_writer` let mut iter = self.original_sorter.into_stream_merger_iter()?; // used only for the callback let mut documents_count = 0; while let Some((key, val)) = iter.next()? { - if val[0] == Operation::Deletion as u8 { - continue; - } + // skip first byte corresponding to the operation type (Deletion or Addition). let val = &val[1..]; // send a callback to show at which step we are @@ -601,16 +709,51 @@ impl<'a, 'i> Transform<'a, 'i> { total_documents: self.documents_count, }); - // We increment all the field of the current document in the field distribution. - let obkv = KvReader::new(val); - - for (key, _) in obkv.iter() { - let name = - self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Computing field distribution in transform.", - })?; - *field_distribution.entry(name.to_string()).or_insert(0) += 1; + for (key, value) in KvReader::new(val) { + let reader = KvReaderDelAdd::new(value); + match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { + (None, None) => {} + (None, Some(_)) => { + // New field + let name = self.fields_ids_map.name(key).ok_or( + FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + }, + )?; + *field_distribution.entry(name.to_string()).or_insert(0) += 1; + } + (Some(_), None) => { + // Field removed + let name = self.fields_ids_map.name(key).ok_or( + FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + }, + )?; + match field_distribution.entry(name.to_string()) { + BEntry::Vacant(_) => { /* Bug? trying to remove a non-existing field */ + } + BEntry::Occupied(mut entry) => { + // attempt to remove one + match entry.get_mut().checked_sub(1) { + Some(0) => { + entry.remove(); + } + Some(new_val) => { + *entry.get_mut() = new_val; + } + None => { + unreachable!("Attempting to remove a field that wasn't in the field distribution") + } + } + } + } + } + (Some(_), Some(_)) => { + // Value change, no field distribution change + } + } } writer.insert(key, val)?; } @@ -631,9 +774,7 @@ impl<'a, 'i> Transform<'a, 'i> { // We get rids of the `Operation` byte and skip the deleted documents as well. let mut iter = self.flattened_sorter.into_stream_merger_iter()?; while let Some((key, val)) = iter.next()? { - if val[0] == Operation::Deletion as u8 { - continue; - } + // skip first byte corresponding to the operation type (Deletion or Addition). let val = &val[1..]; writer.insert(key, val)?; } @@ -649,15 +790,11 @@ impl<'a, 'i> Transform<'a, 'i> { new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| { fst_new_external_documents_ids_builder.insert(key, value) })?; - let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map(); Ok(TransformOutput { primary_key, fields_ids_map: self.fields_ids_map, field_distribution, - new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(), - new_documents_ids: self.new_documents_ids, - replaced_documents_ids: self.replaced_documents_ids, documents_count: self.documents_count, original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, flattened_documents: flattened_documents @@ -672,7 +809,7 @@ impl<'a, 'i> Transform<'a, 'i> { // TODO this can be done in parallel by using the rayon `ThreadPool`. pub fn prepare_for_documents_reindexing( self, - wtxn: &mut heed::RwTxn<'i, '_>, + wtxn: &mut heed::RwTxn<'i>, old_fields_ids_map: FieldsIdsMap, mut new_fields_ids_map: FieldsIdsMap, ) -> Result { @@ -687,37 +824,40 @@ impl<'a, 'i> Transform<'a, 'i> { .to_string(); let field_distribution = self.index.field_distribution(wtxn)?; - // Delete the soft deleted document ids from the maps inside the external_document_ids structure - let new_external_documents_ids = { - let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; - external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; - // This call should be free and can't fail since the previous method merged both fsts. - external_documents_ids.into_static().to_fst()?.into_owned() - }; - let documents_ids = self.index.documents_ids(wtxn)?; let documents_count = documents_ids.len() as usize; - // We create a final writer to write the new documents in order from the sorter. - let mut original_writer = create_writer( + // We initialize the sorter with the user indexing settings. + let mut original_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, self.indexer_settings.chunk_compression_type, self.indexer_settings.chunk_compression_level, - tempfile::tempfile()?, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory.map(|mem| mem / 2), ); - // We create a final writer to write the new documents in order from the sorter. - let mut flattened_writer = create_writer( + // We initialize the sorter with the user indexing settings. + let mut flattened_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, self.indexer_settings.chunk_compression_type, self.indexer_settings.chunk_compression_level, - tempfile::tempfile()?, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory.map(|mem| mem / 2), ); let mut obkv_buffer = Vec::new(); - for result in self.index.all_documents(wtxn)? { - let (docid, obkv) = result?; + let mut document_sorter_key_buffer = Vec::new(); + let mut document_sorter_value_buffer = Vec::new(); + for result in self.index.external_documents_ids().iter(wtxn)? { + let (external_id, docid) = result?; + let obkv = self.index.documents.get(wtxn, &docid)?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, + )?; obkv_buffer.clear(); - let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); + let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer); // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. for (id, name) in new_fields_ids_map.iter() { @@ -727,7 +867,17 @@ impl<'a, 'i> Transform<'a, 'i> { } let buffer = obkv_writer.into_inner()?; - original_writer.insert(docid.to_be_bytes(), &buffer)?; + + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + document_sorter_value_buffer.clear(); + into_del_add_obkv( + KvReaderU16::new(buffer), + DelAddOperation::Addition, + &mut document_sorter_value_buffer, + )?; + original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; // Once we have the document. We're going to flatten it // and insert it in the flattened sorter. @@ -762,29 +912,34 @@ impl<'a, 'i> Transform<'a, 'i> { let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; writer.insert(fid, &value)?; } - flattened_writer.insert(docid.to_be_bytes(), &buffer)?; + document_sorter_value_buffer.clear(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Addition, + &mut document_sorter_value_buffer, + )?; + flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; } - // Once we have written all the documents, we extract - // the file and reset the seek to be able to read it again. - let mut original_documents = original_writer.into_inner()?; - original_documents.rewind()?; + let grenad_params = GrenadParameters { + chunk_compression_type: self.indexer_settings.chunk_compression_type, + chunk_compression_level: self.indexer_settings.chunk_compression_level, + max_memory: self.indexer_settings.max_memory, + max_nb_chunks: self.indexer_settings.max_nb_chunks, // default value, may be chosen. + }; - let mut flattened_documents = flattened_writer.into_inner()?; - flattened_documents.rewind()?; + // Once we have written all the documents, we merge everything into a Reader. + let original_documents = sorter_into_reader(original_sorter, grenad_params)?; + + let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; let output = TransformOutput { primary_key, fields_ids_map: new_fields_ids_map, field_distribution, - new_external_documents_ids, - new_documents_ids: documents_ids, - replaced_documents_ids: RoaringBitmap::default(), documents_count, - original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, - flattened_documents: flattened_documents - .into_inner() - .map_err(|err| err.into_error())?, + original_documents: original_documents.into_inner().into_inner(), + flattened_documents: flattened_documents.into_inner().into_inner(), }; let new_facets = output.compute_real_facets(wtxn, self.index)?; @@ -828,38 +983,111 @@ mod test { #[test] fn merge_obkvs() { - let mut doc_0 = Vec::new(); - let mut kv_writer = KvWriter::new(&mut doc_0); + let mut additive_doc_0 = Vec::new(); + let mut deletive_doc_0 = Vec::new(); + let mut del_add_doc_0 = Vec::new(); + let mut kv_writer = KvWriter::memory(); kv_writer.insert(0_u8, [0]).unwrap(); - kv_writer.finish().unwrap(); - doc_0.insert(0, Operation::Addition as u8); - - let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap(); - assert_eq!(*ret, doc_0); - - let ret = merge_obkvs_and_operations( - &[], - &[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())], + let buffer = kv_writer.into_inner().unwrap(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Addition, + &mut additive_doc_0, ) .unwrap(); - assert_eq!(*ret, doc_0); - - let ret = merge_obkvs_and_operations( - &[], - &[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())], + additive_doc_0.insert(0, Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Deletion, + &mut deletive_doc_0, ) .unwrap(); - assert_eq!(*ret, [Operation::Deletion as u8]); + deletive_doc_0.insert(0, Operation::Deletion as u8); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::DeletionAndAddition, + &mut del_add_doc_0, + ) + .unwrap(); + del_add_doc_0.insert(0, Operation::Addition as u8); - let ret = merge_obkvs_and_operations( + let mut additive_doc_1 = Vec::new(); + let mut kv_writer = KvWriter::memory(); + kv_writer.insert(1_u8, [1]).unwrap(); + let buffer = kv_writer.into_inner().unwrap(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Addition, + &mut additive_doc_1, + ) + .unwrap(); + additive_doc_1.insert(0, Operation::Addition as u8); + + let mut additive_doc_0_1 = Vec::new(); + let mut kv_writer = KvWriter::memory(); + kv_writer.insert(0_u8, [0]).unwrap(); + kv_writer.insert(1_u8, [1]).unwrap(); + let buffer = kv_writer.into_inner().unwrap(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Addition, + &mut additive_doc_0_1, + ) + .unwrap(); + additive_doc_0_1.insert(0, Operation::Addition as u8); + + let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())]) + .unwrap(); + assert_eq!(*ret, additive_doc_0); + + let ret = obkvs_merge_additions_and_deletions( + &[], + &[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())], + ) + .unwrap(); + assert_eq!(*ret, del_add_doc_0); + + let ret = obkvs_merge_additions_and_deletions( + &[], + &[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())], + ) + .unwrap(); + assert_eq!(*ret, deletive_doc_0); + + let ret = obkvs_merge_additions_and_deletions( &[], &[ - Cow::from([Operation::Addition as u8, 1].as_slice()), - Cow::from([Operation::Deletion as u8].as_slice()), - Cow::from(doc_0.as_slice()), + Cow::from(additive_doc_1.as_slice()), + Cow::from(deletive_doc_0.as_slice()), + Cow::from(additive_doc_0.as_slice()), ], ) .unwrap(); - assert_eq!(*ret, doc_0); + assert_eq!(*ret, del_add_doc_0); + + let ret = obkvs_merge_additions_and_deletions( + &[], + &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], + ) + .unwrap(); + assert_eq!(*ret, additive_doc_0_1); + + let ret = obkvs_keep_last_addition_merge_deletions( + &[], + &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], + ) + .unwrap(); + assert_eq!(*ret, additive_doc_0); + + let ret = obkvs_keep_last_addition_merge_deletions( + &[], + &[ + Cow::from(deletive_doc_0.as_slice()), + Cow::from(additive_doc_1.as_slice()), + Cow::from(additive_doc_0.as_slice()), + ], + ) + .unwrap(); + assert_eq!(*ret, del_add_doc_0); } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 5895a69c5..49e36b87e 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,5 +1,4 @@ -use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; @@ -7,34 +6,40 @@ use std::io::{self, BufReader}; use bytemuck::allocation::pod_collect_to_vec; use charabia::{Language, Script}; use grenad::MergerBuilder; -use heed::types::ByteSlice; -use heed::RwTxn; +use heed::types::Bytes; +use heed::{PutFlags, RwTxn}; +use log::error; +use obkv::{KvReader, KvWriter}; +use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::helpers::{ - self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, + self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, + valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; use crate::distance::NDotProductPoint; use crate::error::UserError; +use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; +use crate::index::db_name::DOCUMENTS; use crate::index::Hnsw; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; -use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32}; +use crate::{lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError}; pub(crate) enum TypedChunk { FieldIdDocidFacetStrings(grenad::Reader), FieldIdDocidFacetNumbers(grenad::Reader), Documents(grenad::Reader), - FieldIdWordcountDocids(grenad::Reader>), - NewDocumentsIds(RoaringBitmap), + FieldIdWordCountDocids(grenad::Reader>), WordDocids { word_docids_reader: grenad::Reader>, exact_word_docids_reader: grenad::Reader>, + word_fid_docids_reader: grenad::Reader>, }, WordPositionDocids(grenad::Reader>), - WordFidDocids(grenad::Reader>), WordPairProximityDocids(grenad::Reader>), FieldIdFacetStringDocids(grenad::Reader>), FieldIdFacetNumberDocids(grenad::Reader>), @@ -43,7 +48,7 @@ pub(crate) enum TypedChunk { FieldIdFacetIsEmptyDocids(grenad::Reader>), GeoPoints(grenad::Reader>), VectorPoints(grenad::Reader>), - ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), + ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } impl TypedChunk { @@ -58,23 +63,22 @@ impl TypedChunk { TypedChunk::Documents(grenad) => { format!("Documents {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::FieldIdWordcountDocids(grenad) => { + TypedChunk::FieldIdWordCountDocids(grenad) => { format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::NewDocumentsIds(grenad) => { - format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!( - "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}", + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => format!( + "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}", word_docids_reader.len(), - exact_word_docids_reader.len() + exact_word_docids_reader.len(), + word_fid_docids_reader.len() ), TypedChunk::WordPositionDocids(grenad) => { format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::WordFidDocids(grenad) => { - format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len()) - } TypedChunk::WordPairProximityDocids(grenad) => { format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len()) } @@ -99,8 +103,8 @@ impl TypedChunk { TypedChunk::VectorPoints(grenad) => { format!("VectorPoints {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::ScriptLanguageDocids(grenad) => { - format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len()) + TypedChunk::ScriptLanguageDocids(sl_map) => { + format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len()) } } } @@ -119,34 +123,75 @@ pub(crate) fn write_typed_chunk_into_index( let mut is_merged_database = false; match typed_chunk { TypedChunk::Documents(obkv_documents_iter) => { + let mut operations: Vec = Default::default(); + + let mut docids = index.documents_ids(wtxn)?; let mut cursor = obkv_documents_iter.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - index.documents.remap_types::().put(wtxn, key, value)?; + while let Some((key, reader)) = cursor.move_on_next()? { + let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); + let reader: KvReader = KvReader::new(reader); + + let (document_id_bytes, external_id_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?; + let docid = DocumentId::from_be_bytes(document_id_bytes); + let external_id = std::str::from_utf8(external_id_bytes)?; + + for (field_id, value) in reader.iter() { + let del_add_reader = KvReaderDelAdd::new(value); + + if let Some(addition) = del_add_reader.get(DelAdd::Addition) { + writer.insert(field_id, addition)?; + } + } + + let db = index.documents.remap_data_type::(); + + if !writer.is_empty() { + db.put(wtxn, &docid, &writer.into_inner().unwrap())?; + operations.push(DocumentOperation { + external_id: external_id.to_string(), + internal_id: docid, + kind: DocumentOperationKind::Create, + }); + docids.insert(docid); + } else { + db.delete(wtxn, &docid)?; + operations.push(DocumentOperation { + external_id: external_id.to_string(), + internal_id: docid, + kind: DocumentOperationKind::Delete, + }); + docids.remove(docid); + } } + let external_documents_docids = index.external_documents_ids(); + external_documents_docids.apply(wtxn, operations)?; + index.put_documents_ids(wtxn, &docids)?; } - TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => { + TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { append_entries_into_database( fid_word_count_docids_iter, &index.field_id_word_count_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::NewDocumentsIds(documents_ids) => { - return Ok((documents_ids, is_merged_database)) - } - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => { let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; append_entries_into_database( word_docids_iter.clone(), &index.word_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; @@ -155,8 +200,18 @@ pub(crate) fn write_typed_chunk_into_index( &index.exact_word_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + )?; + + let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; + append_entries_into_database( + word_fid_docids_iter, + &index.word_fid_docids, + wtxn, + index_is_empty, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; // create fst from word docids @@ -177,19 +232,8 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_position_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, - )?; - is_merged_database = true; - } - TypedChunk::WordFidDocids(word_fid_docids_iter) => { - append_entries_into_database( - word_fid_docids_iter, - &index.word_fid_docids, - wtxn, - index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -209,8 +253,8 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_exists_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -220,8 +264,8 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_is_null_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -231,8 +275,8 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_is_empty_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -242,28 +286,48 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_pair_proximity_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } TypedChunk::FieldIdDocidFacetNumbers(fid_docid_facet_number) => { let index_fid_docid_facet_numbers = - index.field_id_docid_facet_f64s.remap_types::(); + index.field_id_docid_facet_f64s.remap_types::(); let mut cursor = fid_docid_facet_number.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + let reader = KvReaderDelAdd::new(value); if valid_lmdb_key(key) { - index_fid_docid_facet_numbers.put(wtxn, key, value)?; + match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { + (None, None) => {} + (None, Some(new)) => index_fid_docid_facet_numbers.put(wtxn, key, new)?, + (Some(_), None) => { + index_fid_docid_facet_numbers.delete(wtxn, key)?; + } + (Some(_), Some(new)) => { + index_fid_docid_facet_numbers.put(wtxn, key, new)? + } + } } } } TypedChunk::FieldIdDocidFacetStrings(fid_docid_facet_string) => { let index_fid_docid_facet_strings = - index.field_id_docid_facet_strings.remap_types::(); + index.field_id_docid_facet_strings.remap_types::(); let mut cursor = fid_docid_facet_string.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + let reader = KvReaderDelAdd::new(value); if valid_lmdb_key(key) { - index_fid_docid_facet_strings.put(wtxn, key, value)?; + match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { + (None, None) => {} + (None, Some(new)) => index_fid_docid_facet_strings.put(wtxn, key, new)?, + (Some(_), None) => { + index_fid_docid_facet_strings.delete(wtxn, key)?; + } + (Some(_), Some(new)) => { + index_fid_docid_facet_strings.put(wtxn, key, new)? + } + } } } } @@ -276,85 +340,113 @@ pub(crate) fn write_typed_chunk_into_index( // convert the key back to a u32 (4 bytes) let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - // convert the latitude and longitude back to a f64 (8 bytes) - let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); - let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); - let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; - let xyz_point = lat_lng_to_xyz(&point); - - rtree.insert(GeoPoint::new(xyz_point, (docid, point))); - geo_faceted_docids.insert(docid); + let deladd_obkv = KvReaderDelAdd::new(value); + if let Some(value) = deladd_obkv.get(DelAdd::Deletion) { + let geopoint = extract_geo_point(value, docid); + rtree.remove(&geopoint); + geo_faceted_docids.remove(docid); + } + if let Some(value) = deladd_obkv.get(DelAdd::Addition) { + let geopoint = extract_geo_point(value, docid); + rtree.insert(geopoint); + geo_faceted_docids.insert(docid); + } } index.put_geo_rtree(wtxn, &rtree)?; index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } TypedChunk::VectorPoints(vector_points) => { - let (pids, mut points): (Vec<_>, Vec<_>) = match index.vector_hnsw(wtxn)? { - Some(hnsw) => hnsw.iter().map(|(pid, point)| (pid, point.clone())).unzip(), - None => Default::default(), - }; - - // Convert the PointIds into DocumentIds - let mut docids = Vec::new(); - for pid in pids { - let docid = - index.vector_id_docid.get(wtxn, &BEU32::new(pid.into_inner()))?.unwrap(); - docids.push(docid.get()); + let mut vectors_set = HashSet::new(); + // We extract and store the previous vectors + if let Some(hnsw) = index.vector_hnsw(wtxn)? { + for (pid, point) in hnsw.iter() { + let pid_key = pid.into_inner(); + let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap(); + let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect(); + vectors_set.insert((docid, vector)); + } } - let mut expected_dimensions = points.get(0).map(|p| p.len()); let mut cursor = vector_points.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { // convert the key back to a u32 (4 bytes) let (left, _index) = try_split_array_at(key).unwrap(); let docid = DocumentId::from_be_bytes(left); - // convert the vector back to a Vec - let vector: Vec = pod_collect_to_vec(value); - // TODO Inform the user about the document that has a wrong `_vectors` - let found = vector.len(); - let expected = *expected_dimensions.get_or_insert(found); - if expected != found { - return Err(UserError::InvalidVectorDimensions { expected, found })?; + let vector_deladd_obkv = KvReaderDelAdd::new(value); + if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { + // convert the vector back to a Vec + let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); + let key = (docid, vector); + if !vectors_set.remove(&key) { + error!("Unable to delete the vector: {:?}", key.1); + } + } + if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { + // convert the vector back to a Vec + let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); + vectors_set.insert((docid, vector)); } - - points.push(NDotProductPoint::new(vector)); - docids.push(docid); } - assert_eq!(docids.len(), points.len()); + // Extract the most common vector dimension + let expected_dimension_size = { + let mut dims = HashMap::new(); + vectors_set.iter().for_each(|(_, v)| *dims.entry(v.len()).or_insert(0) += 1); + dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len) + }; + + // Ensure that the vector lengths are correct and + // prepare the vectors before inserting them in the HNSW. + let mut points = Vec::new(); + let mut docids = Vec::new(); + for (docid, vector) in vectors_set { + if expected_dimension_size.map_or(false, |expected| expected != vector.len()) { + return Err(UserError::InvalidVectorDimensions { + expected: expected_dimension_size.unwrap_or(vector.len()), + found: vector.len(), + } + .into()); + } else { + let vector = vector.into_iter().map(OrderedFloat::into_inner).collect(); + points.push(NDotProductPoint::new(vector)); + docids.push(docid); + } + } let hnsw_length = points.len(); let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points); + assert_eq!(docids.len(), pids.len()); + + // Store the vectors in the point-docid relation database index.vector_id_docid.clear(wtxn)?; for (docid, pid) in docids.into_iter().zip(pids) { - index.vector_id_docid.put( - wtxn, - &BEU32::new(pid.into_inner()), - &BEU32::new(docid), - )?; + index.vector_id_docid.put(wtxn, &pid.into_inner(), &docid)?; } log::debug!("There are {} entries in the HNSW so far", hnsw_length); index.put_vector_hnsw(wtxn, &new_hnsw)?; } - TypedChunk::ScriptLanguageDocids(hash_pair) => { - let mut buffer = Vec::new(); - for (key, value) in hash_pair { - buffer.clear(); + TypedChunk::ScriptLanguageDocids(sl_map) => { + for (key, (deletion, addition)) in sl_map { + let mut db_key_exists = false; let final_value = match index.script_language_docids.get(wtxn, &key)? { Some(db_values) => { - let mut db_value_buffer = Vec::new(); - serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?; - let mut new_value_buffer = Vec::new(); - serialize_roaring_bitmap(&value, &mut new_value_buffer)?; - merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?; - RoaringBitmap::deserialize_from(&buffer[..])? + db_key_exists = true; + (db_values - deletion) | addition } - None => value, + None => addition, }; - index.script_language_docids.put(wtxn, &key, &final_value)?; + + if final_value.is_empty() { + // If the database entry exists, delete it. + if db_key_exists { + index.script_language_docids.delete(wtxn, &key)?; + } + } else { + index.script_language_docids.put(wtxn, &key, &final_value)?; + } } } } @@ -362,6 +454,15 @@ pub(crate) fn write_typed_chunk_into_index( Ok((RoaringBitmap::new(), is_merged_database)) } +/// Converts the latitude and longitude back to an xyz GeoPoint. +fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint { + let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); + let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); + let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; + let xyz_point = lat_lng_to_xyz(&point); + GeoPoint::new(xyz_point, (docid, point)) +} + fn merge_word_docids_reader_into_fst( word_docids_iter: grenad::Reader>, exact_word_docids_iter: grenad::Reader>, @@ -379,24 +480,6 @@ fn merge_word_docids_reader_into_fst( Ok(builder.into_set()) } -fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { - let new_value = RoaringBitmap::deserialize_from(new_value)?; - let db_value = RoaringBitmap::deserialize_from(db_value)?; - let value = new_value | db_value; - Ok(serialize_roaring_bitmap(&value, buffer)?) -} - -fn merge_cbo_roaring_bitmaps( - new_value: &[u8], - db_value: &[u8], - buffer: &mut Vec, -) -> Result<()> { - Ok(CboRoaringBitmapCodec::merge_into( - &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], - buffer, - )?) -} - /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( @@ -410,29 +493,31 @@ fn write_entries_into_database( where R: io::Read + io::Seek, FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, - FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, + FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, { puffin::profile_function!(format!("number of entries: {}", data.len())); let mut buffer = Vec::new(); - let database = database.remap_types::(); + let database = database.remap_types::(); let mut cursor = data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { buffer.clear(); let value = if index_is_empty { - serialize_value(value, &mut buffer)? + Some(serialize_value(value, &mut buffer)?) } else { match database.get(wtxn, key)? { - Some(prev_value) => { - merge_values(value, prev_value, &mut buffer)?; - &buffer[..] - } - None => serialize_value(value, &mut buffer)?, + Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, + None => Some(serialize_value(value, &mut buffer)?), } }; - database.put(wtxn, key, value)?; + match value { + Some(value) => database.put(wtxn, key, value)?, + None => { + database.delete(wtxn, key)?; + } + } } } @@ -454,7 +539,8 @@ fn append_entries_into_database( where R: io::Read + io::Seek, FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, - FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, + FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, + K: for<'a> heed::BytesDecode<'a>, { puffin::profile_function!(format!("number of entries: {}", data.len())); @@ -470,14 +556,23 @@ where } let mut buffer = Vec::new(); - let mut database = database.iter_mut(wtxn)?.remap_types::(); + let mut database = database.iter_mut(wtxn)?.remap_types::(); let mut cursor = data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { + debug_assert!( + K::bytes_decode(key).is_ok(), + "Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}", + key.len(), + &key + ); buffer.clear(); let value = serialize_value(value, &mut buffer)?; - unsafe { database.append(key, value)? }; + unsafe { + // safety: We do not keep a reference to anything that lives inside the database + database.put_current_with_options::(PutFlags::APPEND, key, value)? + }; } } diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 9982957e5..eb2b6e69a 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -1,6 +1,5 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; -pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDeletionResult}; pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ @@ -9,10 +8,6 @@ pub use self::index_documents::{ MergeFn, }; pub use self::indexer_config::IndexerConfig; -pub use self::prefix_word_pairs::{ - PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, - MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, -}; pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; @@ -21,11 +16,10 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; -mod delete_documents; +pub(crate) mod del_add; pub(crate) mod facet; mod index_documents; mod indexer_config; -mod prefix_word_pairs; mod settings; mod update_step; mod word_prefix_docids; diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs deleted file mode 100644 index e3135d546..000000000 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ /dev/null @@ -1,579 +0,0 @@ -use std::borrow::Cow; -use std::collections::HashSet; -use std::io::{BufReader, BufWriter}; - -use grenad::CompressionType; -use heed::types::ByteSlice; - -use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; -use crate::{Index, Result}; - -mod prefix_word; -mod word_prefix; - -pub use prefix_word::index_prefix_word_database; -pub use word_prefix::index_word_prefix_database; - -pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4; -pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2; - -pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - max_proximity: u8, - max_prefix_length: usize, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -} -impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, - ) -> Self { - Self { - wtxn, - index, - max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, - max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, - chunk_compression_type, - chunk_compression_level, - } - } - - #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute<'a>( - self, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &'a [String], - common_prefix_fst_words: &[&'a [String]], - del_prefix_fst_words: &HashSet>, - ) -> Result<()> { - puffin::profile_function!(); - - index_word_prefix_database( - self.wtxn, - self.index.word_pair_proximity_docids, - self.index.word_prefix_pair_proximity_docids, - self.max_proximity, - self.max_prefix_length, - new_word_pair_proximity_docids.clone(), - new_prefix_fst_words, - common_prefix_fst_words, - del_prefix_fst_words, - self.chunk_compression_type, - self.chunk_compression_level, - )?; - - index_prefix_word_database( - self.wtxn, - self.index.word_pair_proximity_docids, - self.index.prefix_word_pair_proximity_docids, - self.max_proximity, - self.max_prefix_length, - new_word_pair_proximity_docids, - new_prefix_fst_words, - common_prefix_fst_words, - del_prefix_fst_words, - self.chunk_compression_type, - self.chunk_compression_level, - )?; - - Ok(()) - } -} - -// This is adapted from `sorter_into_lmdb_database` -pub fn insert_into_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - new_key: &[u8], - new_value: &[u8], -) -> Result<()> { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; - match iter.next().transpose()? { - Some((key, old_val)) if new_key == key => { - let val = - merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) - .map_err(|_| { - // TODO just wrap this error? - crate::error::InternalError::IndexingMergingKeys { - process: "get-put-merge", - } - })?; - // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour - unsafe { iter.put_current(new_key, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; - } - } - Ok(()) -} - -// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, -// but it uses `append` if the database is empty, and it assumes that the values in the -// writer don't conflict with values in the database. -pub fn write_into_lmdb_database_without_merging( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - writer: grenad::Writer>, -) -> Result<()> { - let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?; - let reader = grenad::Reader::new(BufReader::new(file))?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - // safety: the key comes from the grenad reader, not the database - unsafe { out_iter.append(k, v)? }; - } - } else { - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - Ok(()) -} - -#[cfg(test)] -mod tests { - use std::io::Cursor; - use std::iter::FromIterator; - - use roaring::RoaringBitmap; - - use crate::db_snap; - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::update::{DeleteDocuments, DeletionStrategy, IndexDocumentsMethod}; - - fn documents_with_enough_different_words_for_prefixes( - prefixes: &[&str], - start_id: usize, - ) -> Vec { - let mut documents = Vec::new(); - let mut id = start_id; - for prefix in prefixes { - for i in 0..50 { - documents.push( - serde_json::json!({ - "id": id, - "text": format!("{prefix}{i:x}"), - }) - .as_object() - .unwrap() - .clone(), - ); - id += 1; - } - } - documents - } - - #[test] - fn add_new_documents() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": "9000", - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": "9001", - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100); - documents.push( - serde_json::json!({ - "id": "9002", - "text": "At an extraordinary house" - }) - .as_object() - .unwrap() - .clone(), - ); - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_pair_proximity_docids, "update"); - db_snap!(index, word_prefix_pair_proximity_docids, "update"); - db_snap!(index, prefix_word_pair_proximity_docids, "update"); - } - #[test] - fn batch_bug_3043() { - // https://github.com/meilisearch/meilisearch/issues/3043 - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "text": "x y" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "text": "x a y" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_pair_proximity_docids); - db_snap!(index, word_prefix_pair_proximity_docids); - db_snap!(index, prefix_word_pair_proximity_docids); - } - - #[test] - fn hard_delete_and_reupdate() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysHard); - delete.delete_documents(&RoaringBitmap::from_iter([50])); - delete.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, "first_delete"); - db_snap!(index, word_docids, "first_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); - - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysHard); - delete.delete_documents(&RoaringBitmap::from_iter(0..50)); - delete.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, "second_delete"); - db_snap!(index, word_docids, "second_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "reupdate"); - db_snap!(index, word_docids, "reupdate"); - db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); - db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); - } - - #[test] - fn soft_delete_and_reupdate() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysSoft); - delete.delete_documents(&RoaringBitmap::from_iter([50])); - delete.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, "first_delete"); - db_snap!(index, word_docids, "first_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); - - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysSoft); - - delete.delete_documents(&RoaringBitmap::from_iter(0..50)); - delete.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, "second_delete"); - db_snap!(index, word_docids, "second_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "reupdate"); - db_snap!(index, word_docids, "reupdate"); - db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); - db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); - } - - #[test] - fn replace_soft_deletion() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "replaced"); - db_snap!(index, word_docids, "replaced"); - db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); - db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); - db_snap!(index, soft_deleted_documents_ids, "replaced", @"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, ]"); - } - - #[test] - fn replace_hard_deletion() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; - index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "replaced"); - db_snap!(index, word_docids, "replaced"); - db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); - db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); - db_snap!(index, soft_deleted_documents_ids, "replaced", @"[]"); - } -} diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs deleted file mode 100644 index 1ec66d010..000000000 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ /dev/null @@ -1,182 +0,0 @@ -use std::borrow::Cow; -use std::collections::{BTreeMap, HashSet}; - -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; - -use crate::update::index_documents::{create_writer, CursorClonableMmap}; -use crate::update::prefix_word_pairs::{ - insert_into_database, write_into_lmdb_database_without_merging, -}; -use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; - -#[allow(clippy::too_many_arguments)] -#[logging_timer::time] -pub fn index_prefix_word_database( - wtxn: &mut heed::RwTxn, - word_pair_proximity_docids: heed::Database, - prefix_word_pair_proximity_docids: heed::Database, - max_proximity: u8, - max_prefix_length: usize, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &[String], - common_prefix_fst_words: &[&[String]], - del_prefix_fst_words: &HashSet>, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -) -> Result<()> { - puffin::profile_function!(); - - let max_proximity = max_proximity - 1; - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - let common_prefixes: Vec<_> = common_prefix_fst_words - .iter() - .flat_map(|s| s.iter()) - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length) - .collect(); - - for proximity in 1..max_proximity { - for prefix in common_prefixes.iter() { - let mut prefix_key = vec![proximity]; - prefix_key.extend_from_slice(prefix.as_bytes()); - let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - proximity, - prefix.as_bytes(), - // the next two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.next()? { - let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) - .ok_or(heed::Error::Decoding)?; - Ok(Some((word2, value))) - } else { - Ok(None) - } - }, - // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) - |key, value| { - insert_into_database( - wtxn, - *prefix_word_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - let new_prefixes: Vec<_> = new_prefix_fst_words - .iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length) - .collect(); - - // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) - // element in an intermediary grenad - let mut writer = - create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); - - for proximity in 1..max_proximity { - for prefix in new_prefixes.iter() { - let mut prefix_key = vec![proximity]; - prefix_key.extend_from_slice(prefix.as_bytes()); - let mut db_iter = word_pair_proximity_docids - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? - .remap_key_type::(); - execute_on_word_pairs_and_prefixes( - proximity, - prefix.as_bytes(), - &mut db_iter, - |db_iter| { - db_iter - .next() - .transpose() - .map(|x| x.map(|((_, _, word2), value)| (word2, value))) - .map_err(|e| e.into()) - }, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - } - } - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - wtxn, - *prefix_word_pair_proximity_docids.as_polymorph(), - writer, - )?; - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = - prefix_word_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; - while let Some(((_, prefix, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(prefix.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; - } - } - } - - Ok(()) -} - -/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database. -/// -/// Its arguments are: -/// - an iterator over the words following the given `prefix` with the given `proximity` -/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements -fn execute_on_word_pairs_and_prefixes( - proximity: u8, - prefix: &[u8], - iter: &mut I, - mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result>, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, -) -> Result<()> { - let mut batch: BTreeMap, Vec>> = BTreeMap::default(); - - // Memory usage check: - // The content of the loop will be called for each `word2` that follows a word beginning - // with `prefix` with the given proximity. - // In practice, I don't think the batch can ever get too big. - while let Some((word2, docids)) = next_word2_and_docids(iter)? { - let entry = batch.entry(word2.to_owned()).or_default(); - entry.push(Cow::Owned(docids.to_owned())); - } - - let mut key_buffer = Vec::with_capacity(512); - key_buffer.push(proximity); - key_buffer.extend_from_slice(prefix); - key_buffer.push(0); - - let mut value_buffer = Vec::with_capacity(65_536); - - for (word2, docids) in batch { - key_buffer.truncate(prefix.len() + 2); - value_buffer.clear(); - - key_buffer.extend_from_slice(&word2); - let data = if docids.len() > 1 { - CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?; - value_buffer.as_slice() - } else { - &docids[0] - }; - insert(key_buffer.as_slice(), data)?; - } - Ok(()) -} diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 6609786a3..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,20 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [101, ] -1 a amazing [100, ] -1 a an [100, ] -1 a and [100, ] -1 a beautiful [100, ] -1 b house [100, ] -1 b rings [101, ] -1 be house [100, ] -1 be rings [101, ] -2 a am [101, ] -2 a amazing [100, ] -2 a and [100, ] -2 a beautiful [100, ] -2 a house [100, ] -2 b at [101, ] -2 be at [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 52b29e136..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,23 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [101, ] -1 amazing a [100, ] -1 an a [100, ] -1 and b [100, ] -1 and be [100, ] -1 at a [100, ] -1 rings a [101, ] -1 the b [101, ] -1 the be [101, ] -2 amazing b [100, ] -2 amazing be [100, ] -2 an a [100, ] -2 at a [100, 101, ] -2 bell a [101, ] -3 an b [100, ] -3 an be [100, ] -3 at a [100, ] -3 rings a [101, ] -3 the a [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 7644c433d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,29 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [101, ] -1 a amazing [100, ] -1 a an [100, 202, ] -1 a and [100, ] -1 a beautiful [100, ] -1 a extraordinary [202, ] -1 am and [100, ] -1 an amazing [100, ] -1 an beautiful [100, ] -1 an extraordinary [202, ] -1 b house [100, ] -1 b rings [101, ] -1 be house [100, ] -1 be rings [101, ] -2 a am [101, ] -2 a amazing [100, ] -2 a and [100, ] -2 a beautiful [100, ] -2 a extraordinary [202, ] -2 a house [100, 202, ] -2 am beautiful [100, ] -2 an and [100, ] -2 an house [100, 202, ] -2 b at [101, ] -2 be at [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap deleted file mode 100644 index 1b56974c2..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap +++ /dev/null @@ -1,33 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 am [101, ] -1 amazing and [100, ] -1 an amazing [100, ] -1 an extraordinary [202, ] -1 and beautiful [100, ] -1 at 5 [101, ] -1 at an [100, 202, ] -1 beautiful house [100, ] -1 bell rings [101, ] -1 extraordinary house [202, ] -1 rings at [101, ] -1 the bell [101, ] -2 amazing beautiful [100, ] -2 an and [100, ] -2 an house [202, ] -2 and house [100, ] -2 at am [101, ] -2 at amazing [100, ] -2 at extraordinary [202, ] -2 bell at [101, ] -2 rings 5 [101, ] -2 the rings [101, ] -3 amazing house [100, ] -3 an beautiful [100, ] -3 at and [100, ] -3 at house [202, ] -3 bell 5 [101, ] -3 rings am [101, ] -3 the at [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 008a4b21d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,31 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [101, ] -1 5 am [101, ] -1 amazing a [100, ] -1 amazing an [100, ] -1 an a [100, ] -1 an am [100, ] -1 and b [100, ] -1 and be [100, ] -1 at a [100, 202, ] -1 at an [100, 202, ] -1 rings a [101, ] -1 the b [101, ] -1 the be [101, ] -2 amazing b [100, ] -2 amazing be [100, ] -2 an a [100, ] -2 an an [100, ] -2 at a [100, 101, ] -2 at am [100, 101, ] -2 bell a [101, ] -3 an b [100, ] -3 an be [100, ] -3 at a [100, ] -3 at an [100, ] -3 rings a [101, ] -3 rings am [101, ] -3 the a [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index d212999bb..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap deleted file mode 100644 index 816895dcf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap +++ /dev/null @@ -1,8 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a y [51, ] -1 x a [51, ] -1 x y [50, ] -2 x y [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 03530a2f1..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a y [51, ] -1 x y [50, ] -2 x y [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap deleted file mode 100644 index 39e9fbe65..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 61987fd4a..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -2 a am [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap deleted file mode 100644 index 1caf1a9a3..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap +++ /dev/null @@ -1,60 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -at [51, ] -bell [51, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 618a0b076..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 rings a [51, ] -2 at a [51, ] -2 bell a [51, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap deleted file mode 100644 index 39e9fbe65..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 267a1c01d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 b rings [51, ] -2 b at [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap deleted file mode 100644 index e5336d58c..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap +++ /dev/null @@ -1,60 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -am [51, ] -at [51, ] -b0 [0, ] -b1 [1, ] -b10 [16, ] -b11 [17, ] -b12 [18, ] -b13 [19, ] -b14 [20, ] -b15 [21, ] -b16 [22, ] -b17 [23, ] -b18 [24, ] -b19 [25, ] -b1a [26, ] -b1b [27, ] -b1c [28, ] -b1d [29, ] -b1e [30, ] -b1f [31, ] -b2 [2, ] -b20 [32, ] -b21 [33, ] -b22 [34, ] -b23 [35, ] -b24 [36, ] -b25 [37, ] -b26 [38, ] -b27 [39, ] -b28 [40, ] -b29 [41, ] -b2a [42, ] -b2b [43, ] -b2c [44, ] -b2d [45, ] -b2e [46, ] -b2f [47, ] -b3 [3, ] -b30 [48, ] -b31 [49, ] -b4 [4, ] -b5 [5, ] -b6 [6, ] -b7 [7, ] -b8 [8, ] -b9 [9, ] -ba [10, ] -bb [11, ] -bc [12, ] -bd [13, ] -be [14, ] -bell [51, ] -bf [15, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 4cdf756ac..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 the b [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap deleted file mode 100644 index 4dca775e6..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 61987fd4a..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -2 a am [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap deleted file mode 100644 index 7949d464e..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -am [51, ] -at [51, ] -bell [51, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 618a0b076..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 rings a [51, ] -2 at a [51, ] -2 bell a [51, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 78b6a3885..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,9 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a amazing [50, ] -1 a an [50, ] -1 a house [50, ] -2 a amazing [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap deleted file mode 100644 index 8c7809973..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap +++ /dev/null @@ -1,61 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -amazing [50, ] -an [50, ] -at [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 65d8b806b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 an a [50, ] -1 at a [50, ] -2 at a [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap deleted file mode 100644 index 775d41a3d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 54c9e4b9b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 b rings [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap deleted file mode 100644 index f86fdcb8b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap +++ /dev/null @@ -1,61 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -amazing [50, ] -an [50, ] -at [50, ] -b0 [52, ] -b1 [53, ] -b10 [68, ] -b11 [69, ] -b12 [70, ] -b13 [71, ] -b14 [72, ] -b15 [73, ] -b16 [74, ] -b17 [75, ] -b18 [76, ] -b19 [77, ] -b1a [78, ] -b1b [79, ] -b1c [80, ] -b1d [81, ] -b1e [82, ] -b1f [83, ] -b2 [54, ] -b20 [84, ] -b21 [85, ] -b22 [86, ] -b23 [87, ] -b24 [88, ] -b25 [89, ] -b26 [90, ] -b27 [91, ] -b28 [92, ] -b29 [93, ] -b2a [94, ] -b2b [95, ] -b2c [96, ] -b2d [97, ] -b2e [98, ] -b2f [99, ] -b3 [55, ] -b30 [100, ] -b31 [101, ] -b4 [56, ] -b5 [57, ] -b6 [58, ] -b7 [59, ] -b8 [60, ] -b9 [61, ] -ba [62, ] -bb [63, ] -bc [64, ] -bd [65, ] -be [66, ] -bell [51, ] -bf [67, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 4cdf756ac..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 the b [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 78b6a3885..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,9 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a amazing [50, ] -1 a an [50, ] -1 a house [50, ] -2 a amazing [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap deleted file mode 100644 index 8c7809973..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap +++ /dev/null @@ -1,61 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -amazing [50, ] -an [50, ] -at [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 65d8b806b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 an a [50, ] -1 at a [50, ] -2 at a [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap deleted file mode 100644 index 775d41a3d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 0241f26a5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a amazing [50, ] -1 a an [50, ] -1 a house [50, ] -1 b rings [51, ] -2 a amazing [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap deleted file mode 100644 index 6a481eeee..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5f6443e54fae188aa96d4f27fce28939 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index d20582970..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,8 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 an a [50, ] -1 at a [50, ] -1 the b [51, ] -2 at a [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap deleted file mode 100644 index 39e9fbe65..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap deleted file mode 100644 index c8a1e54b4..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index db62b6566..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,17 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -1 b house [50, ] -1 b rings [51, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] -2 b at [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap deleted file mode 100644 index 7fd726325..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -9f4866b80177e321a33ce434992022b5 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 2ea0d46f4..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,19 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 and b [50, ] -1 at a [50, ] -1 rings a [51, ] -1 the b [51, ] -2 amazing b [50, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 an b [50, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap deleted file mode 100644 index 4dca775e6..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs deleted file mode 100644 index 570adece9..000000000 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ /dev/null @@ -1,728 +0,0 @@ -/*! -The word-prefix-pair-proximity-docids database is a database whose keys are of -the form `(proximity, word, prefix)` and the values are roaring bitmaps of -the documents which contain `word` followed by another word starting with -`prefix` at a distance of `proximity`. - -The prefixes present in this database are only those that correspond to many -different words in the documents. - -## How is it created/updated? (simplified version) -To compute it, we have access to (mainly) two inputs: - -* a list of sorted prefixes, such as: -```text -c -ca -cat -d -do -dog -``` -Note that only prefixes which correspond to more than a certain number of -different words from the database are included in this list. - -* a sorted list of proximities and word pairs (the proximity is the distance between the two words), -associated with a roaring bitmap, such as: -```text -1 good doggo -> docids1: [8] -1 good door -> docids2: [7, 19, 20] -1 good ghost -> docids3: [1] -2 good dog -> docids4: [2, 5, 6] -2 horror cathedral -> docids5: [1, 2] -``` - -I illustrate a simplified version of the algorithm to create the word-prefix -pair-proximity database below: - -1. **Outer loop:** First, we iterate over each proximity and word pair: -```text -proximity: 1 -word1 : good -word2 : doggo -``` -2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are -in the list of sorted prefixes. And we insert the key `prefix` -and the value (`docids`) to a sorted map which we call the “batch”. For example, -at the end of the first outer loop, we may have: -```text -Outer loop 1: ------------------------------- -proximity: 1 -word1 : good -word2 : doggo -docids : docids1 - -prefixes: [d, do, dog] - -batch: [ - d, -> [docids1] - do -> [docids1] - dog -> [docids1] -] -``` -3. For illustration purpose, let's run through a second iteration of the outer loop: -```text -Outer loop 2: ------------------------------- -proximity: 1 -word1 : good -word2 : door -docids : docids2 - -prefixes: [d, do, doo] - -batch: [ - d -> [docids1, docids2] - do -> [docids1, docids2] - dog -> [docids1] - doo -> [docids2] -] -``` -Notice that there were some conflicts which were resolved by merging the -conflicting values together. Also, an additional prefix was added at the -end of the batch. - -4. On the third iteration of the outer loop, we have: -```text -Outer loop 3: ------------------------------- -proximity: 1 -word1 : good -word2 : ghost -``` -Because `word2` begins with a different letter than the previous `word2`, -we know that all the prefixes of `word2` are greater than the prefixes of the previous word2 - -Therefore, we know that we can insert every element from the batch into the -database before proceeding any further. This operation is called -“flushing the batch”. Flushing the batch should also be done whenever: -* `proximity` is different than the previous `proximity`. -* `word1` is different than the previous `word1`. -* `word2` starts with a different letter than the previous word2 - -6. **Flushing the batch:** to flush the batch, we iterate over its elements: -```text -Flushing Batch loop 1: ------------------------------- -proximity : 1 -word1 : good -prefix : d - -docids : [docids2, docids3] -``` -We then merge the array of `docids` (of type `Vec>`) using -`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a -roaring bitmap of all the document ids where `word1` is followed by `prefix` -at a distance of `proximity`. -Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids` -into the database. - -7. That's it! ... except... - -## How is it created/updated (continued) - -I lied a little bit about the input data. In reality, we get two sets of the -inputs described above, which come from different places: - -* For the list of sorted prefixes, we have: - 1. `new_prefixes`, which are all the prefixes that were not present in the - database before the insertion of the new documents - - 2. `common_prefixes` which are the prefixes that are present both in the - database and in the newly added documents - -* For the list of word pairs and proximities, we have: - 1. `new_word_pairs`, which is the list of word pairs and their proximities - present in the newly added documents - - 2. `word_pairs_db`, which is the list of word pairs from the database. - This list includes all elements in `new_word_pairs` since `new_word_pairs` - was added to the database prior to calling the `WordPrefix::execute` - function. - -To update the prefix database correctly, we call the algorithm described earlier first -on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). -Thus: - -1. For all the word pairs that were already present in the DB, we insert them -again with the `new_prefixes`. Calling the algorithm on them with the -`common_prefixes` would not result in any new data. - -2. For all the new word pairs, we insert them twice: first with the `common_prefixes`, -and then, because they are part of `word_pairs_db`, with the `new_prefixes`. - -Note, also, that since we read data from the database when iterating over -`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- -docids from the batch directly into the database (we would have a concurrent -reader and writer). Therefore, when calling the algorithm on -`(new_prefixes, word_pairs_db)`, we insert the computed -`((proximity, word, prefix), docids)` elements in an intermediary grenad -Writer instead of the DB. At the end of the outer loop, we finally read from -the grenad and insert its elements in the database. -*/ - -use std::borrow::Cow; -use std::collections::HashSet; - -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; - -use crate::update::index_documents::{create_writer, CursorClonableMmap}; -use crate::update::prefix_word_pairs::{ - insert_into_database, write_into_lmdb_database_without_merging, -}; -use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; - -#[allow(clippy::too_many_arguments)] -#[logging_timer::time] -pub fn index_word_prefix_database( - wtxn: &mut heed::RwTxn, - word_pair_proximity_docids: heed::Database, - word_prefix_pair_proximity_docids: heed::Database, - max_proximity: u8, - max_prefix_length: usize, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &[String], - common_prefix_fst_words: &[&[String]], - del_prefix_fst_words: &HashSet>, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -) -> Result<()> { - puffin::profile_function!(); - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length - let prefixes = PrefixTrieNode::from_sorted_prefixes( - common_prefix_fst_words - .iter() - .flat_map(|s| s.iter()) - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length), - ); - - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (proximity, word1, common_prefix) elements - // to insert in the DB - if !prefixes.is_empty() { - let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - // the first two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.move_on_next()? { - let (proximity, word1, word2) = - UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - Ok(Some(((proximity, word1, word2), value))) - } else { - Ok(None) - } - }, - &prefixes, - max_proximity, - // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap) - |key, value| { - insert_into_database( - wtxn, - *word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - - let prefixes = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words.iter().map(|s| s.as_str()).filter(|s| s.len() <= max_prefix_length), - ); - - if !prefixes.is_empty() { - let mut db_iter = word_pair_proximity_docids - .remap_key_type::() - .remap_data_type::() - .iter(wtxn)?; - - // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) - // element in an intermediary grenad - let mut writer = - create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); - - execute_on_word_pairs_and_prefixes( - &mut db_iter, - |db_iter| db_iter.next().transpose().map_err(|e| e.into()), - &prefixes, - max_proximity, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - wtxn, - *word_prefix_pair_proximity_docids.as_polymorph(), - writer, - )?; - } - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = - word_prefix_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; - while let Some(((_, _, prefix), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(prefix.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; - } - } - } - - Ok(()) -} - -/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. -/// -/// Its main arguments are: -/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements -/// 2. a prefix trie -/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements -/// -/// For more information about what this function does, read the module documentation. -fn execute_on_word_pairs_and_prefixes( - iter: &mut I, - mut next_word_pair_proximity: impl for<'a> FnMut( - &'a mut I, - ) -> Result< - Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>, - >, - prefixes: &PrefixTrieNode, - max_proximity: u8, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, -) -> Result<()> { - let mut batch = PrefixAndProximityBatch::default(); - let mut prev_word2_start = 0; - - // Optimisation: the index at the root of the prefix trie where to search for - let mut prefix_search_start = PrefixTrieNodeSearchStart(0); - - // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter - let mut empty_prefixes = false; - - let mut prefix_buffer = Vec::with_capacity(8); - let mut merge_buffer = Vec::with_capacity(65_536); - - while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { - // stop indexing if the proximity is over the threshold - if proximity > max_proximity { - break; - }; - let word2_start_different_than_prev = word2[0] != prev_word2_start; - // if there were no potential prefixes for the previous word2 based on its first letter, - // and if the current word2 starts with the same letter, then there is also no potential - // prefixes for the current word2, and we can skip to the next iteration - if empty_prefixes && !word2_start_different_than_prev { - continue; - } - - // if the proximity is different to the previous one, OR - // if word1 is different than the previous word1, OR - // if the start of word2 is different than the previous start of word2, - // THEN we'll need to flush the batch - let prox_different_than_prev = proximity != batch.proximity; - let word1_different_than_prev = word1 != batch.word1; - if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev - { - batch.flush(&mut merge_buffer, &mut insert)?; - batch.proximity = proximity; - // don't forget to reset the value of batch.word1 and prev_word2_start - if word1_different_than_prev { - batch.word1.clear(); - batch.word1.extend_from_slice(word1); - } - if word2_start_different_than_prev { - prev_word2_start = word2[0]; - } - prefix_search_start.0 = 0; - // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2 - empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); - } - - if !empty_prefixes { - // All conditions are satisfied, we can now insert each new prefix of word2 into the batch - prefix_buffer.clear(); - prefixes.for_each_prefix_of( - word2, - &mut prefix_buffer, - &prefix_search_start, - |prefix_buffer| { - batch.insert(prefix_buffer, data.to_vec()); - }, - ); - } - } - batch.flush(&mut merge_buffer, &mut insert)?; - Ok(()) -} -/** -A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps). -The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. - -It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently. - -The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content -can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: -- key : (proximity, word1, prefix) as bytes -- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes -*/ -#[derive(Default)] -struct PrefixAndProximityBatch { - proximity: u8, - word1: Vec, - #[allow(clippy::type_complexity)] - batch: Vec<(Vec, Vec>)>, -} - -impl PrefixAndProximityBatch { - /// Insert the new key and value into the batch - /// - /// The key must either exist in the batch or be greater than all existing keys - fn insert(&mut self, new_key: &[u8], new_value: Vec) { - match self.batch.iter_mut().find(|el| el.0 == new_key) { - Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)), - None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])), - } - } - - /// Empties the batch, calling `insert` on each element. - /// - /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap. - fn flush( - &mut self, - merge_buffer: &mut Vec, - insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, - ) -> Result<()> { - let PrefixAndProximityBatch { proximity, word1, batch } = self; - if batch.is_empty() { - return Ok(()); - } - merge_buffer.clear(); - - let mut buffer = Vec::with_capacity(word1.len() + 1 + 6); - buffer.push(*proximity); - buffer.extend_from_slice(word1); - buffer.push(0); - - for (key, mergeable_data) in batch.drain(..) { - buffer.truncate(1 + word1.len() + 1); - buffer.extend_from_slice(key.as_slice()); - - let data = if mergeable_data.len() > 1 { - CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; - merge_buffer.as_slice() - } else { - &mergeable_data[0] - }; - insert(buffer.as_slice(), data)?; - merge_buffer.clear(); - } - - Ok(()) - } -} - -/** A prefix trie. Used to iterate quickly over the prefixes of a word that are -within a set. - -## Structure -The trie is made of nodes composed of: -1. a byte character (e.g. 'a') -2. whether the node is an end node or not -3. a list of children nodes, sorted by their byte character - -For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]` -is drawn below. Nodes with a double border are "end nodes". - -┌──────────────────────┐ ┌──────────────────────┐ ╔══════════════════════╗ -│ a │ │ c │ ║ r ║ -└──────────────────────┘ └──────────────────────┘ ╚══════════════════════╝ -╔══════╗╔══════╗╔══════╗ ┌─────────┐ ╔═════════╗ ┌─────────┐ ╔══════════╗ -║ c ║║ e ║║ r ║ │ e │ ║ h ║ │ e │ ║ i ║ -╚══════╝╚══════╝╚══════╝ └─────────┘ ╚═════════╝ └─────────┘ ╚══════════╝ - ╔═══╗ ╔═══╗ ╔═══╗ - ║ i ║ ║ l ║ ║ l ║ - ╚═══╝ ╚═══╝ ╚═══╝ -*/ -#[derive(Default, Debug)] -struct PrefixTrieNode { - children: Vec<(PrefixTrieNode, u8)>, - is_end_node: bool, -} - -#[derive(Debug)] -struct PrefixTrieNodeSearchStart(usize); - -impl PrefixTrieNode { - fn is_empty(&self) -> bool { - self.children.is_empty() - } - - /// Returns false if the trie does not contain a prefix of the given word. - /// Returns true if the trie *may* contain a prefix of the given word. - /// - /// Moves the search start to the first node equal to the first letter of the word, - /// or to 0 otherwise. - fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { - let byte = word[0]; - if self.children[search_start.0].1 == byte { - true - } else { - match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { - Ok(position) => { - search_start.0 += position; - true - } - Err(_) => { - search_start.0 = 0; - false - } - } - } - } - - fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { - let mut node = PrefixTrieNode::default(); - for prefix in prefixes { - node.insert_sorted_prefix(prefix.as_bytes().iter()); - } - node - } - fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter) { - if let Some(&c) = prefix.next() { - if let Some((node, byte)) = self.children.last_mut() { - if *byte == c { - node.insert_sorted_prefix(prefix); - return; - } - } - let mut new_node = PrefixTrieNode::default(); - new_node.insert_sorted_prefix(prefix); - self.children.push((new_node, c)); - } else { - self.is_end_node = true; - } - } - - /// Call the given closure on each prefix of the word contained in the prefix trie. - /// - /// The search starts from the given `search_start`. - fn for_each_prefix_of( - &self, - word: &[u8], - buffer: &mut Vec, - search_start: &PrefixTrieNodeSearchStart, - mut do_fn: impl FnMut(&mut Vec), - ) { - let first_byte = word[0]; - let mut cur_node = self; - buffer.push(first_byte); - if let Some((child_node, c)) = - cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) - { - if *c == first_byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - for &byte in &word[1..] { - buffer.push(byte); - if let Some((child_node, c)) = - cur_node.children.iter().find(|(_, c)| *c >= byte) - { - if *c == byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - } else { - break; - } - } else { - break; - } - } - } - } - } -} -#[cfg(test)] -mod tests { - use roaring::RoaringBitmap; - - use super::*; - use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; - - fn check_prefixes( - trie: &PrefixTrieNode, - search_start: &PrefixTrieNodeSearchStart, - word: &str, - expected_prefixes: &[&str], - ) { - let mut actual_prefixes = vec![]; - trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), search_start, |x| { - let s = String::from_utf8(x.to_owned()).unwrap(); - actual_prefixes.push(s); - }); - assert_eq!(actual_prefixes, expected_prefixes); - } - - #[test] - fn test_trie() { - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", - "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", - "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", - "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", - "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", - "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", - "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", - "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", - "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", - "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", - "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", - "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", - "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", - "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", - "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", - "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", - ])); - - let mut search_start = PrefixTrieNodeSearchStart(0); - - let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(search_start.0, 2); - - check_prefixes(&trie, &search_start, "affair", &["a"]); - check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); - - let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(trie.children[search_start.0].1, b'u'); - - check_prefixes(&trie, &search_start, "unique", &["u", "un"]); - - // NOTE: this should fail, because the search start is already beyong 'a' - let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); - assert!(!is_empty); - // search start is reset - assert_eq!(search_start.0, 0); - - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); - check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); - } - - #[test] - fn test_execute_on_word_pairs_and_prefixes() { - let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - - let mut serialised_bitmap123 = vec![]; - let mut bitmap123 = RoaringBitmap::new(); - bitmap123.insert(1); - bitmap123.insert(2); - bitmap123.insert(3); - CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); - - let mut serialised_bitmap456 = vec![]; - let mut bitmap456 = RoaringBitmap::new(); - bitmap456.insert(4); - bitmap456.insert(5); - bitmap456.insert(6); - CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); - - let mut serialised_bitmap789 = vec![]; - let mut bitmap789 = RoaringBitmap::new(); - bitmap789.insert(7); - bitmap789.insert(8); - bitmap789.insert(9); - CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); - - let mut serialised_bitmap_ranges = vec![]; - let mut bitmap_ranges = RoaringBitmap::new(); - bitmap_ranges.insert_range(63_000..65_000); - bitmap_ranges.insert_range(123_000..128_000); - CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); - - let word_pairs = [ - ((1, "healthy", "arbres"), &serialised_bitmap123), - ((1, "healthy", "boat"), &serialised_bitmap123), - ((1, "healthy", "ca"), &serialised_bitmap123), - ((1, "healthy", "cats"), &serialised_bitmap456), - ((1, "healthy", "cattos"), &serialised_bitmap123), - ((1, "jittery", "cat"), &serialised_bitmap123), - ((1, "jittery", "cata"), &serialised_bitmap456), - ((1, "jittery", "catb"), &serialised_bitmap789), - ((1, "jittery", "catc"), &serialised_bitmap_ranges), - ((2, "healthy", "arbre"), &serialised_bitmap123), - ((2, "healthy", "arbres"), &serialised_bitmap456), - ((2, "healthy", "cats"), &serialised_bitmap789), - ((2, "healthy", "cattos"), &serialised_bitmap_ranges), - ((3, "healthy", "arbre"), &serialised_bitmap456), - ((3, "healthy", "arbres"), &serialised_bitmap789), - ]; - - let expected_result = [ - ((1, "healthy", "arb"), bitmap123.clone()), - ((1, "healthy", "arbre"), bitmap123.clone()), - ((1, "healthy", "cat"), &bitmap456 | &bitmap123), - ((1, "healthy", "catto"), bitmap123.clone()), - ((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), - ((2, "healthy", "arb"), &bitmap123 | &bitmap456), - ((2, "healthy", "arbre"), &bitmap123 | &bitmap456), - ((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges), - ((2, "healthy", "catto"), bitmap_ranges.clone()), - ]; - - let mut result = vec![]; - - let mut iter = - IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| { - ((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice()) - }); - execute_on_word_pairs_and_prefixes( - &mut iter, - |iter| Ok(iter.next()), - &prefixes, - 2, - |k, v| { - let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap(); - let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); - result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap)); - Ok(()) - }, - ) - .unwrap(); - - for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { - let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x; - let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y; - - assert_eq!(actual_word1, expected_word1); - assert_eq!(actual_prefix, expected_prefix); - assert_eq!(actual_proximity, expected_proximity); - assert_eq!(actual_bitmap, expected_bitmap); - } - } -} diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c2c0e9084..712e595e9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -12,6 +12,7 @@ use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; +use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::{FieldsIdsMap, Index, OrderBy, Result}; @@ -100,8 +101,8 @@ impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting { } } -pub struct Settings<'a, 't, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, +pub struct Settings<'a, 't, 'i> { + wtxn: &'t mut heed::RwTxn<'i>, index: &'i Index, indexer_config: &'a IndexerConfig, @@ -127,14 +128,15 @@ pub struct Settings<'a, 't, 'u, 'i> { max_values_per_facet: Setting, sort_facet_values_by: Setting>, pagination_max_total_hits: Setting, + proximity_precision: Setting, } -impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { +impl<'a, 't, 'i> Settings<'a, 't, 'i> { pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, + wtxn: &'t mut heed::RwTxn<'i>, index: &'i Index, indexer_config: &'a IndexerConfig, - ) -> Settings<'a, 't, 'u, 'i> { + ) -> Settings<'a, 't, 'i> { Settings { wtxn, index, @@ -158,6 +160,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { max_values_per_facet: Setting::NotSet, sort_facet_values_by: Setting::NotSet, pagination_max_total_hits: Setting::NotSet, + proximity_precision: Setting::NotSet, indexer_config, } } @@ -332,6 +335,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.pagination_max_total_hits = Setting::Reset; } + pub fn set_proximity_precision(&mut self, value: ProximityPrecision) { + self.proximity_precision = Setting::Set(value); + } + + pub fn reset_proximity_precision(&mut self) { + self.proximity_precision = Setting::Reset; + } + fn reindex( &mut self, progress_callback: &FP, @@ -822,7 +833,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_max_values_per_facet(&mut self) -> Result<()> { match self.max_values_per_facet { Setting::Set(max) => { - self.index.put_max_values_per_facet(self.wtxn, max)?; + self.index.put_max_values_per_facet(self.wtxn, max as u64)?; } Setting::Reset => { self.index.delete_max_values_per_facet(self.wtxn)?; @@ -850,7 +861,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_pagination_max_total_hits(&mut self) -> Result<()> { match self.pagination_max_total_hits { Setting::Set(max) => { - self.index.put_pagination_max_total_hits(self.wtxn, max)?; + self.index.put_pagination_max_total_hits(self.wtxn, max as u64)?; } Setting::Reset => { self.index.delete_pagination_max_total_hits(self.wtxn)?; @@ -861,6 +872,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_proximity_precision(&mut self) -> Result { + let changed = match self.proximity_precision { + Setting::Set(new) => { + let old = self.index.proximity_precision(self.wtxn)?; + if old == Some(new) { + false + } else { + self.index.put_proximity_precision(self.wtxn, new)?; + true + } + } + Setting::Reset => self.index.delete_proximity_precision(self.wtxn)?, + Setting::NotSet => false, + }; + + Ok(changed) + } + pub fn execute(mut self, progress_callback: FP, should_abort: FA) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -897,6 +926,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let synonyms_updated = self.update_synonyms()?; let searchable_updated = self.update_searchable()?; let exact_attributes_updated = self.update_exact_attributes()?; + let proximity_precision = self.update_proximity_precision()?; if stop_words_updated || non_separator_tokens_updated @@ -906,6 +936,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { || synonyms_updated || searchable_updated || exact_attributes_updated + || proximity_precision { self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?; } @@ -917,13 +948,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { #[cfg(test)] mod tests { use big_s::S; - use heed::types::ByteSlice; + use heed::types::Bytes; use maplit::{btreemap, btreeset, hashset}; use super::*; use crate::error::Error; use crate::index::tests::TempIndex; - use crate::update::{ClearDocuments, DeleteDocuments}; + use crate::update::ClearDocuments; use crate::{Criterion, Filter, SearchResult}; #[test] @@ -1130,7 +1161,7 @@ mod tests { } let count = index .facet_id_f64_docids - .remap_key_type::() + .remap_key_type::() // The faceted field id is 1u16 .prefix_iter(&rtxn, &[0, 1, 0]) .unwrap() @@ -1151,7 +1182,7 @@ mod tests { // Only count the field_id 0 and level 0 facet values. let count = index .facet_id_f64_docids - .remap_key_type::() + .remap_key_type::() .prefix_iter(&rtxn, &[0, 1, 0]) .unwrap() .count(); @@ -1565,7 +1596,7 @@ mod tests { }) .unwrap_err(); assert!(matches!(error, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_)))); - wtxn.abort().unwrap(); + wtxn.abort(); // But if we clear the database... let mut wtxn = index.write_txn().unwrap(); @@ -1731,6 +1762,7 @@ mod tests { max_values_per_facet, sort_facet_values_by, pagination_max_total_hits, + proximity_precision, } = settings; assert!(matches!(searchable_fields, Setting::NotSet)); assert!(matches!(displayed_fields, Setting::NotSet)); @@ -1752,6 +1784,7 @@ mod tests { assert!(matches!(max_values_per_facet, Setting::NotSet)); assert!(matches!(sort_facet_values_by, Setting::NotSet)); assert!(matches!(pagination_max_total_hits, Setting::NotSet)); + assert!(matches!(proximity_precision, Setting::NotSet)); }) .unwrap(); } @@ -1768,13 +1801,9 @@ mod tests { } index.add_documents(documents! { docs }).unwrap(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - (0..5).for_each(|id| { - builder.delete_external_id(&id.to_string()); - }); - builder.execute().unwrap(); + index.delete_documents((0..5).map(|id| id.to_string()).collect()); + let mut wtxn = index.write_txn().unwrap(); index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_searchable_fields(vec!["id".to_string()]); diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap deleted file mode 100644 index 6d69b2ffb..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap deleted file mode 100644 index 88d3a98aa..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -benoit [2, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap deleted file mode 100644 index 6d69b2ffb..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index 9139b7a05..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[0, 1, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap deleted file mode 100644 index 15c881e87..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -benoit [2, ] -kevin [0, ] -kevina [1, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap deleted file mode 100644 index 87856f6dc..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -2 0 2.2 1 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap deleted file mode 100644 index a7ee4348d..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] -2 [20, 21, 22, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap deleted file mode 100644 index cfa649653..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -2 0 1.2 1 [20, 22, ] -2 0 2.2 1 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap deleted file mode 100644 index 8336bd712..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap +++ /dev/null @@ -1,19 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] -1 0 aquarium 1 [5, ] -1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] -1 0 cartoon 1 [2, 7, 15, 17, ] -1 0 colorfulness 1 [13, ] -1 0 design 1 [2, 18, ] -1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] -1 0 geometry 1 [19, ] -1 0 letter 1 [1, ] -1 0 outdoor 1 [4, ] -1 0 painting 1 [3, ] -1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] -1 0 sign 1 [0, ] -2 0 design 1 [21, ] -2 0 geometry 1 [20, 22, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index dfac98e59..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[0, 20, 22, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap deleted file mode 100644 index 972a733e2..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap +++ /dev/null @@ -1,42 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ] -2 [20, 21, 22, ] -36 [3, ] -37 [4, ] -38 [5, ] -39 [6, ] -4 [0, ] -40 [7, ] -41 [8, ] -42 [9, ] -43 [10, ] -44 [11, ] -45 [12, ] -46 [13, ] -47 [14, ] -5 [1, ] -52 [15, ] -57 [16, ] -58 [17, ] -68 [18, ] -69 [19, ] -7 [2, ] -70 [20, ] -71 [21, ] -72 [22, ] -abstract [2, 6, 10, 13, 14, 15, 16, 17, ] -aquarium [5, ] -art [4, 5, 8, 9, 10, 12, 17, ] -cartoon [2, 7, 15, 17, ] -colorfulness [13, ] -design [2, 18, 21, ] -drawing [3, 4, 5, 8, 10, 11, 16, ] -geometry [19, 20, 22, ] -letter [1, ] -outdoor [4, ] -painting [3, ] -pattern [2, 3, 9, 10, 13, 14, 16, ] -sign [0, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap deleted file mode 100644 index 941838e34..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap +++ /dev/null @@ -1,29 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 1 2 [20, 22, ] -1 1 36 [3, ] -1 1 37 [4, ] -1 1 38 [5, ] -1 1 39 [6, ] -1 1 4 [0, ] -1 1 40 [7, ] -1 1 41 [8, ] -1 1 42 [9, ] -1 1 43 [10, ] -1 1 44 [11, ] -1 1 45 [12, ] -1 1 46 [13, ] -1 1 47 [14, ] -1 1 5 [1, ] -1 1 52 [15, ] -1 1 57 [16, ] -1 1 58 [17, ] -1 1 68 [18, ] -1 1 69 [19, ] -1 1 7 [2, ] -1 1 70 [20, ] -1 1 71 [21, ] -1 1 72 [22, ] -1 2 2 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap deleted file mode 100644 index c909a3cd8..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap +++ /dev/null @@ -1,53 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -3 0 48.9021 1 [19, ] -3 0 49.4449 1 [18, ] -3 0 49.9314 1 [17, ] -3 0 50.1112 1 [16, ] -3 0 50.1793 1 [15, ] -3 0 50.2844 1 [14, ] -3 0 50.3518 1 [13, ] -3 0 50.4095 1 [11, ] -3 0 50.4502 1 [12, ] -3 0 50.6053 1 [8, ] -3 0 50.6224 1 [3, ] -3 0 50.6299 1 [0, ] -3 0 50.6312 1 [2, ] -3 0 50.6415 1 [1, ] -3 0 50.6552 1 [4, ] -3 0 50.6924 1 [5, ] -3 0 50.7263 1 [6, ] -3 0 50.7453 1 [7, ] -3 0 50.8466 1 [10, ] -3 0 51.0537 1 [9, ] -3 1 48.9021 4 [16, 17, 18, 19, ] -3 1 50.1793 4 [11, 13, 14, 15, ] -3 1 50.4502 4 [0, 3, 8, 12, ] -3 1 50.6312 4 [1, 2, 4, 5, ] -3 1 50.7263 4 [6, 7, 9, 10, ] -4 0 2.271 1 [17, ] -4 0 2.3708 1 [19, ] -4 0 2.7637 1 [14, ] -4 0 2.7913 1 [18, ] -4 0 2.8547 1 [16, ] -4 0 3.0569 1 [0, ] -4 0 3.1106 1 [1, 2, ] -4 0 3.1476 1 [3, ] -4 0 3.1541 1 [6, ] -4 0 3.1763 1 [5, ] -4 0 3.1897 1 [4, ] -4 0 3.2189 1 [15, ] -4 0 3.2206 1 [7, ] -4 0 3.3758 1 [8, ] -4 0 3.5326 1 [13, ] -4 0 3.6957 1 [9, ] -4 0 3.9623 1 [12, ] -4 0 4.337 1 [10, ] -4 0 4.4347 1 [11, ] -4 1 2.271 4 [14, 17, 18, 19, ] -4 1 2.8547 4 [0, 1, 2, 3, 16, ] -4 1 3.1541 4 [4, 5, 6, 15, ] -4 1 3.2206 4 [7, 8, 9, 13, ] -4 1 3.9623 3 [10, 11, 12, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index 1260b12de..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[4, 5, 6, 11, 16, 18, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index efcd7af8c..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index efcd7af8c..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index efcd7af8c..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, 15, ] diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index a30254994..544bea224 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -1,31 +1,33 @@ use std::collections::{HashMap, HashSet}; use grenad::CompressionType; -use heed::types::{ByteSlice, Str}; +use heed::types::{Bytes, Str}; use heed::Database; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{ - create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, - CursorClonableMmap, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, + write_sorter_into_database, CursorClonableMmap, MergeFn, }; -use crate::{Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Result}; -pub struct WordPrefixDocids<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - word_docids: Database, - word_prefix_docids: Database, +pub struct WordPrefixDocids<'t, 'i> { + wtxn: &'t mut heed::RwTxn<'i>, + word_docids: Database, + word_prefix_docids: Database, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, } -impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { +impl<'t, 'i> WordPrefixDocids<'t, 'i> { pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - word_docids: Database, - word_prefix_docids: Database, - ) -> WordPrefixDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i>, + word_docids: Database, + word_prefix_docids: Database, + ) -> WordPrefixDocids<'t, 'i> { WordPrefixDocids { wtxn, word_docids, @@ -51,7 +53,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -91,12 +93,17 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } // We fetch the docids associated to the newly added word prefix fst only. - let db = self.word_docids.remap_data_type::(); + let db = self.word_docids.remap_data_type::(); + let mut buffer = Vec::new(); for prefix in new_prefix_fst_words { let prefix = std::str::from_utf8(prefix.as_bytes())?; for result in db.prefix_iter(self.wtxn, prefix)? { let (_word, data) = result?; - prefix_docids_sorter.insert(prefix, data)?; + buffer.clear(); + let mut writer = KvWriterDelAdd::new(&mut buffer); + writer.insert(DelAdd::Addition, data)?; + + prefix_docids_sorter.insert(prefix, writer.into_inner()?)?; } } @@ -110,12 +117,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { drop(iter); + let database_is_empty = self.word_prefix_docids.is_empty(self.wtxn)?; + // We finally write the word prefix docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.word_prefix_docids.as_polymorph(), + write_sorter_into_database( prefix_docids_sorter, - merge_roaring_bitmaps, + &self.word_prefix_docids, + self.wtxn, + database_is_empty, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; Ok(()) diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs index c65438928..819cc097b 100644 --- a/milli/src/update/words_prefix_integer_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -2,21 +2,23 @@ use std::collections::{HashMap, HashSet}; use std::str; use grenad::CompressionType; -use heed::types::ByteSlice; +use heed::types::Bytes; use heed::{BytesDecode, BytesEncode, Database}; use log::debug; use crate::error::SerializationError; use crate::heed_codec::StrBEU16Codec; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, - CursorClonableMmap, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, + write_sorter_into_database, CursorClonableMmap, MergeFn, }; use crate::{CboRoaringBitmapCodec, Result}; -pub struct WordPrefixIntegerDocids<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, +pub struct WordPrefixIntegerDocids<'t, 'i> { + wtxn: &'t mut heed::RwTxn<'i>, prefix_database: Database, word_database: Database, pub(crate) chunk_compression_type: CompressionType, @@ -25,12 +27,12 @@ pub struct WordPrefixIntegerDocids<'t, 'u, 'i> { pub(crate) max_memory: Option, } -impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { +impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> { pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, + wtxn: &'t mut heed::RwTxn<'i>, prefix_database: Database, word_database: Database, - ) -> WordPrefixIntegerDocids<'t, 'u, 'i> { + ) -> WordPrefixIntegerDocids<'t, 'i> { WordPrefixIntegerDocids { wtxn, prefix_database, @@ -55,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { let mut prefix_integer_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -70,7 +72,8 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); while let Some((key, data)) = new_word_integer_docids_iter.move_on_next()? { - let (word, pos) = StrBEU16Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + let (word, pos) = + StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?; current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), @@ -107,7 +110,8 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { } // We fetch the docids associated to the newly added word prefix fst only. - let db = self.word_database.remap_data_type::(); + let db = self.word_database.remap_data_type::(); + let mut buffer = Vec::new(); for prefix_bytes in new_prefix_fst_words { let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| { SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } @@ -115,7 +119,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { // iter over all lines of the DB where the key is prefixed by the current prefix. let iter = db - .remap_key_type::() + .remap_key_type::() .prefix_iter(self.wtxn, prefix_bytes.as_bytes())? .remap_key_type::(); for result in iter { @@ -123,7 +127,11 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { if word.starts_with(prefix) { let key = (prefix, pos); let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); - prefix_integer_docids_sorter.insert(bytes, data)?; + + buffer.clear(); + let mut writer = KvWriterDelAdd::new(&mut buffer); + writer.insert(DelAdd::Addition, data)?; + prefix_integer_docids_sorter.insert(bytes, writer.into_inner()?)?; } } } @@ -143,12 +151,16 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { drop(iter); } + let database_is_empty = self.prefix_database.is_empty(self.wtxn)?; + // We finally write all the word prefix integer docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.prefix_database.as_polymorph(), + write_sorter_into_database( prefix_integer_docids_sorter, - merge_cbo_roaring_bitmaps, + &self.prefix_database, + self.wtxn, + database_is_empty, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; Ok(()) @@ -159,6 +171,7 @@ fn write_prefixes_in_sorter( prefixes: &mut HashMap, Vec>>, sorter: &mut grenad::Sorter, ) -> Result<()> { + // TODO: Merge before insertion. for (key, data_slices) in prefixes.drain() { for data in data_slices { if valid_lmdb_key(&key) { diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index 121b45c4a..f26bf93e5 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -2,21 +2,19 @@ use std::iter::{repeat_with, FromIterator}; use std::str; use fst::{SetBuilder, Streamer}; +use heed::RwTxn; use crate::{Index, Result, SmallString32}; -pub struct WordsPrefixesFst<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, +pub struct WordsPrefixesFst<'t, 'i> { + wtxn: &'t mut RwTxn<'i>, index: &'i Index, threshold: u32, max_prefix_length: usize, } -impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordsPrefixesFst<'t, 'u, 'i> { +impl<'t, 'i> WordsPrefixesFst<'t, 'i> { + pub fn new(wtxn: &'t mut RwTxn<'i>, index: &'i Index) -> WordsPrefixesFst<'t, 'i> { WordsPrefixesFst { wtxn, index, threshold: 100, max_prefix_length: 4 } } diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 1c68cfff2..9193ab762 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -88,9 +88,11 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec { let rtxn = index.read_txn().unwrap(); - let docid_map = index.external_documents_ids(&rtxn).unwrap(); - let docid_map: std::collections::HashMap<_, _> = - EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); + let docid_map = index.external_documents_ids(); + let docid_map: std::collections::HashMap<_, _> = EXTERNAL_DOCUMENTS_IDS + .iter() + .map(|id| (docid_map.get(&rtxn, id).unwrap().unwrap(), id)) + .collect(); internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect() }