diff --git a/.github/workflows/benchmarks-pr.yml b/.github/workflows/benchmarks-pr.yml index aa784296a..30baa294e 100644 --- a/.github/workflows/benchmarks-pr.yml +++ b/.github/workflows/benchmarks-pr.yml @@ -90,7 +90,8 @@ jobs: set -x export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8) export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json) - echo 'Here are your benchmarks diff 👊' >> body.txt + export bench_name=$(echo ${{ steps.command.outputs.command-arguments }}) + echo "Here are your $bench_name benchmarks diff 👊" >> body.txt echo '```' >> body.txt ./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt echo '```' >> body.txt diff --git a/.github/workflows/publish-apt-brew-pkg.yml b/.github/workflows/publish-apt-brew-pkg.yml index 452776e38..11893bae0 100644 --- a/.github/workflows/publish-apt-brew-pkg.yml +++ b/.github/workflows/publish-apt-brew-pkg.yml @@ -50,7 +50,7 @@ jobs: needs: check-version steps: - name: Create PR to Homebrew - uses: mislav/bump-homebrew-formula-action@v2 + uses: mislav/bump-homebrew-formula-action@v3 with: formula-name: meilisearch formula-path: Formula/m/meilisearch.rb diff --git a/.github/workflows/publish-docker-images.yml b/.github/workflows/publish-docker-images.yml index 051fb105d..1ee8ba4d0 100644 --- a/.github/workflows/publish-docker-images.yml +++ b/.github/workflows/publish-docker-images.yml @@ -63,7 +63,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Login to Docker Hub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} diff --git a/.github/workflows/sdks-tests.yml b/.github/workflows/sdks-tests.yml index 05cf6b91c..7b6ea74de 100644 --- a/.github/workflows/sdks-tests.yml +++ b/.github/workflows/sdks-tests.yml @@ -160,7 +160,7 @@ jobs: with: repository: meilisearch/meilisearch-js - name: Setup node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: cache: 'yarn' - name: Install dependencies @@ -318,7 +318,7 @@ jobs: with: repository: meilisearch/meilisearch-js-plugins - name: Setup node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: cache: yarn - name: Install dependencies diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index a44a843fe..ed9cafa79 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -43,7 +43,7 @@ jobs: toolchain: nightly override: true - name: Cache dependencies - uses: Swatinem/rust-cache@v2.6.2 + uses: Swatinem/rust-cache@v2.7.1 - name: Run cargo check without any default features uses: actions-rs/cargo@v1 with: @@ -65,7 +65,7 @@ jobs: steps: - uses: actions/checkout@v3 - name: Cache dependencies - uses: Swatinem/rust-cache@v2.6.2 + uses: Swatinem/rust-cache@v2.7.1 - name: Run cargo check without any default features uses: actions-rs/cargo@v1 with: @@ -149,7 +149,7 @@ jobs: toolchain: stable override: true - name: Cache dependencies - uses: Swatinem/rust-cache@v2.6.2 + uses: Swatinem/rust-cache@v2.7.1 - name: Run tests in debug uses: actions-rs/cargo@v1 with: @@ -168,7 +168,7 @@ jobs: override: true components: clippy - name: Cache dependencies - uses: Swatinem/rust-cache@v2.6.2 + uses: Swatinem/rust-cache@v2.7.1 - name: Run cargo clippy uses: actions-rs/cargo@v1 with: @@ -187,7 +187,7 @@ jobs: override: true components: rustfmt - name: Cache dependencies - uses: Swatinem/rust-cache@v2.6.2 + uses: Swatinem/rust-cache@v2.7.1 - name: Run cargo fmt # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate diff --git a/Cargo.lock b/Cargo.lock index 017257512..75d8463e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1731,12 +1731,13 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "grenad" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5232b2d157b7bf63d7abe1b12177039e58db2f29e377517c0cdee1578cca4c93" +checksum = "6a007932af5475ebb5c63bef8812bb1c36f317983bb4ca663e9d6dd58d6a0f8c" dependencies = [ "bytemuck", "byteorder", + "rayon", "tempfile", ] @@ -3281,6 +3282,7 @@ dependencies = [ "logging_timer", "maplit", "md5", + "meili-snap", "memmap2", "mimalloc", "obkv", @@ -3443,9 +3445,9 @@ dependencies = [ [[package]] name = "obkv" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385" +checksum = "6c459142426056c639ff88d053ebaaaeca0ee1411c94362892398ef4ccd81080" [[package]] name = "once_cell" diff --git a/README.md b/README.md index cb9475dea..88621729d 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,6 @@

⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍

---- - -### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A! - ---- - Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.

diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 9446c0b0f..65f581b93 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -6,9 +6,7 @@ use std::path::Path; use criterion::{criterion_group, criterion_main, Criterion}; use milli::heed::{EnvOpenOptions, RwTxn}; -use milli::update::{ - DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, -}; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::Index; use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; @@ -266,17 +264,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); @@ -613,17 +601,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); @@ -875,22 +853,31 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); } +fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec) { + let mut wtxn = index.write_txn().unwrap(); + + let indexer_config = IndexerConfig::default(); + for ids in document_ids_to_delete { + let config = IndexDocumentsConfig::default(); + + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false) + .unwrap(); + (builder, _) = builder.remove_documents_from_db_no_batch(&ids).unwrap(); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); +} + fn indexing_movies_in_three_batches(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); @@ -1112,17 +1099,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); @@ -1338,17 +1315,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); diff --git a/dump/src/reader/mod.rs b/dump/src/reader/mod.rs index af02888d2..603c557d6 100644 --- a/dump/src/reader/mod.rs +++ b/dump/src/reader/mod.rs @@ -526,12 +526,12 @@ pub(crate) mod test { assert!(indexes.is_empty()); // products - insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(products.metadata(), @r###" { "uid": "products", "primaryKey": "sku", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:22.688964637Z", + "updatedAt": "2022-10-09T20:27:23.951017769Z" } "###); @@ -541,12 +541,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); // movies - insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(movies.metadata(), @r###" { "uid": "movies", "primaryKey": "id", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:22.197788495Z", + "updatedAt": "2022-10-09T20:28:01.93111053Z" } "###); @@ -571,12 +571,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce"); // spells - insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(spells.metadata(), @r###" { "uid": "dnd_spells", "primaryKey": "index", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:24.242683494Z", + "updatedAt": "2022-10-09T20:27:24.312809641Z" } "###); @@ -617,12 +617,12 @@ pub(crate) mod test { assert!(indexes.is_empty()); // products - insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(products.metadata(), @r###" { "uid": "products", "primaryKey": "sku", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:56.595257Z", + "updatedAt": "2023-01-30T16:25:58.70348Z" } "###); @@ -632,12 +632,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); // movies - insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(movies.metadata(), @r###" { "uid": "movies", "primaryKey": "id", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:56.192178Z", + "updatedAt": "2023-01-30T16:25:56.455714Z" } "###); @@ -647,12 +647,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720"); // spells - insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(spells.metadata(), @r###" { "uid": "dnd_spells", "primaryKey": "index", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:58.876405Z", + "updatedAt": "2023-01-30T16:25:59.079906Z" } "###); diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap deleted file mode 100644 index 92fc61d72..000000000 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap +++ /dev/null @@ -1,24 +0,0 @@ ---- -source: dump/src/reader/mod.rs -expression: spells.settings().unwrap() ---- -{ - "displayedAttributes": [ - "*" - ], - "searchableAttributes": [ - "*" - ], - "filterableAttributes": [], - "sortableAttributes": [], - "rankingRules": [ - "typo", - "words", - "proximity", - "attribute", - "exactness" - ], - "stopWords": [], - "synonyms": {}, - "distinctAttribute": null -} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap deleted file mode 100644 index b0b54c136..000000000 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap +++ /dev/null @@ -1,38 +0,0 @@ ---- -source: dump/src/reader/mod.rs -expression: products.settings().unwrap() ---- -{ - "displayedAttributes": [ - "*" - ], - "searchableAttributes": [ - "*" - ], - "filterableAttributes": [], - "sortableAttributes": [], - "rankingRules": [ - "typo", - "words", - "proximity", - "attribute", - "exactness" - ], - "stopWords": [], - "synonyms": { - "android": [ - "phone", - "smartphone" - ], - "iphone": [ - "phone", - "smartphone" - ], - "phone": [ - "android", - "iphone", - "smartphone" - ] - }, - "distinctAttribute": null -} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap deleted file mode 100644 index 5c12a0438..000000000 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap +++ /dev/null @@ -1,31 +0,0 @@ ---- -source: dump/src/reader/mod.rs -expression: movies.settings().unwrap() ---- -{ - "displayedAttributes": [ - "*" - ], - "searchableAttributes": [ - "*" - ], - "filterableAttributes": [ - "genres", - "id" - ], - "sortableAttributes": [ - "genres", - "id" - ], - "rankingRules": [ - "typo", - "words", - "proximity", - "attribute", - "exactness", - "release_date:asc" - ], - "stopWords": [], - "synonyms": {}, - "distinctAttribute": null -} diff --git a/dump/src/reader/v2/mod.rs b/dump/src/reader/v2/mod.rs index 4016e6341..a0ff13a3b 100644 --- a/dump/src/reader/v2/mod.rs +++ b/dump/src/reader/v2/mod.rs @@ -46,6 +46,7 @@ pub type Checked = settings::Checked; pub type Unchecked = settings::Unchecked; pub type Task = updates::UpdateEntry; +pub type Kind = updates::UpdateMeta; // everything related to the errors pub type ResponseError = errors::ResponseError; @@ -107,8 +108,11 @@ impl V2Reader { pub fn indexes(&self) -> Result> + '_> { Ok(self.index_uuid.iter().map(|index| -> Result<_> { V2IndexReader::new( - index.uid.clone(), &self.dump.path().join("indexes").join(format!("index-{}", index.uuid)), + index, + BufReader::new( + File::open(self.dump.path().join("updates").join("data.jsonl")).unwrap(), + ), ) })) } @@ -143,16 +147,41 @@ pub struct V2IndexReader { } impl V2IndexReader { - pub fn new(name: String, path: &Path) -> Result { + pub fn new(path: &Path, index_uuid: &IndexUuid, tasks: BufReader) -> Result { let meta = File::open(path.join("meta.json"))?; let meta: DumpMeta = serde_json::from_reader(meta)?; + let mut created_at = None; + let mut updated_at = None; + + for line in tasks.lines() { + let task: Task = serde_json::from_str(&line?)?; + if !(task.uuid == index_uuid.uuid && task.is_finished()) { + continue; + } + + let new_created_at = match task.update.meta() { + Kind::DocumentsAddition { .. } | Kind::Settings(_) => task.update.finished_at(), + _ => None, + }; + let new_updated_at = task.update.finished_at(); + + if created_at.is_none() || created_at > new_created_at { + created_at = new_created_at; + } + + if updated_at.is_none() || updated_at < new_updated_at { + updated_at = new_updated_at; + } + } + + let current_time = OffsetDateTime::now_utc(); + let metadata = IndexMetadata { - uid: name, + uid: index_uuid.uid.clone(), primary_key: meta.primary_key, - // FIXME: Iterate over the whole task queue to find the creation and last update date. - created_at: OffsetDateTime::now_utc(), - updated_at: OffsetDateTime::now_utc(), + created_at: created_at.unwrap_or(current_time), + updated_at: updated_at.unwrap_or(current_time), }; let ret = V2IndexReader { @@ -248,12 +277,12 @@ pub(crate) mod test { assert!(indexes.is_empty()); // products - insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(products.metadata(), @r###" { "uid": "products", "primaryKey": "sku", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:22.688964637Z", + "updatedAt": "2022-10-09T20:27:23.951017769Z" } "###); @@ -263,12 +292,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); // movies - insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(movies.metadata(), @r###" { "uid": "movies", "primaryKey": "id", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:22.197788495Z", + "updatedAt": "2022-10-09T20:28:01.93111053Z" } "###); @@ -293,12 +322,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce"); // spells - insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(spells.metadata(), @r###" { "uid": "dnd_spells", "primaryKey": "index", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2022-10-09T20:27:24.242683494Z", + "updatedAt": "2022-10-09T20:27:24.312809641Z" } "###); @@ -340,12 +369,12 @@ pub(crate) mod test { assert!(indexes.is_empty()); // products - insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(products.metadata(), @r###" { "uid": "products", "primaryKey": "sku", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:56.595257Z", + "updatedAt": "2023-01-30T16:25:58.70348Z" } "###); @@ -355,12 +384,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); // movies - insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(movies.metadata(), @r###" { "uid": "movies", "primaryKey": "id", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:56.192178Z", + "updatedAt": "2023-01-30T16:25:56.455714Z" } "###); @@ -370,12 +399,12 @@ pub(crate) mod test { meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720"); // spells - insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" + insta::assert_json_snapshot!(spells.metadata(), @r###" { "uid": "dnd_spells", "primaryKey": "index", - "createdAt": "[now]", - "updatedAt": "[now]" + "createdAt": "2023-01-30T16:25:58.876405Z", + "updatedAt": "2023-01-30T16:25:59.079906Z" } "###); diff --git a/dump/src/reader/v2/updates.rs b/dump/src/reader/v2/updates.rs index 33d88d46f..bf1227c7a 100644 --- a/dump/src/reader/v2/updates.rs +++ b/dump/src/reader/v2/updates.rs @@ -227,4 +227,14 @@ impl UpdateStatus { _ => None, } } + + pub fn finished_at(&self) -> Option { + match self { + UpdateStatus::Processing(_) => None, + UpdateStatus::Enqueued(_) => None, + UpdateStatus::Processed(u) => Some(u.processed_at), + UpdateStatus::Aborted(_) => None, + UpdateStatus::Failed(u) => Some(u.failed_at), + } + } } diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index aa93cda2a..661285325 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -24,14 +24,13 @@ use std::fs::{self, File}; use std::io::BufWriter; use dump::IndexMetadata; -use log::{debug, error, info}; +use log::{debug, error, info, trace}; use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::{ - DeleteDocuments, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod, - Settings as MilliSettings, + IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; use meilisearch_types::milli::{self, Filter, BEU32}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; @@ -44,7 +43,7 @@ use uuid::Uuid; use crate::autobatcher::{self, BatchKind}; use crate::utils::{self, swap_index_uid_in_task}; -use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId}; +use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId}; /// Represents a combination of tasks that can all be processed at the same time. /// @@ -105,12 +104,6 @@ pub(crate) enum IndexOperation { operations: Vec, tasks: Vec, }, - DocumentDeletion { - index_uid: String, - // The vec associated with each document deletion tasks. - documents: Vec>, - tasks: Vec, - }, IndexDocumentDeletionByFilter { index_uid: String, task: Task, @@ -162,7 +155,6 @@ impl Batch { } Batch::IndexOperation { op, .. } => match op { IndexOperation::DocumentOperation { tasks, .. } - | IndexOperation::DocumentDeletion { tasks, .. } | IndexOperation::Settings { tasks, .. } | IndexOperation::DocumentClear { tasks, .. } => { tasks.iter().map(|task| task.uid).collect() @@ -227,7 +219,6 @@ impl IndexOperation { pub fn index_uid(&self) -> &str { match self { IndexOperation::DocumentOperation { index_uid, .. } - | IndexOperation::DocumentDeletion { index_uid, .. } | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. } | IndexOperation::DocumentClear { index_uid, .. } | IndexOperation::Settings { index_uid, .. } @@ -243,9 +234,6 @@ impl fmt::Display for IndexOperation { IndexOperation::DocumentOperation { .. } => { f.write_str("IndexOperation::DocumentOperation") } - IndexOperation::DocumentDeletion { .. } => { - f.write_str("IndexOperation::DocumentDeletion") - } IndexOperation::IndexDocumentDeletionByFilter { .. } => { f.write_str("IndexOperation::IndexDocumentDeletionByFilter") } @@ -348,18 +336,27 @@ impl IndexScheduler { BatchKind::DocumentDeletion { deletion_ids } => { let tasks = self.get_existing_tasks(rtxn, deletion_ids)?; - let mut documents = Vec::new(); + let mut operations = Vec::with_capacity(tasks.len()); + let mut documents_counts = Vec::with_capacity(tasks.len()); for task in &tasks { match task.kind { KindWithContent::DocumentDeletion { ref documents_ids, .. } => { - documents.push(documents_ids.clone()) + operations.push(DocumentOperation::Delete(documents_ids.clone())); + documents_counts.push(documents_ids.len() as u64); } _ => unreachable!(), } } Ok(Some(Batch::IndexOperation { - op: IndexOperation::DocumentDeletion { index_uid, documents, tasks }, + op: IndexOperation::DocumentOperation { + index_uid, + primary_key: None, + method: IndexDocumentsMethod::ReplaceDocuments, + documents_counts, + operations, + tasks, + }, must_create_index, })) } @@ -825,6 +822,10 @@ impl IndexScheduler { // 2. dump the tasks let mut dump_tasks = dump.create_tasks_queue()?; for ret in self.all_tasks.iter(&rtxn)? { + if self.must_stop_processing.get() { + return Err(Error::AbortedTask); + } + let (_, mut t) = ret?; let status = t.status; let content_file = t.content_uuid(); @@ -845,6 +846,9 @@ impl IndexScheduler { // 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. if let Some(content_file) = content_file { + if self.must_stop_processing.get() { + return Err(Error::AbortedTask); + } if status == Status::Enqueued { let content_file = self.file_store.get_update(content_file)?; @@ -884,6 +888,9 @@ impl IndexScheduler { // 3.1. Dump the documents for ret in index.all_documents(&rtxn)? { + if self.must_stop_processing.get() { + return Err(Error::AbortedTask); + } let (_id, doc) = ret?; let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; index_dumper.push_document(&document)?; @@ -903,6 +910,9 @@ impl IndexScheduler { "[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]" )).unwrap(); + if self.must_stop_processing.get() { + return Err(Error::AbortedTask); + } let path = self.dumps_path.join(format!("{}.dump", dump_uid)); let file = File::create(path)?; dump.persist_to(BufWriter::new(file))?; @@ -1195,7 +1205,7 @@ impl IndexScheduler { index, indexer_config, config, - |indexing_step| debug!("update: {:?}", indexing_step), + |indexing_step| trace!("update: {:?}", indexing_step), || must_stop_processing.get(), )?; @@ -1242,7 +1252,8 @@ impl IndexScheduler { let (new_builder, user_result) = builder.remove_documents(document_ids)?; builder = new_builder; - + // Uses Invariant: remove documents actually always returns Ok for the inner result + let count = user_result.unwrap(); let provided_ids = if let Some(Details::DocumentDeletion { provided_ids, .. }) = task.details @@ -1253,23 +1264,11 @@ impl IndexScheduler { unreachable!(); }; - match user_result { - Ok(count) => { - task.status = Status::Succeeded; - task.details = Some(Details::DocumentDeletion { - provided_ids, - deleted_documents: Some(count), - }); - } - Err(e) => { - task.status = Status::Failed; - task.details = Some(Details::DocumentDeletion { - provided_ids, - deleted_documents: Some(0), - }); - task.error = Some(milli::Error::from(e).into()); - } - } + task.status = Status::Succeeded; + task.details = Some(Details::DocumentDeletion { + provided_ids, + deleted_documents: Some(count), + }); } } } @@ -1284,31 +1283,13 @@ impl IndexScheduler { milli::update::Settings::new(index_wtxn, index, indexer_config); builder.reset_primary_key(); builder.execute( - |indexing_step| debug!("update: {:?}", indexing_step), + |indexing_step| trace!("update: {:?}", indexing_step), || must_stop_processing.clone().get(), )?; } Ok(tasks) } - IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => { - let mut builder = milli::update::DeleteDocuments::new(index_wtxn, index)?; - documents.iter().flatten().for_each(|id| { - builder.delete_external_id(id); - }); - - let DocumentDeletionResult { deleted_documents, .. } = builder.execute()?; - - for (task, documents) in tasks.iter_mut().zip(documents) { - task.status = Status::Succeeded; - task.details = Some(Details::DocumentDeletion { - provided_ids: documents.len(), - deleted_documents: Some(deleted_documents.min(documents.len() as u64)), - }); - } - - Ok(tasks) - } IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => { let filter = if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } = @@ -1318,7 +1299,13 @@ impl IndexScheduler { } else { unreachable!() }; - let deleted_documents = delete_document_by_filter(index_wtxn, filter, index); + let deleted_documents = delete_document_by_filter( + index_wtxn, + filter, + self.index_mapper.indexer_config(), + self.must_stop_processing.clone(), + index, + ); let original_filter = if let Some(Details::DocumentDeletionByFilter { original_filter, deleted_documents: _, @@ -1552,6 +1539,8 @@ impl IndexScheduler { fn delete_document_by_filter<'a>( wtxn: &mut RwTxn<'a, '_>, filter: &serde_json::Value, + indexer_config: &IndexerConfig, + must_stop_processing: MustStopProcessing, index: &'a Index, ) -> Result { let filter = Filter::from_json(filter)?; @@ -1562,9 +1551,26 @@ fn delete_document_by_filter<'a>( } e => e.into(), })?; - let mut delete_operation = DeleteDocuments::new(wtxn, index)?; - delete_operation.delete_documents(&candidates); - delete_operation.execute().map(|result| result.deleted_documents)? + + let config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + + let mut builder = milli::update::IndexDocuments::new( + wtxn, + index, + indexer_config, + config, + |indexing_step| debug!("update: {:?}", indexing_step), + || must_stop_processing.get(), + )?; + + let (new_builder, count) = builder.remove_documents_from_db_no_batch(&candidates)?; + builder = new_builder; + + let _ = builder.execute()?; + count } else { 0 }) diff --git a/index-scheduler/src/error.rs b/index-scheduler/src/error.rs index ddc6960f7..bbe526460 100644 --- a/index-scheduler/src/error.rs +++ b/index-scheduler/src/error.rs @@ -108,6 +108,8 @@ pub enum Error { TaskDeletionWithEmptyQuery, #[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")] TaskCancelationWithEmptyQuery, + #[error("Aborted task")] + AbortedTask, #[error(transparent)] Dump(#[from] dump::Error), @@ -175,6 +177,7 @@ impl Error { | Error::TaskNotFound(_) | Error::TaskDeletionWithEmptyQuery | Error::TaskCancelationWithEmptyQuery + | Error::AbortedTask | Error::Dump(_) | Error::Heed(_) | Error::Milli(_) @@ -236,6 +239,9 @@ impl ErrorCode for Error { Error::TaskDatabaseUpdate(_) => Code::Internal, Error::CreateBatch(_) => Code::Internal, + // This one should never be seen by the end user + Error::AbortedTask => Code::Internal, + #[cfg(test)] Error::PlannedFailure => Code::Internal, } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 95902aa15..896c06c99 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1183,7 +1183,8 @@ impl IndexScheduler { // If we have an abortion error we must stop the tick here and re-schedule tasks. Err(Error::Milli(milli::Error::InternalError( milli::InternalError::AbortedIndexation, - ))) => { + ))) + | Err(Error::AbortedTask) => { #[cfg(test)] self.breakpoint(Breakpoint::AbortedIndexation); wtxn.abort().map_err(Error::HeedTransaction)?; @@ -4339,4 +4340,26 @@ mod tests { } "###); } + + #[test] + fn cancel_processing_dump() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let dump_creation = KindWithContent::DumpCreation { keys: Vec::new(), instance_uid: None }; + let dump_cancellation = KindWithContent::TaskCancelation { + query: "cancel dump".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }; + let _ = index_scheduler.register(dump_creation).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register"); + handle.advance_till([Start, BatchCreated, InsideProcessBatch]); + + let _ = index_scheduler.register(dump_cancellation).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered"); + + snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); + } } diff --git a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap new file mode 100644 index 000000000..ce0343975 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap @@ -0,0 +1,35 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} +---------------------------------------------------------------------- +### Status: +enqueued [0,] +---------------------------------------------------------------------- +### Kind: +"dumpCreation" [0,] +---------------------------------------------------------------------- +### Index Tasks: +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap new file mode 100644 index 000000000..f3d7b363f --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap @@ -0,0 +1,45 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: canceled, canceled_by: 1, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} +1 {uid: 1, status: succeeded, details: { matched_tasks: 1, canceled_tasks: Some(0), original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [1,] +canceled [0,] +---------------------------------------------------------------------- +### Kind: +"taskCancelation" [1,] +"dumpCreation" [0,] +---------------------------------------------------------------------- +### Index Tasks: +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: +1 [0,] + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap new file mode 100644 index 000000000..72ae58e00 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap @@ -0,0 +1,38 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[0,] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} +1 {uid: 1, status: enqueued, details: { matched_tasks: 1, canceled_tasks: None, original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }} +---------------------------------------------------------------------- +### Status: +enqueued [0,1,] +---------------------------------------------------------------------- +### Kind: +"taskCancelation" [1,] +"dumpCreation" [0,] +---------------------------------------------------------------------- +### Index Tasks: +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 4b6711601..afe9c5189 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -324,7 +324,6 @@ impl ErrorCode for milli::Error { UserError::SerdeJson(_) | UserError::InvalidLmdbOpenOptions | UserError::DocumentLimitReached - | UserError::AccessingSoftDeletedDocument { .. } | UserError::UnknownInternalDocumentId { .. } => Code::Internal, UserError::InvalidStoreFile => Code::InvalidStoreFile, UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice, diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 603d8ff86..16c08c6c2 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -362,7 +362,7 @@ fn import_dump( update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }, - |indexing_step| log::debug!("update: {:?}", indexing_step), + |indexing_step| log::trace!("update: {:?}", indexing_step), || false, )?; diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 2afc1b5fb..b6950ae6e 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -612,8 +612,8 @@ fn retrieve_document>( let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); let internal_id = index - .external_documents_ids(&txn)? - .get(doc_id.as_bytes()) + .external_documents_ids() + .get(&txn, doc_id)? .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; let document = index diff --git a/meilisearch/tests/documents/delete_documents.rs b/meilisearch/tests/documents/delete_documents.rs index b3f04aea0..5a15e95ff 100644 --- a/meilisearch/tests/documents/delete_documents.rs +++ b/meilisearch/tests/documents/delete_documents.rs @@ -397,7 +397,7 @@ async fn delete_document_by_complex_filter() { "canceledBy": null, "details": { "providedIds": 0, - "deletedDocuments": 4, + "deletedDocuments": 2, "originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]" }, "error": null, diff --git a/milli/Cargo.toml b/milli/Cargo.toml index cf5fe9726..1d8517e73 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -26,8 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" -grenad = { version = "0.4.4", default-features = false, features = [ - "tempfile", +grenad = { version = "0.4.5", default-features = false, features = [ + "rayon", "tempfile" ] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [ "lmdb", "read-txn-no-tls" @@ -79,6 +79,7 @@ big_s = "1.0.2" insta = "1.29.0" maplit = "1.0.2" md5 = "0.7.0" +meili-snap = { path = "../meili-snap" } rand = { version = "0.8.5", features = ["small_rng"] } [features] diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 7c037b3bf..a874ac17e 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,5 +1,6 @@ mod builder; mod enriched; +mod primary_key; mod reader; mod serde_impl; @@ -11,6 +12,7 @@ use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; use obkv::KvReader; +pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY}; pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; @@ -87,6 +89,12 @@ impl DocumentsBatchIndex { } } +impl FieldIdMapper for DocumentsBatchIndex { + fn id(&self, name: &str) -> Option { + self.id(name) + } +} + #[derive(Debug, thiserror::Error)] pub enum Error { #[error("Error parsing number {value:?} at line {line}: {error}")] diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs new file mode 100644 index 000000000..16a95c21f --- /dev/null +++ b/milli/src/documents/primary_key.rs @@ -0,0 +1,172 @@ +use std::iter; +use std::result::Result as StdResult; + +use serde_json::Value; + +use crate::{FieldId, InternalError, Object, Result, UserError}; + +/// The symbol used to define levels in a nested primary key. +const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; + +/// The default primary that is used when not specified. +pub const DEFAULT_PRIMARY_KEY: &str = "id"; + +/// Trait for objects that can map the name of a field to its [`FieldId`]. +pub trait FieldIdMapper { + /// Attempts to map the passed name to its [`FieldId`]. + /// + /// `None` if the field with this name was not found. + fn id(&self, name: &str) -> Option; +} + +/// A type that represent the type of primary key that has been set +/// for this index, a classic flat one or a nested one. +#[derive(Debug, Clone, Copy)] +pub enum PrimaryKey<'a> { + Flat { name: &'a str, field_id: FieldId }, + Nested { name: &'a str }, +} + +pub enum DocumentIdExtractionError { + InvalidDocumentId(UserError), + MissingDocumentId, + TooManyDocumentIds(usize), +} + +impl<'a> PrimaryKey<'a> { + pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option { + Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) { + Self::Nested { name: path } + } else { + let field_id = fields.id(path)?; + Self::Flat { name: path, field_id } + }) + } + + pub fn name(&self) -> &str { + match self { + PrimaryKey::Flat { name, .. } => name, + PrimaryKey::Nested { name } => name, + } + } + + pub fn document_id( + &self, + document: &obkv::KvReader, + fields: &impl FieldIdMapper, + ) -> Result> { + match self { + PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) { + Some(document_id_bytes) => { + let document_id = serde_json::from_slice(document_id_bytes) + .map_err(InternalError::SerdeJson)?; + match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => { + Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error))) + } + } + } + None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), + }, + nested @ PrimaryKey::Nested { .. } => { + let mut matching_documents_ids = Vec::new(); + for (first_level_name, right) in nested.possible_level_names() { + if let Some(field_id) = fields.id(first_level_name) { + if let Some(value_bytes) = document.get(field_id) { + let object = serde_json::from_slice(value_bytes) + .map_err(InternalError::SerdeJson)?; + fetch_matching_values(object, right, &mut matching_documents_ids); + + if matching_documents_ids.len() >= 2 { + return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds( + matching_documents_ids.len(), + ))); + } + } + } + } + + match matching_documents_ids.pop() { + Some(document_id) => match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => { + Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error))) + } + }, + None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), + } + } + } + } + + /// Returns an `Iterator` that gives all the possible fields names the primary key + /// can have depending of the first level name and depth of the objects. + pub fn possible_level_names(&self) -> impl Iterator + '_ { + let name = self.name(); + name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) + .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) + .chain(iter::once((name, ""))) + } +} + +fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { + match value { + Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), + otherwise => output.push(otherwise), + } +} + +fn fetch_matching_values_in_object( + object: Object, + selector: &str, + base_key: &str, + output: &mut Vec, +) { + for (key, value) in object { + let base_key = if base_key.is_empty() { + key.to_string() + } else { + format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) + }; + + if starts_with(selector, &base_key) { + match value { + Value::Object(object) => { + fetch_matching_values_in_object(object, selector, &base_key, output) + } + value => output.push(value), + } + } + } +} + +fn starts_with(selector: &str, key: &str) -> bool { + selector.strip_prefix(key).map_or(false, |tail| { + tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) + }) +} + +// FIXME: move to a DocumentId struct + +fn validate_document_id(document_id: &str) -> Option<&str> { + if !document_id.is_empty() + && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) + { + Some(document_id) + } else { + None + } +} + +pub fn validate_document_id_value(document_id: Value) -> Result> { + match document_id { + Value::String(string) => match validate_document_id(&string) { + Some(s) if s.len() == string.len() => Ok(Ok(string)), + Some(s) => Ok(Ok(s.to_string())), + None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), + }, + Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), + content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), + } +} diff --git a/milli/src/error.rs b/milli/src/error.rs index e9e1fddd3..b249f2977 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -89,8 +89,6 @@ pub enum FieldIdMapMissingEntry { #[derive(Error, Debug)] pub enum UserError { - #[error("A soft deleted internal document id have been used: `{document_id}`.")] - AccessingSoftDeletedDocument { document_id: DocumentId }, #[error("A document cannot contain more than 65,535 fields.")] AttributeLimitReached, #[error(transparent)] diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 36b147336..ec419446c 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -1,159 +1,75 @@ -use std::borrow::Cow; use std::collections::HashMap; -use std::convert::TryInto; -use std::{fmt, str}; -use fst::map::IndexedValue; -use fst::{IntoStreamer, Streamer}; -use roaring::RoaringBitmap; +use heed::types::{OwnedType, Str}; +use heed::{Database, RoIter, RoTxn, RwTxn}; -const DELETED_ID: u64 = u64::MAX; +use crate::{DocumentId, BEU32}; -pub struct ExternalDocumentsIds<'a> { - pub(crate) hard: fst::Map>, - pub(crate) soft: fst::Map>, - soft_deleted_docids: RoaringBitmap, +pub enum DocumentOperationKind { + Create, + Delete, } -impl<'a> ExternalDocumentsIds<'a> { - pub fn new( - hard: fst::Map>, - soft: fst::Map>, - soft_deleted_docids: RoaringBitmap, - ) -> ExternalDocumentsIds<'a> { - ExternalDocumentsIds { hard, soft, soft_deleted_docids } - } +pub struct DocumentOperation { + pub external_id: String, + pub internal_id: DocumentId, + pub kind: DocumentOperationKind, +} - pub fn into_static(self) -> ExternalDocumentsIds<'static> { - ExternalDocumentsIds { - hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), - soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), - soft_deleted_docids: self.soft_deleted_docids, - } +pub struct ExternalDocumentsIds(Database>); + +impl ExternalDocumentsIds { + pub fn new(db: Database>) -> ExternalDocumentsIds { + ExternalDocumentsIds(db) } /// Returns `true` if hard and soft external documents lists are empty. - pub fn is_empty(&self) -> bool { - self.hard.is_empty() && self.soft.is_empty() + pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result { + self.0.is_empty(rtxn).map_err(Into::into) } - pub fn get>(&self, external_id: A) -> Option { - let external_id = external_id.as_ref(); - match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { - Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => { - Some(id.try_into().unwrap()) - } - _otherwise => None, - } - } - - /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they - /// don't contain any soft deleted document id. - pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> { - let mut new_hard_builder = fst::MapBuilder::memory(); - - let union_op = self.hard.op().add(&self.soft).r#union(); - let mut iter = union_op.into_stream(); - while let Some((external_id, docids)) = iter.next() { - // prefer selecting the ids from soft, always - let id = indexed_last_value(docids).unwrap(); - if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) { - new_hard_builder.insert(external_id, id)?; - } - } - drop(iter); - - // Delete soft map completely - self.soft = fst::Map::default().map_data(Cow::Owned)?; - // We save the new map as the new hard map. - self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; - - Ok(()) - } - - pub fn insert_ids>(&mut self, other: &fst::Map) -> fst::Result<()> { - let union_op = self.soft.op().add(other).r#union(); - - let mut new_soft_builder = fst::MapBuilder::memory(); - let mut iter = union_op.into_stream(); - while let Some((external_id, marked_docids)) = iter.next() { - let id = indexed_last_value(marked_docids).unwrap(); - new_soft_builder.insert(external_id, id)?; - } - - drop(iter); - - // We save the new map as the new soft map. - self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; - self.merge_soft_into_hard() + pub fn get>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result> { + Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get())) } /// An helper function to debug this type, returns an `HashMap` of both, /// soft and hard fst maps, combined. - pub fn to_hash_map(&self) -> HashMap { - let mut map = HashMap::new(); - - let union_op = self.hard.op().add(&self.soft).r#union(); - let mut iter = union_op.into_stream(); - while let Some((external_id, marked_docids)) = iter.next() { - let id = indexed_last_value(marked_docids).unwrap(); - if id != DELETED_ID { - let external_id = str::from_utf8(external_id).unwrap(); - map.insert(external_id.to_owned(), id.try_into().unwrap()); - } + pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result> { + let mut map = HashMap::default(); + for result in self.0.iter(rtxn)? { + let (external, internal) = result?; + map.insert(external.to_owned(), internal.get()); } - - map + Ok(map) } - /// Return an fst of the combined hard and soft deleted ID. - pub fn to_fst<'b>(&'b self) -> fst::Result>>> { - if self.soft.is_empty() { - return Ok(Cow::Borrowed(&self.hard)); - } - let union_op = self.hard.op().add(&self.soft).r#union(); - - let mut iter = union_op.into_stream(); - let mut new_hard_builder = fst::MapBuilder::memory(); - while let Some((external_id, marked_docids)) = iter.next() { - let value = indexed_last_value(marked_docids).unwrap(); - if value != DELETED_ID { - new_hard_builder.insert(external_id, value)?; + /// Applies the list of operations passed as argument, modifying the current external to internal id mapping. + /// + /// If the list contains multiple operations on the same external id, then the result is unspecified. + /// + /// # Panics + /// + /// - If attempting to delete a document that doesn't exist + /// - If attempting to create a document that already exists + pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec) -> heed::Result<()> { + for DocumentOperation { external_id, internal_id, kind } in operations { + match kind { + DocumentOperationKind::Create => { + self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?; + } + DocumentOperationKind::Delete => { + if !self.0.delete(wtxn, &external_id)? { + panic!("Attempting to delete a non-existing document") + } + } } } - drop(iter); - - Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?)) - } - - fn merge_soft_into_hard(&mut self) -> fst::Result<()> { - if self.soft.len() >= self.hard.len() / 2 { - self.hard = self.to_fst()?.into_owned(); - self.soft = fst::Map::default().map_data(Cow::Owned)?; - } - Ok(()) } -} -impl fmt::Debug for ExternalDocumentsIds<'_> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish() + /// Returns an iterator over all the external ids. + pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { + self.0.iter(rtxn) } } - -impl Default for ExternalDocumentsIds<'static> { - fn default() -> Self { - ExternalDocumentsIds { - hard: fst::Map::default().map_data(Cow::Owned).unwrap(), - soft: fst::Map::default().map_data(Cow::Owned).unwrap(), - soft_deleted_docids: RoaringBitmap::new(), - } - } -} - -/// Returns the value of the `IndexedValue` with the highest _index_. -fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option { - indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value) -} diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 810ff755b..9c1c87f82 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -81,6 +81,12 @@ impl Default for FieldsIdsMap { } } +impl crate::documents::FieldIdMapper for FieldsIdsMap { + fn id(&self, name: &str) -> Option { + self.id(name) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index bf76287d8..f635e55af 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -6,6 +6,7 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use roaring::RoaringBitmap; use crate::heed_codec::BytesDecodeOwned; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; /// This is the limit where using a byteorder became less size efficient /// than using a direct roaring encoding, it is also the point where we are able @@ -60,12 +61,16 @@ impl CboRoaringBitmapCodec { /// if the merged values length is under the threshold, values are directly /// serialized in the buffer else a RoaringBitmap is created from the /// values and is serialized in the buffer. - pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { + pub fn merge_into(slices: I, buffer: &mut Vec) -> io::Result<()> + where + I: IntoIterator, + A: AsRef<[u8]>, + { let mut roaring = RoaringBitmap::new(); let mut vec = Vec::new(); for bytes in slices { - if bytes.len() <= THRESHOLD * size_of::() { + if bytes.as_ref().len() <= THRESHOLD * size_of::() { let mut reader = bytes.as_ref(); while let Ok(integer) = reader.read_u32::() { vec.push(integer); @@ -85,7 +90,7 @@ impl CboRoaringBitmapCodec { } } else { // We can unwrap safely because the vector is sorted upper. - let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap(); + let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap(); roaring.serialize_into(buffer)?; } } else { @@ -95,6 +100,33 @@ impl CboRoaringBitmapCodec { Ok(()) } + + /// Merges a DelAdd delta into a CboRoaringBitmap. + pub fn merge_deladd_into<'a>( + deladd: KvReaderDelAdd<'_>, + previous: &[u8], + buffer: &'a mut Vec, + ) -> io::Result> { + // Deserialize the bitmap that is already there + let mut previous = Self::deserialize_from(previous)?; + + // Remove integers we no more want in the previous bitmap + if let Some(value) = deladd.get(DelAdd::Deletion) { + previous -= Self::deserialize_from(value)?; + } + + // Insert the new integers we want in the previous bitmap + if let Some(value) = deladd.get(DelAdd::Addition) { + previous |= Self::deserialize_from(value)?; + } + + if previous.is_empty() { + return Ok(None); + } + + Self::serialize_into(&previous, buffer); + Ok(Some(&buffer[..])) + } } impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { diff --git a/milli/src/index.rs b/milli/src/index.rs index d563f852b..f8cceb0ef 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fs::File; -use std::mem::size_of; use std::path::Path; use charabia::{Language, Script}; @@ -13,8 +12,8 @@ use rstar::RTree; use time::OffsetDateTime; use crate::distance::NDotProductPoint; +use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; -use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, @@ -42,7 +41,6 @@ pub mod main_key { pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; - pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids"; pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; @@ -54,17 +52,13 @@ pub mod main_key { /// It is concatenated with a big-endian encoded number (non-human readable). /// e.g. vector-hnsw0x0032. pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw"; - pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; - pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; - pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const STOP_WORDS_KEY: &str = "stop-words"; pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens"; pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens"; pub const DICTIONARY_KEY: &str = "dictionary"; - pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; pub const SYNONYMS_KEY: &str = "synonyms"; pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms"; pub const WORDS_FST_KEY: &str = "words-fst"; @@ -87,10 +81,9 @@ pub mod db_name { pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids"; + pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; - pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; - pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids"; pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; @@ -118,24 +111,23 @@ pub struct Index { /// Contains many different types (e.g. the fields ids map). pub(crate) main: PolyDatabase, + /// Maps the external documents ids with the internal document id. + pub external_documents_ids: Database>, + /// A word and all the documents ids containing the word. - pub word_docids: Database, + pub word_docids: Database, /// A word and all the documents ids containing the word, from attributes for which typos are not allowed. - pub exact_word_docids: Database, + pub exact_word_docids: Database, /// A prefix of word and all the documents ids containing this prefix. - pub word_prefix_docids: Database, + pub word_prefix_docids: Database, /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. - pub exact_word_prefix_docids: Database, + pub exact_word_prefix_docids: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, - /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. - pub word_prefix_pair_proximity_docids: Database, - /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears. - pub prefix_word_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. pub word_position_docids: Database, @@ -189,13 +181,15 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(25); + options.max_dbs(24); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; let mut wtxn = env.write_txn()?; let main = env.create_poly_database(&mut wtxn, Some(MAIN))?; let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?; + let external_documents_ids = + env.create_database(&mut wtxn, Some(EXTERNAL_DOCUMENTS_IDS))?; let exact_word_docids = env.create_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?; let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?; let exact_word_prefix_docids = @@ -204,10 +198,6 @@ impl Index { env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let script_language_docids = env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?; - let word_prefix_pair_proximity_docids = - env.create_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; - let prefix_word_pair_proximity_docids = - env.create_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?; let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?; let field_id_word_count_docids = @@ -241,14 +231,13 @@ impl Index { Ok(Index { env, main, + external_documents_ids, word_docids, exact_word_docids, word_prefix_docids, exact_word_prefix_docids, word_pair_proximity_docids, script_language_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, word_position_docids, word_fid_docids, word_prefix_position_docids, @@ -372,29 +361,6 @@ impl Index { Ok(count.unwrap_or_default()) } - /* deleted documents ids */ - - /// Writes the soft deleted documents ids. - pub(crate) fn put_soft_deleted_documents_ids( - &self, - wtxn: &mut RwTxn, - docids: &RoaringBitmap, - ) -> heed::Result<()> { - self.main.put::<_, Str, RoaringBitmapCodec>( - wtxn, - main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY, - docids, - ) - } - - /// Returns the soft deleted documents ids. - pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self - .main - .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)? - .unwrap_or_default()) - } - /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. @@ -415,45 +381,10 @@ impl Index { /* external documents ids */ - /// Writes the external documents ids and internal ids (i.e. `u32`). - pub(crate) fn put_external_documents_ids( - &self, - wtxn: &mut RwTxn, - external_documents_ids: &ExternalDocumentsIds<'_>, - ) -> heed::Result<()> { - let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids; - let hard = hard.as_fst().as_bytes(); - let soft = soft.as_fst().as_bytes(); - self.main.put::<_, Str, ByteSlice>( - wtxn, - main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, - hard, - )?; - self.main.put::<_, Str, ByteSlice>( - wtxn, - main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, - soft, - )?; - Ok(()) - } - /// Returns the external documents ids map which associate the external ids /// with the internal ids (i.e. `u32`). - pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result> { - let hard = - self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; - let soft = - self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; - let hard = match hard { - Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, - None => fst::Map::default().map_data(Cow::Owned)?, - }; - let soft = match soft { - Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?, - None => fst::Map::default().map_data(Cow::Owned)?, - }; - let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?; - Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids)) + pub fn external_documents_ids(&self) -> ExternalDocumentsIds { + ExternalDocumentsIds::new(self.external_documents_ids) } /* fields ids map */ @@ -926,44 +857,6 @@ impl Index { /* faceted documents ids */ - /// Writes the documents ids that are faceted under this field id for the given facet type. - pub fn put_faceted_documents_ids( - &self, - wtxn: &mut RwTxn, - field_id: FieldId, - facet_type: FacetType, - docids: &RoaringBitmap, - ) -> heed::Result<()> { - let key = match facet_type { - FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, - FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, - }; - let mut buffer = vec![0u8; key.len() + size_of::()]; - buffer[..key.len()].copy_from_slice(key.as_bytes()); - buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); - self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) - } - - /// Retrieve all the documents ids that are faceted under this field id for the given facet type. - pub fn faceted_documents_ids( - &self, - rtxn: &RoTxn, - field_id: FieldId, - facet_type: FacetType, - ) -> heed::Result { - let key = match facet_type { - FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, - FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, - }; - let mut buffer = vec![0u8; key.len() + size_of::()]; - buffer[..key.len()].copy_from_slice(key.as_bytes()); - buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); - match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { - Some(docids) => Ok(docids), - None => Ok(RoaringBitmap::new()), - } - } - /// Retrieve all the documents which contain this field id set as null pub fn null_faceted_documents_ids( &self, @@ -1246,12 +1139,7 @@ impl Index { rtxn: &'t RoTxn, ids: impl IntoIterator + 'a, ) -> Result)>> + 'a> { - let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; - Ok(ids.into_iter().map(move |id| { - if soft_deleted_documents.contains(id) { - return Err(UserError::AccessingSoftDeletedDocument { document_id: id })?; - } let kv = self .documents .get(rtxn, &BEU32::new(id))? @@ -1277,6 +1165,36 @@ impl Index { self.iter_documents(rtxn, self.documents_ids(rtxn)?) } + pub fn external_id_of<'a, 't: 'a>( + &'a self, + rtxn: &'t RoTxn, + ids: impl IntoIterator + 'a, + ) -> Result> + 'a> { + let fields = self.fields_ids_map(rtxn)?; + + // uses precondition "never called on an empty index" + let primary_key = self.primary_key(rtxn)?.ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + })?; + let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| { + InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldName { + field_name: primary_key.to_owned(), + process: "external_id_of", + }) + })?; + Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> { + let (_docid, obkv) = entry?; + match primary_key.document_id(&obkv, &fields)? { + Ok(document_id) => Ok(document_id), + Err(_) => Err(InternalError::DocumentsError( + crate::documents::Error::InvalidDocumentFormat, + ) + .into()), + } + })) + } + pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { FacetDistribution::new(rtxn, self) } @@ -1477,14 +1395,10 @@ impl Index { rtxn: &RoTxn, key: &(Script, Language), ) -> heed::Result> { - let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; - let doc_ids = self.script_language_docids.get(rtxn, key)?; - Ok(doc_ids.map(|ids| ids - soft_deleted_documents)) + self.script_language_docids.get(rtxn, key) } pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result>> { - let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; - let mut script_language: HashMap> = HashMap::new(); let mut script_language_doc_count: Vec<(Script, Language, u64)> = Vec::new(); let mut total = 0; @@ -1492,7 +1406,7 @@ impl Index { let ((script, language), docids) = sl?; // keep only Languages that contains at least 1 document. - let remaining_documents_count = (docids - &soft_deleted_documents).len(); + let remaining_documents_count = docids.len(); total += remaining_documents_count; if remaining_documents_count > 0 { script_language_doc_count.push((script, language, remaining_documents_count)); @@ -1528,8 +1442,7 @@ pub(crate) mod tests { use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig, - IndexDocumentsMethod, IndexerConfig, Settings, + self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult}; @@ -1619,16 +1532,36 @@ pub(crate) mod tests { Ok(()) } - pub fn delete_document(&self, external_document_id: &str) { + pub fn delete_documents_using_wtxn<'t>( + &'t self, + wtxn: &mut RwTxn<'t, '_>, + external_document_ids: Vec, + ) { + let builder = IndexDocuments::new( + wtxn, + self, + &self.indexer_config, + self.index_documents_config.clone(), + |_| (), + || false, + ) + .unwrap(); + let (builder, user_error) = builder.remove_documents(external_document_ids).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + } + + pub fn delete_documents(&self, external_document_ids: Vec) { let mut wtxn = self.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, self).unwrap(); - delete.strategy(self.index_documents_config.deletion_strategy); + self.delete_documents_using_wtxn(&mut wtxn, external_document_ids); - delete.delete_external_id(external_document_id); - delete.execute().unwrap(); wtxn.commit().unwrap(); } + + pub fn delete_document(&self, external_document_id: &str) { + self.delete_documents(vec![external_document_id.to_string()]) + } } #[test] @@ -1942,9 +1875,7 @@ pub(crate) mod tests { use big_s::S; use maplit::hashset; - let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; - let index = index; + let index = TempIndex::new(); index .update_settings(|settings| { @@ -1963,14 +1894,12 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 0 0 1 1 2 2 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); db_snap!(index, facet_id_f64_docids, 1, @r###" 1 0 0 1 [0, ] 1 0 1 1 [1, ] @@ -1986,44 +1915,37 @@ pub(crate) mod tests { } index.add_documents(documents!(docs)).unwrap(); - db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 0 4 - 1 5 - 2 6 + docids: + 0 0 + 1 1 + 2 2 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[0, 1, 2, ]"); db_snap!(index, facet_id_f64_docids, 2, @r###" - 1 0 0 1 [0, ] - 1 0 1 1 [1, 4, ] - 1 0 2 1 [2, 5, ] - 1 0 3 1 [3, 6, ] + 1 0 1 1 [0, ] + 1 0 2 1 [1, ] + 1 0 3 1 [2, 3, ] "###); index .add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }])) .unwrap(); - db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - 3 7 - hard: - 0 4 - 1 5 - 2 6 + docids: + 0 0 + 1 1 + 2 2 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[0, 1, 2, 3, ]"); db_snap!(index, facet_id_f64_docids, 3, @r###" - 1 0 0 1 [0, ] - 1 0 1 1 [1, 4, ] - 1 0 2 1 [2, 5, ] - 1 0 3 1 [3, 6, ] - 1 0 4 1 [7, ] + 1 0 1 1 [0, ] + 1 0 2 1 [1, ] + 1 0 3 1 [2, ] + 1 0 4 1 [3, ] "###); index @@ -2032,300 +1954,30 @@ pub(crate) mod tests { }) .unwrap(); - db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - hard: - 0 4 - 1 5 - 2 6 - 3 7 + docids: + 0 0 + 1 1 + 2 2 + 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); db_snap!(index, facet_id_f64_docids, 3, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [7, ] + 0 0 0 1 [0, ] + 0 0 1 1 [1, ] + 0 0 2 1 [2, ] + 0 0 3 1 [3, ] + 1 0 1 1 [0, ] + 1 0 2 1 [1, ] + 1 0 3 1 [2, ] + 1 0 4 1 [3, ] "###); } - #[test] - fn replace_documents_in_batches_external_ids_and_soft_deletion_check() { - use big_s::S; - use maplit::hashset; - - let mut index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("doggo") }); - }) - .unwrap(); - - let add_documents = |index: &TempIndex, docs: Vec>| { - let mut wtxn = index.write_txn().unwrap(); - let mut builder = IndexDocuments::new( - &mut wtxn, - index, - &index.indexer_config, - index.index_documents_config.clone(), - |_| (), - || false, - ) - .unwrap(); - for docs in docs { - (builder, _) = builder.add_documents(documents!(docs)).unwrap(); - } - builder.execute().unwrap(); - wtxn.commit().unwrap(); - }; - // First Batch - { - let mut docs1 = vec![]; - for i in 0..4 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1]); - - db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: - 0 0 - 1 1 - 2 2 - 3 3 - "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [0, ] - 1 0 1 1 [1, ] - 1 0 2 1 [2, ] - 1 0 3 1 [3, ] - "###); - } - // Second Batch: replace the documents with soft-deletion - { - index.index_documents_config.deletion_strategy = - crate::update::DeletionStrategy::AlwaysSoft; - let mut docs1 = vec![]; - for i in 0..3 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i+1 } - )); - } - let mut docs2 = vec![]; - for i in 0..3 { - docs2.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1, docs2]); - - db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, ]"); - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [0, 4, ] - 1 0 1 1 [1, 5, ] - 1 0 2 1 [2, 6, ] - 1 0 3 1 [3, ] - "###); - } - let rtxn = index.read_txn().unwrap(); - let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(3), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [4]).unwrap()[0]; - - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(0), - "doggo": Number(0), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [5]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(1), - "doggo": Number(1), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [6]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(2), - "doggo": Number(2), - } - "###); - drop(rtxn); - // Third Batch: replace the documents with soft-deletion again - { - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; - let mut docs1 = vec![]; - for i in 0..3 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i+1 } - )); - } - let mut docs2 = vec![]; - for i in 0..4 { - docs2.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1, docs2]); - - db_snap!(index, documents_ids, @"[3, 7, 8, 9, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: - 0 7 - 1 8 - 2 9 - 3 3 - "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, 4, 5, 6, ]"); - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [0, 4, 7, ] - 1 0 1 1 [1, 5, 8, ] - 1 0 2 1 [2, 6, 9, ] - 1 0 3 1 [3, ] - "###); - } - let rtxn = index.read_txn().unwrap(); - let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(3), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [7]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(0), - "doggo": Number(0), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [8]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(1), - "doggo": Number(1), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [9]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(2), - "doggo": Number(2), - } - "###); - drop(rtxn); - - // Fourth Batch: replace the documents without soft-deletion - { - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; - let mut docs1 = vec![]; - for i in 0..3 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i+2 } - )); - } - let mut docs2 = vec![]; - for i in 0..1 { - docs2.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1, docs2]); - - db_snap!(index, documents_ids, @"[3, 10, 11, 12, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: - 0 10 - 1 11 - 2 12 - 3 3 - "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [10, ] - 1 0 3 1 [3, 11, ] - 1 0 4 1 [12, ] - "###); - - let rtxn = index.read_txn().unwrap(); - let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(3), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [10]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(0), - "doggo": Number(0), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [11]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(1), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [12]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(2), - "doggo": Number(4), - } - "###); - drop(rtxn); - } - } - #[test] fn bug_3021_first() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; index @@ -2343,23 +1995,18 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 34 1 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); index.delete_document("34"); db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 34 1 + docids: 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); index .update_settings(|s| { @@ -2371,11 +2018,9 @@ pub(crate) mod tests { // do not contain any entry for previously soft-deleted document ids db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - hard: + docids: 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); // So that this document addition works correctly now. // It would be wrongly interpreted as a replacement before @@ -2383,24 +2028,19 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, 4, @r###" - soft: - hard: + docids: 34 1 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); // We do the test again, but deleting the document with id 0 instead of id 1 now index.delete_document("38"); db_snap!(index, documents_ids, @"[1, ]"); db_snap!(index, external_documents_ids, 5, @r###" - soft: - hard: + docids: 34 1 - 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 5, @"[0, ]"); index .update_settings(|s| { @@ -2410,11 +2050,9 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[1, ]"); db_snap!(index, external_documents_ids, 6, @r###" - soft: - hard: + docids: 34 1 "###); - db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); // And adding lots of documents afterwards instead of just one. // These extra subtests don't add much, but it's better than nothing. @@ -2422,8 +2060,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); db_snap!(index, external_documents_ids, 7, @r###" - soft: - hard: + docids: 34 1 38 0 39 2 @@ -2431,14 +2068,38 @@ pub(crate) mod tests { 41 3 42 5 "###); - db_snap!(index, soft_deleted_documents_ids, 7, @"[]"); + } + + #[test] + fn simple_delete() { + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + index + .add_documents(documents!([ + { "id": 30 }, + { "id": 34 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + docids: + 30 0 + 34 1"###); + + index.delete_document("34"); + + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + docids: + 30 0 + "###); } #[test] fn bug_3021_second() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; index @@ -2456,23 +2117,18 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 30 0 34 1 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); index.delete_document("34"); db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: + docids: 30 0 - 34 1 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); index .update_settings(|s| { @@ -2484,11 +2140,9 @@ pub(crate) mod tests { // do not contain any entry for previously soft-deleted document ids db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - hard: + docids: 30 0 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); // So that when we add a new document index.add_documents(documents!({ "primary_key": 35, "b": 2 })).unwrap(); @@ -2497,12 +2151,10 @@ pub(crate) mod tests { // The external documents ids don't have several external ids pointing to the same // internal document id db_snap!(index, external_documents_ids, 4, @r###" - soft: - hard: + docids: 30 0 35 1 "###); - db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); // And when we add 34 again, we don't replace document 35 index.add_documents(documents!({ "primary_key": 34, "a": 1 })).unwrap(); @@ -2510,13 +2162,11 @@ pub(crate) mod tests { // And document 35 still exists, is not deleted db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, 5, @r###" - soft: - hard: + docids: 30 0 34 2 35 1 "###); - db_snap!(index, soft_deleted_documents_ids, 5, @"[]"); let rtxn = index.read_txn().unwrap(); let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0]; @@ -2548,8 +2198,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); db_snap!(index, external_documents_ids, 6, @r###" - soft: - hard: + docids: 30 0 34 2 35 1 @@ -2557,14 +2206,12 @@ pub(crate) mod tests { 38 4 39 5 "###); - db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); } #[test] fn bug_3021_third() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; index @@ -2583,38 +2230,29 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 3 0 4 1 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); index.delete_document("3"); db_snap!(index, documents_ids, @"[1, 2, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 3 0 + docids: 4 1 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[0, ]"); - - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap(); - db_snap!(index, documents_ids, @"[2, 3, ]"); + db_snap!(index, documents_ids, @"[1, 2, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 4 3 + docids: + 4 1 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); index .add_documents(documents!([ @@ -2622,15 +2260,13 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: + docids: 3 0 - 4 3 + 4 1 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); } #[test] @@ -2638,7 +2274,6 @@ pub(crate) mod tests { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index .update_settings(|settings| { @@ -2655,12 +2290,10 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: + docids: 11 0 4 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[]"); index .add_documents(documents!([ @@ -2669,31 +2302,23 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: - 1 3 + docids: + 1 2 11 0 - 4 2 + 4 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[1, ]"); - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysHard); - delete.execute().unwrap(); - wtxn.commit().unwrap(); + index.delete_documents(Default::default()); - db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: - 1 3 + docids: + 1 2 11 0 - 4 2 + 4 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[]"); index .add_documents(documents!([ @@ -2702,15 +2327,13 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, documents_ids, @"[0, 1, 4, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: - 1 4 + docids: + 1 2 11 0 4 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]"); let rtxn = index.read_txn().unwrap(); let search = Search::new(&rtxn, &index); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 892401c08..0197639e4 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -13,7 +13,7 @@ use crate::heed_codec::ByteSliceRefCodec; /// The documents returned by the iterator are grouped by the facet values that /// determined their rank. For example, given the documents: /// -/// ```ignore +/// ```text /// 0: { "colour": ["blue", "green"] } /// 1: { "colour": ["blue", "red"] } /// 2: { "colour": ["orange", "red"] } @@ -22,7 +22,7 @@ use crate::heed_codec::ByteSliceRefCodec; /// ``` /// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator /// over the following elements: -/// ```ignore +/// ```text /// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue" /// [3] // same for "green" /// [2] // same for "orange" diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index c4cdb37e6..dbd9538a5 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -223,12 +223,9 @@ impl<'a> Filter<'a> { impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time - let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?; - // and finally we delete all the soft_deleted_documents, again, only once at the very end self.inner_evaluate(rtxn, index, &filterable_fields) - .map(|result| result - soft_deleted_documents) } fn evaluate_operator( diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index e0a2ba3cf..2c670658d 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -12,7 +12,7 @@ use super::Word; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; use crate::{ - CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, + CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, }; /// A cache storing pointers to values in the LMDB databases. @@ -25,7 +25,7 @@ pub struct DatabaseCache<'ctx> { pub word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option>>, pub word_prefix_pair_proximity_docids: - FxHashMap<(u8, Interned, Interned), Option>>, + FxHashMap<(u8, Interned, Interned), Option>, pub prefix_word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option>>, pub word_docids: FxHashMap, Option>>, @@ -168,7 +168,7 @@ impl<'ctx> SearchContext<'ctx> { merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, word, self.word_interner.get(word).as_str(), @@ -182,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> { &mut self, word: Interned, ) -> Result> { - DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, word, self.word_interner.get(word).as_str(), @@ -230,7 +230,7 @@ impl<'ctx> SearchContext<'ctx> { merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, prefix, self.word_interner.get(prefix).as_str(), @@ -244,7 +244,7 @@ impl<'ctx> SearchContext<'ctx> { &mut self, prefix: Interned, ) -> Result> { - DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, prefix, self.word_interner.get(prefix).as_str(), @@ -297,35 +297,47 @@ impl<'ctx> SearchContext<'ctx> { prefix2: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - (proximity, word1, prefix2), - &( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(prefix2).as_str(), - ), - &mut self.db_cache.word_prefix_pair_proximity_docids, - self.index.word_prefix_pair_proximity_docids.remap_data_type::(), - ) + let docids = match self + .db_cache + .word_prefix_pair_proximity_docids + .entry((proximity, word1, prefix2)) + { + Entry::Occupied(docids) => docids.get().clone(), + Entry::Vacant(entry) => { + // compute docids using prefix iter and store the result in the cache. + let key = U8StrStrCodec::bytes_encode(&( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(prefix2).as_str(), + )) + .unwrap() + .into_owned(); + let mut prefix_docids = RoaringBitmap::new(); + let remap_key_type = self + .index + .word_pair_proximity_docids + .remap_key_type::() + .prefix_iter(self.txn, &key)?; + for result in remap_key_type { + let (_, docids) = result?; + + prefix_docids |= docids; + } + entry.insert(Some(prefix_docids.clone())); + Some(prefix_docids) + } + }; + Ok(docids) } + pub fn get_db_prefix_word_pair_proximity_docids( &mut self, left_prefix: Interned, right: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - (proximity, left_prefix, right), - &( - proximity, - self.word_interner.get(left_prefix).as_str(), - self.word_interner.get(right).as_str(), - ), - &mut self.db_cache.prefix_word_pair_proximity_docids, - self.index.prefix_word_pair_proximity_docids.remap_data_type::(), - ) + // only accept exact matches on reverted positions + self.get_db_word_pair_proximity_docids(left_prefix, right, proximity) } pub fn get_db_word_fid_docids( diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index 4d340ae1c..2d181a537 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -371,7 +371,7 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best s"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -379,13 +379,13 @@ fn test_proximity_prefix_db() { insta::assert_debug_snapshot!(texts, @r###" [ "\"this is the best summer meal\"", - "\"summer best\"", "\"this is the best meal of summer\"", - "\"summer x best\"", "\"this is the best meal I have ever had in such a beautiful summer day\"", "\"this is the best cooked meal of the summer\"", "\"this is the best meal of the summer\"", "\"summer x y best\"", + "\"summer x best\"", + "\"summer best\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", ] "###); @@ -423,17 +423,17 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best win"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ + "\"this is the best winter meal\"", + "\"this is the best meal of winter\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best cooked meal of the winter\"", "\"this is the best meal of the winter\"", - "\"this is the best meal of winter\"", - "\"this is the best winter meal\"", "\"winter x y best\"", "\"winter x best\"", "\"winter best\"", @@ -471,20 +471,20 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best wi"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"this is the best winter meal\"", - "\"winter best\"", "\"this is the best meal of winter\"", - "\"winter x best\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best cooked meal of the winter\"", "\"this is the best meal of the winter\"", "\"winter x y best\"", + "\"winter x best\"", + "\"winter best\"", ] "###); } diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap index 8f3b964c1..efcfef7f1 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" }, ), ], - [ - Proximity( - Rank { - rank: 3, - max_rank: 4, - }, - ), - ], [ Proximity( Rank { @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 2, + rank: 1, + max_rank: 4, + }, + ), + ], + [ + Proximity( + Rank { + rank: 1, max_rank: 4, }, ), diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap index 1ee6bfc91..242bc3424 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" }, ), ], - [ - Proximity( - Rank { - rank: 3, - max_rank: 4, - }, - ), - ], [ Proximity( Rank { @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 2, + rank: 1, + max_rank: 4, + }, + ), + ], + [ + Proximity( + Rank { + rank: 1, max_rank: 4, }, ), diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap index 5129f1b3b..efcfef7f1 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap @@ -6,7 +6,7 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 1, + rank: 4, max_rank: 4, }, ), @@ -14,7 +14,7 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 1, + rank: 2, max_rank: 4, }, ), diff --git a/milli/src/search/new/tests/sort.rs b/milli/src/search/new/tests/sort.rs index aa6aa971f..8fdf52d44 100644 --- a/milli/src/search/new/tests/sort.rs +++ b/milli/src/search/new/tests/sort.rs @@ -13,6 +13,7 @@ This module tests the `sort` ranking rule: use big_s::S; use maplit::hashset; +use meili_snap::insta; use crate::index::tests::TempIndex; use crate::search::new::tests::collect_field_values; diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 158f515b8..28c4cb45c 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -4,9 +4,8 @@ use std::path::Path; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; -use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index}; +use crate::{make_db_snap_from_iter, obkv_to_json, Index}; #[track_caller] pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { @@ -98,7 +97,6 @@ Create a snapshot test of the given database. - `facet_id_string_docids` - `documents_ids` - `stop_words` - - `soft_deleted_documents_ids` - `field_distribution` - `fields_ids_map` - `geo_faceted_documents_ids` @@ -221,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String { &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) }) } -pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { - make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( - (proximity, word1, prefix), - b, - )| { - &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) - }) -} -pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String { - make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |( - (proximity, prefix, word2), - b, - )| { - &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b)) - }) -} pub fn snap_word_position_docids(index: &Index) -> String { make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) @@ -308,12 +290,6 @@ pub fn snap_stop_words(index: &Index) -> String { let snap = format!("{stop_words:?}"); snap } -pub fn snap_soft_deleted_documents_ids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); - - display_bitmap(&soft_deleted_documents_ids) -} pub fn snap_field_distributions(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let mut snap = String::new(); @@ -340,50 +316,21 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { } pub fn snap_external_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); - let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap(); + let external_ids = index.external_documents_ids().to_hash_map(&rtxn).unwrap(); + // ensure fixed order (not guaranteed by hashmap) + let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect(); + external_ids.sort_by(|(l, _), (r, _)| l.cmp(r)); let mut snap = String::new(); - writeln!(&mut snap, "soft:").unwrap(); - let stream_soft = soft.stream(); - let soft_external_ids = stream_soft.into_str_vec().unwrap(); - for (key, id) in soft_external_ids { - writeln!(&mut snap, "{key:<24} {id}").unwrap(); - } - writeln!(&mut snap, "hard:").unwrap(); - let stream_hard = hard.stream(); - let hard_external_ids = stream_hard.into_str_vec().unwrap(); - for (key, id) in hard_external_ids { + writeln!(&mut snap, "docids:").unwrap(); + for (key, id) in external_ids { writeln!(&mut snap, "{key:<24} {id}").unwrap(); } snap } -pub fn snap_number_faceted_documents_ids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let mut snap = String::new(); - for field_id in fields_ids_map.ids() { - let number_faceted_documents_ids = - index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap(); - writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids)) - .unwrap(); - } - snap -} -pub fn snap_string_faceted_documents_ids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let mut snap = String::new(); - for field_id in fields_ids_map.ids() { - let string_faceted_documents_ids = - index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap(); - writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids)) - .unwrap(); - } - snap -} pub fn snap_words_fst(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let words_fst = index.words_fst(&rtxn).unwrap(); @@ -516,9 +463,6 @@ macro_rules! full_snap_of_db { ($index:ident, stop_words) => {{ $crate::snapshot_tests::snap_stop_words(&$index) }}; - ($index:ident, soft_deleted_documents_ids) => {{ - $crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index) - }}; ($index:ident, field_distribution) => {{ $crate::snapshot_tests::snap_field_distributions(&$index) }}; @@ -531,12 +475,6 @@ macro_rules! full_snap_of_db { ($index:ident, external_documents_ids) => {{ $crate::snapshot_tests::snap_external_documents_ids(&$index) }}; - ($index:ident, number_faceted_documents_ids) => {{ - $crate::snapshot_tests::snap_number_faceted_documents_ids(&$index) - }}; - ($index:ident, string_faceted_documents_ids) => {{ - $crate::snapshot_tests::snap_string_faceted_documents_ids(&$index) - }}; ($index:ident, words_fst) => {{ $crate::snapshot_tests::snap_words_fst(&$index) }}; diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs index 784bee5a7..f460693ba 100644 --- a/milli/src/update/available_documents_ids.rs +++ b/milli/src/update/available_documents_ids.rs @@ -8,16 +8,11 @@ pub struct AvailableDocumentsIds { } impl AvailableDocumentsIds { - pub fn from_documents_ids( - docids: &RoaringBitmap, - soft_deleted_docids: &RoaringBitmap, - ) -> AvailableDocumentsIds { - let used_docids = docids | soft_deleted_docids; - - match used_docids.max() { + pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds { + match docids.max() { Some(last_id) => { let mut available = RoaringBitmap::from_iter(0..last_id); - available -= used_docids; + available -= docids; let iter = match last_id.checked_add(1) { Some(id) => id..=u32::max_value(), @@ -50,7 +45,7 @@ mod tests { #[test] fn empty() { let base = RoaringBitmap::new(); - let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); + let left = AvailableDocumentsIds::from_documents_ids(&base); let right = 0..=u32::max_value(); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } @@ -63,28 +58,8 @@ mod tests { base.insert(100); base.insert(405); - let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); + let left = AvailableDocumentsIds::from_documents_ids(&base); let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } - - #[test] - fn soft_deleted() { - let mut base = RoaringBitmap::new(); - base.insert(0); - base.insert(10); - base.insert(100); - base.insert(405); - - let mut soft_deleted = RoaringBitmap::new(); - soft_deleted.insert(1); - soft_deleted.insert(11); - soft_deleted.insert(101); - soft_deleted.insert(406); - - let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted); - let right = - (0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n)); - left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); - } } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ab42fd854..afe0191b1 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,8 +1,7 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::facet::FacetType; -use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; +use crate::{FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -21,13 +20,12 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { let Index { env: _env, main: _main, + external_documents_ids, word_docids, exact_word_docids, word_prefix_docids, exact_word_prefix_docids, word_pair_proximity_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, word_position_docids, word_fid_docids, field_id_word_count_docids, @@ -51,43 +49,23 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We retrieve the number of documents ids that we are deleting. let number_of_documents = self.index.number_of_documents(self.wtxn)?; - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; - self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &empty_roaring)?; - self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?; self.index.delete_vector_hnsw(self.wtxn)?; - // We clean all the faceted documents ids. - for field_id in faceted_fields { - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::Number, - &empty_roaring, - )?; - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::String, - &empty_roaring, - )?; - } - // Clear the other databases. + external_documents_ids.clear(self.wtxn)?; word_docids.clear(self.wtxn)?; exact_word_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?; exact_word_prefix_docids.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; - word_prefix_pair_proximity_docids.clear(self.wtxn)?; - prefix_word_pair_proximity_docids.clear(self.wtxn)?; word_position_docids.clear(self.wtxn)?; word_fid_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; @@ -140,7 +118,7 @@ mod tests { assert!(index.words_fst(&rtxn).unwrap().is_empty()); assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); - assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); + assert!(index.external_documents_ids().is_empty(&rtxn).unwrap()); assert!(index.documents_ids(&rtxn).unwrap().is_empty()); assert!(index.field_distribution(&rtxn).unwrap().is_empty()); assert!(index.geo_rtree(&rtxn).unwrap().is_none()); @@ -150,7 +128,6 @@ mod tests { assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); - assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs new file mode 100644 index 000000000..794beb5df --- /dev/null +++ b/milli/src/update/del_add.rs @@ -0,0 +1,125 @@ +use obkv::Key; + +pub type KvWriterDelAdd = obkv::KvWriter; +pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>; + +/// DelAdd defines the new value to add in the database and old value to delete from the database. +/// +/// Its used in an OBKV to be serialized in grenad files. +#[repr(u8)] +#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)] +pub enum DelAdd { + Deletion = 0, + Addition = 1, +} + +impl Key for DelAdd { + const BYTES_SIZE: usize = std::mem::size_of::(); + type BYTES = [u8; Self::BYTES_SIZE]; + + fn to_be_bytes(&self) -> Self::BYTES { + u8::to_be_bytes(*self as u8) + } + + fn from_be_bytes(array: Self::BYTES) -> Self { + match u8::from_be_bytes(array) { + 0 => Self::Deletion, + 1 => Self::Addition, + otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise), + } + } +} + +/// Creates a Kv> from Kv +/// +/// Deletion: put all the values under DelAdd::Deletion +/// Addition: put all the values under DelAdd::Addition, +/// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition, +pub fn into_del_add_obkv( + reader: obkv::KvReader, + operation: DelAddOperation, + buffer: &mut Vec, +) -> Result<(), std::io::Error> { + let mut writer = obkv::KvWriter::new(buffer); + let mut value_buffer = Vec::new(); + for (key, value) in reader.iter() { + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) { + value_writer.insert(DelAdd::Deletion, value)?; + } + if matches!(operation, DelAddOperation::Addition | DelAddOperation::DeletionAndAddition) { + value_writer.insert(DelAdd::Addition, value)?; + } + value_writer.finish()?; + writer.insert(key, &value_buffer)?; + } + + writer.finish() +} + +/// Enum controlling the side of the DelAdd obkv in which the provided value will be written. +#[derive(Debug, Clone, Copy)] +pub enum DelAddOperation { + Deletion, + Addition, + DeletionAndAddition, +} + +/// Creates a Kv> from two Kv +/// +/// putting each deletion obkv's keys under an DelAdd::Deletion +/// and putting each addition obkv's keys under an DelAdd::Addition +pub fn del_add_from_two_obkvs( + deletion: obkv::KvReader, + addition: obkv::KvReader, + buffer: &mut Vec, +) -> Result<(), std::io::Error> { + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + let mut writer = obkv::KvWriter::new(buffer); + let mut value_buffer = Vec::new(); + + for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) { + value_buffer.clear(); + match eob { + Left((k, v)) => { + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Deletion, v).unwrap(); + writer.insert(k, value_writer.into_inner()?).unwrap(); + } + Right((k, v)) => { + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Addition, v).unwrap(); + writer.insert(k, value_writer.into_inner()?).unwrap(); + } + Both((k, deletion), (_, addition)) => { + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Deletion, deletion).unwrap(); + value_writer.insert(DelAdd::Addition, addition).unwrap(); + writer.insert(k, value_writer.into_inner()?).unwrap(); + } + } + } + + writer.finish() +} + +pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool { + del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition) +} + +/// A function that extracts and returns the Add side of a DelAdd obkv. +/// This is useful when there are no previous value in the database and +/// therefore we don't need to do a diff with what's already there. +/// +/// If there is no Add side we currently write an empty buffer +/// which is a valid CboRoaringBitmap. +#[allow(clippy::ptr_arg)] // required to avoid signature mismatch +pub fn deladd_serialize_add_side<'a>( + obkv: &'a [u8], + _buffer: &mut Vec, +) -> crate::Result<&'a [u8]> { + Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) +} diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs deleted file mode 100644 index 164ad0c7e..000000000 --- a/milli/src/update/delete_documents.rs +++ /dev/null @@ -1,1255 +0,0 @@ -use std::collections::btree_map::Entry; -use std::collections::{BTreeSet, HashMap, HashSet}; - -use fst::IntoStreamer; -use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice}; -use heed::{BytesDecode, BytesEncode, Database, RwIter}; -use instant_distance::PointId; -use roaring::RoaringBitmap; -use serde::{Deserialize, Serialize}; -use time::OffsetDateTime; - -use super::facet::delete::FacetsDelete; -use super::ClearDocuments; -use crate::error::InternalError; -use crate::facet::FacetType; -use crate::heed_codec::facet::FieldDocIdFacetCodec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::index::Hnsw; -use crate::{ - ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32, -}; - -pub struct DeleteDocuments<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - external_documents_ids: ExternalDocumentsIds<'static>, - to_delete_docids: RoaringBitmap, - strategy: DeletionStrategy, -} - -/// Result of a [`DeleteDocuments`] operation. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct DocumentDeletionResult { - pub deleted_documents: u64, - pub remaining_documents: u64, -} - -/// Strategy for deleting documents. -/// -/// - Soft-deleted documents are simply marked as deleted without being actually removed from DB. -/// - Hard-deleted documents are definitely suppressed from the DB. -/// -/// Soft-deleted documents trade disk space for runtime performance. -/// -/// Note that any of these variants can be used at any given moment for any indexation in a database. -/// For instance, you can use an [`AlwaysSoft`] followed by an [`AlwaysHard`] option without issue. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] -pub enum DeletionStrategy { - #[default] - /// Definitely suppress documents according to the number or size of soft-deleted documents - Dynamic, - /// Never definitely suppress documents - AlwaysSoft, - /// Always definitely suppress documents - AlwaysHard, -} - -impl std::fmt::Display for DeletionStrategy { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - DeletionStrategy::Dynamic => write!(f, "dynamic"), - DeletionStrategy::AlwaysSoft => write!(f, "always_soft"), - DeletionStrategy::AlwaysHard => write!(f, "always_hard"), - } - } -} - -/// Result of a [`DeleteDocuments`] operation, used for internal purposes. -/// -/// It is a superset of the [`DocumentDeletionResult`] structure, giving -/// additional information about the algorithm used to delete the documents. -#[derive(Debug)] -pub(crate) struct DetailedDocumentDeletionResult { - pub deleted_documents: u64, - pub remaining_documents: u64, -} - -impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> Result> { - let external_documents_ids = index.external_documents_ids(wtxn)?.into_static(); - - Ok(DeleteDocuments { - wtxn, - index, - external_documents_ids, - to_delete_docids: RoaringBitmap::new(), - strategy: Default::default(), - }) - } - - pub fn strategy(&mut self, strategy: DeletionStrategy) { - self.strategy = strategy; - } - - pub fn delete_document(&mut self, docid: u32) { - self.to_delete_docids.insert(docid); - } - - pub fn delete_documents(&mut self, docids: &RoaringBitmap) { - self.to_delete_docids |= docids; - } - - pub fn delete_external_id(&mut self, external_id: &str) -> Option { - let docid = self.external_documents_ids.get(external_id)?; - self.delete_document(docid); - Some(docid) - } - - pub fn execute(self) -> Result { - let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } = - self.execute_inner()?; - - Ok(DocumentDeletionResult { deleted_documents, remaining_documents }) - } - - pub(crate) fn execute_inner(mut self) -> Result { - puffin::profile_function!(); - - self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; - - // We retrieve the current documents ids that are in the database. - let mut documents_ids = self.index.documents_ids(self.wtxn)?; - let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?; - let current_documents_ids_len = documents_ids.len(); - - // We can and must stop removing documents in a database that is empty. - if documents_ids.is_empty() { - // but if there was still documents to delete we clear the database entirely - if !soft_deleted_docids.is_empty() { - ClearDocuments::new(self.wtxn, self.index).execute()?; - } - return Ok(DetailedDocumentDeletionResult { - deleted_documents: 0, - remaining_documents: 0, - }); - } - - // We remove the documents ids that we want to delete - // from the documents in the database and write them back. - documents_ids -= &self.to_delete_docids; - self.index.put_documents_ids(self.wtxn, &documents_ids)?; - - // We can execute a ClearDocuments operation when the number of documents - // to delete is exactly the number of documents in the database. - if current_documents_ids_len == self.to_delete_docids.len() { - let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?; - return Ok(DetailedDocumentDeletionResult { - deleted_documents: current_documents_ids_len, - remaining_documents, - }); - } - - let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let mut field_distribution = self.index.field_distribution(self.wtxn)?; - - // we update the field distribution - for docid in self.to_delete_docids.iter() { - let key = BEU32::new(docid); - let document = - self.index.documents.get(self.wtxn, &key)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: "documents", key: None }, - )?; - for (fid, _value) in document.iter() { - let field_name = - fields_ids_map.name(fid).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: fid, - process: "delete documents", - })?; - if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) - { - match entry.get().checked_sub(1) { - Some(0) | None => entry.remove(), - Some(count) => entry.insert(count), - }; - } - } - } - - self.index.put_field_distribution(self.wtxn, &field_distribution)?; - - soft_deleted_docids |= &self.to_delete_docids; - - // We always soft-delete the documents, even if they will be permanently - // deleted immediately after. - self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; - - // decide for a hard or soft deletion depending on the strategy - let soft_deletion = match self.strategy { - DeletionStrategy::Dynamic => { - // decide to keep the soft deleted in the DB for now if they meet 2 criteria: - // 1. There is less than a fixed rate of 50% of soft-deleted to actual documents, *and* - // 2. Soft-deleted occupy an average of less than a fixed size on disk - - let size_used = self.index.used_size()?; - let nb_documents = self.index.number_of_documents(self.wtxn)?; - let nb_soft_deleted = soft_deleted_docids.len(); - - (nb_soft_deleted < nb_documents) && { - const SOFT_DELETED_SIZE_BYTE_THRESHOLD: u64 = 1_073_741_824; // 1GiB - - // nb_documents + nb_soft_deleted !=0 because if nb_documents is 0 we short-circuit earlier, and then we moved the documents to delete - // from the documents_docids to the soft_deleted_docids. - let estimated_document_size = size_used / (nb_documents + nb_soft_deleted); - let estimated_size_used_by_soft_deleted = - estimated_document_size * nb_soft_deleted; - estimated_size_used_by_soft_deleted < SOFT_DELETED_SIZE_BYTE_THRESHOLD - } - } - DeletionStrategy::AlwaysSoft => true, - DeletionStrategy::AlwaysHard => false, - }; - - if soft_deletion { - // Keep the soft-deleted in the DB - return Ok(DetailedDocumentDeletionResult { - deleted_documents: self.to_delete_docids.len(), - remaining_documents: documents_ids.len(), - }); - } - - self.to_delete_docids = soft_deleted_docids; - - let Index { - env: _env, - main: _main, - word_docids, - exact_word_docids, - word_prefix_docids, - exact_word_prefix_docids, - word_pair_proximity_docids, - field_id_word_count_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, - word_position_docids, - word_prefix_position_docids, - word_fid_docids, - word_prefix_fid_docids, - facet_id_f64_docids: _, - facet_id_string_docids: _, - facet_id_normalized_string_strings: _, - facet_id_string_fst: _, - field_id_docid_facet_f64s: _, - field_id_docid_facet_strings: _, - script_language_docids, - facet_id_exists_docids, - facet_id_is_null_docids, - facet_id_is_empty_docids, - vector_id_docid, - documents, - } = self.index; - // Remove from the documents database - for docid in &self.to_delete_docids { - documents.delete(self.wtxn, &BEU32::new(docid))?; - } - // We acquire the current external documents ids map... - // Note that its soft-deleted document ids field will be equal to the `to_delete_docids` - let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?; - // We then remove the soft-deleted docids from it - new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; - // and write it back to the main database. - let new_external_documents_ids = new_external_documents_ids.into_static(); - self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; - - let mut words_to_keep = BTreeSet::default(); - let mut words_to_delete = BTreeSet::default(); - // We iterate over the words and delete the documents ids - // from the word docids database. - remove_from_word_docids( - self.wtxn, - word_docids, - &self.to_delete_docids, - &mut words_to_keep, - &mut words_to_delete, - )?; - remove_from_word_docids( - self.wtxn, - exact_word_docids, - &self.to_delete_docids, - &mut words_to_keep, - &mut words_to_delete, - )?; - - // We construct an FST set that contains the words to delete from the words FST. - let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?; - - let new_words_fst = { - // We retrieve the current words FST from the database. - let words_fst = self.index.words_fst(self.wtxn)?; - let difference = words_fst.op().add(&words_to_delete).difference(); - - // We stream the new external ids that does no more contains the to-delete external ids. - let mut new_words_fst_builder = fst::SetBuilder::memory(); - new_words_fst_builder.extend_stream(difference.into_stream())?; - - // We create an words FST set from the above builder. - new_words_fst_builder.into_set() - }; - - // We write the new words FST into the main database. - self.index.put_words_fst(self.wtxn, &new_words_fst)?; - - let prefixes_to_delete = - remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.to_delete_docids)?; - - let exact_prefix_to_delete = remove_from_word_prefix_docids( - self.wtxn, - exact_word_prefix_docids, - &self.to_delete_docids, - )?; - - let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union(); - - // We compute the new prefix FST and write it only if there is a change. - if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() { - let new_words_prefixes_fst = { - // We retrieve the current words prefixes FST from the database. - let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?; - let difference = - words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference(); - - // We stream the new external ids that does no more contains the to-delete external ids. - let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory(); - new_words_prefixes_fst_builder.extend_stream(difference.into_stream())?; - - // We create an words FST set from the above builder. - new_words_prefixes_fst_builder.into_set() - }; - - // We write the new words prefixes FST into the main database. - self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?; - } - - for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { - // We delete the documents ids from the word prefix pair proximity database docids - // and remove the empty pairs too. - Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?; - } - Self::delete_from_db( - word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_position_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - - // Remove the documents ids from the field id word count database. - Self::delete_from_db( - field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - - if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { - let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; - - let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree - .iter() - .filter(|&point| self.to_delete_docids.contains(point.data.0)) - .cloned() - .map(|point| (point, point.data.0)) - .unzip(); - points_to_remove.iter().for_each(|point| { - rtree.remove(point); - }); - geo_faceted_doc_ids -= docids_to_remove; - - self.index.put_geo_rtree(self.wtxn, &rtree)?; - self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; - } - - for facet_type in [FacetType::Number, FacetType::String] { - let mut affected_facet_values = HashMap::new(); - for field_id in self.index.faceted_fields_ids(self.wtxn)? { - // Remove docids from the number faceted documents ids - let mut docids = - self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?; - docids -= &self.to_delete_docids; - self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?; - - let facet_values = remove_docids_from_field_id_docid_facet_value( - self.index, - self.wtxn, - facet_type, - field_id, - &self.to_delete_docids, - )?; - if !facet_values.is_empty() { - affected_facet_values.insert(field_id, facet_values); - } - } - FacetsDelete::new( - self.index, - facet_type, - affected_facet_values, - &self.to_delete_docids, - ) - .execute(self.wtxn)?; - } - - // Remove the documents ids from the script language database. - Self::delete_from_db( - script_language_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_docids( - self.wtxn, - facet_id_exists_docids, - &self.to_delete_docids, - )?; - - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_docids( - self.wtxn, - facet_id_is_null_docids, - &self.to_delete_docids, - )?; - - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_docids( - self.wtxn, - facet_id_is_empty_docids, - &self.to_delete_docids, - )?; - - // An ugly and slow way to remove the vectors from the HNSW - // It basically reconstructs the HNSW from scratch without editing the current one. - if let Some(current_hnsw) = self.index.vector_hnsw(self.wtxn)? { - let mut points = Vec::new(); - let mut docids = Vec::new(); - for result in vector_id_docid.iter(self.wtxn)? { - let (vector_id, docid) = result?; - if !self.to_delete_docids.contains(docid.get()) { - let pid = PointId::from(vector_id.get()); - let vector = current_hnsw[pid].clone(); - points.push(vector); - docids.push(docid); - } - } - - let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points); - - vector_id_docid.clear(self.wtxn)?; - for (pid, docid) in pids.into_iter().zip(docids) { - vector_id_docid.put(self.wtxn, &BEU32::new(pid.into_inner()), &docid)?; - } - self.index.put_vector_hnsw(self.wtxn, &new_hnsw)?; - } - - self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; - - Ok(DetailedDocumentDeletionResult { - deleted_documents: self.to_delete_docids.len(), - remaining_documents: documents_ids.len(), - }) - } - - fn delete_from_db( - mut iter: RwIter, C>, - to_delete_docids: &RoaringBitmap, - ) -> Result<()> - where - C: for<'a> BytesDecode<'a, DItem = RoaringBitmap> - + for<'a> BytesEncode<'a, EItem = RoaringBitmap>, - { - puffin::profile_function!(); - - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - Ok(()) - } -} - -fn remove_from_word_prefix_docids( - txn: &mut heed::RwTxn, - db: &Database, - to_remove: &RoaringBitmap, -) -> Result>> { - puffin::profile_function!(); - - let mut prefixes_to_delete = fst::SetBuilder::memory(); - - // We iterate over the word prefix docids database and remove the deleted documents ids - // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. - let mut iter = db.iter_mut(txn)?; - while let Some(result) = iter.next() { - let (prefix, mut docids) = result?; - let prefix = prefix.to_owned(); - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - prefixes_to_delete.insert(prefix)?; - } else if docids.len() != previous_len { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&prefix, &docids)? }; - } - } - - Ok(prefixes_to_delete.into_set()) -} - -fn remove_from_word_docids( - txn: &mut heed::RwTxn, - db: &heed::Database, - to_remove: &RoaringBitmap, - words_to_keep: &mut BTreeSet, - words_to_remove: &mut BTreeSet, -) -> Result<()> { - puffin::profile_function!(); - - // We create an iterator to be able to get the content and delete the word docids. - // It's faster to acquire a cursor to get and delete or put, as we avoid traversing - // the LMDB B-Tree two times but only once. - let mut iter = db.iter_mut(txn)?; - while let Some((key, mut docids)) = iter.next().transpose()? { - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - words_to_remove.insert(key.to_owned()); - } else { - words_to_keep.insert(key.to_owned()); - if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } - } - - Ok(()) -} - -fn remove_docids_from_field_id_docid_facet_value( - index: &Index, - wtxn: &mut heed::RwTxn, - facet_type: FacetType, - field_id: FieldId, - to_remove: &RoaringBitmap, -) -> heed::Result>> { - puffin::profile_function!(); - - let db = match facet_type { - FacetType::String => { - index.field_id_docid_facet_strings.remap_types::() - } - FacetType::Number => { - index.field_id_docid_facet_f64s.remap_types::() - } - }; - let mut all_affected_facet_values = HashSet::default(); - let mut iter = db - .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? - .remap_key_type::>(); - - while let Some(result) = iter.next() { - let ((_, docid, facet_value), _) = result?; - if to_remove.contains(docid) { - if !all_affected_facet_values.contains(facet_value) { - all_affected_facet_values.insert(facet_value.to_owned()); - } - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } - } - - Ok(all_affected_facet_values) -} - -fn remove_docids_from_facet_id_docids<'a, C>( - wtxn: &'a mut heed::RwTxn, - db: &heed::Database, - to_remove: &RoaringBitmap, -) -> heed::Result<()> -where - C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, -{ - puffin::profile_function!(); - - let mut iter = db.remap_key_type::().iter_mut(wtxn)?; - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use big_s::S; - use heed::RwTxn; - use maplit::hashset; - - use super::*; - use crate::index::tests::TempIndex; - use crate::{db_snap, Filter, Search}; - - fn delete_documents<'t>( - wtxn: &mut RwTxn<'t, '_>, - index: &'t Index, - external_ids: &[&str], - strategy: DeletionStrategy, - ) -> Vec { - let external_document_ids = index.external_documents_ids(wtxn).unwrap(); - let ids_to_delete: Vec = external_ids - .iter() - .map(|id| external_document_ids.get(id.as_bytes()).unwrap()) - .collect(); - - // Delete some documents. - let mut builder = DeleteDocuments::new(wtxn, index).unwrap(); - builder.strategy(strategy); - external_ids.iter().for_each(|id| { - builder.delete_external_id(id); - }); - builder.execute().unwrap(); - - ids_to_delete - } - - fn delete_documents_with_numbers_as_primary_key_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, - { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, - { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } - ]), - ) - .unwrap(); - - // delete those documents, ids are synchronous therefore 0, 1, and 2. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_document(0); - builder.delete_document(1); - builder.delete_document(2); - builder.strategy(deletion_strategy); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - // All these snapshots should be empty since the database was cleared - db_snap!(index, documents_ids, deletion_strategy); - db_snap!(index, word_docids, deletion_strategy); - db_snap!(index, word_pair_proximity_docids, deletion_strategy); - db_snap!(index, facet_id_exists_docids, deletion_strategy); - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - - let rtxn = index.read_txn().unwrap(); - - assert!(index.field_distribution(&rtxn).unwrap().is_empty()); - } - - #[test] - fn delete_documents_with_numbers_as_primary_key() { - delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysHard); - delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysSoft); - } - - fn delete_documents_with_strange_primary_key_(strategy: DeletionStrategy) { - let index = TempIndex::new(); - - index - .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()])) - .unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "mysuperid": 0, "name": "kevin" }, - { "mysuperid": 1, "name": "kevina" }, - { "mysuperid": 2, "name": "benoit" } - ]), - ) - .unwrap(); - wtxn.commit().unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - - // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_external_id("0"); - builder.delete_external_id("1"); - builder.strategy(strategy); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, strategy); - db_snap!(index, word_docids, strategy); - db_snap!(index, word_pair_proximity_docids, strategy); - db_snap!(index, soft_deleted_documents_ids, strategy); - } - - #[test] - fn delete_documents_with_strange_primary_key() { - delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysHard); - delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysSoft); - } - - fn filtered_placeholder_search_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label"), S("label2") }); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"], deletion_strategy); - - // Placeholder search with filter - let filter = Filter::from_str("label = sign").unwrap().unwrap(); - let results = index.search(&wtxn).filter(filter).execute().unwrap(); - assert!(results.documents_ids.is_empty()); - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - db_snap!(index, word_docids, deletion_strategy); - db_snap!(index, facet_id_f64_docids, deletion_strategy); - db_snap!(index, word_pair_proximity_docids, deletion_strategy); - db_snap!(index, facet_id_exists_docids, deletion_strategy); - db_snap!(index, facet_id_string_docids, deletion_strategy); - } - - #[test] - fn filtered_placeholder_search_should_not_return_deleted_documents() { - filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysHard, - ); - filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysSoft, - ); - } - - fn placeholder_search_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"], deletion_strategy); - - // Placeholder search - let results = index.search(&wtxn).execute().unwrap(); - assert!(!results.documents_ids.is_empty()); - for id in results.documents_ids.iter() { - assert!( - !deleted_internal_ids.contains(id), - "The document {} was supposed to be deleted", - id - ); - } - - wtxn.commit().unwrap(); - } - - #[test] - fn placeholder_search_should_not_return_deleted_documents() { - placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn search_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy); - - // search for abstract - let results = index.search(&wtxn).query("abstract").execute().unwrap(); - assert!(!results.documents_ids.is_empty()); - for id in results.documents_ids.iter() { - assert!( - !deleted_internal_ids.contains(id), - "The document {} was supposed to be deleted", - id - ); - } - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - } - - #[test] - fn search_should_not_return_deleted_documents() { - search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn geo_filtered_placeholder_search_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("id")); - settings.set_filterable_fields(hashset!(S("_geo"))); - settings.set_sortable_fields(hashset!(S("_geo"))); - }) - .unwrap(); - - index.add_documents_using_wtxn(&mut wtxn, documents!([ - { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } }, - { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } }, - { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } }, - { "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } }, - { "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } }, - { "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } }, - { "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } }, - { "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } }, - { "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } }, - { "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } }, - { "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } }, - { "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } }, - { "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } }, - { "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } }, - { "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } }, - { "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } }, - { "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } }, - { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } }, - { "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } }, - { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } } - ])).unwrap(); - - let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &external_ids_to_delete, deletion_strategy); - - // Placeholder search with geo filter - let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); - let results = index.search(&wtxn).filter(filter).execute().unwrap(); - assert!(!results.documents_ids.is_empty()); - for id in results.documents_ids.iter() { - assert!( - !deleted_internal_ids.contains(id), - "The document {} was supposed to be deleted", - id - ); - } - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - db_snap!(index, facet_id_f64_docids, deletion_strategy); - db_snap!(index, facet_id_string_docids, deletion_strategy); - } - - #[test] - fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { - geo_filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysHard, - ); - geo_filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysSoft, - ); - } - - fn get_documents_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - let deleted_external_ids = ["1_7", "1_52"]; - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &deleted_external_ids, deletion_strategy); - - // list all documents - let results = index.all_documents(&wtxn).unwrap(); - for result in results { - let (id, _) = result.unwrap(); - assert!( - !deleted_internal_ids.contains(&id), - "The document {} was supposed to be deleted", - id - ); - } - - // list internal document ids - let results = index.documents_ids(&wtxn).unwrap(); - for id in results { - assert!( - !deleted_internal_ids.contains(&id), - "The document {} was supposed to be deleted", - id - ); - } - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - - // get internal docids from deleted external document ids - let results = index.external_documents_ids(&rtxn).unwrap(); - for id in deleted_external_ids { - assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id); - } - drop(rtxn); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - } - - #[test] - fn get_documents_should_not_return_deleted_documents() { - get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn stats_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index.add_documents_using_wtxn(&mut wtxn, documents!([ - { "docid": "1_4", "label": ["sign"]}, - { "docid": "1_5", "label": ["letter"]}, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"}, - { "docid": "1_36", "label": ["drawing","painting","pattern"]}, - { "docid": "1_37", "label": ["art","drawing","outdoor"]}, - { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"}, - { "docid": "1_39", "label": ["abstract"]}, - { "docid": "1_40", "label": ["cartoon"]}, - { "docid": "1_41", "label": ["art","drawing"]}, - { "docid": "1_42", "label": ["art","pattern"]}, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32}, - { "docid": "1_44", "label": ["drawing"], "number": 44i32}, - { "docid": "1_45", "label": ["art"]}, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]}, - { "docid": "1_47", "label": ["abstract","pattern"]}, - { "docid": "1_52", "label": ["abstract","cartoon"]}, - { "docid": "1_57", "label": ["abstract","drawing","pattern"]}, - { "docid": "1_58", "label": ["abstract","art","cartoon"]}, - { "docid": "1_68", "label": ["design"]}, - { "docid": "1_69", "label": ["geometry"]} - ])).unwrap(); - - delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy); - - // count internal documents - let results = index.number_of_documents(&wtxn).unwrap(); - assert_eq!(18, results); - - // count field distribution - let results = index.field_distribution(&wtxn).unwrap(); - assert_eq!(Some(&18), results.get("label")); - assert_eq!(Some(&1), results.get("title")); - assert_eq!(Some(&2), results.get("number")); - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - } - - #[test] - fn stats_should_not_return_deleted_documents() { - stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn stored_detected_script_and_language_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - use charabia::{Language, Script}; - let index = TempIndex::new(); - let mut wtxn = index.write_txn().unwrap(); - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, - { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, - { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, - { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" }, - { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" }, - { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" }, - ])) - .unwrap(); - - let key_cmn = (Script::Cj, Language::Cmn); - let cj_cmn_docs = - index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default(); - let mut expected_cj_cmn_docids = RoaringBitmap::new(); - expected_cj_cmn_docids.push(1); - expected_cj_cmn_docids.push(5); - assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); - - delete_documents(&mut wtxn, &index, &["1"], deletion_strategy); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let cj_cmn_docs = - index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default(); - let mut expected_cj_cmn_docids = RoaringBitmap::new(); - expected_cj_cmn_docids.push(5); - assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); - } - - #[test] - fn stored_detected_script_and_language_should_not_return_deleted_documents() { - stored_detected_script_and_language_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysHard, - ); - stored_detected_script_and_language_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysSoft, - ); - } - - #[test] - fn delete_words_exact_attributes() { - let index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key(S("id")); - settings.set_searchable_fields(vec![S("text"), S("exact")]); - settings.set_exact_attributes(vec![S("exact")].into_iter().collect()); - }) - .unwrap(); - - index - .add_documents(documents!([ - { "id": 0, "text": "hello" }, - { "id": 1, "exact": "hello"} - ])) - .unwrap(); - db_snap!(index, word_docids, 1, @r###" - hello [0, ] - "###); - db_snap!(index, exact_word_docids, 1, @r###" - hello [1, ] - "###); - db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); - - let mut wtxn = index.write_txn().unwrap(); - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &["1"], DeletionStrategy::AlwaysHard); - wtxn.commit().unwrap(); - - db_snap!(index, word_docids, 2, @r###" - hello [0, ] - "###); - db_snap!(index, exact_word_docids, 2, @""); - db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); - - insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]"); - let txn = index.read_txn().unwrap(); - let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap(); - insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###); - - let mut s = Search::new(&txn, &index); - s.query("hello"); - let crate::SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); - } -} diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index a3f0c8f71..5626a4aae 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,10 +1,9 @@ -use std::borrow::Cow; use std::fs::File; use std::io::BufReader; use grenad::CompressionType; use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn}; use roaring::RoaringBitmap; use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; @@ -13,17 +12,15 @@ use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::heed_codec::ByteSliceRefCodec; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; -use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result}; /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases /// by rebuilding the database "from scratch". /// /// First, the new elements are inserted into the level 0 of the database. Then, the /// higher levels are cleared and recomputed from the content of level 0. -/// -/// Finally, the `faceted_documents_ids` value in the main database of `Index` -/// is updated to contain the new set of faceted documents. pub struct FacetsUpdateBulk<'i> { index: &'i Index, group_size: u8, @@ -31,7 +28,7 @@ pub struct FacetsUpdateBulk<'i> { facet_type: FacetType, field_ids: Vec, // None if level 0 does not need to be updated - new_data: Option>>, + delta_data: Option>>, } impl<'i> FacetsUpdateBulk<'i> { @@ -39,7 +36,7 @@ impl<'i> FacetsUpdateBulk<'i> { index: &'i Index, field_ids: Vec, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, group_size: u8, min_level_size: u8, ) -> FacetsUpdateBulk<'i> { @@ -49,7 +46,7 @@ impl<'i> FacetsUpdateBulk<'i> { group_size, min_level_size, facet_type, - new_data: Some(new_data), + delta_data: Some(delta_data), } } @@ -64,13 +61,13 @@ impl<'i> FacetsUpdateBulk<'i> { group_size: FACET_GROUP_SIZE, min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, - new_data: None, + delta_data: None, } } #[logging_timer::time("FacetsUpdateBulk::{}")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { - let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; + let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self; let db = match facet_type { FacetType::String => index @@ -81,12 +78,9 @@ impl<'i> FacetsUpdateBulk<'i> { } }; - let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; + let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size }; - inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { - index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; - Ok(()) - })?; + inner.update(wtxn, &field_ids)?; Ok(()) } @@ -95,26 +89,19 @@ impl<'i> FacetsUpdateBulk<'i> { /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type pub(crate) struct FacetsUpdateBulkInner { pub db: heed::Database, FacetGroupValueCodec>, - pub new_data: Option>, + pub delta_data: Option>, pub group_size: u8, pub min_level_size: u8, } impl FacetsUpdateBulkInner { - pub fn update( - mut self, - wtxn: &mut RwTxn, - field_ids: &[u16], - mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>, - ) -> Result<()> { + pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> { self.update_level0(wtxn)?; for &field_id in field_ids.iter() { self.clear_levels(wtxn, field_id)?; } for &field_id in field_ids.iter() { - let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?; - - handle_all_docids(wtxn, field_id, all_docids)?; + let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?; for level_reader in level_readers { let mut cursor = level_reader.into_cursor()?; @@ -133,19 +120,27 @@ impl FacetsUpdateBulkInner { self.db.delete_range(wtxn, &range).map(drop)?; Ok(()) } + fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { - let new_data = match self.new_data.take() { + let delta_data = match self.delta_data.take() { Some(x) => x, None => return Ok(()), }; if self.db.is_empty(wtxn)? { let mut buffer = Vec::new(); let mut database = self.db.iter_mut(wtxn)?.remap_types::(); - let mut cursor = new_data.into_cursor()?; + let mut cursor = delta_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if !valid_lmdb_key(key) { continue; } + let value = KvReaderDelAdd::new(value); + + // DB is empty, it is safe to ignore Del operations + let Some(value) = value.get(DelAdd::Addition) else { + continue; + }; + buffer.clear(); // the group size for level 0 buffer.push(1); @@ -157,11 +152,14 @@ impl FacetsUpdateBulkInner { let mut buffer = Vec::new(); let database = self.db.remap_types::(); - let mut cursor = new_data.into_cursor()?; + let mut cursor = delta_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if !valid_lmdb_key(key) { continue; } + + let value = KvReaderDelAdd::new(value); + // the value is a CboRoaringBitmap, but I still need to prepend the // group size for level 0 (= 1) to it buffer.clear(); @@ -169,17 +167,27 @@ impl FacetsUpdateBulkInner { // then we extend the buffer with the docids bitmap match database.get(wtxn, key)? { Some(prev_value) => { + // prev_value is the group size for level 0, followed by the previous bitmap. let old_bitmap = &prev_value[1..]; - CboRoaringBitmapCodec::merge_into( - &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], - &mut buffer, - )?; + CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?; } None => { + // it is safe to ignore the del in that case. + let Some(value) = value.get(DelAdd::Addition) else { + // won't put the key in DB as the value would be empty + continue; + }; + buffer.extend_from_slice(value); } }; - database.put(wtxn, key, &buffer)?; + let new_bitmap = &buffer[1..]; + // if the new bitmap is empty, let's remove it + if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 { + database.delete(wtxn, key)?; + } else { + database.put(wtxn, key, &buffer)?; + } } } Ok(()) @@ -188,16 +196,10 @@ impl FacetsUpdateBulkInner { &self, field_id: FieldId, txn: &RoTxn, - ) -> Result<(Vec>>, RoaringBitmap)> { - let mut all_docids = RoaringBitmap::new(); - let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { - for bitmap in bitmaps { - all_docids |= bitmap; - } - Ok(()) - })?; + ) -> Result>>> { + let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?; - Ok((subwriters, all_docids)) + Ok(subwriters) } #[allow(clippy::type_complexity)] fn read_level_0<'t>( @@ -491,7 +493,6 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); - db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); } #[test] diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs deleted file mode 100644 index 883abc8ca..000000000 --- a/milli/src/update/facet/delete.rs +++ /dev/null @@ -1,360 +0,0 @@ -use std::collections::{HashMap, HashSet}; - -use heed::RwTxn; -use log::debug; -use roaring::RoaringBitmap; -use time::OffsetDateTime; - -use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; -use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; -use crate::heed_codec::ByteSliceRefCodec; -use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}; -use crate::{FieldId, Index, Result}; - -/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases. -/// -/// Depending on the number of removed elements and the existing size of the database, we use either -/// a bulk delete method or an incremental delete method. -pub struct FacetsDelete<'i, 'b> { - index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, - facet_type: FacetType, - affected_facet_values: HashMap>>, - docids_to_delete: &'b RoaringBitmap, - group_size: u8, - max_group_size: u8, - min_level_size: u8, -} -impl<'i, 'b> FacetsDelete<'i, 'b> { - pub fn new( - index: &'i Index, - facet_type: FacetType, - affected_facet_values: HashMap>>, - docids_to_delete: &'b RoaringBitmap, - ) -> Self { - let database = match facet_type { - FacetType::String => index - .facet_id_string_docids - .remap_key_type::>(), - FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() - } - }; - Self { - index, - database, - facet_type, - affected_facet_values, - docids_to_delete, - group_size: FACET_GROUP_SIZE, - max_group_size: FACET_MAX_GROUP_SIZE, - min_level_size: FACET_MIN_LEVEL_SIZE, - } - } - - pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> { - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - - for (field_id, affected_facet_values) in self.affected_facet_values { - // This is an incorrect condition, since we assume that the length of the database is equal - // to the number of facet values for the given field_id. It means that in some cases, we might - // wrongly choose the incremental indexer over the bulk indexer. But the only case where that could - // really be a performance problem is when we fully delete a large ratio of all facet values for - // each field id. This would almost never happen. Still, to be overly cautious, I have added a - // 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance - // penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead. - if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) { - // Bulk delete - let mut modified = false; - - for facet_value in affected_facet_values { - let key = - FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() }; - let mut old = self.database.get(wtxn, &key)?.unwrap(); - let previous_len = old.bitmap.len(); - old.bitmap -= self.docids_to_delete; - if old.bitmap.is_empty() { - modified = true; - self.database.delete(wtxn, &key)?; - } else if old.bitmap.len() != previous_len { - modified = true; - self.database.put(wtxn, &key, &old)?; - } - } - if modified { - let builder = FacetsUpdateBulk::new_not_updating_level_0( - self.index, - vec![field_id], - self.facet_type, - ); - builder.execute(wtxn)?; - } - } else { - // Incremental - let inc = FacetsUpdateIncrementalInner { - db: self.database, - group_size: self.group_size, - min_level_size: self.min_level_size, - max_group_size: self.max_group_size, - }; - for facet_value in affected_facet_values { - inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?; - } - } - } - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use std::iter::FromIterator; - - use big_s::S; - use maplit::hashset; - use rand::seq::SliceRandom; - use rand::SeedableRng; - use roaring::RoaringBitmap; - - use crate::db_snap; - use crate::documents::documents_batch_reader_from_objects; - use crate::index::tests::TempIndex; - use crate::update::facet::test_helpers::ordered_string; - use crate::update::{DeleteDocuments, DeletionStrategy}; - - #[test] - fn delete_mixed_incremental_and_bulk() { - // The point of this test is to create an index populated with documents - // containing different filterable attributes. Then, we delete a bunch of documents - // such that a mix of the incremental and bulk indexer is used (depending on the field id) - let index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index - .update_settings(|settings| { - settings.set_filterable_fields( - hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "label": i / 10, - "colour": i / 100, - "timestamp": i / 2, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576"); - db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf"); - - let mut wtxn = index.env.write_txn().unwrap(); - - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); - builder.delete_documents(&RoaringBitmap::from_iter(0..100)); - // by deleting the first 100 documents, we expect that: - // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) - // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 - // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 - // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 - // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, @"[]"); - db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6"); - db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56"); - } - - // Same test as above but working with string values for the facets - #[test] - fn delete_mixed_incremental_and_bulk_string() { - // The point of this test is to create an index populated with documents - // containing different filterable attributes. Then, we delete a bunch of documents - // such that a mix of the incremental and bulk indexer is used (depending on the field id) - let index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index - .update_settings(|settings| { - settings.set_filterable_fields( - hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "label": ordered_string(i / 10), - "colour": ordered_string(i / 100), - "timestamp": ordered_string(i / 2), - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) - db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); - db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); - - let mut wtxn = index.env.write_txn().unwrap(); - - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); - builder.delete_documents(&RoaringBitmap::from_iter(0..100)); - // by deleting the first 100 documents, we expect that: - // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) - // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 - // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 - // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 - // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, @"[]"); - db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc"); - db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f"); - } - - #[test] - fn delete_almost_all_incrementally_string() { - let index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index - .update_settings(|settings| { - settings.set_filterable_fields( - hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "label": ordered_string(i / 10), - "colour": ordered_string(i / 100), - "timestamp": ordered_string(i / 2), - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) - db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); - db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); - - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - - let mut docids_to_delete = (0..1000).collect::>(); - docids_to_delete.shuffle(&mut rng); - for docid in docids_to_delete.into_iter().take(990) { - let mut wtxn = index.env.write_txn().unwrap(); - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); - builder.delete_documents(&RoaringBitmap::from_iter([docid])); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - } - - db_snap!(index, soft_deleted_documents_ids, @"[]"); - db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d"); - db_snap!(index, string_faceted_documents_ids, 2, @r###" - 0 [] - 1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] - 2 [292, 324, 358, 381, 493, 839, 852, ] - 3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] - "###); - } -} - -#[allow(unused)] -#[cfg(test)] -mod comparison_bench { - use std::iter::once; - - use rand::Rng; - use roaring::RoaringBitmap; - - use crate::heed_codec::facet::OrderedF64Codec; - use crate::update::facet::test_helpers::FacetIndex; - - // This is a simple test to get an intuition on the relative speed - // of the incremental vs. bulk indexer. - // - // The benchmark shows the worst-case scenario for the incremental indexer, since - // each facet value contains only one document ID. - // - // In that scenario, it appears that the incremental indexer is about 70 times slower than the - // bulk indexer. - // #[test] - fn benchmark_facet_indexing_delete() { - let mut r = rand::thread_rng(); - - for i in 1..=20 { - let size = 50_000 * i; - let index = FacetIndex::::new(4, 8, 5); - - let mut txn = index.env.write_txn().unwrap(); - let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); - for i in 0..size { - // field id = 0, left_bound = i, docids = [i] - elements.push(((0, i as f64), once(i).collect())); - } - let timer = std::time::Instant::now(); - index.bulk_insert(&mut txn, &[0], elements.iter()); - let time_spent = timer.elapsed().as_millis(); - println!("bulk {size} : {time_spent}ms"); - - txn.commit().unwrap(); - - for nbr_doc in [1, 100, 1000, 10_000] { - let mut txn = index.env.write_txn().unwrap(); - let timer = std::time::Instant::now(); - // - // delete one document - // - for _ in 0..nbr_doc { - let deleted_u32 = r.gen::() % size; - let deleted_f64 = deleted_u32 as f64; - index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32) - } - let time_spent = timer.elapsed().as_millis(); - println!(" delete {nbr_doc} : {time_spent}ms"); - txn.abort().unwrap(); - } - } - } -} diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 743c0b038..e241c499c 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,9 +1,9 @@ -use std::collections::HashMap; use std::fs::File; use std::io::BufReader; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use obkv::KvReader; use roaring::RoaringBitmap; use crate::facet::FacetType; @@ -12,8 +12,9 @@ use crate::heed_codec::facet::{ }; use crate::heed_codec::ByteSliceRefCodec; use crate::search::facet::get_highest_level; +use crate::update::del_add::DelAdd; use crate::update::index_documents::valid_lmdb_key; -use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use crate::{CboRoaringBitmapCodec, Index, Result}; enum InsertionResult { InPlace, @@ -28,27 +29,21 @@ enum DeletionResult { /// Algorithm to incrementally insert and delete elememts into the /// `facet_id_(string/f64)_docids` databases. -/// -/// Rhe `faceted_documents_ids` value in the main database of `Index` -/// is also updated to contain the new set of faceted documents. -pub struct FacetsUpdateIncremental<'i> { - index: &'i Index, +pub struct FacetsUpdateIncremental { inner: FacetsUpdateIncrementalInner, - facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, } -impl<'i> FacetsUpdateIncremental<'i> { +impl FacetsUpdateIncremental { pub fn new( - index: &'i Index, + index: &Index, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, group_size: u8, min_level_size: u8, max_group_size: u8, ) -> Self { FacetsUpdateIncremental { - index, inner: FacetsUpdateIncrementalInner { db: match facet_type { FacetType::String => index @@ -62,31 +57,41 @@ impl<'i> FacetsUpdateIncremental<'i> { max_group_size, min_level_size, }, - facet_type, - new_data, + delta_data, } } - pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { - let mut new_faceted_docids = HashMap::::default(); - - let mut cursor = self.new_data.into_cursor()?; + pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> { + let mut cursor = self.delta_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if !valid_lmdb_key(key) { continue; } let key = FacetGroupKeyCodec::::bytes_decode(key) .ok_or(heed::Error::Encoding)?; - let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; - *new_faceted_docids.entry(key.field_id).or_default() |= docids; + let value = KvReader::new(value); + + let docids_to_delete = value + .get(DelAdd::Deletion) + .map(CboRoaringBitmapCodec::bytes_decode) + .map(|o| o.ok_or(heed::Error::Encoding)); + + let docids_to_add = value + .get(DelAdd::Addition) + .map(CboRoaringBitmapCodec::bytes_decode) + .map(|o| o.ok_or(heed::Error::Encoding)); + + if let Some(docids_to_delete) = docids_to_delete { + let docids_to_delete = docids_to_delete?; + self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?; + } + + if let Some(docids_to_add) = docids_to_add { + let docids_to_add = docids_to_add?; + self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?; + } } - for (field_id, new_docids) in new_faceted_docids { - let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; - docids |= new_docids; - self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; - } Ok(()) } } diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index bbd25f91e..52fea0f5f 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -14,7 +14,7 @@ The databases must be able to return results for queries such as: The algorithms that implement these queries are found in the `src/search/facet` folder. To make these queries fast to compute, the database adopts a tree structure: -```ignore +```text ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ ┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │ │Level 2│ │ │ │ │ @@ -41,7 +41,7 @@ These documents all contain a facet value that is contained within `ab .. gaf`. In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a [`FacetGroupValue`], which have the following format: -```ignore +```text FacetGroupKey: - field id : u16 - level : u8 @@ -98,7 +98,6 @@ use crate::update::merge_btreeset_string; use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH}; pub mod bulk; -pub mod delete; pub mod incremental; /// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases. @@ -109,7 +108,7 @@ pub struct FacetsUpdate<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, group_size: u8, max_group_size: u8, min_level_size: u8, @@ -118,7 +117,7 @@ impl<'i> FacetsUpdate<'i> { pub fn new( index: &'i Index, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, ) -> Self { let database = match facet_type { FacetType::String => index @@ -135,26 +134,26 @@ impl<'i> FacetsUpdate<'i> { max_group_size: FACET_MAX_GROUP_SIZE, min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, - new_data, + delta_data, } } pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { - if self.new_data.is_empty() { + if self.delta_data.is_empty() { return Ok(()); } debug!("Computing and writing the facet values levels docids into LMDB on disk..."); self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // See self::comparison_bench::benchmark_facet_indexing - if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { + if self.delta_data.len() >= (self.database.len(wtxn)? as u64 / 50) { let field_ids = self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); let bulk_update = FacetsUpdateBulk::new( self.index, field_ids, self.facet_type, - self.new_data, + self.delta_data, self.group_size, self.min_level_size, ); @@ -163,7 +162,7 @@ impl<'i> FacetsUpdate<'i> { let incremental_update = FacetsUpdateIncremental::new( self.index, self.facet_type, - self.new_data, + self.delta_data, self.group_size, self.min_level_size, self.max_group_size, @@ -279,6 +278,7 @@ pub(crate) mod test_helpers { use crate::heed_codec::ByteSliceRefCodec; use crate::search::facet::get_highest_level; use crate::snapshot_tests::display_bitmap; + use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::FacetsUpdateIncrementalInner; use crate::CboRoaringBitmapCodec; @@ -455,20 +455,22 @@ pub(crate) mod test_helpers { let key: FacetGroupKey<&[u8]> = FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + let mut inner_writer = KvWriterDelAdd::memory(); let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap(); - writer.insert(&key, &value).unwrap(); + inner_writer.insert(DelAdd::Addition, value).unwrap(); + writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap(); } writer.finish().unwrap(); let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); let update = FacetsUpdateBulkInner { db: self.content, - new_data: Some(reader), + delta_data: Some(reader), group_size: self.group_size.get(), min_level_size: self.min_level_size.get(), }; - update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); + update.update(wtxn, field_ids).unwrap(); } pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) { @@ -556,101 +558,6 @@ pub(crate) mod test_helpers { } } -#[cfg(test)] -mod tests { - use big_s::S; - use maplit::hashset; - - use crate::db_snap; - use crate::documents::documents_batch_reader_from_objects; - use crate::index::tests::TempIndex; - use crate::update::DeletionStrategy; - - #[test] - fn replace_all_identical_soft_deletion_then_hard_deletion() { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("size") }); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "size": i % 250, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b"); - db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9"); - db_snap!(index, soft_deleted_documents_ids, "initial", @"[]"); - - let mut documents = vec![]; - for i in 0..999 { - documents.push( - serde_json::json! { - { - "id": i, - "size": i % 250, - "other": 0, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f"); - db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06"); - db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); - - // Then replace the last document while disabling soft_deletion - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; - let mut documents = vec![]; - for i in 999..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "size": i % 250, - "other": 0, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6"); - db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028"); - db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]"); - } -} - #[allow(unused)] #[cfg(test)] mod comparison_bench { diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 22b16f253..03eb3f4de 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -1,20 +1,17 @@ +use std::fmt; use std::io::{BufWriter, Read, Seek}; use std::result::Result as StdResult; -use std::{fmt, iter}; use serde::{Deserialize, Serialize}; use serde_json::Value; -use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; +use crate::documents::{ + DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader, + EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY, +}; use crate::error::{GeoError, InternalError, UserError}; use crate::update::index_documents::{obkv_to_object, writer_into_reader}; -use crate::{FieldId, Index, Object, Result}; - -/// The symbol used to define levels in a nested primary key. -const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; - -/// The default primary that is used when not specified. -const DEFAULT_PRIMARY_KEY: &str = "id"; +use crate::{FieldId, Index, Result}; /// This function validates and enrich the documents by checking that: /// - we can infer a primary key, @@ -41,14 +38,12 @@ pub fn enrich_documents_batch( // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. let primary_key = match index.primary_key(rtxn)? { - Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { - PrimaryKey::nested(primary_key) - } - Some(primary_key) => match documents_batch_index.id(primary_key) { - Some(id) => PrimaryKey::flat(primary_key, id), - None if autogenerate_docids => { - PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) - } + Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) { + Some(primary_key) => primary_key, + None if autogenerate_docids => PrimaryKey::Flat { + name: primary_key, + field_id: documents_batch_index.insert(primary_key), + }, None => { return match cursor.next_document()? { Some(first_document) => Ok(Err(UserError::MissingDocumentId { @@ -76,14 +71,14 @@ pub fn enrich_documents_batch( }); match guesses.as_slice() { - [] if autogenerate_docids => PrimaryKey::flat( - DEFAULT_PRIMARY_KEY, - documents_batch_index.insert(DEFAULT_PRIMARY_KEY), - ), + [] if autogenerate_docids => PrimaryKey::Flat { + name: DEFAULT_PRIMARY_KEY, + field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY), + }, [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), [(field_id, name)] => { log::info!("Primary key was not specified in index. Inferred to '{name}'"); - PrimaryKey::flat(name, *field_id) + PrimaryKey::Flat { name, field_id: *field_id } } multiple => { return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { @@ -156,92 +151,24 @@ fn fetch_or_generate_document_id( uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], count: u32, ) -> Result> { - match primary_key { - PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { - match document.get(primary_key_id) { - Some(document_id_bytes) => { - let document_id = serde_json::from_slice(document_id_bytes) - .map_err(InternalError::SerdeJson)?; - match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), - Err(user_error) => Ok(Err(user_error)), - } - } - None if autogenerate_docids => { - let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); - Ok(Ok(DocumentId::generated(uuid.to_string(), count))) - } - None => Ok(Err(UserError::MissingDocumentId { - primary_key: primary_key.to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })), - } + Ok(match primary_key.document_id(document, documents_batch_index)? { + Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }), + Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error), + Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => { + let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); + Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count }) } - nested @ PrimaryKey::Nested { .. } => { - let mut matching_documents_ids = Vec::new(); - for (first_level_name, right) in nested.possible_level_names() { - if let Some(field_id) = documents_batch_index.id(first_level_name) { - if let Some(value_bytes) = document.get(field_id) { - let object = serde_json::from_slice(value_bytes) - .map_err(InternalError::SerdeJson)?; - fetch_matching_values(object, right, &mut matching_documents_ids); - - if matching_documents_ids.len() >= 2 { - return Ok(Err(UserError::TooManyDocumentIds { - primary_key: nested.name().to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })); - } - } - } - } - - match matching_documents_ids.pop() { - Some(document_id) => match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), - Err(user_error) => Ok(Err(user_error)), - }, - None => Ok(Err(UserError::MissingDocumentId { - primary_key: nested.name().to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })), - } + Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(document, documents_batch_index)?, + }), + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(document, documents_batch_index)?, + }) } - } -} - -/// A type that represent the type of primary key that has been set -/// for this index, a classic flat one or a nested one. -#[derive(Debug, Clone, Copy)] -enum PrimaryKey<'a> { - Flat { name: &'a str, field_id: FieldId }, - Nested { name: &'a str }, -} - -impl PrimaryKey<'_> { - fn flat(name: &str, field_id: FieldId) -> PrimaryKey { - PrimaryKey::Flat { name, field_id } - } - - fn nested(name: &str) -> PrimaryKey { - PrimaryKey::Nested { name } - } - - fn name(&self) -> &str { - match self { - PrimaryKey::Flat { name, .. } => name, - PrimaryKey::Nested { name } => name, - } - } - - /// Returns an `Iterator` that gives all the possible fields names the primary key - /// can have depending of the first level name and deepnes of the objects. - fn possible_level_names(&self) -> impl Iterator + '_ { - let name = self.name(); - name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) - .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) - .chain(iter::once((name, ""))) - } + }) } /// A type that represents a document id that has been retrieved from a document or auto-generated. @@ -255,14 +182,6 @@ pub enum DocumentId { } impl DocumentId { - fn retrieved(value: String) -> DocumentId { - DocumentId::Retrieved { value } - } - - fn generated(value: String, document_nth: u32) -> DocumentId { - DocumentId::Generated { value, document_nth } - } - fn debug(&self) -> String { format!("{:?}", self) } @@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId { } } -fn starts_with(selector: &str, key: &str) -> bool { - selector.strip_prefix(key).map_or(false, |tail| { - tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) - }) -} - -pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { - match value { - Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), - otherwise => output.push(otherwise), - } -} - -pub fn fetch_matching_values_in_object( - object: Object, - selector: &str, - base_key: &str, - output: &mut Vec, -) { - for (key, value) in object { - let base_key = if base_key.is_empty() { - key.to_string() - } else { - format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) - }; - - if starts_with(selector, &base_key) { - match value { - Value::Object(object) => { - fetch_matching_values_in_object(object, selector, &base_key, output) - } - value => output.push(value), - } - } - } -} - -pub fn validate_document_id(document_id: &str) -> Option<&str> { - if !document_id.is_empty() - && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) - { - Some(document_id) - } else { - None - } -} - -/// Parses a Json encoded document id and validate it, returning a user error when it is one. -pub fn validate_document_id_value(document_id: Value) -> Result> { - match document_id { - Value::String(string) => match validate_document_id(&string) { - Some(s) if s.len() == string.len() => Ok(Ok(string)), - Some(s) => Ok(Ok(s.to_string())), - None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), - }, - Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), - content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), - } -} - /// Try to extract an `f64` from a JSON `Value` and return the `Value` /// in the `Err` variant if it failed. pub fn extract_finite_float_from_value(value: Value) -> StdResult { diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 643d16354..303b64271 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -5,18 +5,16 @@ use std::io::BufReader; use std::{io, mem, str}; use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; -use obkv::KvReader; +use obkv::{KvReader, KvWriterU16}; use roaring::RoaringBitmap; use serde_json::Value; -use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; +use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; -use crate::update::index_documents::MergeFn; -use crate::{ - absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, -}; +use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; +use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; -pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>; +pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; /// Extracts the word and positions where this word appear and /// prefixes it by the document id. @@ -32,25 +30,162 @@ pub fn extract_docid_word_positions( allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: Option, -) -> Result<(RoaringBitmap, grenad::Reader>, ScriptLanguageDocidsMap)> { +) -> Result<(grenad::Reader>, ScriptLanguageDocidsMap)> { puffin::profile_function!(); let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); + // initialize destination values. let mut documents_ids = RoaringBitmap::new(); let mut script_language_docids = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, - concat_u32s_array, + keep_latest_obkv, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - let mut buffers = Buffers::default(); + // initialize buffers. + let mut del_buffers = Buffers::default(); + let mut add_buffers = Buffers::default(); + let mut key_buffer = Vec::new(); + let mut value_buffer = Vec::new(); + + // initialize tokenizer. + let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None); + let tokenizer = builder.build(); + + // iterate over documents. + let mut cursor = obkv_documents.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let document_id = key + .try_into() + .map(u32::from_be_bytes) + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + let obkv = KvReader::::new(value); + + // if the searchable fields didn't change, skip the searchable indexing for this document. + if !searchable_fields_changed(&KvReader::::new(value), searchable_fields) { + continue; + } + + documents_ids.push(document_id); + + // Update key buffer prefix. + key_buffer.clear(); + key_buffer.extend_from_slice(&document_id.to_be_bytes()); + + // Tokenize deletions and additions in 2 diffferent threads. + let (del, add): (Result<_>, Result<_>) = rayon::join( + || { + // deletions + lang_safe_tokens_from_document( + &obkv, + searchable_fields, + &tokenizer, + stop_words, + allowed_separators, + dictionary, + max_positions_per_attributes, + DelAdd::Deletion, + &mut del_buffers, + ) + }, + || { + // additions + lang_safe_tokens_from_document( + &obkv, + searchable_fields, + &tokenizer, + stop_words, + allowed_separators, + dictionary, + max_positions_per_attributes, + DelAdd::Addition, + &mut add_buffers, + ) + }, + ); + + let (del_obkv, del_script_language_word_count) = del?; + let (add_obkv, add_script_language_word_count) = add?; + + // merge deletions and additions. + // transforming two KV> into one KV>> + value_buffer.clear(); + del_add_from_two_obkvs( + KvReader::::new(del_obkv), + KvReader::::new(add_obkv), + &mut value_buffer, + )?; + + // write each KV> into the sorter, field by field. + let obkv = KvReader::::new(&value_buffer); + for (field_id, value) in obkv.iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&field_id.to_be_bytes()); + docid_word_positions_sorter.insert(&key_buffer, value)?; + } + + // update script_language_docids deletions. + for (script, languages_frequency) in del_script_language_word_count { + for (language, _) in languages_frequency { + let entry = script_language_docids + .entry((script, language)) + .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); + entry.0.push(document_id); + } + } + + // update script_language_docids additions. + for (script, languages_frequency) in add_script_language_word_count { + for (language, _) in languages_frequency { + let entry = script_language_docids + .entry((script, language)) + .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); + entry.1.push(document_id); + } + } + } + + // the returned sorter is serialized as: key: (DocId, FieldId), value: KV>. + sorter_into_reader(docid_word_positions_sorter, indexer) + .map(|reader| (reader, script_language_docids)) +} + +/// Check if any searchable fields of a document changed. +fn searchable_fields_changed( + obkv: &KvReader, + searchable_fields: &Option>, +) -> bool { + for (field_id, field_bytes) in obkv.iter() { + if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + let del_add = KvReaderDelAdd::new(field_bytes); + match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { + // if both fields are None, check the next field. + (None, None) => (), + // if both contains a value and values are the same, check the next field. + (Some(del), Some(add)) if del == add => (), + // otherwise the fields are different, return true. + _otherwise => return true, + } + } + } + + false +} + +/// Factorize tokenizer building. +fn tokenizer_builder<'a>( + stop_words: Option<&'a fst::Set<&[u8]>>, + allowed_separators: Option<&'a [&str]>, + dictionary: Option<&'a [&str]>, + script_language: Option<&'a HashMap>>, +) -> TokenizerBuilder<'a, &'a [u8]> { let mut tokenizer_builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { tokenizer_builder.stop_words(stop_words); @@ -61,130 +196,147 @@ pub fn extract_docid_word_positions( if let Some(separators) = allowed_separators { tokenizer_builder.separators(separators); } - let tokenizer = tokenizer_builder.build(); - let mut cursor = obkv_documents.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let document_id = key - .try_into() - .map(u32::from_be_bytes) - .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let obkv = KvReader::::new(value); + if let Some(script_language) = script_language { + tokenizer_builder.allow_list(script_language); + } - documents_ids.push(document_id); - buffers.key_buffer.clear(); - buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes()); + tokenizer_builder +} - let mut script_language_word_count = HashMap::new(); +/// Extract words mapped with their positions of a document, +/// ensuring no Language detection mistakes was made. +#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct +fn lang_safe_tokens_from_document<'a>( + obkv: &KvReader, + searchable_fields: &Option>, + tokenizer: &Tokenizer, + stop_words: Option<&fst::Set<&[u8]>>, + allowed_separators: Option<&[&str]>, + dictionary: Option<&[&str]>, + max_positions_per_attributes: u32, + del_add: DelAdd, + buffers: &'a mut Buffers, +) -> Result<(&'a [u8], HashMap>)> { + let mut script_language_word_count = HashMap::new(); - extract_tokens_from_document( - &obkv, - searchable_fields, - &tokenizer, - max_positions_per_attributes, - &mut buffers, - &mut script_language_word_count, - &mut docid_word_positions_sorter, - )?; + tokens_from_document( + obkv, + searchable_fields, + tokenizer, + max_positions_per_attributes, + del_add, + buffers, + &mut script_language_word_count, + )?; - // if we detect a potetial mistake in the language detection, - // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. - // context: https://github.com/meilisearch/meilisearch/issues/3565 - if script_language_word_count - .values() - .map(Vec::as_slice) - .any(potential_language_detection_error) - { - // build an allow list with the most frequent detected languages in the document. - let script_language: HashMap<_, _> = - script_language_word_count.iter().filter_map(most_frequent_languages).collect(); + // if we detect a potetial mistake in the language detection, + // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. + // context: https://github.com/meilisearch/meilisearch/issues/3565 + if script_language_word_count + .values() + .map(Vec::as_slice) + .any(potential_language_detection_error) + { + // build an allow list with the most frequent detected languages in the document. + let script_language: HashMap<_, _> = + script_language_word_count.iter().filter_map(most_frequent_languages).collect(); - // if the allow list is empty, meaning that no Language is considered frequent, - // then we don't rerun the extraction. - if !script_language.is_empty() { - // build a new temporary tokenizer including the allow list. - let mut tokenizer_builder = TokenizerBuilder::new(); - if let Some(stop_words) = stop_words { - tokenizer_builder.stop_words(stop_words); - } - tokenizer_builder.allow_list(&script_language); - let tokenizer = tokenizer_builder.build(); + // if the allow list is empty, meaning that no Language is considered frequent, + // then we don't rerun the extraction. + if !script_language.is_empty() { + // build a new temporary tokenizer including the allow list. + let mut builder = tokenizer_builder( + stop_words, + allowed_separators, + dictionary, + Some(&script_language), + ); + let tokenizer = builder.build(); - script_language_word_count.clear(); + script_language_word_count.clear(); - // rerun the extraction. - extract_tokens_from_document( - &obkv, - searchable_fields, - &tokenizer, - max_positions_per_attributes, - &mut buffers, - &mut script_language_word_count, - &mut docid_word_positions_sorter, - )?; - } - } - - for (script, languages_frequency) in script_language_word_count { - for (language, _) in languages_frequency { - let entry = script_language_docids - .entry((script, language)) - .or_insert_with(RoaringBitmap::new); - entry.push(document_id); - } + // rerun the extraction. + tokens_from_document( + obkv, + searchable_fields, + &tokenizer, + max_positions_per_attributes, + del_add, + buffers, + &mut script_language_word_count, + )?; } } - sorter_into_reader(docid_word_positions_sorter, indexer) - .map(|reader| (documents_ids, reader, script_language_docids)) + // returns a (KV>, HashMap>) + Ok((&buffers.obkv_buffer, script_language_word_count)) } -fn extract_tokens_from_document( +/// Extract words mapped with their positions of a document. +fn tokens_from_document<'a>( obkv: &KvReader, searchable_fields: &Option>, tokenizer: &Tokenizer, max_positions_per_attributes: u32, - buffers: &mut Buffers, + del_add: DelAdd, + buffers: &'a mut Buffers, script_language_word_count: &mut HashMap>, - docid_word_positions_sorter: &mut grenad::Sorter, -) -> Result<()> { +) -> Result<&'a [u8]> { + buffers.obkv_buffer.clear(); + let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { + // if field is searchable. if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { - let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - buffers.field_buffer.clear(); - if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { - let tokens = process_tokens(tokenizer.tokenize(field)) - .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); + // extract deletion or addition only. + if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { + // parse json. + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - for (index, token) in tokens { - // if a language has been detected for the token, we update the counter. - if let Some(language) = token.language { - let script = token.script; - let entry = - script_language_word_count.entry(script).or_insert_with(Vec::new); - match entry.iter_mut().find(|(l, _)| *l == language) { - Some((_, n)) => *n += 1, - None => entry.push((language, 1)), + // prepare writing destination. + buffers.obkv_positions_buffer.clear(); + let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer); + + // convert json into a unique string. + buffers.field_buffer.clear(); + if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { + // create an iterator of token with their positions. + let tokens = process_tokens(tokenizer.tokenize(field)) + .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); + + for (index, token) in tokens { + // if a language has been detected for the token, we update the counter. + if let Some(language) = token.language { + let script = token.script; + let entry = + script_language_word_count.entry(script).or_insert_with(Vec::new); + match entry.iter_mut().find(|(l, _)| *l == language) { + Some((_, n)) => *n += 1, + None => entry.push((language, 1)), + } + } + + // keep a word only if it is not empty and fit in a LMDB key. + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + let position: u16 = index + .try_into() + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + writer.insert(position, token.as_bytes())?; } } - let token = token.lemma().trim(); - if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - buffers.key_buffer.truncate(mem::size_of::()); - buffers.key_buffer.extend_from_slice(token.as_bytes()); - let position: u16 = index - .try_into() - .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let position = absolute_from_relative_position(field_id, position); - docid_word_positions_sorter - .insert(&buffers.key_buffer, position.to_ne_bytes())?; - } + // write positions into document. + let positions = writer.into_inner()?; + document_writer.insert(field_id, positions)?; } } } } - Ok(()) + // returns a KV> + Ok(document_writer.into_inner().map(|v| v.as_slice())?) } /// Transform a JSON value into a string that can be indexed. @@ -287,10 +439,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize) #[derive(Default)] struct Buffers { - // the key buffer is the concatenation of the internal document id with the field id. - // The buffer has to be completelly cleared between documents, - // and the field id part must be cleared between each field. - key_buffer: Vec, // the field buffer for each fields desserialization, and must be cleared between each field. field_buffer: String, + // buffer used to store the value data containing an obkv. + obkv_buffer: Vec, + // buffer used to store the value data containing an obkv of tokens with their positions. + obkv_positions_buffer: Vec, } diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index d557e0b6c..f860aacba 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -4,11 +4,12 @@ use std::io::{self, BufReader}; use heed::{BytesDecode, BytesEncode}; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, + create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, }; +use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. @@ -17,7 +18,7 @@ use crate::Result; /// documents ids from the given chunk of docid facet number positions. #[logging_timer::time] pub fn extract_facet_number_docids( - docid_fid_facet_number: grenad::Reader, + fid_docid_facet_number: grenad::Reader, indexer: GrenadParameters, ) -> Result>> { puffin::profile_function!(); @@ -26,21 +27,30 @@ pub fn extract_facet_number_docids( let mut facet_number_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - let mut cursor = docid_fid_facet_number.into_cursor()?; - while let Some((key_bytes, _)) = cursor.move_on_next()? { + let mut buffer = Vec::new(); + let mut cursor = fid_docid_facet_number.into_cursor()?; + while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? { let (field_id, document_id, number) = FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); let key = FacetGroupKey { field_id, level: 0, left_bound: number }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() { + obkv.insert(deladd_key, document_id.to_ne_bytes())?; + } + obkv.finish()?; + + facet_number_docids_sorter.insert(key_bytes, &buffer)?; } sorter_into_reader(facet_number_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index b1b27449e..2ade776c3 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -1,13 +1,15 @@ use std::fs::File; -use std::io::{self, BufReader}; +use std::io::BufReader; +use std::{io, str}; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::StrRefCodec; -use crate::update::index_documents::merge_cbo_roaring_bitmaps; -use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; +use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps; +use crate::{FieldId, Result}; /// Extracts the facet string and the documents ids where this facet string appear. /// @@ -24,15 +26,16 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); + let mut buffer = Vec::new(); let mut cursor = docid_fid_facet_string.into_cursor()?; - while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { + while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); @@ -40,21 +43,17 @@ pub fn extract_facet_string_docids( try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); - let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?; - - let normalised_truncated_value: String; - if normalised_value.len() > MAX_FACET_VALUE_LENGTH { - normalised_truncated_value = normalised_value - .char_indices() - .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); - normalised_value = normalised_truncated_value.as_str(); - } - let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; + let normalized_value = str::from_utf8(normalized_value_bytes)?; + let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - // document id is encoded in native-endian because of the CBO roaring bitmap codec - facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; + + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() { + obkv.insert(deladd_key, document_id.to_ne_bytes())?; + } + obkv.finish()?; + facet_string_docids_sorter.insert(&key_bytes, &buffer)?; } sorter_into_reader(facet_string_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 42c355323..3fcec3e79 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,24 +1,36 @@ +use std::borrow::Cow; use std::collections::{BTreeMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; use std::mem::size_of; +use std::result::Result as StdResult; +use grenad::Sorter; use heed::zerocopy::AsBytes; use heed::BytesEncode; +use itertools::EitherOrBoth; +use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::{from_slice, Value}; +use FilterableValues::{Empty, Null, Values}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; +use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{create_writer, writer_into_reader}; -use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; +use crate::{ + CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH, +}; + +/// The length of the elements that are always in the buffer when inserting new values. +const TRUNCATE_SIZE: usize = size_of::() + size_of::(); /// The extracted facet values stored in grenad files by type. pub struct ExtractedFacetValues { - pub docid_fid_facet_numbers_chunk: grenad::Reader>, - pub docid_fid_facet_strings_chunk: grenad::Reader>, + pub fid_docid_facet_numbers_chunk: grenad::Reader>, + pub fid_docid_facet_strings_chunk: grenad::Reader>, pub fid_facet_is_null_docids_chunk: grenad::Reader>, pub fid_facet_is_empty_docids_chunk: grenad::Reader>, pub fid_facet_exists_docids_chunk: grenad::Reader>, @@ -58,71 +70,150 @@ pub fn extract_fid_docid_facet_values( max_memory.map(|m| m / 2), ); - let mut facet_exists_docids = BTreeMap::::new(); - let mut facet_is_null_docids = BTreeMap::::new(); - let mut facet_is_empty_docids = BTreeMap::::new(); + // The tuples represents the Del and Add side for a bitmap + let mut facet_exists_docids = BTreeMap::::new(); + let mut facet_is_null_docids = BTreeMap::::new(); + let mut facet_is_empty_docids = BTreeMap::::new(); + + // We create two buffers for mutable ref issues with closures. + let mut numbers_key_buffer = Vec::new(); + let mut strings_key_buffer = Vec::new(); - let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::new(value); for (field_id, field_bytes) in obkv.iter() { if faceted_fields.contains(&field_id) { - key_buffer.clear(); + numbers_key_buffer.clear(); + strings_key_buffer.clear(); // Set key to the field_id // Note: this encoding is consistent with FieldIdCodec - key_buffer.extend_from_slice(&field_id.to_be_bytes()); + numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); + strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); - // Here, we know already that the document must be added to the “field id exists” database let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); let document = BEU32::from(document).get(); - facet_exists_docids.entry(field_id).or_default().insert(document); - // For the other extraction tasks, prefix the key with the field_id and the document_id - key_buffer.extend_from_slice(docid_bytes); + numbers_key_buffer.extend_from_slice(docid_bytes); + strings_key_buffer.extend_from_slice(docid_bytes); - let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + let del_add_obkv = obkv::KvReader::new(field_bytes); + let del_value = match del_add_obkv.get(DelAdd::Deletion) { + Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), + None => None, + }; + let add_value = match del_add_obkv.get(DelAdd::Addition) { + Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), + None => None, + }; - match extract_facet_values( - &value, - geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng), - ) { - FilterableValues::Null => { - facet_is_null_docids.entry(field_id).or_default().insert(document); - } - FilterableValues::Empty => { - facet_is_empty_docids.entry(field_id).or_default().insert(document); - } - FilterableValues::Values { numbers, strings } => { - // insert facet numbers in sorter - for number in numbers { - key_buffer.truncate(size_of::() + size_of::()); - if let Some(value_bytes) = f64_into_bytes(number) { - key_buffer.extend_from_slice(&value_bytes); - key_buffer.extend_from_slice(&number.to_be_bytes()); + // We insert the document id on the Del and the Add side if the field exists. + let (ref mut del_exists, ref mut add_exists) = + facet_exists_docids.entry(field_id).or_default(); + let (ref mut del_is_null, ref mut add_is_null) = + facet_is_null_docids.entry(field_id).or_default(); + let (ref mut del_is_empty, ref mut add_is_empty) = + facet_is_empty_docids.entry(field_id).or_default(); - fid_docid_facet_numbers_sorter - .insert(&key_buffer, ().as_bytes())?; - } + if del_value.is_some() { + del_exists.insert(document); + } + if add_value.is_some() { + add_exists.insert(document); + } + + let geo_support = + geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng); + let del_filterable_values = + del_value.map(|value| extract_facet_values(&value, geo_support)); + let add_filterable_values = + add_value.map(|value| extract_facet_values(&value, geo_support)); + + // Those closures are just here to simplify things a bit. + let mut insert_numbers_diff = |del_numbers, add_numbers| { + insert_numbers_diff( + &mut fid_docid_facet_numbers_sorter, + &mut numbers_key_buffer, + del_numbers, + add_numbers, + ) + }; + let mut insert_strings_diff = |del_strings, add_strings| { + insert_strings_diff( + &mut fid_docid_facet_strings_sorter, + &mut strings_key_buffer, + del_strings, + add_strings, + ) + }; + + match (del_filterable_values, add_filterable_values) { + (None, None) => (), + (Some(del_filterable_values), None) => match del_filterable_values { + Null => { + del_is_null.insert(document); } - - // insert normalized and original facet string in sorter - for (normalized, original) in - strings.into_iter().filter(|(n, _)| !n.is_empty()) - { - let normalized_truncated_value: String = normalized - .char_indices() - .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); - - key_buffer.truncate(size_of::() + size_of::()); - key_buffer.extend_from_slice(normalized_truncated_value.as_bytes()); - fid_docid_facet_strings_sorter - .insert(&key_buffer, original.as_bytes())?; + Empty => { + del_is_empty.insert(document); + } + Values { numbers, strings } => { + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + }, + (None, Some(add_filterable_values)) => match add_filterable_values { + Null => { + add_is_null.insert(document); + } + Empty => { + add_is_empty.insert(document); + } + Values { numbers, strings } => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + } + }, + (Some(del_filterable_values), Some(add_filterable_values)) => { + match (del_filterable_values, add_filterable_values) { + (Null, Null) | (Empty, Empty) => (), + (Null, Empty) => { + del_is_null.insert(document); + add_is_empty.insert(document); + } + (Empty, Null) => { + del_is_empty.insert(document); + add_is_null.insert(document); + } + (Null, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + del_is_null.insert(document); + } + (Empty, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + del_is_empty.insert(document); + } + (Values { numbers, strings }, Null) => { + add_is_null.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + (Values { numbers, strings }, Empty) => { + add_is_empty.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + ( + Values { numbers: del_numbers, strings: del_strings }, + Values { numbers: add_numbers, strings: add_strings }, + ) => { + insert_numbers_diff(del_numbers, add_numbers)?; + insert_strings_diff(del_strings, add_strings)?; + } } } } @@ -130,14 +221,15 @@ pub fn extract_fid_docid_facet_values( } } + let mut buffer = Vec::new(); let mut facet_exists_docids_writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, tempfile::tempfile()?, ); - for (fid, bitmap) in facet_exists_docids.into_iter() { - let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); - facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() { + deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; + facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?; } let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; @@ -146,9 +238,9 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_level, tempfile::tempfile()?, ); - for (fid, bitmap) in facet_is_null_docids.into_iter() { - let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); - facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() { + deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; + facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?; } let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; @@ -157,21 +249,156 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_level, tempfile::tempfile()?, ); - for (fid, bitmap) in facet_is_empty_docids.into_iter() { - let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); - facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() { + deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; + facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?; } let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?; Ok(ExtractedFacetValues { - docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, - docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, + fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, fid_facet_is_null_docids_chunk: facet_is_null_docids_reader, fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader, fid_facet_exists_docids_chunk: facet_exists_docids_reader, }) } +/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps. +fn deladd_obkv_cbo_roaring_bitmaps( + buffer: &mut Vec, + del_bitmap: &RoaringBitmap, + add_bitmap: &RoaringBitmap, +) -> io::Result<()> { + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(buffer); + let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap(); + let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap(); + obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?; + obkv.insert(DelAdd::Addition, add_bitmap_bytes)?; + obkv.finish() +} + +/// Truncates a string to the biggest valid LMDB key size. +fn truncate_string(s: String) -> String { + s.char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect() +} + +/// Computes the diff between both Del and Add numbers and +/// only inserts the parts that differ in the sorter. +fn insert_numbers_diff( + fid_docid_facet_numbers_sorter: &mut Sorter, + key_buffer: &mut Vec, + mut del_numbers: Vec, + mut add_numbers: Vec, +) -> Result<()> +where + MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, Error>, +{ + // We sort and dedup the float numbers + del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); + add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); + del_numbers.dedup_by_key(|f| OrderedFloat(*f)); + add_numbers.dedup_by_key(|f| OrderedFloat(*f)); + + let merged_numbers_iter = itertools::merge_join_by( + del_numbers.into_iter().map(OrderedFloat), + add_numbers.into_iter().map(OrderedFloat), + |del, add| del.cmp(add), + ); + + // insert facet numbers in sorter + for eob in merged_numbers_iter { + key_buffer.truncate(TRUNCATE_SIZE); + match eob { + EitherOrBoth::Both(_, _) => (), // no need to touch anything + EitherOrBoth::Left(OrderedFloat(number)) => { + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those numbers. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, ().as_bytes())?; + let bytes = obkv.into_inner()?; + fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; + } + } + EitherOrBoth::Right(OrderedFloat(number)) => { + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those numbers. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, ().as_bytes())?; + let bytes = obkv.into_inner()?; + fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; + } + } + } + } + + Ok(()) +} + +/// Computes the diff between both Del and Add strings and +/// only inserts the parts that differ in the sorter. +fn insert_strings_diff( + fid_docid_facet_strings_sorter: &mut Sorter, + key_buffer: &mut Vec, + mut del_strings: Vec<(String, String)>, + mut add_strings: Vec<(String, String)>, +) -> Result<()> +where + MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, Error>, +{ + // We sort and dedup the normalized and original strings + del_strings.sort_unstable(); + add_strings.sort_unstable(); + del_strings.dedup(); + add_strings.dedup(); + + let merged_strings_iter = itertools::merge_join_by( + del_strings.into_iter().filter(|(n, _)| !n.is_empty()), + add_strings.into_iter().filter(|(n, _)| !n.is_empty()), + |del, add| del.cmp(add), + ); + + // insert normalized and original facet string in sorter + for eob in merged_strings_iter { + key_buffer.truncate(TRUNCATE_SIZE); + match eob { + EitherOrBoth::Both(_, _) => (), // no need to touch anything + EitherOrBoth::Left((normalized, original)) => { + let truncated = truncate_string(normalized); + key_buffer.extend_from_slice(truncated.as_bytes()); + + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, original)?; + let bytes = obkv.into_inner()?; + fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; + } + EitherOrBoth::Right((normalized, original)) => { + let truncated = truncate_string(normalized); + key_buffer.extend_from_slice(truncated.as_bytes()); + + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, original)?; + let bytes = obkv.into_inner()?; + fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; + } + } + } + + Ok(()) +} + /// Represent what a document field contains. enum FilterableValues { /// Corresponds to the JSON `null` value. @@ -182,6 +409,7 @@ enum FilterableValues { Values { numbers: Vec, strings: Vec<(String, String)> }, } +/// Extracts the facet values of a JSON field. fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues { fn inner_extract_facet_values( value: &Value, diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 92564b4cd..182d0c5d8 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -1,16 +1,18 @@ -use std::collections::HashMap; use std::fs::File; use std::io::{self, BufReader}; -use grenad::Sorter; +use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, + GrenadParameters, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{relative_from_absolute_position, DocumentId, FieldId, Result}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::Result; + +const MAX_COUNTED_WORDS: usize = 30; /// Extracts the field id word count and the documents ids where /// this field id with this amount of words appear. @@ -28,70 +30,62 @@ pub fn extract_fid_word_count_docids( let mut fid_word_count_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - // This map is assumed to not consume a lot of memory. - let mut document_fid_wordcount = HashMap::new(); - let mut current_document_id = None; - + let mut key_buffer = Vec::new(); + let mut value_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, _word_bytes) = try_split_array_at(key) + let (document_id_bytes, fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - let curr_document_id = *current_document_id.get_or_insert(document_id); - if curr_document_id != document_id { - drain_document_fid_wordcount_into_sorter( - &mut fid_word_count_docids_sorter, - &mut document_fid_wordcount, - curr_document_id, - )?; - current_document_id = Some(document_id); + let del_add_reader = KvReaderDelAdd::new(value); + let deletion = del_add_reader + // get deleted words + .get(DelAdd::Deletion) + // count deleted words + .map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count()) + // keep the count if under or equal to MAX_COUNTED_WORDS + .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); + let addition = del_add_reader + // get added words + .get(DelAdd::Addition) + // count added words + .map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count()) + // keep the count if under or equal to MAX_COUNTED_WORDS + .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); + + if deletion != addition { + // Insert deleted word count in sorter if exist. + if let Some(word_count) = deletion { + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + key_buffer.clear(); + key_buffer.extend_from_slice(fid_bytes); + key_buffer.push(word_count as u8); + fid_word_count_docids_sorter + .insert(&key_buffer, value_writer.into_inner().unwrap())?; + } + // Insert added word count in sorter if exist. + if let Some(word_count) = addition { + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key_buffer.clear(); + key_buffer.extend_from_slice(fid_bytes); + key_buffer.push(word_count as u8); + fid_word_count_docids_sorter + .insert(&key_buffer, value_writer.into_inner().unwrap())?; + } } - - for position in read_u32_ne_bytes(value) { - let (field_id, _) = relative_from_absolute_position(position); - - let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); - *value += 1; - } - } - - if let Some(document_id) = current_document_id { - // We must make sure that don't lose the current document field id - // word count map if we break because we reached the end of the chunk. - drain_document_fid_wordcount_into_sorter( - &mut fid_word_count_docids_sorter, - &mut document_fid_wordcount, - document_id, - )?; } sorter_into_reader(fid_word_count_docids_sorter, indexer) } - -fn drain_document_fid_wordcount_into_sorter( - fid_word_count_docids_sorter: &mut Sorter, - document_fid_wordcount: &mut HashMap, - document_id: DocumentId, -) -> Result<()> { - let mut key_buffer = Vec::new(); - - for (fid, count) in document_fid_wordcount.drain() { - if count <= 30 { - key_buffer.clear(); - key_buffer.extend_from_slice(&fid.to_be_bytes()); - key_buffer.push(count as u8); - - fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; - } - } - - Ok(()) -} diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 285a4bdba..5ee7967d2 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -6,6 +6,7 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::GeoError; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::extract_finite_float_from_value; use crate::{FieldId, InternalError, Result}; @@ -30,39 +31,71 @@ pub fn extract_geo_points( let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::new(value); - // since we only needs the primary key when we throw an error we create this getter to - // lazily get it when needed + // since we only need the primary key when we throw an error + // we create this getter to lazily get it when needed let document_id = || -> Value { let document_id = obkv.get(primary_key_id).unwrap(); serde_json::from_slice(document_id).unwrap() }; // first we get the two fields - let lat = obkv.get(lat_fid); - let lng = obkv.get(lng_fid); + match (obkv.get(lat_fid), obkv.get(lng_fid)) { + (Some(lat), Some(lng)) => { + let deladd_lat_obkv = KvReaderDelAdd::new(lat); + let deladd_lng_obkv = KvReaderDelAdd::new(lng); - if let Some((lat, lng)) = lat.zip(lng) { - // then we extract the values - let lat = extract_finite_float_from_value( - serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, - ) - .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; + // then we extract the values + let del_lat_lng = deladd_lat_obkv + .get(DelAdd::Deletion) + .zip(deladd_lng_obkv.get(DelAdd::Deletion)) + .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id)) + .transpose()?; + let add_lat_lng = deladd_lat_obkv + .get(DelAdd::Addition) + .zip(deladd_lng_obkv.get(DelAdd::Addition)) + .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id)) + .transpose()?; - let lng = extract_finite_float_from_value( - serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, - ) - .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; - - #[allow(clippy::drop_non_drop)] - let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; - writer.insert(docid_bytes, bytes)?; - } else if lat.is_none() && lng.is_some() { - return Err(GeoError::MissingLatitude { document_id: document_id() })?; - } else if lat.is_some() && lng.is_none() { - return Err(GeoError::MissingLongitude { document_id: document_id() })?; + if del_lat_lng != add_lat_lng { + let mut obkv = KvWriterDelAdd::memory(); + if let Some([lat, lng]) = del_lat_lng { + #[allow(clippy::drop_non_drop)] + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; + obkv.insert(DelAdd::Deletion, bytes)?; + } + if let Some([lat, lng]) = add_lat_lng { + #[allow(clippy::drop_non_drop)] + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; + obkv.insert(DelAdd::Addition, bytes)?; + } + let bytes = obkv.into_inner()?; + writer.insert(docid_bytes, bytes)?; + } + } + (None, Some(_)) => { + return Err(GeoError::MissingLatitude { document_id: document_id() }.into()) + } + (Some(_), None) => { + return Err(GeoError::MissingLongitude { document_id: document_id() }.into()) + } + (None, None) => (), } - // else => the _geo object was `null`, there is nothing to do } writer_into_reader(writer) } + +/// Extract the finite floats lat and lng from two bytes slices. +fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> { + let lat = extract_finite_float_from_value( + serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; + + let lng = extract_finite_float_from_value( + serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; + + Ok([lat, lng]) +} diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 863bc07c3..317a9aec3 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -1,13 +1,24 @@ +use std::cmp::Ordering; use std::convert::TryFrom; use std::fs::File; -use std::io::{self, BufReader}; +use std::io::{self, BufReader, BufWriter}; +use std::mem::size_of; +use std::str::from_utf8; use bytemuck::cast_slice; +use grenad::Writer; +use itertools::EitherOrBoth; +use ordered_float::OrderedFloat; use serde_json::{from_slice, Value}; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::UserError; -use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::index_documents::helpers::try_split_at; +use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors}; + +/// The length of the elements that are always in the buffer when inserting new values. +const TRUNCATE_SIZE: usize = size_of::(); /// Extracts the embedding vector contained in each document under the `_vectors` field. /// @@ -16,7 +27,6 @@ use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors}; pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, - primary_key_id: FieldId, vectors_fid: FieldId, ) -> Result>> { puffin::profile_function!(); @@ -27,43 +37,112 @@ pub fn extract_vector_points( tempfile::tempfile()?, ); + let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; - while let Some((docid_bytes, value)) = cursor.move_on_next()? { + while let Some((key, value)) = cursor.move_on_next()? { + // this must always be serialized as (docid, external_docid); + let (docid_bytes, external_id_bytes) = + try_split_at(key, std::mem::size_of::()).unwrap(); + debug_assert!(from_utf8(external_id_bytes).is_ok()); + let obkv = obkv::KvReader::new(value); + key_buffer.clear(); + key_buffer.extend_from_slice(docid_bytes); // since we only needs the primary key when we throw an error we create this getter to // lazily get it when needed - let document_id = || -> Value { - let document_id = obkv.get(primary_key_id).unwrap(); - from_slice(document_id).unwrap() - }; + let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; // first we retrieve the _vectors field - if let Some(vectors) = obkv.get(vectors_fid) { - // extract the vectors - let vectors = match from_slice(vectors) { - Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors), - Err(_) => { - return Err(UserError::InvalidVectorsType { - document_id: document_id(), - value: from_slice(vectors).map_err(InternalError::SerdeJson)?, - } - .into()) - } - }; + if let Some(value) = obkv.get(vectors_fid) { + let vectors_obkv = KvReaderDelAdd::new(value); - if let Some(vectors) = vectors { - for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) { - let index = u16::try_from(i).unwrap(); - let mut key = docid_bytes.to_vec(); - key.extend_from_slice(&index.to_be_bytes()); - let bytes = cast_slice(&vector); - writer.insert(key, bytes)?; - } - } + // then we extract the values + let del_vectors = vectors_obkv + .get(DelAdd::Deletion) + .map(|vectors| extract_vectors(vectors, document_id)) + .transpose()? + .flatten(); + let add_vectors = vectors_obkv + .get(DelAdd::Addition) + .map(|vectors| extract_vectors(vectors, document_id)) + .transpose()? + .flatten(); + + // and we finally push the unique vectors into the writer + push_vectors_diff( + &mut writer, + &mut key_buffer, + del_vectors.unwrap_or_default(), + add_vectors.unwrap_or_default(), + )?; } - // else => the `_vectors` object was `null`, there is nothing to do } writer_into_reader(writer) } + +/// Computes the diff between both Del and Add numbers and +/// only inserts the parts that differ in the sorter. +fn push_vectors_diff( + writer: &mut Writer>, + key_buffer: &mut Vec, + mut del_vectors: Vec>, + mut add_vectors: Vec>, +) -> Result<()> { + // We sort and dedup the vectors + del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); + add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); + del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + + let merged_vectors_iter = + itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); + + // insert vectors into the writer + for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { + // Generate the key by extending the unique index to it. + key_buffer.truncate(TRUNCATE_SIZE); + let index = u16::try_from(i).unwrap(); + key_buffer.extend_from_slice(&index.to_be_bytes()); + + match eob { + EitherOrBoth::Both(_, _) => (), // no need to touch anything + EitherOrBoth::Left(vector) => { + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + writer.insert(&key_buffer, bytes)?; + } + EitherOrBoth::Right(vector) => { + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + writer.insert(&key_buffer, bytes)?; + } + } + } + + Ok(()) +} + +/// Compares two vectors by using the OrderingFloat helper. +fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering { + a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat)) +} + +/// Extracts the vectors from a JSON value. +fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result>>> { + match from_slice(value) { + Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)), + Err(_) => Err(UserError::InvalidVectorsType { + document_id: document_id(), + value: from_slice(value).map_err(InternalError::SerdeJson)?, + } + .into()), + } +} diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index f211f7023..f278012c7 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -1,18 +1,20 @@ -use std::collections::HashSet; +use std::collections::{BTreeSet, HashSet}; use std::fs::File; use std::io::{self, BufReader}; -use std::iter::FromIterator; -use roaring::RoaringBitmap; +use heed::BytesDecode; +use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, - try_split_array_at, GrenadParameters, + create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, + try_split_array_at, writer_into_reader, GrenadParameters, }; use crate::error::SerializationError; +use crate::heed_codec::StrBEU16Codec; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::update::index_documents::helpers::read_u32_ne_bytes; -use crate::{relative_from_absolute_position, FieldId, Result}; +use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::MergeFn; +use crate::{DocumentId, FieldId, Result}; /// Extracts the word and the documents ids where this word appear. /// @@ -26,65 +28,152 @@ pub fn extract_word_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, exact_attributes: &HashSet, -) -> Result<(grenad::Reader>, grenad::Reader>)> { +) -> Result<( + grenad::Reader>, + grenad::Reader>, + grenad::Reader>, +)> { puffin::profile_function!(); let max_memory = indexer.max_memory_by_thread(); - let mut word_docids_sorter = create_sorter( + let mut word_fid_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|x| x / 2), + max_memory.map(|x| x / 3), + ); + let mut key_buffer = Vec::new(); + let mut del_words = BTreeSet::new(); + let mut add_words = BTreeSet::new(); + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, fid_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let (fid_bytes, _) = try_split_array_at(fid_bytes) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = u32::from_be_bytes(document_id_bytes); + let fid = u16::from_be_bytes(fid_bytes); + + let del_add_reader = KvReaderDelAdd::new(value); + // extract all unique words to remove. + if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { + for (_pos, word) in KvReaderU16::new(deletion).iter() { + del_words.insert(word.to_vec()); + } + } + + // extract all unique additional words. + if let Some(addition) = del_add_reader.get(DelAdd::Addition) { + for (_pos, word) in KvReaderU16::new(addition).iter() { + add_words.insert(word.to_vec()); + } + } + + words_into_sorter( + document_id, + fid, + &mut key_buffer, + &del_words, + &add_words, + &mut word_fid_docids_sorter, + )?; + + del_words.clear(); + add_words.clear(); + } + + let mut word_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_deladd_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|x| x / 3), ); let mut exact_word_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|x| x / 2), + max_memory.map(|x| x / 3), ); - let mut value_buffer = Vec::new(); - let mut cursor = docid_word_positions.into_cursor()?; - while let Some((key, positions)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) + let mut word_fid_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; + // TODO: replace sorters by writers by accumulating values into a buffer before inserting them. + while let Some((key, value)) = iter.next()? { + // only keep the value if their is a change to apply in the DB. + if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) { + word_fid_docids_writer.insert(key, value)?; + } + + let (word, fid) = StrBEU16Codec::bytes_decode(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - let document_id = u32::from_be_bytes(document_id_bytes); - let bitmap = RoaringBitmap::from_iter(Some(document_id)); - serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; - - // If there are no exact attributes, we do not need to iterate over positions. - if exact_attributes.is_empty() { - word_docids_sorter.insert(word_bytes, &value_buffer)?; + // every words contained in an attribute set to exact must be pushed in the exact_words list. + if exact_attributes.contains(&fid) { + exact_word_docids_sorter.insert(word.as_bytes(), value)?; } else { - let mut added_to_exact = false; - let mut added_to_word_docids = false; - for position in read_u32_ne_bytes(positions) { - // as soon as we know that this word had been to both readers, we don't need to - // iterate over the positions. - if added_to_exact && added_to_word_docids { - break; - } - let (fid, _) = relative_from_absolute_position(position); - if exact_attributes.contains(&fid) && !added_to_exact { - exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; - added_to_exact = true; - } else if !added_to_word_docids { - word_docids_sorter.insert(word_bytes, &value_buffer)?; - added_to_word_docids = true; - } - } + word_docids_sorter.insert(word.as_bytes(), value)?; } } Ok(( sorter_into_reader(word_docids_sorter, indexer)?, sorter_into_reader(exact_word_docids_sorter, indexer)?, + writer_into_reader(word_fid_docids_writer)?, )) } + +fn words_into_sorter( + document_id: DocumentId, + fid: FieldId, + key_buffer: &mut Vec, + del_words: &BTreeSet>, + add_words: &BTreeSet>, + word_fid_docids_sorter: &mut grenad::Sorter, +) -> Result<()> { + puffin::profile_function!(); + + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + let mut buffer = Vec::new(); + for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) { + buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut buffer); + let word_bytes = match eob { + Left(word_bytes) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + word_bytes + } + Right(word_bytes) => { + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + word_bytes + } + Both(word_bytes, _) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + word_bytes + } + }; + + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?; + } + + Ok(()) +} diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs deleted file mode 100644 index 09f571038..000000000 --- a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs +++ /dev/null @@ -1,51 +0,0 @@ -use std::fs::File; -use std::io::{self, BufReader}; - -use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, -}; -use crate::error::SerializationError; -use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{relative_from_absolute_position, DocumentId, Result}; - -/// Extracts the word, field id, and the documents ids where this word appear at this field id. -#[logging_timer::time] -pub fn extract_word_fid_docids( - docid_word_positions: grenad::Reader, - indexer: GrenadParameters, -) -> Result>> { - puffin::profile_function!(); - - let max_memory = indexer.max_memory_by_thread(); - - let mut word_fid_docids_sorter = create_sorter( - grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ); - - let mut key_buffer = Vec::new(); - let mut cursor = docid_word_positions.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) - .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - let document_id = DocumentId::from_be_bytes(document_id_bytes); - - for position in read_u32_ne_bytes(value) { - key_buffer.clear(); - key_buffer.extend_from_slice(word_bytes); - key_buffer.push(0); - let (fid, _) = relative_from_absolute_position(position); - key_buffer.extend_from_slice(&fid.to_be_bytes()); - word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; - } - } - - let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; - - Ok(word_fid_docids_reader) -} diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 9ddd5ff4c..b8a377247 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -1,16 +1,18 @@ -use std::cmp::Ordering; -use std::collections::{BinaryHeap, HashMap}; +use std::collections::{BTreeMap, VecDeque}; use std::fs::File; use std::io::BufReader; -use std::{cmp, io, mem, str, vec}; +use std::{cmp, io}; + +use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, MergeFn, + create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at, + writer_into_reader, GrenadParameters, MergeFn, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::proximity::{positions_proximity, MAX_DISTANCE}; +use crate::proximity::{index_proximity, MAX_DISTANCE}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::{DocumentId, Result}; /// Extracts the best proximity between pairs of words and the documents ids where this pair appear. @@ -26,58 +28,137 @@ pub fn extract_word_pair_proximity_docids( let max_memory = indexer.max_memory_by_thread(); - let mut word_pair_proximity_docids_sorter = create_sorter( - grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory.map(|m| m / 2), - ); + let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE) + .map(|_| { + create_sorter( + grenad::SortAlgorithm::Unstable, + merge_deladd_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / MAX_DISTANCE as usize), + ) + }) + .collect(); - // This map is assumed to not consume a lot of memory. - let mut document_word_positions_heap = BinaryHeap::new(); + let mut del_word_positions: VecDeque<(String, u16)> = + VecDeque::with_capacity(MAX_DISTANCE as usize); + let mut add_word_positions: VecDeque<(String, u16)> = + VecDeque::with_capacity(MAX_DISTANCE as usize); + let mut del_word_pair_proximity = BTreeMap::new(); + let mut add_word_pair_proximity = BTreeMap::new(); let mut current_document_id = None; let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) + let (document_id_bytes, _fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - let word = str::from_utf8(word_bytes)?; - let curr_document_id = *current_document_id.get_or_insert(document_id); - if curr_document_id != document_id { - let document_word_positions_heap = mem::take(&mut document_word_positions_heap); + // if we change document, we fill the sorter + if current_document_id.map_or(false, |id| id != document_id) { + puffin::profile_scope!("Document into sorter"); + document_word_positions_into_sorter( - curr_document_id, - document_word_positions_heap, - &mut word_pair_proximity_docids_sorter, + current_document_id.unwrap(), + &del_word_pair_proximity, + &add_word_pair_proximity, + &mut word_pair_proximity_docids_sorters, )?; - current_document_id = Some(document_id); + del_word_pair_proximity.clear(); + add_word_pair_proximity.clear(); } - let word = word.to_string(); - let mut positions: Vec<_> = read_u32_ne_bytes(value).collect(); - positions.sort_unstable(); - let mut iter = positions.into_iter(); - if let Some(position) = iter.next() { - document_word_positions_heap.push(PeekedWordPosition { word, position, iter }); - } + current_document_id = Some(document_id); + + let (del, add): (Result<_>, Result<_>) = rayon::join( + || { + // deletions + if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) { + for (position, word) in KvReaderU16::new(deletion).iter() { + // drain the proximity window until the head word is considered close to the word we are inserting. + while del_word_positions.get(0).map_or(false, |(_w, p)| { + index_proximity(*p as u32, position as u32) >= MAX_DISTANCE + }) { + word_positions_into_word_pair_proximity( + &mut del_word_positions, + &mut del_word_pair_proximity, + )?; + } + + // insert the new word. + let word = std::str::from_utf8(word)?; + del_word_positions.push_back((word.to_string(), position)); + } + + while !del_word_positions.is_empty() { + word_positions_into_word_pair_proximity( + &mut del_word_positions, + &mut del_word_pair_proximity, + )?; + } + } + + Ok(()) + }, + || { + // additions + if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) { + for (position, word) in KvReaderU16::new(addition).iter() { + // drain the proximity window until the head word is considered close to the word we are inserting. + while add_word_positions.get(0).map_or(false, |(_w, p)| { + index_proximity(*p as u32, position as u32) >= MAX_DISTANCE + }) { + word_positions_into_word_pair_proximity( + &mut add_word_positions, + &mut add_word_pair_proximity, + )?; + } + + // insert the new word. + let word = std::str::from_utf8(word)?; + add_word_positions.push_back((word.to_string(), position)); + } + + while !add_word_positions.is_empty() { + word_positions_into_word_pair_proximity( + &mut add_word_positions, + &mut add_word_pair_proximity, + )?; + } + } + + Ok(()) + }, + ); + + del?; + add?; } if let Some(document_id) = current_document_id { - // We must make sure that don't lose the current document field id - // word count map if we break because we reached the end of the chunk. - let document_word_positions_heap = mem::take(&mut document_word_positions_heap); + puffin::profile_scope!("Final document into sorter"); document_word_positions_into_sorter( document_id, - document_word_positions_heap, - &mut word_pair_proximity_docids_sorter, + &del_word_pair_proximity, + &add_word_pair_proximity, + &mut word_pair_proximity_docids_sorters, )?; } + { + puffin::profile_scope!("sorter_into_reader"); + let mut writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); - sorter_into_reader(word_pair_proximity_docids_sorter, indexer) + for sorter in word_pair_proximity_docids_sorters { + sorter.write_into_stream_writer(&mut writer)?; + } + + writer_into_reader(writer) + } } /// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive. @@ -86,96 +167,66 @@ pub fn extract_word_pair_proximity_docids( /// close to each other. fn document_word_positions_into_sorter( document_id: DocumentId, - mut word_positions_heap: BinaryHeap>>, - word_pair_proximity_docids_sorter: &mut grenad::Sorter, + del_word_pair_proximity: &BTreeMap<(String, String), u8>, + add_word_pair_proximity: &BTreeMap<(String, String), u8>, + word_pair_proximity_docids_sorters: &mut [grenad::Sorter], ) -> Result<()> { - let mut word_pair_proximity = HashMap::new(); - let mut ordered_peeked_word_positions = Vec::new(); - while !word_positions_heap.is_empty() { - while let Some(peeked_word_position) = word_positions_heap.pop() { - ordered_peeked_word_positions.push(peeked_word_position); - if ordered_peeked_word_positions.len() == 7 { - break; - } - } - - if let Some((head, tail)) = ordered_peeked_word_positions.split_first() { - for PeekedWordPosition { word, position, .. } in tail { - let prox = positions_proximity(head.position, *position); - if prox > 0 && prox < MAX_DISTANCE { - word_pair_proximity - .entry((head.word.clone(), word.clone())) - .and_modify(|p| { - *p = cmp::min(*p, prox); - }) - .or_insert(prox); - } - } - - // Push the tail in the heap. - let tail_iter = ordered_peeked_word_positions.drain(1..); - word_positions_heap.extend(tail_iter); - - // Advance the head and push it in the heap. - if let Some(mut head) = ordered_peeked_word_positions.pop() { - if let Some(next_position) = head.iter.next() { - let prox = positions_proximity(head.position, next_position); - - if prox > 0 && prox < MAX_DISTANCE { - word_pair_proximity - .entry((head.word.clone(), head.word.clone())) - .and_modify(|p| { - *p = cmp::min(*p, prox); - }) - .or_insert(prox); - } - - word_positions_heap.push(PeekedWordPosition { - word: head.word, - position: next_position, - iter: head.iter, - }); - } - } - } - } + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + let mut buffer = Vec::new(); let mut key_buffer = Vec::new(); - for ((w1, w2), prox) in word_pair_proximity { + for eob in + merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| { + d.cmp(a) + }) + { + buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut buffer); + let ((w1, w2), prox) = match eob { + Left(key_value) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + key_value + } + Right(key_value) => { + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key_value + } + Both(key_value, _) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key_value + } + }; + key_buffer.clear(); - key_buffer.push(prox as u8); + key_buffer.push(*prox); key_buffer.extend_from_slice(w1.as_bytes()); key_buffer.push(0); key_buffer.extend_from_slice(w2.as_bytes()); - word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + word_pair_proximity_docids_sorters[*prox as usize - 1] + .insert(&key_buffer, value_writer.into_inner().unwrap())?; } Ok(()) } -struct PeekedWordPosition { - word: String, - position: u32, - iter: I, -} - -impl Ord for PeekedWordPosition { - fn cmp(&self, other: &Self) -> Ordering { - self.position.cmp(&other.position).reverse() - } -} - -impl PartialOrd for PeekedWordPosition { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Eq for PeekedWordPosition {} - -impl PartialEq for PeekedWordPosition { - fn eq(&self, other: &Self) -> bool { - self.position == other.position +fn word_positions_into_word_pair_proximity( + word_positions: &mut VecDeque<(String, u16)>, + word_pair_proximity: &mut BTreeMap<(String, String), u8>, +) -> Result<()> { + let (head_word, head_position) = word_positions.pop_front().unwrap(); + for (word, position) in word_positions.iter() { + let prox = index_proximity(head_position as u32, *position as u32) as u8; + if prox > 0 && prox < MAX_DISTANCE as u8 { + word_pair_proximity + .entry((head_word.clone(), word.clone())) + .and_modify(|p| { + *p = cmp::min(*p, prox); + }) + .or_insert(prox); + } } + Ok(()) } diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 94139ddf8..89b77d140 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -1,13 +1,18 @@ +use std::collections::BTreeSet; use std::fs::File; use std::io::{self, BufReader}; +use obkv::KvReaderU16; + use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, + create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, + GrenadParameters, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::MergeFn; +use crate::{bucketed_position, DocumentId, Result}; /// Extracts the word positions and the documents ids where this word appear. /// @@ -24,32 +29,111 @@ pub fn extract_word_position_docids( let mut word_position_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); + let mut del_word_positions: BTreeSet<(u16, Vec)> = BTreeSet::new(); + let mut add_word_positions: BTreeSet<(u16, Vec)> = BTreeSet::new(); + let mut current_document_id: Option = None; let mut key_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) + let (document_id_bytes, _fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = DocumentId::from_be_bytes(document_id_bytes); - for position in read_u32_ne_bytes(value) { - key_buffer.clear(); - key_buffer.extend_from_slice(word_bytes); - key_buffer.push(0); - let (_, position) = relative_from_absolute_position(position); - let position = bucketed_position(position); - key_buffer.extend_from_slice(&position.to_be_bytes()); - word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + if current_document_id.map_or(false, |id| document_id != id) { + words_position_into_sorter( + current_document_id.unwrap(), + &mut key_buffer, + &del_word_positions, + &add_word_positions, + &mut word_position_docids_sorter, + )?; + del_word_positions.clear(); + add_word_positions.clear(); + } + + current_document_id = Some(document_id); + + let del_add_reader = KvReaderDelAdd::new(value); + // extract all unique words to remove. + if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { + for (position, word_bytes) in KvReaderU16::new(deletion).iter() { + let position = bucketed_position(position); + del_word_positions.insert((position, word_bytes.to_vec())); + } + } + + // extract all unique additional words. + if let Some(addition) = del_add_reader.get(DelAdd::Addition) { + for (position, word_bytes) in KvReaderU16::new(addition).iter() { + let position = bucketed_position(position); + add_word_positions.insert((position, word_bytes.to_vec())); + } } } + if let Some(document_id) = current_document_id { + words_position_into_sorter( + document_id, + &mut key_buffer, + &del_word_positions, + &add_word_positions, + &mut word_position_docids_sorter, + )?; + } + + // TODO remove noop DelAdd OBKV let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?; Ok(word_position_docids_reader) } + +fn words_position_into_sorter( + document_id: DocumentId, + key_buffer: &mut Vec, + del_word_positions: &BTreeSet<(u16, Vec)>, + add_word_positions: &BTreeSet<(u16, Vec)>, + word_position_docids_sorter: &mut grenad::Sorter, +) -> Result<()> { + puffin::profile_function!(); + + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + let mut buffer = Vec::new(); + for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a)) + { + buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut buffer); + let (position, word_bytes) = match eob { + Left(key) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + key + } + Right(key) => { + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key + } + Both(key, _) => { + // both values needs to be kept because it will be used in other extractors. + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key + } + }; + + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); + key_buffer.extend_from_slice(&position.to_be_bytes()); + word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?; + } + + Ok(()) +} diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index f44eac8f5..91f3e1c62 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -6,7 +6,6 @@ mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_vector_points; mod extract_word_docids; -mod extract_word_fid_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; @@ -26,12 +25,11 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_vector_points::extract_vector_points; use self::extract_word_docids::extract_word_docids; -use self::extract_word_fid_docids::extract_word_fid_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ - as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, - GrenadParameters, MergeFn, MergeableReader, + as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, + MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -65,7 +63,6 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), vectors_field_id, - primary_key_id, ) }) .collect::>()?; @@ -94,9 +91,9 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, ( - docid_fid_facet_numbers_chunks, + fid_docid_facet_numbers_chunks, ( - docid_fid_facet_strings_chunks, + fid_docid_facet_strings_chunks, ( facet_is_null_docids_chunks, (facet_is_empty_docids_chunks, facet_exists_docids_chunks), @@ -110,7 +107,7 @@ pub(crate) fn data_from_obkv_documents( let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { debug!("merge {} database", "facet-id-exists-docids"); - match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { Ok(reader) => { let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); } @@ -126,7 +123,7 @@ pub(crate) fn data_from_obkv_documents( let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { debug!("merge {} database", "facet-id-is-null-docids"); - match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { Ok(reader) => { let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader))); } @@ -142,7 +139,7 @@ pub(crate) fn data_from_obkv_documents( let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { debug!("merge {} database", "facet-id-is-empty-docids"); - match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { Ok(reader) => { let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader))); } @@ -158,7 +155,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_word_pair_proximity_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::WordPairProximityDocids, "word-pair-proximity-docids", ); @@ -168,24 +165,31 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_fid_word_count_docids, - merge_cbo_roaring_bitmaps, - TypedChunk::FieldIdWordcountDocids, + merge_deladd_cbo_roaring_bitmaps, + TypedChunk::FieldIdWordCountDocids, "field-id-wordcount-docids", ); spawn_extraction_task::< _, _, - Vec<(grenad::Reader>, grenad::Reader>)>, + Vec<( + grenad::Reader>, + grenad::Reader>, + grenad::Reader>, + )>, >( docid_word_positions_chunks.clone(), indexer, lmdb_writer_sx.clone(), move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), - merge_roaring_bitmaps, - |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, + merge_deladd_cbo_roaring_bitmaps, + |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } }, "word-docids", ); @@ -195,36 +199,27 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_word_position_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::WordPositionDocids, "word-position-docids", ); - spawn_extraction_task::<_, _, Vec>>>( - docid_word_positions_chunks, - indexer, - lmdb_writer_sx.clone(), - extract_word_fid_docids, - merge_cbo_roaring_bitmaps, - TypedChunk::WordFidDocids, - "word-fid-docids", - ); spawn_extraction_task::<_, _, Vec>>>( - docid_fid_facet_strings_chunks, + fid_docid_facet_strings_chunks, indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::FieldIdFacetStringDocids, "field-id-facet-string-docids", ); spawn_extraction_task::<_, _, Vec>>>( - docid_fid_facet_numbers_chunks, + fid_docid_facet_numbers_chunks, indexer, lmdb_writer_sx, extract_facet_number_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::FieldIdFacetNumberDocids, "field-id-facet-number-docids", ); @@ -278,7 +273,6 @@ fn send_original_documents_data( indexer: GrenadParameters, lmdb_writer_sx: Sender>, vectors_field_id: Option, - primary_key_id: FieldId, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -287,12 +281,7 @@ fn send_original_documents_data( let documents_chunk_cloned = original_documents_chunk.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); rayon::spawn(move || { - let result = extract_vector_points( - documents_chunk_cloned, - indexer, - primary_key_id, - vectors_field_id, - ); + let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id); let _ = match result { Ok(vector_points) => { lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) @@ -356,10 +345,10 @@ fn send_and_extract_flattened_documents_data( }); } - let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = + let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { - let (documents_ids, docid_word_positions_chunk, script_language_pair) = + let (docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions( flattened_documents_chunk.clone(), indexer, @@ -370,9 +359,6 @@ fn send_and_extract_flattened_documents_data( max_positions_per_attributes, )?; - // send documents_ids to DB writer - let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))); - // send docid_word_positions_chunk to DB writer let docid_word_positions_chunk = unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; @@ -384,8 +370,8 @@ fn send_and_extract_flattened_documents_data( }, || { let ExtractedFacetValues { - docid_fid_facet_numbers_chunk, - docid_fid_facet_strings_chunk, + fid_docid_facet_numbers_chunk, + fid_docid_facet_strings_chunk, fid_facet_is_null_docids_chunk, fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk, @@ -396,26 +382,26 @@ fn send_and_extract_flattened_documents_data( geo_fields_ids, )?; - // send docid_fid_facet_numbers_chunk to DB writer - let docid_fid_facet_numbers_chunk = - unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? }; + // send fid_docid_facet_numbers_chunk to DB writer + let fid_docid_facet_numbers_chunk = + unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? }; let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers( - docid_fid_facet_numbers_chunk.clone(), + fid_docid_facet_numbers_chunk.clone(), ))); - // send docid_fid_facet_strings_chunk to DB writer - let docid_fid_facet_strings_chunk = - unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? }; + // send fid_docid_facet_strings_chunk to DB writer + let fid_docid_facet_strings_chunk = + unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? }; let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings( - docid_fid_facet_strings_chunk.clone(), + fid_docid_facet_strings_chunk.clone(), ))); Ok(( - docid_fid_facet_numbers_chunk, + fid_docid_facet_numbers_chunk, ( - docid_fid_facet_strings_chunk, + fid_docid_facet_strings_chunk, ( fid_facet_is_null_docids_chunk, (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk), @@ -425,5 +411,5 @@ fn send_and_extract_flattened_documents_data( }, ); - Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) + Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?)) } diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 582bf2a5b..061cbe5a0 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -1,14 +1,12 @@ use std::borrow::Cow; use std::fs::File; use std::io::{self, BufReader, BufWriter, Seek}; -use std::time::Instant; use grenad::{CompressionType, Sorter}; use heed::types::ByteSlice; -use log::debug; use super::{ClonableMmap, MergeFn}; -use crate::error::InternalError; +use crate::update::index_documents::valid_lmdb_key; use crate::Result; pub type CursorClonableMmap = io::Cursor; @@ -47,6 +45,7 @@ pub fn create_sorter( builder.allow_realloc(false); } builder.sort_algorithm(sort_algorithm); + builder.sort_in_parallel(true); builder.build() } @@ -54,6 +53,7 @@ pub fn sorter_into_reader( sorter: grenad::Sorter, indexer: GrenadParameters, ) -> Result>> { + puffin::profile_function!(); let mut writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -115,6 +115,32 @@ impl MergeableReader for Vec<(grenad::Reader>, grenad::Reader>, + grenad::Reader>, + grenad::Reader>, + )> +{ + type Output = ( + grenad::Reader>, + grenad::Reader>, + grenad::Reader>, + ); + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut m1 = MergerBuilder::new(merge_fn); + let mut m2 = MergerBuilder::new(merge_fn); + let mut m3 = MergerBuilder::new(merge_fn); + for (r1, r2, r3) in self.into_iter() { + m1.push(r1)?; + m2.push(r2)?; + m3.push(r3)?; + } + Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?)) + } +} + struct MergerBuilder(grenad::MergerBuilder); impl MergerBuilder { @@ -195,11 +221,13 @@ pub fn grenad_obkv_into_chunks( ); while let Some((document_id, obkv)) = cursor.move_on_next()? { - obkv_documents.insert(document_id, obkv)?; - current_chunk_size += document_id.len() as u64 + obkv.len() as u64; + if !obkv.is_empty() { + obkv_documents.insert(document_id, obkv)?; + current_chunk_size += document_id.len() as u64 + obkv.len() as u64; - if current_chunk_size >= documents_chunk_size as u64 { - return writer_into_reader(obkv_documents).map(Some); + if current_chunk_size >= documents_chunk_size as u64 { + return writer_into_reader(obkv_documents).map(Some); + } } } @@ -210,45 +238,46 @@ pub fn grenad_obkv_into_chunks( Ok(std::iter::from_fn(move || transposer().transpose())) } -pub fn sorter_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, +/// Write provided sorter in database using serialize_value function. +/// merge_values function is used if an entry already exist in the database. +pub fn write_sorter_into_database( sorter: Sorter, - merge: MergeFn, -) -> Result<()> { + database: &heed::Database, + wtxn: &mut heed::RwTxn, + index_is_empty: bool, + serialize_value: FS, + merge_values: FM, +) -> Result<()> +where + FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, + FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, +{ puffin::profile_function!(); - debug!("Writing MTBL sorter..."); - let before = Instant::now(); + + let mut buffer = Vec::new(); + let database = database.remap_types::(); let mut merger_iter = sorter.into_stream_merger_iter()?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - while let Some((k, v)) = merger_iter.next()? { - // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; - } - } else { - while let Some((k, v)) = merger_iter.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).map_err(|_| { - // TODO just wrap this error? - InternalError::IndexingMergingKeys { process: "get-put-merge" } - })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; + while let Some((key, value)) = merger_iter.next()? { + if valid_lmdb_key(key) { + buffer.clear(); + let value = if index_is_empty { + Some(serialize_value(value, &mut buffer)?) + } else { + match database.get(wtxn, key)? { + Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, + None => Some(serialize_value(value, &mut buffer)?), } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + }; + match value { + Some(value) => database.put(wtxn, key, value)?, + None => { + database.delete(wtxn, key)?; } } } } - debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 5d111067a..d355ead68 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -6,22 +6,12 @@ use std::result::Result as StdResult; use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::transform::Operation; use crate::Result; pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result>; -pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - let capacity = values.iter().map(|v| v.len()).sum::(); - let mut output = Vec::with_capacity(capacity); - values.iter().for_each(|integers| output.extend_from_slice(integers)); - Ok(Cow::Owned(output)) - } -} - pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec) -> io::Result<()> { buffer.clear(); buffer.reserve(bitmap.serialized_size()); @@ -75,57 +65,123 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result) { +pub fn merge_two_del_add_obkvs( + base: obkv::KvReaderU16, + update: obkv::KvReaderU16, + merge_additions: bool, + buffer: &mut Vec, +) { use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; buffer.clear(); let mut writer = obkv::KvWriter::new(buffer); + let mut value_buffer = Vec::new(); for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { match eob { - Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), + Left((k, v)) => { + if merge_additions { + writer.insert(k, v).unwrap() + } else { + // If merge_additions is false, recreate an obkv keeping the deletions only. + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + let base_reader = KvReaderDelAdd::new(v); + + if let Some(deletion) = base_reader.get(DelAdd::Deletion) { + value_writer.insert(DelAdd::Deletion, deletion).unwrap(); + value_writer.finish().unwrap(); + writer.insert(k, &value_buffer).unwrap() + } + } + } + Right((k, v)) => writer.insert(k, v).unwrap(), + Both((k, base), (_, update)) => { + // merge deletions and additions. + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + let base_reader = KvReaderDelAdd::new(base); + let update_reader = KvReaderDelAdd::new(update); + + // keep newest deletion. + if let Some(deletion) = update_reader + .get(DelAdd::Deletion) + .or_else(|| base_reader.get(DelAdd::Deletion)) + { + value_writer.insert(DelAdd::Deletion, deletion).unwrap(); + } + + // keep base addition only if merge_additions is true. + let base_addition = + merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten(); + // keep newest addition. + // TODO use or_else + if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) { + value_writer.insert(DelAdd::Addition, addition).unwrap(); + } + + value_writer.finish().unwrap(); + writer.insert(k, &value_buffer).unwrap() + } } } writer.finish().unwrap(); } -/// Merge all the obks in the order we see them. -pub fn merge_obkvs_and_operations<'a>( +/// Merge all the obkvs from the newest to the oldest. +fn inner_merge_del_add_obkvs<'a>( + obkvs: &[Cow<'a, [u8]>], + merge_additions: bool, +) -> Result> { + // pop the newest operation from the list. + let (newest, obkvs) = obkvs.split_last().unwrap(); + // keep the operation type for the returned value. + let newest_operation_type = newest[0]; + + // treat the newest obkv as the starting point of the merge. + let mut acc_operation_type = newest_operation_type; + let mut acc = newest[1..].to_vec(); + let mut buffer = Vec::new(); + // reverse iter from the most recent to the oldest. + for current in obkvs.iter().rev() { + // if in the previous iteration there was a complete deletion, + // stop the merge process. + if acc_operation_type == Operation::Deletion as u8 { + break; + } + + let newest = obkv::KvReader::new(&acc); + let oldest = obkv::KvReader::new(¤t[1..]); + merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer); + + // we want the result of the merge into our accumulator. + std::mem::swap(&mut acc, &mut buffer); + acc_operation_type = current[0]; + } + + acc.insert(0, newest_operation_type); + Ok(Cow::from(acc)) +} + +/// Merge all the obkvs from the newest to the oldest. +pub fn obkvs_merge_additions_and_deletions<'a>( _key: &[u8], obkvs: &[Cow<'a, [u8]>], ) -> Result> { - // [add, add, delete, add, add] - // we can ignore everything that happened before the last delete. - let starting_position = - obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0); - - // [add, add, delete] - // if the last operation was a deletion then we simply return the deletion - if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8 - { - return Ok(obkvs[obkvs.len() - 1].clone()); - } - let mut buffer = Vec::new(); - - // (add, add, delete) [add, add] - // in the other case, no deletion will be encountered during the merge - let mut ret = - obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| { - let first = obkv::KvReader::new(&acc); - let second = obkv::KvReader::new(¤t[1..]); - merge_two_obkvs(first, second, &mut buffer); - - // we want the result of the merge into our accumulator - std::mem::swap(&mut acc, &mut buffer); - acc - }); - - ret.insert(0, Operation::Addition as u8); - Ok(Cow::from(ret)) + inner_merge_del_add_obkvs(obkvs, true) } +/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions. +pub fn obkvs_keep_last_addition_merge_deletions<'a>( + _key: &[u8], + obkvs: &[Cow<'a, [u8]>], +) -> Result> { + inner_merge_del_add_obkvs(obkvs, false) +} + +/// Do a union of all the CboRoaringBitmaps in the values. pub fn merge_cbo_roaring_bitmaps<'a>( _key: &[u8], values: &[Cow<'a, [u8]>], @@ -138,3 +194,52 @@ pub fn merge_cbo_roaring_bitmaps<'a>( Ok(Cow::from(vec)) } } + +/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv +/// separately and outputs a new DelAdd with both unions. +pub fn merge_deladd_cbo_roaring_bitmaps<'a>( + _key: &[u8], + values: &[Cow<'a, [u8]>], +) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + // Retrieve the bitmaps from both sides + let mut del_bitmaps_bytes = Vec::new(); + let mut add_bitmaps_bytes = Vec::new(); + for value in values { + let obkv = KvReaderDelAdd::new(value); + if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { + del_bitmaps_bytes.push(bitmap_bytes); + } + if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) { + add_bitmaps_bytes.push(bitmap_bytes); + } + } + + let mut output_deladd_obkv = KvWriterDelAdd::memory(); + let mut buffer = Vec::new(); + CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?; + output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?; + buffer.clear(); + CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?; + output_deladd_obkv.insert(DelAdd::Addition, &buffer)?; + output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) + } +} + +/// A function that merges a DelAdd of bitmao into an already existing bitmap. +/// +/// The first argument is the DelAdd obkv of CboRoaringBitmaps and +/// the second one is the CboRoaringBitmap to merge into. +pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>( + deladd_obkv: &[u8], + previous: &[u8], + buffer: &'a mut Vec, +) -> Result> { + Ok(CboRoaringBitmapCodec::merge_deladd_into( + KvReaderDelAdd::new(deladd_obkv), + previous, + buffer, + )?) +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index c403f9e3d..52638d6f6 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -9,13 +9,14 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, - merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, + merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader, GrenadParameters, MergeableReader, }; pub use merge_functions::{ - concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, - merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, - serialize_roaring_bitmap, MergeFn, + keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, + obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn, }; use crate::MAX_WORD_LENGTH; @@ -44,10 +45,6 @@ where Some((head, tail)) } -pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { - bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) -} - /// Converts an fst Stream into an HashSet of Strings. pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet> where diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b3e7e203e..113114681 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,11 +20,13 @@ use slice_group_by::GroupBy; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; use self::enrich::enrich_documents_batch; -pub use self::enrich::{extract_finite_float_from_value, DocumentId}; +pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, - fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, + fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader, + ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; @@ -32,13 +34,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, - WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, + IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; -use crate::{Index, Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Index, Result}; static MERGED_DATABASE_COUNT: usize = 7; -static PREFIX_DATABASE_COUNT: usize = 5; +static PREFIX_DATABASE_COUNT: usize = 4; static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -86,7 +87,6 @@ pub struct IndexDocumentsConfig { pub words_positions_level_group_size: Option, pub words_positions_min_level_size: Option, pub update_method: IndexDocumentsMethod, - pub deletion_strategy: DeletionStrategy, pub autogenerate_docids: bool, } @@ -178,6 +178,7 @@ where // Early return when there is no document to add if to_delete.is_empty() { + // Maintains Invariant: remove documents actually always returns Ok for the inner result return Ok((self, Ok(0))); } @@ -190,14 +191,48 @@ where self.deleted_documents += deleted_documents; + // Maintains Invariant: remove documents actually always returns Ok for the inner result Ok((self, Ok(deleted_documents))) } + /// Removes documents from db using their internal document ids. + /// + /// # Warning + /// + /// This function is dangerous and will only work correctly if: + /// + /// - All the passed ids currently exist in the database + /// - No batching using the standards `remove_documents` and `add_documents` took place + /// + /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function. + pub fn remove_documents_from_db_no_batch( + mut self, + to_delete: &RoaringBitmap, + ) -> Result<(Self, u64)> { + puffin::profile_function!(); + + // Early return when there is no document to add + if to_delete.is_empty() { + return Ok((self, 0)); + } + + let deleted_documents = self + .transform + .as_mut() + .expect("Invalid document deletion state") + .remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)? + as u64; + + self.deleted_documents += deleted_documents; + + Ok((self, deleted_documents)) + } + #[logging_timer::time("IndexDocuments::{}")] pub fn execute(mut self) -> Result { puffin::profile_function!(); - if self.added_documents == 0 { + if self.added_documents == 0 && self.deleted_documents == 0 { let number_of_documents = self.index.number_of_documents(self.wtxn)?; return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); } @@ -241,9 +276,6 @@ where primary_key, fields_ids_map, field_distribution, - new_external_documents_ids, - new_documents_ids, - replaced_documents_ids, documents_count, original_documents, flattened_documents, @@ -367,29 +399,12 @@ where let _ = lmdb_writer_sx.send(Err(e)); } - // needs to be droped to avoid channel waiting lock. + // needs to be dropped to avoid channel waiting lock. drop(lmdb_writer_sx) }); - // We delete the documents that this document addition replaces. This way we are - // able to simply insert all the documents even if they already exist in the database. - if !replaced_documents_ids.is_empty() { - let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?; - deletion_builder.strategy(self.config.deletion_strategy); - debug!("documents to delete {:?}", replaced_documents_ids); - deletion_builder.delete_documents(&replaced_documents_ids); - let deleted_documents_result = deletion_builder.execute_inner()?; - debug!("{} documents actually deleted", deleted_documents_result.deleted_documents); - } - - let index_documents_ids = self.index.documents_ids(self.wtxn)?; - let index_is_empty = index_documents_ids.is_empty(); + let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0; let mut final_documents_ids = RoaringBitmap::new(); - let mut word_pair_proximity_docids = None; - let mut word_position_docids = None; - let mut word_fid_docids = None; - let mut word_docids = None; - let mut exact_word_docids = None; let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -397,35 +412,40 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); + let mut word_position_docids = None; + let mut word_fid_docids = None; + let mut word_docids = None; + let mut exact_word_docids = None; + for result in lmdb_writer_rx { if (self.should_abort)() { return Err(Error::InternalError(InternalError::AbortedIndexation)); } let typed_chunk = match result? { - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => { let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; word_docids = Some(cloneable_chunk); let cloneable_chunk = unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; exact_word_docids = Some(cloneable_chunk); - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } - } - TypedChunk::WordPairProximityDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_pair_proximity_docids = Some(cloneable_chunk); - TypedChunk::WordPairProximityDocids(chunk) + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; + word_fid_docids = Some(cloneable_chunk); + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } } TypedChunk::WordPositionDocids(chunk) => { let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; word_position_docids = Some(cloneable_chunk); TypedChunk::WordPositionDocids(chunk) } - TypedChunk::WordFidDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_fid_docids = Some(cloneable_chunk); - TypedChunk::WordFidDocids(chunk) - } otherwise => otherwise, }; @@ -457,25 +477,16 @@ where // We write the primary key field id into the main database self.index.put_primary_key(self.wtxn, &primary_key)?; - - // We write the external documents ids into the main database. - let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?; - external_documents_ids.insert_ids(&new_external_documents_ids)?; - let external_documents_ids = external_documents_ids.into_static(); - self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; - - let all_documents_ids = index_documents_ids | new_documents_ids; - self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; + let number_of_documents = self.index.number_of_documents(self.wtxn)?; self.execute_prefix_databases( word_docids, exact_word_docids, - word_pair_proximity_docids, word_position_docids, word_fid_docids, )?; - Ok(all_documents_ids.len()) + Ok(number_of_documents) } #[logging_timer::time("IndexDocuments::{}")] @@ -483,7 +494,6 @@ where self, word_docids: Option>, exact_word_docids: Option>, - word_pair_proximity_docids: Option>, word_position_docids: Option>, word_fid_docids: Option>, ) -> Result<()> @@ -604,32 +614,6 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); - if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { - // Run the word prefix pair proximity docids update operation. - PrefixWordPairsProximityDocids::new( - self.wtxn, - self.index, - self.indexer_config.chunk_compression_type, - self.indexer_config.chunk_compression_level, - ) - .execute( - word_pair_proximity_docids, - &new_prefix_fst_words, - &common_prefix_fst_words, - &del_prefix_fst_words, - )?; - } - - if (self.should_abort)() { - return Err(Error::InternalError(InternalError::AbortedIndexation)); - } - - databases_seen += 1; - (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen, - total_databases: TOTAL_POSTING_DATABASE_COUNT, - }); - if let Some(word_position_docids) = word_position_docids { // Run the words prefix position docids update operation. let mut builder = WordPrefixIntegerDocids::new( @@ -687,8 +671,8 @@ where fn execute_word_prefix_docids( txn: &mut heed::RwTxn, reader: grenad::Reader>, - word_docids_db: Database, - word_prefix_docids_db: Database, + word_docids_db: Database, + word_prefix_docids_db: Database, indexer_config: &IndexerConfig, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], @@ -709,14 +693,15 @@ fn execute_word_prefix_docids( #[cfg(test)] mod tests { use big_s::S; + use fst::IntoStreamer; + use heed::RwTxn; use maplit::hashset; use super::*; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; use crate::search::TermsMatchingStrategy; - use crate::update::DeleteDocuments; - use crate::{db_snap, BEU16}; + use crate::{db_snap, Filter, Search, BEU16}; #[test] fn simple_document_replacement() { @@ -807,11 +792,10 @@ mod tests { assert_eq!(count, 1); // Check that we get only one document from the database. - // Since the document has been deleted and re-inserted, its internal docid has been incremented to 1 - let docs = index.documents(&rtxn, Some(1)).unwrap(); + let docs = index.documents(&rtxn, Some(0)).unwrap(); assert_eq!(docs.len(), 1); let (id, doc) = docs[0]; - assert_eq!(id, 1); + assert_eq!(id, 0); // Check that this document is equal to the last one sent. let mut doc_iter = doc.iter(); @@ -872,7 +856,7 @@ mod tests { assert_eq!(count, 3); // the document 0 has been deleted and reinserted with the id 3 - let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap(); + let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap(); let kevin_position = docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); assert_eq!(kevin_position, 2); @@ -1018,7 +1002,6 @@ mod tests { assert_eq!(count, 6); db_snap!(index, word_docids, "updated"); - db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]"); drop(rtxn); } @@ -1121,17 +1104,15 @@ mod tests { { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ])) .unwrap(); - let mut wtxn = index.write_txn().unwrap(); - assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_external_id("30"); - builder.execute().unwrap(); + index.delete_document("30"); - let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); - assert!(external_documents_ids.get("30").is_none()); - wtxn.commit().unwrap(); + let txn = index.read_txn().unwrap(); + assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId")); + + let external_documents_ids = index.external_documents_ids(); + assert!(external_documents_ids.get(&txn, "30").unwrap().is_none()); index .add_documents(documents!([ @@ -1140,8 +1121,8 @@ mod tests { .unwrap(); let wtxn = index.write_txn().unwrap(); - let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); - assert!(external_documents_ids.get("30").is_some()); + let external_documents_ids = index.external_documents_ids(); + assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some()); wtxn.commit().unwrap(); index @@ -1435,8 +1416,10 @@ mod tests { index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap(); let rtxn = index.read_txn().unwrap(); - let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); - assert!(external_documents_ids.get("1").is_some()); + let all_documents_count = index.all_documents(&rtxn).unwrap().count(); + assert_eq!(all_documents_count, 1); + let external_documents_ids = index.external_documents_ids(); + assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some()); } #[test] @@ -1490,12 +1473,6 @@ mod tests { 3 2 second second 3 3 third third "###); - db_snap!(index, string_faceted_documents_ids, @r###" - 0 [] - 1 [] - 2 [] - 3 [0, 1, 2, 3, ] - "###); let rtxn = index.read_txn().unwrap(); @@ -1519,12 +1496,6 @@ mod tests { db_snap!(index, facet_id_string_docids, @""); db_snap!(index, field_id_docid_facet_strings, @""); - db_snap!(index, string_faceted_documents_ids, @r###" - 0 [] - 1 [] - 2 [] - 3 [0, 1, 2, 3, ] - "###); let rtxn = index.read_txn().unwrap(); @@ -1551,12 +1522,6 @@ mod tests { 3 2 second second 3 3 third third "###); - db_snap!(index, string_faceted_documents_ids, @r###" - 0 [] - 1 [] - 2 [] - 3 [0, 1, 2, 3, ] - "###); let rtxn = index.read_txn().unwrap(); @@ -1719,7 +1684,7 @@ mod tests { let wtxn = index.read_txn().unwrap(); - let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map(); + let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap(); let ids = map.values().collect::>(); assert_eq!(ids.len(), map.len()); @@ -2531,17 +2496,8 @@ mod tests { db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4"); db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83"); - let mut wtxn = index.write_txn().unwrap(); - // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); - builder.delete_external_id("0"); - builder.delete_external_id("3"); - let result = builder.execute().unwrap(); - println!("{result:?}"); - - wtxn.commit().unwrap(); + index.delete_documents(vec!["0".into(), "3".into()]); db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); @@ -2596,8 +2552,7 @@ mod tests { ), ] */ - let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; + let index = TempIndex::new(); // START OF BATCH @@ -2637,8 +2592,7 @@ mod tests { {"id":1,"doggo":"bernese"} "###); db_snap!(index, external_documents_ids, @r###" - soft: - hard: + docids: 1 0 "###); @@ -2683,13 +2637,10 @@ mod tests { "###); db_snap!(index, external_documents_ids, @r###" - soft: - hard: + docids: 0 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[]"); - // BATCH 3 println!("--- ENTERING BATCH 3"); @@ -2731,4 +2682,537 @@ mod tests { let res = index.search(&rtxn).execute().unwrap(); index.documents(&rtxn, res.documents_ids).unwrap(); } + + fn delete_documents<'t>( + wtxn: &mut RwTxn<'t, '_>, + index: &'t TempIndex, + external_ids: &[&str], + ) -> Vec { + let external_document_ids = index.external_documents_ids(); + let ids_to_delete: Vec = external_ids + .iter() + .map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap()) + .collect(); + + // Delete some documents. + index.delete_documents_using_wtxn( + wtxn, + external_ids.iter().map(ToString::to_string).collect(), + ); + + ids_to_delete + } + + #[test] + fn delete_documents_with_numbers_as_primary_key() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, + { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, + { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } + ]), + ) + .unwrap(); + + // delete those documents, ids are synchronous therefore 0, 1, and 2. + index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]); + + wtxn.commit().unwrap(); + + // All these snapshots should be empty since the database was cleared + db_snap!(index, documents_ids); + db_snap!(index, word_docids); + db_snap!(index, word_pair_proximity_docids); + db_snap!(index, facet_id_exists_docids); + + let rtxn = index.read_txn().unwrap(); + + assert!(index.field_distribution(&rtxn).unwrap().is_empty()); + } + + #[test] + fn delete_documents_with_strange_primary_key() { + let index = TempIndex::new(); + + index + .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()])) + .unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "mysuperid": 0, "name": "kevin" }, + { "mysuperid": 1, "name": "kevina" }, + { "mysuperid": 2, "name": "benoit" } + ]), + ) + .unwrap(); + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete not all of the documents but some of them. + index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]); + + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids); + db_snap!(index, word_docids); + db_snap!(index, word_pair_proximity_docids); + } + + #[test] + fn filtered_placeholder_search_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + settings.set_filterable_fields(hashset! { S("label"), S("label2") }); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]); + + // Placeholder search with filter + let filter = Filter::from_str("label = sign").unwrap().unwrap(); + let results = index.search(&wtxn).filter(filter).execute().unwrap(); + assert!(results.documents_ids.is_empty()); + + wtxn.commit().unwrap(); + + db_snap!(index, word_docids); + db_snap!(index, facet_id_f64_docids); + db_snap!(index, word_pair_proximity_docids); + db_snap!(index, facet_id_exists_docids); + db_snap!(index, facet_id_string_docids); + } + + #[test] + fn placeholder_search_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]); + + // Placeholder search + let results = index.search(&wtxn).execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + } + + #[test] + fn search_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + + // search for abstract + let results = index.search(&wtxn).query("abstract").execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + } + + #[test] + fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("id")); + settings.set_filterable_fields(hashset!(S("_geo"))); + settings.set_sortable_fields(hashset!(S("_geo"))); + }) + .unwrap(); + + index.add_documents_using_wtxn(&mut wtxn, documents!([ + { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } }, + { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } }, + { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } }, + { "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } }, + { "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } }, + { "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } }, + { "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } }, + { "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } }, + { "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } }, + { "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } }, + { "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } }, + { "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } }, + { "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } }, + { "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } }, + { "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } }, + { "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } }, + { "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } }, + { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } }, + { "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } }, + { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } } + ])).unwrap(); + + let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete); + + // Placeholder search with geo filter + let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); + let results = index.search(&wtxn).filter(filter).execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + + db_snap!(index, facet_id_f64_docids); + db_snap!(index, facet_id_string_docids); + } + + #[test] + fn get_documents_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_external_ids = ["1_7", "1_52"]; + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids); + + // list all documents + let results = index.all_documents(&wtxn).unwrap(); + for result in results { + let (id, _) = result.unwrap(); + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + + // list internal document ids + let results = index.documents_ids(&wtxn).unwrap(); + for id in results { + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // get internal docids from deleted external document ids + let results = index.external_documents_ids(); + for id in deleted_external_ids { + assert!( + results.get(&rtxn, id).unwrap().is_none(), + "The document {} was supposed to be deleted", + id + ); + } + drop(rtxn); + } + + #[test] + fn stats_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index.add_documents_using_wtxn(&mut wtxn, documents!([ + { "docid": "1_4", "label": ["sign"]}, + { "docid": "1_5", "label": ["letter"]}, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"}, + { "docid": "1_36", "label": ["drawing","painting","pattern"]}, + { "docid": "1_37", "label": ["art","drawing","outdoor"]}, + { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"}, + { "docid": "1_39", "label": ["abstract"]}, + { "docid": "1_40", "label": ["cartoon"]}, + { "docid": "1_41", "label": ["art","drawing"]}, + { "docid": "1_42", "label": ["art","pattern"]}, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32}, + { "docid": "1_44", "label": ["drawing"], "number": 44i32}, + { "docid": "1_45", "label": ["art"]}, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]}, + { "docid": "1_47", "label": ["abstract","pattern"]}, + { "docid": "1_52", "label": ["abstract","cartoon"]}, + { "docid": "1_57", "label": ["abstract","drawing","pattern"]}, + { "docid": "1_58", "label": ["abstract","art","cartoon"]}, + { "docid": "1_68", "label": ["design"]}, + { "docid": "1_69", "label": ["geometry"]} + ])).unwrap(); + + delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + + // count internal documents + let results = index.number_of_documents(&wtxn).unwrap(); + assert_eq!(18, results); + + // count field distribution + let results = index.field_distribution(&wtxn).unwrap(); + assert_eq!(Some(&18), results.get("label")); + assert_eq!(Some(&1), results.get("title")); + assert_eq!(Some(&2), results.get("number")); + + wtxn.commit().unwrap(); + } + + #[test] + fn stored_detected_script_and_language_should_not_return_deleted_documents() { + use charabia::{Language, Script}; + let index = TempIndex::new(); + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, + { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, + { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" }, + { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" }, + { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" }, + ])) + .unwrap(); + + let key_cmn = (Script::Cj, Language::Cmn); + let cj_cmn_docs = + index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default(); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(1); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + + delete_documents(&mut wtxn, &index, &["1"]); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let cj_cmn_docs = + index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default(); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + } + + #[test] + fn delete_words_exact_attributes() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key(S("id")); + settings.set_searchable_fields(vec![S("text"), S("exact")]); + settings.set_exact_attributes(vec![S("exact")].into_iter().collect()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "id": 0, "text": "hello" }, + { "id": 1, "exact": "hello"} + ])) + .unwrap(); + db_snap!(index, word_docids, 1, @r###" + hello [0, ] + "###); + db_snap!(index, exact_word_docids, 1, @r###" + hello [1, ] + "###); + db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); + + let mut wtxn = index.write_txn().unwrap(); + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]); + wtxn.commit().unwrap(); + + db_snap!(index, word_docids, 2, @r###" + hello [0, ] + "###); + db_snap!(index, exact_word_docids, 2, @""); + db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); + + insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]"); + let txn = index.read_txn().unwrap(); + let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap(); + insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###); + + let mut s = Search::new(&txn, &index); + s.query("hello"); + let crate::SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + } } diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap new file mode 100644 index 000000000..8b27dcb0d --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +[] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap new file mode 100644 index 000000000..8a9805f8d --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +[2, ] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap new file mode 100644 index 000000000..bb2f64873 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +benoit [2, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap similarity index 66% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap rename to milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap index 7481b11c4..ed120bf02 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap @@ -1,5 +1,5 @@ --- -source: milli/src/update/delete_documents.rs +source: milli/src/update/index_documents/mod.rs --- 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] 2 [21, ] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap new file mode 100644 index 000000000..deeddff0d --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +2 0 2.2 1 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap similarity index 91% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap rename to milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap index ab1d2175f..2d0b98623 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap @@ -1,5 +1,5 @@ --- -source: milli/src/update/delete_documents.rs +source: milli/src/update/index_documents/mod.rs --- 1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] 1 0 aquarium 1 [5, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap similarity index 95% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap rename to milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap index f8d64e001..73503f098 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap @@ -1,5 +1,5 @@ --- -source: milli/src/update/delete_documents.rs +source: milli/src/update/index_documents/mod.rs --- 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ] 2 [21, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap similarity index 95% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap rename to milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap index 36add107b..022e9f5b1 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap @@ -1,5 +1,5 @@ --- -source: milli/src/update/delete_documents.rs +source: milli/src/update/index_documents/mod.rs --- 1 1 36 [3, ] 1 1 37 [4, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap similarity index 93% rename from milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap rename to milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap index 18a9d9309..c45c350e7 100644 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap @@ -1,5 +1,5 @@ --- -source: milli/src/update/delete_documents.rs +source: milli/src/update/index_documents/mod.rs --- 3 0 48.9021 1 [19, ] 3 0 49.9314 1 [17, ] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap index b0ef38b93..80dbce9e8 100644 --- a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap @@ -1,60 +1,56 @@ --- source: milli/src/update/index_documents/mod.rs --- -0 [1, 7, ] +0 [1, ] 1 [2, ] -10 [1, 7, ] -12 [0, 8, ] +10 [1, ] +12 [0, ] 1344 [3, ] -1813 [8, ] -2 [0, 8, ] +1813 [0, ] +2 [0, ] 23 [5, ] 25 [2, ] -3 [0, 8, ] +3 [0, ] 35 [5, ] -4 [4, 6, ] -42 [0, 5, 8, ] -456 [1, 7, ] -5 [0, 8, ] +4 [4, ] +42 [0, 5, ] +456 [1, ] +5 [0, ] 99 [2, ] adams [5, ] -adventure [1, 7, ] +adventure [1, ] alice [2, ] -and [0, 4, 6, 8, ] -antoine [1, 7, ] -austen [8, ] -austin [0, ] -blood [4, 6, ] +and [0, 4, ] +antoine [1, ] +austen [0, ] +blood [4, ] carroll [2, ] -de [1, 7, ] +de [1, ] douglas [5, ] -exupery [1, 7, ] -fantasy [2, 3, 4, 6, ] +exupery [1, ] +fantasy [2, 3, 4, ] galaxy [5, ] guide [5, ] -half [4, 6, ] -harry [4, 6, ] +half [4, ] +harry [4, ] hitchhiker [5, ] hobbit [3, ] in [2, ] -j [3, 4, 6, 8, ] -jane [0, ] -k [4, 6, ] -le [1, ] +j [0, 3, 4, ] +k [4, ] lewis [2, ] -little [7, ] -petit [1, ] -potter [4, 6, ] -prejudice [0, 8, ] -pride [0, 8, ] -prince [1, 4, 7, ] -princess [6, ] +little [1, ] +potter [4, ] +prejudice [0, ] +pride [0, ] +prince [1, ] +princess [4, ] r [3, ] -romance [0, 8, ] -rowling [4, 6, ] +romance [0, ] +rowling [4, ] s [5, ] -saint [1, 7, ] -the [3, 4, 5, 6, 7, ] +saint [1, ] +the [1, 3, 4, 5, ] to [5, ] tolkien [3, ] wonderland [2, ] diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f0e3bbbf0..323bc3da7 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; -use std::collections::hash_map::Entry; +use std::collections::btree_map::Entry as BEntry; +use std::collections::hash_map::Entry as HEntry; use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek}; @@ -7,18 +8,21 @@ use std::io::{Read, Seek}; use fxhash::FxHashMap; use heed::RoTxn; use itertools::Itertools; -use obkv::{KvReader, KvWriter}; +use obkv::{KvReader, KvReaderU16, KvWriter}; use roaring::RoaringBitmap; use serde_json::Value; use smartstring::SmartString; use super::helpers::{ - create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn, + create_sorter, create_writer, keep_first, obkvs_keep_last_addition_merge_deletions, + obkvs_merge_additions_and_deletions, sorter_into_reader, MergeFn, }; use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; +use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; +use crate::update::index_documents::GrenadParameters; use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, @@ -28,9 +32,6 @@ pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, pub field_distribution: FieldDistribution, - pub new_external_documents_ids: fst::Map>, - pub new_documents_ids: RoaringBitmap, - pub replaced_documents_ids: RoaringBitmap, pub documents_count: usize, pub original_documents: File, pub flattened_documents: File, @@ -106,8 +107,8 @@ impl<'a, 'i> Transform<'a, 'i> { // We must choose the appropriate merge function for when two or more documents // with the same user id must be merged or fully replaced in the same batch. let merge_function = match index_documents_method { - IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv, - IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations, + IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions, + IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions, }; // We initialize the sorter with the user indexing settings. @@ -130,17 +131,13 @@ impl<'a, 'i> Transform<'a, 'i> { indexer_settings.max_memory.map(|mem| mem / 2), ); let documents_ids = index.documents_ids(wtxn)?; - let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?; Ok(Transform { index, fields_ids_map: index.fields_ids_map(wtxn)?, indexer_settings, autogenerate_docids, - available_documents_ids: AvailableDocumentsIds::from_documents_ids( - &documents_ids, - &soft_deleted_documents_ids, - ), + available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), original_sorter, flattened_sorter, index_documents_method, @@ -151,6 +148,7 @@ impl<'a, 'i> Transform<'a, 'i> { }) } + #[logging_timer::time] pub fn read_documents( &mut self, reader: EnrichedDocumentsBatchReader, @@ -163,8 +161,10 @@ impl<'a, 'i> Transform<'a, 'i> { FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { + puffin::profile_function!(); + let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); - let external_documents_ids = self.index.external_documents_ids(wtxn)?; + let external_documents_ids = self.index.external_documents_ids(); let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; let primary_key = cursor.primary_key().to_string(); @@ -172,7 +172,8 @@ impl<'a, 'i> Transform<'a, 'i> { self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; let mut obkv_buffer = Vec::new(); - let mut document_sorter_buffer = Vec::new(); + let mut document_sorter_value_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); let mut documents_count = 0; let mut docid_buffer: Vec = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); @@ -213,29 +214,30 @@ impl<'a, 'i> Transform<'a, 'i> { field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2)); // Build the new obkv document. - let mut writer = obkv::KvWriter::new(&mut obkv_buffer); + let mut writer = KvWriter::new(&mut obkv_buffer); for (k, v) in field_buffer_cache.iter() { writer.insert(*k, v)?; } let mut original_docid = None; - let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { - Entry::Occupied(entry) => *entry.get() as u32, - Entry::Vacant(entry) => { - // If the document was already in the db we mark it as a replaced document. - // It'll be deleted later. - if let Some(docid) = external_documents_ids.get(entry.key()) { - // If it was already in the list of replaced documents it means it was deleted - // by the remove_document method. We should starts as if it never existed. - if self.replaced_documents_ids.insert(docid) { - original_docid = Some(docid); + HEntry::Occupied(entry) => *entry.get() as u32, + HEntry::Vacant(entry) => { + let docid = match external_documents_ids.get(wtxn, entry.key())? { + Some(docid) => { + // If it was already in the list of replaced documents it means it was deleted + // by the remove_document method. We should starts as if it never existed. + if self.replaced_documents_ids.insert(docid) { + original_docid = Some(docid); + } + + docid } - } - let docid = self - .available_documents_ids - .next() - .ok_or(UserError::DocumentLimitReached)?; + None => self + .available_documents_ids + .next() + .ok_or(UserError::DocumentLimitReached)?, + }; entry.insert(docid as u64); docid } @@ -263,47 +265,68 @@ impl<'a, 'i> Transform<'a, 'i> { skip_insertion = true; } else { // we associate the base document with the new key, everything will get merged later. - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(base_obkv); - self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; - match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? { - Some(flattened_obkv) => { - // we recreate our buffer with the flattened documents - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(&flattened_obkv); - self.flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)? + let deladd_operation = match self.index_documents_method { + IndexDocumentsMethod::UpdateDocuments => { + DelAddOperation::DeletionAndAddition } - None => self - .flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)?, + IndexDocumentsMethod::ReplaceDocuments => DelAddOperation::Deletion, + }; + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(base_obkv), + deladd_operation, + &mut document_sorter_value_buffer, + )?; + self.original_sorter + .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; + let base_obkv = KvReader::new(base_obkv); + if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? { + // we recreate our buffer with the flattened documents + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(&flattened_obkv), + deladd_operation, + &mut document_sorter_value_buffer, + )?; } + self.flattened_sorter + .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; } } if !skip_insertion { self.new_documents_ids.insert(docid); - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(&obkv_buffer); + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(&obkv_buffer), + DelAddOperation::Addition, + &mut document_sorter_value_buffer, + )?; // We use the extracted/generated user id as the key for this document. - self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; + self.original_sorter + .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; - match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { - Some(flattened_obkv) => { - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(&flattened_obkv); - self.flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)? - } - None => self - .flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)?, + let flattened_obkv = KvReader::new(&obkv_buffer); + if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(&obkv), + DelAddOperation::Addition, + &mut document_sorter_value_buffer, + )? } + self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; } documents_count += 1; @@ -338,6 +361,7 @@ impl<'a, 'i> Transform<'a, 'i> { /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db, /// it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids. /// - If the document to remove was not present in either the db or the transform we do nothing. + #[logging_timer::time] pub fn remove_documents( &mut self, mut to_remove: Vec, @@ -347,54 +371,176 @@ impl<'a, 'i> Transform<'a, 'i> { where FA: Fn() -> bool + Sync, { + puffin::profile_function!(); + // there may be duplicates in the documents to remove. to_remove.sort_unstable(); to_remove.dedup(); - let external_documents_ids = self.index.external_documents_ids(wtxn)?; + let external_documents_ids = self.index.external_documents_ids(); let mut documents_deleted = 0; + let mut document_sorter_value_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); for to_remove in to_remove { if should_abort() { return Err(Error::InternalError(InternalError::AbortedIndexation)); } - match self.new_external_documents_ids_builder.entry((*to_remove).into()) { - // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. - Entry::Occupied(entry) => { - let doc_id = *entry.get() as u32; - self.original_sorter - .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?; - self.flattened_sorter - .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?; + // Check if the document has been added in the current indexing process. + let deleted_from_current = + match self.new_external_documents_ids_builder.entry((*to_remove).into()) { + // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. + HEntry::Occupied(entry) => { + let docid = *entry.get() as u32; + // Key is the concatenation of the internal docid and the external one. + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes()); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); + obkv::KvWriterU16::new(&mut document_sorter_value_buffer).finish().unwrap(); + self.original_sorter + .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; + self.flattened_sorter + .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; - // we must NOT update the list of replaced_documents_ids - // Either: - // 1. It's already in it and there is nothing to do - // 2. It wasn't in it because the document was created by a previous batch and since - // we're removing it there is nothing to do. - self.new_documents_ids.remove(doc_id); - entry.remove_entry(); - } - Entry::Vacant(entry) => { - // If the document was already in the db we mark it as a `to_delete` document. - // It'll be deleted later. We don't need to push anything to the sorters. - if let Some(docid) = external_documents_ids.get(entry.key()) { - self.replaced_documents_ids.insert(docid); - } else { - // if the document is nowehere to be found, there is nothing to do and we must NOT - // increment the count of documents_deleted - continue; + // we must NOT update the list of replaced_documents_ids + // Either: + // 1. It's already in it and there is nothing to do + // 2. It wasn't in it because the document was created by a previous batch and since + // we're removing it there is nothing to do. + self.new_documents_ids.remove(docid); + entry.remove_entry(); + true } + HEntry::Vacant(_) => false, + }; + + // If the document was already in the db we mark it as a `to_delete` document. + // Then we push the document in sorters in deletion mode. + let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? { + Some(docid) => { + self.remove_document_from_db( + docid, + to_remove, + wtxn, + &mut document_sorter_key_buffer, + &mut document_sorter_value_buffer, + )?; + true } + None => false, }; + // increase counter only if the document existed somewhere before. + if deleted_from_current || deleted_from_db { + documents_deleted += 1; + } + } + + Ok(documents_deleted) + } + + /// Removes documents from db using their internal document ids. + /// + /// # Warning + /// + /// This function is dangerous and will only work correctly if: + /// + /// - All the passed ids currently exist in the database + /// - No batching using the standards `remove_documents` and `add_documents` took place + /// + /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function. + #[logging_timer::time] + pub fn remove_documents_from_db_no_batch( + &mut self, + to_remove: &RoaringBitmap, + wtxn: &mut heed::RwTxn, + should_abort: FA, + ) -> Result + where + FA: Fn() -> bool + Sync, + { + puffin::profile_function!(); + + let mut documents_deleted = 0; + let mut document_sorter_value_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); + let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?; + + for (internal_docid, external_docid) in to_remove.iter().zip(external_ids) { + let external_docid = external_docid?; + if should_abort() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + self.remove_document_from_db( + internal_docid, + external_docid, + wtxn, + &mut document_sorter_key_buffer, + &mut document_sorter_value_buffer, + )?; + documents_deleted += 1; } Ok(documents_deleted) } + fn remove_document_from_db( + &mut self, + internal_docid: u32, + external_docid: String, + txn: &heed::RoTxn, + document_sorter_key_buffer: &mut Vec, + document_sorter_value_buffer: &mut Vec, + ) -> Result<()> { + self.replaced_documents_ids.insert(internal_docid); + + // fetch the obkv document + let original_key = BEU32::new(internal_docid); + let base_obkv = self + .index + .documents + .remap_data_type::() + .get(txn, &original_key)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::DOCUMENTS, + key: None, + })?; + + // Key is the concatenation of the internal docid and the external one. + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&internal_docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes()); + // push it as to delete in the original_sorter + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); + into_del_add_obkv( + KvReaderU16::new(base_obkv), + DelAddOperation::Deletion, + document_sorter_value_buffer, + )?; + self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; + + // flatten it and push it as to delete in the flattened_sorter + let flattened_obkv = KvReader::new(base_obkv); + if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { + // we recreate our buffer with the flattened documents + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); + into_del_add_obkv( + KvReaderU16::new(&obkv), + DelAddOperation::Deletion, + document_sorter_value_buffer, + )?; + } + self.flattened_sorter + .insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?; + Ok(()) + } + // Flatten a document from the fields ids map contained in self and insert the new // created fields. Returns `None` if the document doesn't need to be flattened. fn flatten_from_fields_ids_map(&mut self, obkv: KvReader) -> Result>> { @@ -514,42 +660,10 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(()) } - fn remove_deleted_documents_from_field_distribution( - &self, - rtxn: &RoTxn, - field_distribution: &mut FieldDistribution, - ) -> Result<()> { - for deleted_docid in self.replaced_documents_ids.iter() { - let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or( - InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, - )?; - - for (key, _) in obkv.iter() { - let name = - self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Computing field distribution in transform.", - })?; - // We checked that the document was in the db earlier. If we can't find it it means - // there is an inconsistency between the field distribution and the field id map. - let field = - field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Accessing field distribution in transform.", - })?; - *field -= 1; - if *field == 0 { - // since we were able to get the field right before it's safe to unwrap here - field_distribution.remove(name).unwrap(); - } - } - } - Ok(()) - } - /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. + #[logging_timer::time] pub(crate) fn output_from_sorter( self, wtxn: &mut heed::RwTxn, @@ -581,17 +695,13 @@ impl<'a, 'i> Transform<'a, 'i> { // 2. Add all the new documents to the field distribution let mut field_distribution = self.index.field_distribution(wtxn)?; - self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?; - // Here we are going to do the document count + field distribution + `write_into_stream_writer` let mut iter = self.original_sorter.into_stream_merger_iter()?; // used only for the callback let mut documents_count = 0; while let Some((key, val)) = iter.next()? { - if val[0] == Operation::Deletion as u8 { - continue; - } + // skip first byte corresponding to the operation type (Deletion or Addition). let val = &val[1..]; // send a callback to show at which step we are @@ -601,16 +711,51 @@ impl<'a, 'i> Transform<'a, 'i> { total_documents: self.documents_count, }); - // We increment all the field of the current document in the field distribution. - let obkv = KvReader::new(val); - - for (key, _) in obkv.iter() { - let name = - self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Computing field distribution in transform.", - })?; - *field_distribution.entry(name.to_string()).or_insert(0) += 1; + for (key, value) in KvReader::new(val) { + let reader = KvReaderDelAdd::new(value); + match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { + (None, None) => {} + (None, Some(_)) => { + // New field + let name = self.fields_ids_map.name(key).ok_or( + FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + }, + )?; + *field_distribution.entry(name.to_string()).or_insert(0) += 1; + } + (Some(_), None) => { + // Field removed + let name = self.fields_ids_map.name(key).ok_or( + FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + }, + )?; + match field_distribution.entry(name.to_string()) { + BEntry::Vacant(_) => { /* Bug? trying to remove a non-existing field */ + } + BEntry::Occupied(mut entry) => { + // attempt to remove one + match entry.get_mut().checked_sub(1) { + Some(0) => { + entry.remove(); + } + Some(new_val) => { + *entry.get_mut() = new_val; + } + None => { + unreachable!("Attempting to remove a field that wasn't in the field distribution") + } + } + } + } + } + (Some(_), Some(_)) => { + // Value change, no field distribution change + } + } } writer.insert(key, val)?; } @@ -631,9 +776,7 @@ impl<'a, 'i> Transform<'a, 'i> { // We get rids of the `Operation` byte and skip the deleted documents as well. let mut iter = self.flattened_sorter.into_stream_merger_iter()?; while let Some((key, val)) = iter.next()? { - if val[0] == Operation::Deletion as u8 { - continue; - } + // skip first byte corresponding to the operation type (Deletion or Addition). let val = &val[1..]; writer.insert(key, val)?; } @@ -649,15 +792,11 @@ impl<'a, 'i> Transform<'a, 'i> { new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| { fst_new_external_documents_ids_builder.insert(key, value) })?; - let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map(); Ok(TransformOutput { primary_key, fields_ids_map: self.fields_ids_map, field_distribution, - new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(), - new_documents_ids: self.new_documents_ids, - replaced_documents_ids: self.replaced_documents_ids, documents_count: self.documents_count, original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, flattened_documents: flattened_documents @@ -687,37 +826,41 @@ impl<'a, 'i> Transform<'a, 'i> { .to_string(); let field_distribution = self.index.field_distribution(wtxn)?; - // Delete the soft deleted document ids from the maps inside the external_document_ids structure - let new_external_documents_ids = { - let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; - external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; - // This call should be free and can't fail since the previous method merged both fsts. - external_documents_ids.into_static().to_fst()?.into_owned() - }; - let documents_ids = self.index.documents_ids(wtxn)?; let documents_count = documents_ids.len() as usize; - // We create a final writer to write the new documents in order from the sorter. - let mut original_writer = create_writer( + // We initialize the sorter with the user indexing settings. + let mut original_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, self.indexer_settings.chunk_compression_type, self.indexer_settings.chunk_compression_level, - tempfile::tempfile()?, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory.map(|mem| mem / 2), ); - // We create a final writer to write the new documents in order from the sorter. - let mut flattened_writer = create_writer( + // We initialize the sorter with the user indexing settings. + let mut flattened_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, self.indexer_settings.chunk_compression_type, self.indexer_settings.chunk_compression_level, - tempfile::tempfile()?, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory.map(|mem| mem / 2), ); let mut obkv_buffer = Vec::new(); - for result in self.index.all_documents(wtxn)? { - let (docid, obkv) = result?; + let mut document_sorter_key_buffer = Vec::new(); + let mut document_sorter_value_buffer = Vec::new(); + for result in self.index.external_documents_ids().iter(wtxn)? { + let (external_id, docid) = result?; + let obkv = self.index.documents.get(wtxn, &docid)?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, + )?; + let docid = docid.get(); obkv_buffer.clear(); - let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); + let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer); // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. for (id, name) in new_fields_ids_map.iter() { @@ -727,7 +870,17 @@ impl<'a, 'i> Transform<'a, 'i> { } let buffer = obkv_writer.into_inner()?; - original_writer.insert(docid.to_be_bytes(), &buffer)?; + + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + document_sorter_value_buffer.clear(); + into_del_add_obkv( + KvReaderU16::new(buffer), + DelAddOperation::Addition, + &mut document_sorter_value_buffer, + )?; + original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; // Once we have the document. We're going to flatten it // and insert it in the flattened sorter. @@ -762,29 +915,34 @@ impl<'a, 'i> Transform<'a, 'i> { let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; writer.insert(fid, &value)?; } - flattened_writer.insert(docid.to_be_bytes(), &buffer)?; + document_sorter_value_buffer.clear(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Addition, + &mut document_sorter_value_buffer, + )?; + flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; } - // Once we have written all the documents, we extract - // the file and reset the seek to be able to read it again. - let mut original_documents = original_writer.into_inner()?; - original_documents.rewind()?; + let grenad_params = GrenadParameters { + chunk_compression_type: self.indexer_settings.chunk_compression_type, + chunk_compression_level: self.indexer_settings.chunk_compression_level, + max_memory: self.indexer_settings.max_memory, + max_nb_chunks: self.indexer_settings.max_nb_chunks, // default value, may be chosen. + }; - let mut flattened_documents = flattened_writer.into_inner()?; - flattened_documents.rewind()?; + // Once we have written all the documents, we merge everything into a Reader. + let original_documents = sorter_into_reader(original_sorter, grenad_params)?; + + let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; let output = TransformOutput { primary_key, fields_ids_map: new_fields_ids_map, field_distribution, - new_external_documents_ids, - new_documents_ids: documents_ids, - replaced_documents_ids: RoaringBitmap::default(), documents_count, - original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, - flattened_documents: flattened_documents - .into_inner() - .map_err(|err| err.into_error())?, + original_documents: original_documents.into_inner().into_inner(), + flattened_documents: flattened_documents.into_inner().into_inner(), }; let new_facets = output.compute_real_facets(wtxn, self.index)?; @@ -828,38 +986,111 @@ mod test { #[test] fn merge_obkvs() { - let mut doc_0 = Vec::new(); - let mut kv_writer = KvWriter::new(&mut doc_0); + let mut additive_doc_0 = Vec::new(); + let mut deletive_doc_0 = Vec::new(); + let mut del_add_doc_0 = Vec::new(); + let mut kv_writer = KvWriter::memory(); kv_writer.insert(0_u8, [0]).unwrap(); - kv_writer.finish().unwrap(); - doc_0.insert(0, Operation::Addition as u8); - - let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap(); - assert_eq!(*ret, doc_0); - - let ret = merge_obkvs_and_operations( - &[], - &[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())], + let buffer = kv_writer.into_inner().unwrap(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Addition, + &mut additive_doc_0, ) .unwrap(); - assert_eq!(*ret, doc_0); - - let ret = merge_obkvs_and_operations( - &[], - &[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())], + additive_doc_0.insert(0, Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Deletion, + &mut deletive_doc_0, ) .unwrap(); - assert_eq!(*ret, [Operation::Deletion as u8]); + deletive_doc_0.insert(0, Operation::Deletion as u8); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::DeletionAndAddition, + &mut del_add_doc_0, + ) + .unwrap(); + del_add_doc_0.insert(0, Operation::Addition as u8); - let ret = merge_obkvs_and_operations( + let mut additive_doc_1 = Vec::new(); + let mut kv_writer = KvWriter::memory(); + kv_writer.insert(1_u8, [1]).unwrap(); + let buffer = kv_writer.into_inner().unwrap(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Addition, + &mut additive_doc_1, + ) + .unwrap(); + additive_doc_1.insert(0, Operation::Addition as u8); + + let mut additive_doc_0_1 = Vec::new(); + let mut kv_writer = KvWriter::memory(); + kv_writer.insert(0_u8, [0]).unwrap(); + kv_writer.insert(1_u8, [1]).unwrap(); + let buffer = kv_writer.into_inner().unwrap(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Addition, + &mut additive_doc_0_1, + ) + .unwrap(); + additive_doc_0_1.insert(0, Operation::Addition as u8); + + let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())]) + .unwrap(); + assert_eq!(*ret, additive_doc_0); + + let ret = obkvs_merge_additions_and_deletions( + &[], + &[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())], + ) + .unwrap(); + assert_eq!(*ret, del_add_doc_0); + + let ret = obkvs_merge_additions_and_deletions( + &[], + &[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())], + ) + .unwrap(); + assert_eq!(*ret, deletive_doc_0); + + let ret = obkvs_merge_additions_and_deletions( &[], &[ - Cow::from([Operation::Addition as u8, 1].as_slice()), - Cow::from([Operation::Deletion as u8].as_slice()), - Cow::from(doc_0.as_slice()), + Cow::from(additive_doc_1.as_slice()), + Cow::from(deletive_doc_0.as_slice()), + Cow::from(additive_doc_0.as_slice()), ], ) .unwrap(); - assert_eq!(*ret, doc_0); + assert_eq!(*ret, del_add_doc_0); + + let ret = obkvs_merge_additions_and_deletions( + &[], + &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], + ) + .unwrap(); + assert_eq!(*ret, additive_doc_0_1); + + let ret = obkvs_keep_last_addition_merge_deletions( + &[], + &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], + ) + .unwrap(); + assert_eq!(*ret, additive_doc_0); + + let ret = obkvs_keep_last_addition_merge_deletions( + &[], + &[ + Cow::from(deletive_doc_0.as_slice()), + Cow::from(additive_doc_1.as_slice()), + Cow::from(additive_doc_0.as_slice()), + ], + ) + .unwrap(); + assert_eq!(*ret, del_add_doc_0); } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 5895a69c5..4f9f0ef6f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,5 +1,4 @@ -use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; @@ -9,32 +8,40 @@ use charabia::{Language, Script}; use grenad::MergerBuilder; use heed::types::ByteSlice; use heed::RwTxn; +use log::error; +use obkv::{KvReader, KvWriter}; +use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::helpers::{ - self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, + self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, + valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; use crate::distance::NDotProductPoint; use crate::error::UserError; +use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; +use crate::index::db_name::DOCUMENTS; use crate::index::Hnsw; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; -use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32}; +use crate::{ + lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError, BEU32, +}; pub(crate) enum TypedChunk { FieldIdDocidFacetStrings(grenad::Reader), FieldIdDocidFacetNumbers(grenad::Reader), Documents(grenad::Reader), - FieldIdWordcountDocids(grenad::Reader>), - NewDocumentsIds(RoaringBitmap), + FieldIdWordCountDocids(grenad::Reader>), WordDocids { word_docids_reader: grenad::Reader>, exact_word_docids_reader: grenad::Reader>, + word_fid_docids_reader: grenad::Reader>, }, WordPositionDocids(grenad::Reader>), - WordFidDocids(grenad::Reader>), WordPairProximityDocids(grenad::Reader>), FieldIdFacetStringDocids(grenad::Reader>), FieldIdFacetNumberDocids(grenad::Reader>), @@ -43,7 +50,7 @@ pub(crate) enum TypedChunk { FieldIdFacetIsEmptyDocids(grenad::Reader>), GeoPoints(grenad::Reader>), VectorPoints(grenad::Reader>), - ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), + ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } impl TypedChunk { @@ -58,23 +65,22 @@ impl TypedChunk { TypedChunk::Documents(grenad) => { format!("Documents {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::FieldIdWordcountDocids(grenad) => { + TypedChunk::FieldIdWordCountDocids(grenad) => { format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::NewDocumentsIds(grenad) => { - format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len()) - } - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!( - "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}", + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => format!( + "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}", word_docids_reader.len(), - exact_word_docids_reader.len() + exact_word_docids_reader.len(), + word_fid_docids_reader.len() ), TypedChunk::WordPositionDocids(grenad) => { format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::WordFidDocids(grenad) => { - format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len()) - } TypedChunk::WordPairProximityDocids(grenad) => { format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len()) } @@ -99,8 +105,8 @@ impl TypedChunk { TypedChunk::VectorPoints(grenad) => { format!("VectorPoints {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::ScriptLanguageDocids(grenad) => { - format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len()) + TypedChunk::ScriptLanguageDocids(sl_map) => { + format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len()) } } } @@ -119,34 +125,75 @@ pub(crate) fn write_typed_chunk_into_index( let mut is_merged_database = false; match typed_chunk { TypedChunk::Documents(obkv_documents_iter) => { + let mut operations: Vec = Default::default(); + + let mut docids = index.documents_ids(wtxn)?; let mut cursor = obkv_documents_iter.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - index.documents.remap_types::().put(wtxn, key, value)?; + while let Some((key, reader)) = cursor.move_on_next()? { + let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); + let reader: KvReader = KvReader::new(reader); + + let (document_id_bytes, external_id_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?; + let docid = DocumentId::from_be_bytes(document_id_bytes); + let external_id = std::str::from_utf8(external_id_bytes)?; + + for (field_id, value) in reader.iter() { + let del_add_reader = KvReaderDelAdd::new(value); + + if let Some(addition) = del_add_reader.get(DelAdd::Addition) { + writer.insert(field_id, addition)?; + } + } + + let db = index.documents.remap_data_type::(); + + if !writer.is_empty() { + db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?; + operations.push(DocumentOperation { + external_id: external_id.to_string(), + internal_id: docid, + kind: DocumentOperationKind::Create, + }); + docids.insert(docid); + } else { + db.delete(wtxn, &BEU32::new(docid))?; + operations.push(DocumentOperation { + external_id: external_id.to_string(), + internal_id: docid, + kind: DocumentOperationKind::Delete, + }); + docids.remove(docid); + } } + let external_documents_docids = index.external_documents_ids(); + external_documents_docids.apply(wtxn, operations)?; + index.put_documents_ids(wtxn, &docids)?; } - TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => { + TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { append_entries_into_database( fid_word_count_docids_iter, &index.field_id_word_count_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::NewDocumentsIds(documents_ids) => { - return Ok((documents_ids, is_merged_database)) - } - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => { let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; append_entries_into_database( word_docids_iter.clone(), &index.word_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; @@ -155,8 +202,18 @@ pub(crate) fn write_typed_chunk_into_index( &index.exact_word_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + )?; + + let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; + append_entries_into_database( + word_fid_docids_iter, + &index.word_fid_docids, + wtxn, + index_is_empty, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; // create fst from word docids @@ -177,19 +234,8 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_position_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, - )?; - is_merged_database = true; - } - TypedChunk::WordFidDocids(word_fid_docids_iter) => { - append_entries_into_database( - word_fid_docids_iter, - &index.word_fid_docids, - wtxn, - index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -209,8 +255,8 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_exists_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -220,8 +266,8 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_is_null_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -231,8 +277,8 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_is_empty_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -242,8 +288,8 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_pair_proximity_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -252,8 +298,18 @@ pub(crate) fn write_typed_chunk_into_index( index.field_id_docid_facet_f64s.remap_types::(); let mut cursor = fid_docid_facet_number.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + let reader = KvReaderDelAdd::new(value); if valid_lmdb_key(key) { - index_fid_docid_facet_numbers.put(wtxn, key, value)?; + match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { + (None, None) => {} + (None, Some(new)) => index_fid_docid_facet_numbers.put(wtxn, key, new)?, + (Some(_), None) => { + index_fid_docid_facet_numbers.delete(wtxn, key)?; + } + (Some(_), Some(new)) => { + index_fid_docid_facet_numbers.put(wtxn, key, new)? + } + } } } } @@ -262,8 +318,18 @@ pub(crate) fn write_typed_chunk_into_index( index.field_id_docid_facet_strings.remap_types::(); let mut cursor = fid_docid_facet_string.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + let reader = KvReaderDelAdd::new(value); if valid_lmdb_key(key) { - index_fid_docid_facet_strings.put(wtxn, key, value)?; + match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { + (None, None) => {} + (None, Some(new)) => index_fid_docid_facet_strings.put(wtxn, key, new)?, + (Some(_), None) => { + index_fid_docid_facet_strings.delete(wtxn, key)?; + } + (Some(_), Some(new)) => { + index_fid_docid_facet_strings.put(wtxn, key, new)? + } + } } } } @@ -276,57 +342,86 @@ pub(crate) fn write_typed_chunk_into_index( // convert the key back to a u32 (4 bytes) let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - // convert the latitude and longitude back to a f64 (8 bytes) - let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); - let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); - let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; - let xyz_point = lat_lng_to_xyz(&point); - - rtree.insert(GeoPoint::new(xyz_point, (docid, point))); - geo_faceted_docids.insert(docid); + let deladd_obkv = KvReaderDelAdd::new(value); + if let Some(value) = deladd_obkv.get(DelAdd::Deletion) { + let geopoint = extract_geo_point(value, docid); + rtree.remove(&geopoint); + geo_faceted_docids.remove(docid); + } + if let Some(value) = deladd_obkv.get(DelAdd::Addition) { + let geopoint = extract_geo_point(value, docid); + rtree.insert(geopoint); + geo_faceted_docids.insert(docid); + } } index.put_geo_rtree(wtxn, &rtree)?; index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } TypedChunk::VectorPoints(vector_points) => { - let (pids, mut points): (Vec<_>, Vec<_>) = match index.vector_hnsw(wtxn)? { - Some(hnsw) => hnsw.iter().map(|(pid, point)| (pid, point.clone())).unzip(), - None => Default::default(), - }; - - // Convert the PointIds into DocumentIds - let mut docids = Vec::new(); - for pid in pids { - let docid = - index.vector_id_docid.get(wtxn, &BEU32::new(pid.into_inner()))?.unwrap(); - docids.push(docid.get()); + let mut vectors_set = HashSet::new(); + // We extract and store the previous vectors + if let Some(hnsw) = index.vector_hnsw(wtxn)? { + for (pid, point) in hnsw.iter() { + let pid_key = BEU32::new(pid.into_inner()); + let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap().get(); + let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect(); + vectors_set.insert((docid, vector)); + } } - let mut expected_dimensions = points.get(0).map(|p| p.len()); let mut cursor = vector_points.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { // convert the key back to a u32 (4 bytes) let (left, _index) = try_split_array_at(key).unwrap(); let docid = DocumentId::from_be_bytes(left); - // convert the vector back to a Vec - let vector: Vec = pod_collect_to_vec(value); - // TODO Inform the user about the document that has a wrong `_vectors` - let found = vector.len(); - let expected = *expected_dimensions.get_or_insert(found); - if expected != found { - return Err(UserError::InvalidVectorDimensions { expected, found })?; + let vector_deladd_obkv = KvReaderDelAdd::new(value); + if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { + // convert the vector back to a Vec + let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); + let key = (docid, vector); + if !vectors_set.remove(&key) { + error!("Unable to delete the vector: {:?}", key.1); + } + } + if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { + // convert the vector back to a Vec + let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); + vectors_set.insert((docid, vector)); } - - points.push(NDotProductPoint::new(vector)); - docids.push(docid); } - assert_eq!(docids.len(), points.len()); + // Extract the most common vector dimension + let expected_dimension_size = { + let mut dims = HashMap::new(); + vectors_set.iter().for_each(|(_, v)| *dims.entry(v.len()).or_insert(0) += 1); + dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len) + }; + + // Ensure that the vector lengths are correct and + // prepare the vectors before inserting them in the HNSW. + let mut points = Vec::new(); + let mut docids = Vec::new(); + for (docid, vector) in vectors_set { + if expected_dimension_size.map_or(false, |expected| expected != vector.len()) { + return Err(UserError::InvalidVectorDimensions { + expected: expected_dimension_size.unwrap_or(vector.len()), + found: vector.len(), + } + .into()); + } else { + let vector = vector.into_iter().map(OrderedFloat::into_inner).collect(); + points.push(NDotProductPoint::new(vector)); + docids.push(docid); + } + } let hnsw_length = points.len(); let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points); + assert_eq!(docids.len(), pids.len()); + + // Store the vectors in the point-docid relation database index.vector_id_docid.clear(wtxn)?; for (docid, pid) in docids.into_iter().zip(pids) { index.vector_id_docid.put( @@ -339,22 +434,25 @@ pub(crate) fn write_typed_chunk_into_index( log::debug!("There are {} entries in the HNSW so far", hnsw_length); index.put_vector_hnsw(wtxn, &new_hnsw)?; } - TypedChunk::ScriptLanguageDocids(hash_pair) => { - let mut buffer = Vec::new(); - for (key, value) in hash_pair { - buffer.clear(); + TypedChunk::ScriptLanguageDocids(sl_map) => { + for (key, (deletion, addition)) in sl_map { + let mut db_key_exists = false; let final_value = match index.script_language_docids.get(wtxn, &key)? { Some(db_values) => { - let mut db_value_buffer = Vec::new(); - serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?; - let mut new_value_buffer = Vec::new(); - serialize_roaring_bitmap(&value, &mut new_value_buffer)?; - merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?; - RoaringBitmap::deserialize_from(&buffer[..])? + db_key_exists = true; + (db_values - deletion) | addition } - None => value, + None => addition, }; - index.script_language_docids.put(wtxn, &key, &final_value)?; + + if final_value.is_empty() { + // If the database entry exists, delete it. + if db_key_exists { + index.script_language_docids.delete(wtxn, &key)?; + } + } else { + index.script_language_docids.put(wtxn, &key, &final_value)?; + } } } } @@ -362,6 +460,15 @@ pub(crate) fn write_typed_chunk_into_index( Ok((RoaringBitmap::new(), is_merged_database)) } +/// Converts the latitude and longitude back to an xyz GeoPoint. +fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint { + let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); + let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); + let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; + let xyz_point = lat_lng_to_xyz(&point); + GeoPoint::new(xyz_point, (docid, point)) +} + fn merge_word_docids_reader_into_fst( word_docids_iter: grenad::Reader>, exact_word_docids_iter: grenad::Reader>, @@ -379,24 +486,6 @@ fn merge_word_docids_reader_into_fst( Ok(builder.into_set()) } -fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { - let new_value = RoaringBitmap::deserialize_from(new_value)?; - let db_value = RoaringBitmap::deserialize_from(db_value)?; - let value = new_value | db_value; - Ok(serialize_roaring_bitmap(&value, buffer)?) -} - -fn merge_cbo_roaring_bitmaps( - new_value: &[u8], - db_value: &[u8], - buffer: &mut Vec, -) -> Result<()> { - Ok(CboRoaringBitmapCodec::merge_into( - &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], - buffer, - )?) -} - /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( @@ -410,7 +499,7 @@ fn write_entries_into_database( where R: io::Read + io::Seek, FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, - FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, + FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, { puffin::profile_function!(format!("number of entries: {}", data.len())); @@ -422,17 +511,19 @@ where if valid_lmdb_key(key) { buffer.clear(); let value = if index_is_empty { - serialize_value(value, &mut buffer)? + Some(serialize_value(value, &mut buffer)?) } else { match database.get(wtxn, key)? { - Some(prev_value) => { - merge_values(value, prev_value, &mut buffer)?; - &buffer[..] - } - None => serialize_value(value, &mut buffer)?, + Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, + None => Some(serialize_value(value, &mut buffer)?), } }; - database.put(wtxn, key, value)?; + match value { + Some(value) => database.put(wtxn, key, value)?, + None => { + database.delete(wtxn, key)?; + } + } } } @@ -454,7 +545,8 @@ fn append_entries_into_database( where R: io::Read + io::Seek, FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, - FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, + FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, + K: for<'a> heed::BytesDecode<'a>, { puffin::profile_function!(format!("number of entries: {}", data.len())); @@ -475,6 +567,12 @@ where let mut cursor = data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { + debug_assert!( + K::bytes_decode(key).is_some(), + "Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}", + key.len(), + &key + ); buffer.clear(); let value = serialize_value(value, &mut buffer)?; unsafe { database.append(key, value)? }; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 9982957e5..eb2b6e69a 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -1,6 +1,5 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; -pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDeletionResult}; pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ @@ -9,10 +8,6 @@ pub use self::index_documents::{ MergeFn, }; pub use self::indexer_config::IndexerConfig; -pub use self::prefix_word_pairs::{ - PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, - MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, -}; pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; @@ -21,11 +16,10 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; -mod delete_documents; +pub(crate) mod del_add; pub(crate) mod facet; mod index_documents; mod indexer_config; -mod prefix_word_pairs; mod settings; mod update_step; mod word_prefix_docids; diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs deleted file mode 100644 index e3135d546..000000000 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ /dev/null @@ -1,579 +0,0 @@ -use std::borrow::Cow; -use std::collections::HashSet; -use std::io::{BufReader, BufWriter}; - -use grenad::CompressionType; -use heed::types::ByteSlice; - -use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; -use crate::{Index, Result}; - -mod prefix_word; -mod word_prefix; - -pub use prefix_word::index_prefix_word_database; -pub use word_prefix::index_word_prefix_database; - -pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4; -pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2; - -pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - max_proximity: u8, - max_prefix_length: usize, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -} -impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, - ) -> Self { - Self { - wtxn, - index, - max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, - max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, - chunk_compression_type, - chunk_compression_level, - } - } - - #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute<'a>( - self, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &'a [String], - common_prefix_fst_words: &[&'a [String]], - del_prefix_fst_words: &HashSet>, - ) -> Result<()> { - puffin::profile_function!(); - - index_word_prefix_database( - self.wtxn, - self.index.word_pair_proximity_docids, - self.index.word_prefix_pair_proximity_docids, - self.max_proximity, - self.max_prefix_length, - new_word_pair_proximity_docids.clone(), - new_prefix_fst_words, - common_prefix_fst_words, - del_prefix_fst_words, - self.chunk_compression_type, - self.chunk_compression_level, - )?; - - index_prefix_word_database( - self.wtxn, - self.index.word_pair_proximity_docids, - self.index.prefix_word_pair_proximity_docids, - self.max_proximity, - self.max_prefix_length, - new_word_pair_proximity_docids, - new_prefix_fst_words, - common_prefix_fst_words, - del_prefix_fst_words, - self.chunk_compression_type, - self.chunk_compression_level, - )?; - - Ok(()) - } -} - -// This is adapted from `sorter_into_lmdb_database` -pub fn insert_into_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - new_key: &[u8], - new_value: &[u8], -) -> Result<()> { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; - match iter.next().transpose()? { - Some((key, old_val)) if new_key == key => { - let val = - merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) - .map_err(|_| { - // TODO just wrap this error? - crate::error::InternalError::IndexingMergingKeys { - process: "get-put-merge", - } - })?; - // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour - unsafe { iter.put_current(new_key, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; - } - } - Ok(()) -} - -// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, -// but it uses `append` if the database is empty, and it assumes that the values in the -// writer don't conflict with values in the database. -pub fn write_into_lmdb_database_without_merging( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - writer: grenad::Writer>, -) -> Result<()> { - let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?; - let reader = grenad::Reader::new(BufReader::new(file))?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - // safety: the key comes from the grenad reader, not the database - unsafe { out_iter.append(k, v)? }; - } - } else { - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - Ok(()) -} - -#[cfg(test)] -mod tests { - use std::io::Cursor; - use std::iter::FromIterator; - - use roaring::RoaringBitmap; - - use crate::db_snap; - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::update::{DeleteDocuments, DeletionStrategy, IndexDocumentsMethod}; - - fn documents_with_enough_different_words_for_prefixes( - prefixes: &[&str], - start_id: usize, - ) -> Vec { - let mut documents = Vec::new(); - let mut id = start_id; - for prefix in prefixes { - for i in 0..50 { - documents.push( - serde_json::json!({ - "id": id, - "text": format!("{prefix}{i:x}"), - }) - .as_object() - .unwrap() - .clone(), - ); - id += 1; - } - } - documents - } - - #[test] - fn add_new_documents() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": "9000", - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": "9001", - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100); - documents.push( - serde_json::json!({ - "id": "9002", - "text": "At an extraordinary house" - }) - .as_object() - .unwrap() - .clone(), - ); - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_pair_proximity_docids, "update"); - db_snap!(index, word_prefix_pair_proximity_docids, "update"); - db_snap!(index, prefix_word_pair_proximity_docids, "update"); - } - #[test] - fn batch_bug_3043() { - // https://github.com/meilisearch/meilisearch/issues/3043 - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "text": "x y" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "text": "x a y" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_pair_proximity_docids); - db_snap!(index, word_prefix_pair_proximity_docids); - db_snap!(index, prefix_word_pair_proximity_docids); - } - - #[test] - fn hard_delete_and_reupdate() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysHard); - delete.delete_documents(&RoaringBitmap::from_iter([50])); - delete.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, "first_delete"); - db_snap!(index, word_docids, "first_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); - - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysHard); - delete.delete_documents(&RoaringBitmap::from_iter(0..50)); - delete.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, "second_delete"); - db_snap!(index, word_docids, "second_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "reupdate"); - db_snap!(index, word_docids, "reupdate"); - db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); - db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); - } - - #[test] - fn soft_delete_and_reupdate() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysSoft); - delete.delete_documents(&RoaringBitmap::from_iter([50])); - delete.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, "first_delete"); - db_snap!(index, word_docids, "first_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); - - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysSoft); - - delete.delete_documents(&RoaringBitmap::from_iter(0..50)); - delete.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, "second_delete"); - db_snap!(index, word_docids, "second_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "reupdate"); - db_snap!(index, word_docids, "reupdate"); - db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); - db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); - } - - #[test] - fn replace_soft_deletion() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "replaced"); - db_snap!(index, word_docids, "replaced"); - db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); - db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); - db_snap!(index, soft_deleted_documents_ids, "replaced", @"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, ]"); - } - - #[test] - fn replace_hard_deletion() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; - index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "replaced"); - db_snap!(index, word_docids, "replaced"); - db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); - db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); - db_snap!(index, soft_deleted_documents_ids, "replaced", @"[]"); - } -} diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs deleted file mode 100644 index 1ec66d010..000000000 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ /dev/null @@ -1,182 +0,0 @@ -use std::borrow::Cow; -use std::collections::{BTreeMap, HashSet}; - -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; - -use crate::update::index_documents::{create_writer, CursorClonableMmap}; -use crate::update::prefix_word_pairs::{ - insert_into_database, write_into_lmdb_database_without_merging, -}; -use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; - -#[allow(clippy::too_many_arguments)] -#[logging_timer::time] -pub fn index_prefix_word_database( - wtxn: &mut heed::RwTxn, - word_pair_proximity_docids: heed::Database, - prefix_word_pair_proximity_docids: heed::Database, - max_proximity: u8, - max_prefix_length: usize, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &[String], - common_prefix_fst_words: &[&[String]], - del_prefix_fst_words: &HashSet>, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -) -> Result<()> { - puffin::profile_function!(); - - let max_proximity = max_proximity - 1; - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - let common_prefixes: Vec<_> = common_prefix_fst_words - .iter() - .flat_map(|s| s.iter()) - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length) - .collect(); - - for proximity in 1..max_proximity { - for prefix in common_prefixes.iter() { - let mut prefix_key = vec![proximity]; - prefix_key.extend_from_slice(prefix.as_bytes()); - let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - proximity, - prefix.as_bytes(), - // the next two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.next()? { - let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) - .ok_or(heed::Error::Decoding)?; - Ok(Some((word2, value))) - } else { - Ok(None) - } - }, - // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) - |key, value| { - insert_into_database( - wtxn, - *prefix_word_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - let new_prefixes: Vec<_> = new_prefix_fst_words - .iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length) - .collect(); - - // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) - // element in an intermediary grenad - let mut writer = - create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); - - for proximity in 1..max_proximity { - for prefix in new_prefixes.iter() { - let mut prefix_key = vec![proximity]; - prefix_key.extend_from_slice(prefix.as_bytes()); - let mut db_iter = word_pair_proximity_docids - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? - .remap_key_type::(); - execute_on_word_pairs_and_prefixes( - proximity, - prefix.as_bytes(), - &mut db_iter, - |db_iter| { - db_iter - .next() - .transpose() - .map(|x| x.map(|((_, _, word2), value)| (word2, value))) - .map_err(|e| e.into()) - }, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - } - } - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - wtxn, - *prefix_word_pair_proximity_docids.as_polymorph(), - writer, - )?; - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = - prefix_word_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; - while let Some(((_, prefix, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(prefix.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; - } - } - } - - Ok(()) -} - -/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database. -/// -/// Its arguments are: -/// - an iterator over the words following the given `prefix` with the given `proximity` -/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements -fn execute_on_word_pairs_and_prefixes( - proximity: u8, - prefix: &[u8], - iter: &mut I, - mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result>, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, -) -> Result<()> { - let mut batch: BTreeMap, Vec>> = BTreeMap::default(); - - // Memory usage check: - // The content of the loop will be called for each `word2` that follows a word beginning - // with `prefix` with the given proximity. - // In practice, I don't think the batch can ever get too big. - while let Some((word2, docids)) = next_word2_and_docids(iter)? { - let entry = batch.entry(word2.to_owned()).or_default(); - entry.push(Cow::Owned(docids.to_owned())); - } - - let mut key_buffer = Vec::with_capacity(512); - key_buffer.push(proximity); - key_buffer.extend_from_slice(prefix); - key_buffer.push(0); - - let mut value_buffer = Vec::with_capacity(65_536); - - for (word2, docids) in batch { - key_buffer.truncate(prefix.len() + 2); - value_buffer.clear(); - - key_buffer.extend_from_slice(&word2); - let data = if docids.len() > 1 { - CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?; - value_buffer.as_slice() - } else { - &docids[0] - }; - insert(key_buffer.as_slice(), data)?; - } - Ok(()) -} diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 6609786a3..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,20 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [101, ] -1 a amazing [100, ] -1 a an [100, ] -1 a and [100, ] -1 a beautiful [100, ] -1 b house [100, ] -1 b rings [101, ] -1 be house [100, ] -1 be rings [101, ] -2 a am [101, ] -2 a amazing [100, ] -2 a and [100, ] -2 a beautiful [100, ] -2 a house [100, ] -2 b at [101, ] -2 be at [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 52b29e136..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,23 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [101, ] -1 amazing a [100, ] -1 an a [100, ] -1 and b [100, ] -1 and be [100, ] -1 at a [100, ] -1 rings a [101, ] -1 the b [101, ] -1 the be [101, ] -2 amazing b [100, ] -2 amazing be [100, ] -2 an a [100, ] -2 at a [100, 101, ] -2 bell a [101, ] -3 an b [100, ] -3 an be [100, ] -3 at a [100, ] -3 rings a [101, ] -3 the a [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 7644c433d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,29 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [101, ] -1 a amazing [100, ] -1 a an [100, 202, ] -1 a and [100, ] -1 a beautiful [100, ] -1 a extraordinary [202, ] -1 am and [100, ] -1 an amazing [100, ] -1 an beautiful [100, ] -1 an extraordinary [202, ] -1 b house [100, ] -1 b rings [101, ] -1 be house [100, ] -1 be rings [101, ] -2 a am [101, ] -2 a amazing [100, ] -2 a and [100, ] -2 a beautiful [100, ] -2 a extraordinary [202, ] -2 a house [100, 202, ] -2 am beautiful [100, ] -2 an and [100, ] -2 an house [100, 202, ] -2 b at [101, ] -2 be at [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap deleted file mode 100644 index 1b56974c2..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap +++ /dev/null @@ -1,33 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 am [101, ] -1 amazing and [100, ] -1 an amazing [100, ] -1 an extraordinary [202, ] -1 and beautiful [100, ] -1 at 5 [101, ] -1 at an [100, 202, ] -1 beautiful house [100, ] -1 bell rings [101, ] -1 extraordinary house [202, ] -1 rings at [101, ] -1 the bell [101, ] -2 amazing beautiful [100, ] -2 an and [100, ] -2 an house [202, ] -2 and house [100, ] -2 at am [101, ] -2 at amazing [100, ] -2 at extraordinary [202, ] -2 bell at [101, ] -2 rings 5 [101, ] -2 the rings [101, ] -3 amazing house [100, ] -3 an beautiful [100, ] -3 at and [100, ] -3 at house [202, ] -3 bell 5 [101, ] -3 rings am [101, ] -3 the at [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 008a4b21d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,31 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [101, ] -1 5 am [101, ] -1 amazing a [100, ] -1 amazing an [100, ] -1 an a [100, ] -1 an am [100, ] -1 and b [100, ] -1 and be [100, ] -1 at a [100, 202, ] -1 at an [100, 202, ] -1 rings a [101, ] -1 the b [101, ] -1 the be [101, ] -2 amazing b [100, ] -2 amazing be [100, ] -2 an a [100, ] -2 an an [100, ] -2 at a [100, 101, ] -2 at am [100, 101, ] -2 bell a [101, ] -3 an b [100, ] -3 an be [100, ] -3 at a [100, ] -3 at an [100, ] -3 rings a [101, ] -3 rings am [101, ] -3 the a [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index d212999bb..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap deleted file mode 100644 index 816895dcf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap +++ /dev/null @@ -1,8 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a y [51, ] -1 x a [51, ] -1 x y [50, ] -2 x y [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 03530a2f1..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a y [51, ] -1 x y [50, ] -2 x y [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap deleted file mode 100644 index 39e9fbe65..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 61987fd4a..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -2 a am [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap deleted file mode 100644 index 1caf1a9a3..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap +++ /dev/null @@ -1,60 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -at [51, ] -bell [51, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 618a0b076..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 rings a [51, ] -2 at a [51, ] -2 bell a [51, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap deleted file mode 100644 index 39e9fbe65..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 267a1c01d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 b rings [51, ] -2 b at [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap deleted file mode 100644 index e5336d58c..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap +++ /dev/null @@ -1,60 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -am [51, ] -at [51, ] -b0 [0, ] -b1 [1, ] -b10 [16, ] -b11 [17, ] -b12 [18, ] -b13 [19, ] -b14 [20, ] -b15 [21, ] -b16 [22, ] -b17 [23, ] -b18 [24, ] -b19 [25, ] -b1a [26, ] -b1b [27, ] -b1c [28, ] -b1d [29, ] -b1e [30, ] -b1f [31, ] -b2 [2, ] -b20 [32, ] -b21 [33, ] -b22 [34, ] -b23 [35, ] -b24 [36, ] -b25 [37, ] -b26 [38, ] -b27 [39, ] -b28 [40, ] -b29 [41, ] -b2a [42, ] -b2b [43, ] -b2c [44, ] -b2d [45, ] -b2e [46, ] -b2f [47, ] -b3 [3, ] -b30 [48, ] -b31 [49, ] -b4 [4, ] -b5 [5, ] -b6 [6, ] -b7 [7, ] -b8 [8, ] -b9 [9, ] -ba [10, ] -bb [11, ] -bc [12, ] -bd [13, ] -be [14, ] -bell [51, ] -bf [15, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 4cdf756ac..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 the b [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap deleted file mode 100644 index 4dca775e6..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 61987fd4a..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -2 a am [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap deleted file mode 100644 index 7949d464e..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -am [51, ] -at [51, ] -bell [51, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 618a0b076..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 rings a [51, ] -2 at a [51, ] -2 bell a [51, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 78b6a3885..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,9 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a amazing [50, ] -1 a an [50, ] -1 a house [50, ] -2 a amazing [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap deleted file mode 100644 index 8c7809973..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap +++ /dev/null @@ -1,61 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -amazing [50, ] -an [50, ] -at [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 65d8b806b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 an a [50, ] -1 at a [50, ] -2 at a [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap deleted file mode 100644 index 775d41a3d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 54c9e4b9b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 b rings [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap deleted file mode 100644 index f86fdcb8b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap +++ /dev/null @@ -1,61 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -amazing [50, ] -an [50, ] -at [50, ] -b0 [52, ] -b1 [53, ] -b10 [68, ] -b11 [69, ] -b12 [70, ] -b13 [71, ] -b14 [72, ] -b15 [73, ] -b16 [74, ] -b17 [75, ] -b18 [76, ] -b19 [77, ] -b1a [78, ] -b1b [79, ] -b1c [80, ] -b1d [81, ] -b1e [82, ] -b1f [83, ] -b2 [54, ] -b20 [84, ] -b21 [85, ] -b22 [86, ] -b23 [87, ] -b24 [88, ] -b25 [89, ] -b26 [90, ] -b27 [91, ] -b28 [92, ] -b29 [93, ] -b2a [94, ] -b2b [95, ] -b2c [96, ] -b2d [97, ] -b2e [98, ] -b2f [99, ] -b3 [55, ] -b30 [100, ] -b31 [101, ] -b4 [56, ] -b5 [57, ] -b6 [58, ] -b7 [59, ] -b8 [60, ] -b9 [61, ] -ba [62, ] -bb [63, ] -bc [64, ] -bd [65, ] -be [66, ] -bell [51, ] -bf [67, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 4cdf756ac..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 the b [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 78b6a3885..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,9 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a amazing [50, ] -1 a an [50, ] -1 a house [50, ] -2 a amazing [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap deleted file mode 100644 index 8c7809973..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap +++ /dev/null @@ -1,61 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -amazing [50, ] -an [50, ] -at [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 65d8b806b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 an a [50, ] -1 at a [50, ] -2 at a [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap deleted file mode 100644 index 775d41a3d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 0241f26a5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a amazing [50, ] -1 a an [50, ] -1 a house [50, ] -1 b rings [51, ] -2 a amazing [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap deleted file mode 100644 index 6a481eeee..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5f6443e54fae188aa96d4f27fce28939 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index d20582970..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,8 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 an a [50, ] -1 at a [50, ] -1 the b [51, ] -2 at a [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap deleted file mode 100644 index 39e9fbe65..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap deleted file mode 100644 index c8a1e54b4..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index db62b6566..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,17 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -1 b house [50, ] -1 b rings [51, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] -2 b at [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap deleted file mode 100644 index 7fd726325..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -9f4866b80177e321a33ce434992022b5 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 2ea0d46f4..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,19 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 and b [50, ] -1 at a [50, ] -1 rings a [51, ] -1 the b [51, ] -2 amazing b [50, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 an b [50, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap deleted file mode 100644 index 4dca775e6..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs deleted file mode 100644 index 570adece9..000000000 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ /dev/null @@ -1,728 +0,0 @@ -/*! -The word-prefix-pair-proximity-docids database is a database whose keys are of -the form `(proximity, word, prefix)` and the values are roaring bitmaps of -the documents which contain `word` followed by another word starting with -`prefix` at a distance of `proximity`. - -The prefixes present in this database are only those that correspond to many -different words in the documents. - -## How is it created/updated? (simplified version) -To compute it, we have access to (mainly) two inputs: - -* a list of sorted prefixes, such as: -```text -c -ca -cat -d -do -dog -``` -Note that only prefixes which correspond to more than a certain number of -different words from the database are included in this list. - -* a sorted list of proximities and word pairs (the proximity is the distance between the two words), -associated with a roaring bitmap, such as: -```text -1 good doggo -> docids1: [8] -1 good door -> docids2: [7, 19, 20] -1 good ghost -> docids3: [1] -2 good dog -> docids4: [2, 5, 6] -2 horror cathedral -> docids5: [1, 2] -``` - -I illustrate a simplified version of the algorithm to create the word-prefix -pair-proximity database below: - -1. **Outer loop:** First, we iterate over each proximity and word pair: -```text -proximity: 1 -word1 : good -word2 : doggo -``` -2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are -in the list of sorted prefixes. And we insert the key `prefix` -and the value (`docids`) to a sorted map which we call the “batch”. For example, -at the end of the first outer loop, we may have: -```text -Outer loop 1: ------------------------------- -proximity: 1 -word1 : good -word2 : doggo -docids : docids1 - -prefixes: [d, do, dog] - -batch: [ - d, -> [docids1] - do -> [docids1] - dog -> [docids1] -] -``` -3. For illustration purpose, let's run through a second iteration of the outer loop: -```text -Outer loop 2: ------------------------------- -proximity: 1 -word1 : good -word2 : door -docids : docids2 - -prefixes: [d, do, doo] - -batch: [ - d -> [docids1, docids2] - do -> [docids1, docids2] - dog -> [docids1] - doo -> [docids2] -] -``` -Notice that there were some conflicts which were resolved by merging the -conflicting values together. Also, an additional prefix was added at the -end of the batch. - -4. On the third iteration of the outer loop, we have: -```text -Outer loop 3: ------------------------------- -proximity: 1 -word1 : good -word2 : ghost -``` -Because `word2` begins with a different letter than the previous `word2`, -we know that all the prefixes of `word2` are greater than the prefixes of the previous word2 - -Therefore, we know that we can insert every element from the batch into the -database before proceeding any further. This operation is called -“flushing the batch”. Flushing the batch should also be done whenever: -* `proximity` is different than the previous `proximity`. -* `word1` is different than the previous `word1`. -* `word2` starts with a different letter than the previous word2 - -6. **Flushing the batch:** to flush the batch, we iterate over its elements: -```text -Flushing Batch loop 1: ------------------------------- -proximity : 1 -word1 : good -prefix : d - -docids : [docids2, docids3] -``` -We then merge the array of `docids` (of type `Vec>`) using -`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a -roaring bitmap of all the document ids where `word1` is followed by `prefix` -at a distance of `proximity`. -Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids` -into the database. - -7. That's it! ... except... - -## How is it created/updated (continued) - -I lied a little bit about the input data. In reality, we get two sets of the -inputs described above, which come from different places: - -* For the list of sorted prefixes, we have: - 1. `new_prefixes`, which are all the prefixes that were not present in the - database before the insertion of the new documents - - 2. `common_prefixes` which are the prefixes that are present both in the - database and in the newly added documents - -* For the list of word pairs and proximities, we have: - 1. `new_word_pairs`, which is the list of word pairs and their proximities - present in the newly added documents - - 2. `word_pairs_db`, which is the list of word pairs from the database. - This list includes all elements in `new_word_pairs` since `new_word_pairs` - was added to the database prior to calling the `WordPrefix::execute` - function. - -To update the prefix database correctly, we call the algorithm described earlier first -on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). -Thus: - -1. For all the word pairs that were already present in the DB, we insert them -again with the `new_prefixes`. Calling the algorithm on them with the -`common_prefixes` would not result in any new data. - -2. For all the new word pairs, we insert them twice: first with the `common_prefixes`, -and then, because they are part of `word_pairs_db`, with the `new_prefixes`. - -Note, also, that since we read data from the database when iterating over -`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- -docids from the batch directly into the database (we would have a concurrent -reader and writer). Therefore, when calling the algorithm on -`(new_prefixes, word_pairs_db)`, we insert the computed -`((proximity, word, prefix), docids)` elements in an intermediary grenad -Writer instead of the DB. At the end of the outer loop, we finally read from -the grenad and insert its elements in the database. -*/ - -use std::borrow::Cow; -use std::collections::HashSet; - -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; - -use crate::update::index_documents::{create_writer, CursorClonableMmap}; -use crate::update::prefix_word_pairs::{ - insert_into_database, write_into_lmdb_database_without_merging, -}; -use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; - -#[allow(clippy::too_many_arguments)] -#[logging_timer::time] -pub fn index_word_prefix_database( - wtxn: &mut heed::RwTxn, - word_pair_proximity_docids: heed::Database, - word_prefix_pair_proximity_docids: heed::Database, - max_proximity: u8, - max_prefix_length: usize, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &[String], - common_prefix_fst_words: &[&[String]], - del_prefix_fst_words: &HashSet>, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -) -> Result<()> { - puffin::profile_function!(); - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length - let prefixes = PrefixTrieNode::from_sorted_prefixes( - common_prefix_fst_words - .iter() - .flat_map(|s| s.iter()) - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length), - ); - - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (proximity, word1, common_prefix) elements - // to insert in the DB - if !prefixes.is_empty() { - let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - // the first two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.move_on_next()? { - let (proximity, word1, word2) = - UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - Ok(Some(((proximity, word1, word2), value))) - } else { - Ok(None) - } - }, - &prefixes, - max_proximity, - // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap) - |key, value| { - insert_into_database( - wtxn, - *word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - - let prefixes = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words.iter().map(|s| s.as_str()).filter(|s| s.len() <= max_prefix_length), - ); - - if !prefixes.is_empty() { - let mut db_iter = word_pair_proximity_docids - .remap_key_type::() - .remap_data_type::() - .iter(wtxn)?; - - // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) - // element in an intermediary grenad - let mut writer = - create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); - - execute_on_word_pairs_and_prefixes( - &mut db_iter, - |db_iter| db_iter.next().transpose().map_err(|e| e.into()), - &prefixes, - max_proximity, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - wtxn, - *word_prefix_pair_proximity_docids.as_polymorph(), - writer, - )?; - } - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = - word_prefix_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; - while let Some(((_, _, prefix), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(prefix.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; - } - } - } - - Ok(()) -} - -/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. -/// -/// Its main arguments are: -/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements -/// 2. a prefix trie -/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements -/// -/// For more information about what this function does, read the module documentation. -fn execute_on_word_pairs_and_prefixes( - iter: &mut I, - mut next_word_pair_proximity: impl for<'a> FnMut( - &'a mut I, - ) -> Result< - Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>, - >, - prefixes: &PrefixTrieNode, - max_proximity: u8, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, -) -> Result<()> { - let mut batch = PrefixAndProximityBatch::default(); - let mut prev_word2_start = 0; - - // Optimisation: the index at the root of the prefix trie where to search for - let mut prefix_search_start = PrefixTrieNodeSearchStart(0); - - // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter - let mut empty_prefixes = false; - - let mut prefix_buffer = Vec::with_capacity(8); - let mut merge_buffer = Vec::with_capacity(65_536); - - while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { - // stop indexing if the proximity is over the threshold - if proximity > max_proximity { - break; - }; - let word2_start_different_than_prev = word2[0] != prev_word2_start; - // if there were no potential prefixes for the previous word2 based on its first letter, - // and if the current word2 starts with the same letter, then there is also no potential - // prefixes for the current word2, and we can skip to the next iteration - if empty_prefixes && !word2_start_different_than_prev { - continue; - } - - // if the proximity is different to the previous one, OR - // if word1 is different than the previous word1, OR - // if the start of word2 is different than the previous start of word2, - // THEN we'll need to flush the batch - let prox_different_than_prev = proximity != batch.proximity; - let word1_different_than_prev = word1 != batch.word1; - if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev - { - batch.flush(&mut merge_buffer, &mut insert)?; - batch.proximity = proximity; - // don't forget to reset the value of batch.word1 and prev_word2_start - if word1_different_than_prev { - batch.word1.clear(); - batch.word1.extend_from_slice(word1); - } - if word2_start_different_than_prev { - prev_word2_start = word2[0]; - } - prefix_search_start.0 = 0; - // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2 - empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); - } - - if !empty_prefixes { - // All conditions are satisfied, we can now insert each new prefix of word2 into the batch - prefix_buffer.clear(); - prefixes.for_each_prefix_of( - word2, - &mut prefix_buffer, - &prefix_search_start, - |prefix_buffer| { - batch.insert(prefix_buffer, data.to_vec()); - }, - ); - } - } - batch.flush(&mut merge_buffer, &mut insert)?; - Ok(()) -} -/** -A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps). -The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. - -It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently. - -The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content -can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: -- key : (proximity, word1, prefix) as bytes -- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes -*/ -#[derive(Default)] -struct PrefixAndProximityBatch { - proximity: u8, - word1: Vec, - #[allow(clippy::type_complexity)] - batch: Vec<(Vec, Vec>)>, -} - -impl PrefixAndProximityBatch { - /// Insert the new key and value into the batch - /// - /// The key must either exist in the batch or be greater than all existing keys - fn insert(&mut self, new_key: &[u8], new_value: Vec) { - match self.batch.iter_mut().find(|el| el.0 == new_key) { - Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)), - None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])), - } - } - - /// Empties the batch, calling `insert` on each element. - /// - /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap. - fn flush( - &mut self, - merge_buffer: &mut Vec, - insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, - ) -> Result<()> { - let PrefixAndProximityBatch { proximity, word1, batch } = self; - if batch.is_empty() { - return Ok(()); - } - merge_buffer.clear(); - - let mut buffer = Vec::with_capacity(word1.len() + 1 + 6); - buffer.push(*proximity); - buffer.extend_from_slice(word1); - buffer.push(0); - - for (key, mergeable_data) in batch.drain(..) { - buffer.truncate(1 + word1.len() + 1); - buffer.extend_from_slice(key.as_slice()); - - let data = if mergeable_data.len() > 1 { - CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; - merge_buffer.as_slice() - } else { - &mergeable_data[0] - }; - insert(buffer.as_slice(), data)?; - merge_buffer.clear(); - } - - Ok(()) - } -} - -/** A prefix trie. Used to iterate quickly over the prefixes of a word that are -within a set. - -## Structure -The trie is made of nodes composed of: -1. a byte character (e.g. 'a') -2. whether the node is an end node or not -3. a list of children nodes, sorted by their byte character - -For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]` -is drawn below. Nodes with a double border are "end nodes". - -┌──────────────────────┐ ┌──────────────────────┐ ╔══════════════════════╗ -│ a │ │ c │ ║ r ║ -└──────────────────────┘ └──────────────────────┘ ╚══════════════════════╝ -╔══════╗╔══════╗╔══════╗ ┌─────────┐ ╔═════════╗ ┌─────────┐ ╔══════════╗ -║ c ║║ e ║║ r ║ │ e │ ║ h ║ │ e │ ║ i ║ -╚══════╝╚══════╝╚══════╝ └─────────┘ ╚═════════╝ └─────────┘ ╚══════════╝ - ╔═══╗ ╔═══╗ ╔═══╗ - ║ i ║ ║ l ║ ║ l ║ - ╚═══╝ ╚═══╝ ╚═══╝ -*/ -#[derive(Default, Debug)] -struct PrefixTrieNode { - children: Vec<(PrefixTrieNode, u8)>, - is_end_node: bool, -} - -#[derive(Debug)] -struct PrefixTrieNodeSearchStart(usize); - -impl PrefixTrieNode { - fn is_empty(&self) -> bool { - self.children.is_empty() - } - - /// Returns false if the trie does not contain a prefix of the given word. - /// Returns true if the trie *may* contain a prefix of the given word. - /// - /// Moves the search start to the first node equal to the first letter of the word, - /// or to 0 otherwise. - fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { - let byte = word[0]; - if self.children[search_start.0].1 == byte { - true - } else { - match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { - Ok(position) => { - search_start.0 += position; - true - } - Err(_) => { - search_start.0 = 0; - false - } - } - } - } - - fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { - let mut node = PrefixTrieNode::default(); - for prefix in prefixes { - node.insert_sorted_prefix(prefix.as_bytes().iter()); - } - node - } - fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter) { - if let Some(&c) = prefix.next() { - if let Some((node, byte)) = self.children.last_mut() { - if *byte == c { - node.insert_sorted_prefix(prefix); - return; - } - } - let mut new_node = PrefixTrieNode::default(); - new_node.insert_sorted_prefix(prefix); - self.children.push((new_node, c)); - } else { - self.is_end_node = true; - } - } - - /// Call the given closure on each prefix of the word contained in the prefix trie. - /// - /// The search starts from the given `search_start`. - fn for_each_prefix_of( - &self, - word: &[u8], - buffer: &mut Vec, - search_start: &PrefixTrieNodeSearchStart, - mut do_fn: impl FnMut(&mut Vec), - ) { - let first_byte = word[0]; - let mut cur_node = self; - buffer.push(first_byte); - if let Some((child_node, c)) = - cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) - { - if *c == first_byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - for &byte in &word[1..] { - buffer.push(byte); - if let Some((child_node, c)) = - cur_node.children.iter().find(|(_, c)| *c >= byte) - { - if *c == byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - } else { - break; - } - } else { - break; - } - } - } - } - } -} -#[cfg(test)] -mod tests { - use roaring::RoaringBitmap; - - use super::*; - use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; - - fn check_prefixes( - trie: &PrefixTrieNode, - search_start: &PrefixTrieNodeSearchStart, - word: &str, - expected_prefixes: &[&str], - ) { - let mut actual_prefixes = vec![]; - trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), search_start, |x| { - let s = String::from_utf8(x.to_owned()).unwrap(); - actual_prefixes.push(s); - }); - assert_eq!(actual_prefixes, expected_prefixes); - } - - #[test] - fn test_trie() { - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", - "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", - "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", - "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", - "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", - "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", - "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", - "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", - "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", - "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", - "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", - "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", - "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", - "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", - "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", - "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", - ])); - - let mut search_start = PrefixTrieNodeSearchStart(0); - - let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(search_start.0, 2); - - check_prefixes(&trie, &search_start, "affair", &["a"]); - check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); - - let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(trie.children[search_start.0].1, b'u'); - - check_prefixes(&trie, &search_start, "unique", &["u", "un"]); - - // NOTE: this should fail, because the search start is already beyong 'a' - let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); - assert!(!is_empty); - // search start is reset - assert_eq!(search_start.0, 0); - - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); - check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); - } - - #[test] - fn test_execute_on_word_pairs_and_prefixes() { - let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - - let mut serialised_bitmap123 = vec![]; - let mut bitmap123 = RoaringBitmap::new(); - bitmap123.insert(1); - bitmap123.insert(2); - bitmap123.insert(3); - CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); - - let mut serialised_bitmap456 = vec![]; - let mut bitmap456 = RoaringBitmap::new(); - bitmap456.insert(4); - bitmap456.insert(5); - bitmap456.insert(6); - CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); - - let mut serialised_bitmap789 = vec![]; - let mut bitmap789 = RoaringBitmap::new(); - bitmap789.insert(7); - bitmap789.insert(8); - bitmap789.insert(9); - CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); - - let mut serialised_bitmap_ranges = vec![]; - let mut bitmap_ranges = RoaringBitmap::new(); - bitmap_ranges.insert_range(63_000..65_000); - bitmap_ranges.insert_range(123_000..128_000); - CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); - - let word_pairs = [ - ((1, "healthy", "arbres"), &serialised_bitmap123), - ((1, "healthy", "boat"), &serialised_bitmap123), - ((1, "healthy", "ca"), &serialised_bitmap123), - ((1, "healthy", "cats"), &serialised_bitmap456), - ((1, "healthy", "cattos"), &serialised_bitmap123), - ((1, "jittery", "cat"), &serialised_bitmap123), - ((1, "jittery", "cata"), &serialised_bitmap456), - ((1, "jittery", "catb"), &serialised_bitmap789), - ((1, "jittery", "catc"), &serialised_bitmap_ranges), - ((2, "healthy", "arbre"), &serialised_bitmap123), - ((2, "healthy", "arbres"), &serialised_bitmap456), - ((2, "healthy", "cats"), &serialised_bitmap789), - ((2, "healthy", "cattos"), &serialised_bitmap_ranges), - ((3, "healthy", "arbre"), &serialised_bitmap456), - ((3, "healthy", "arbres"), &serialised_bitmap789), - ]; - - let expected_result = [ - ((1, "healthy", "arb"), bitmap123.clone()), - ((1, "healthy", "arbre"), bitmap123.clone()), - ((1, "healthy", "cat"), &bitmap456 | &bitmap123), - ((1, "healthy", "catto"), bitmap123.clone()), - ((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), - ((2, "healthy", "arb"), &bitmap123 | &bitmap456), - ((2, "healthy", "arbre"), &bitmap123 | &bitmap456), - ((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges), - ((2, "healthy", "catto"), bitmap_ranges.clone()), - ]; - - let mut result = vec![]; - - let mut iter = - IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| { - ((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice()) - }); - execute_on_word_pairs_and_prefixes( - &mut iter, - |iter| Ok(iter.next()), - &prefixes, - 2, - |k, v| { - let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap(); - let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); - result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap)); - Ok(()) - }, - ) - .unwrap(); - - for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { - let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x; - let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y; - - assert_eq!(actual_word1, expected_word1); - assert_eq!(actual_prefix, expected_prefix); - assert_eq!(actual_proximity, expected_proximity); - assert_eq!(actual_bitmap, expected_bitmap); - } - } -} diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c2c0e9084..fd7ffa760 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -923,7 +923,7 @@ mod tests { use super::*; use crate::error::Error; use crate::index::tests::TempIndex; - use crate::update::{ClearDocuments, DeleteDocuments}; + use crate::update::ClearDocuments; use crate::{Criterion, Filter, SearchResult}; #[test] @@ -1768,13 +1768,9 @@ mod tests { } index.add_documents(documents! { docs }).unwrap(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - (0..5).for_each(|id| { - builder.delete_external_id(&id.to_string()); - }); - builder.execute().unwrap(); + index.delete_documents((0..5).map(|id| id.to_string()).collect()); + let mut wtxn = index.write_txn().unwrap(); index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_searchable_fields(vec!["id".to_string()]); diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap deleted file mode 100644 index 6d69b2ffb..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap deleted file mode 100644 index 88d3a98aa..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -benoit [2, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap deleted file mode 100644 index 6d69b2ffb..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index 9139b7a05..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[0, 1, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap deleted file mode 100644 index 15c881e87..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -benoit [2, ] -kevin [0, ] -kevina [1, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap deleted file mode 100644 index 87856f6dc..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -2 0 2.2 1 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap deleted file mode 100644 index a7ee4348d..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] -2 [20, 21, 22, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap deleted file mode 100644 index cfa649653..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -2 0 1.2 1 [20, 22, ] -2 0 2.2 1 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap deleted file mode 100644 index 8336bd712..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap +++ /dev/null @@ -1,19 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] -1 0 aquarium 1 [5, ] -1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] -1 0 cartoon 1 [2, 7, 15, 17, ] -1 0 colorfulness 1 [13, ] -1 0 design 1 [2, 18, ] -1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] -1 0 geometry 1 [19, ] -1 0 letter 1 [1, ] -1 0 outdoor 1 [4, ] -1 0 painting 1 [3, ] -1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] -1 0 sign 1 [0, ] -2 0 design 1 [21, ] -2 0 geometry 1 [20, 22, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index dfac98e59..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[0, 20, 22, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap deleted file mode 100644 index 972a733e2..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap +++ /dev/null @@ -1,42 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ] -2 [20, 21, 22, ] -36 [3, ] -37 [4, ] -38 [5, ] -39 [6, ] -4 [0, ] -40 [7, ] -41 [8, ] -42 [9, ] -43 [10, ] -44 [11, ] -45 [12, ] -46 [13, ] -47 [14, ] -5 [1, ] -52 [15, ] -57 [16, ] -58 [17, ] -68 [18, ] -69 [19, ] -7 [2, ] -70 [20, ] -71 [21, ] -72 [22, ] -abstract [2, 6, 10, 13, 14, 15, 16, 17, ] -aquarium [5, ] -art [4, 5, 8, 9, 10, 12, 17, ] -cartoon [2, 7, 15, 17, ] -colorfulness [13, ] -design [2, 18, 21, ] -drawing [3, 4, 5, 8, 10, 11, 16, ] -geometry [19, 20, 22, ] -letter [1, ] -outdoor [4, ] -painting [3, ] -pattern [2, 3, 9, 10, 13, 14, 16, ] -sign [0, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap deleted file mode 100644 index 941838e34..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap +++ /dev/null @@ -1,29 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 1 2 [20, 22, ] -1 1 36 [3, ] -1 1 37 [4, ] -1 1 38 [5, ] -1 1 39 [6, ] -1 1 4 [0, ] -1 1 40 [7, ] -1 1 41 [8, ] -1 1 42 [9, ] -1 1 43 [10, ] -1 1 44 [11, ] -1 1 45 [12, ] -1 1 46 [13, ] -1 1 47 [14, ] -1 1 5 [1, ] -1 1 52 [15, ] -1 1 57 [16, ] -1 1 58 [17, ] -1 1 68 [18, ] -1 1 69 [19, ] -1 1 7 [2, ] -1 1 70 [20, ] -1 1 71 [21, ] -1 1 72 [22, ] -1 2 2 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap deleted file mode 100644 index c909a3cd8..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap +++ /dev/null @@ -1,53 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -3 0 48.9021 1 [19, ] -3 0 49.4449 1 [18, ] -3 0 49.9314 1 [17, ] -3 0 50.1112 1 [16, ] -3 0 50.1793 1 [15, ] -3 0 50.2844 1 [14, ] -3 0 50.3518 1 [13, ] -3 0 50.4095 1 [11, ] -3 0 50.4502 1 [12, ] -3 0 50.6053 1 [8, ] -3 0 50.6224 1 [3, ] -3 0 50.6299 1 [0, ] -3 0 50.6312 1 [2, ] -3 0 50.6415 1 [1, ] -3 0 50.6552 1 [4, ] -3 0 50.6924 1 [5, ] -3 0 50.7263 1 [6, ] -3 0 50.7453 1 [7, ] -3 0 50.8466 1 [10, ] -3 0 51.0537 1 [9, ] -3 1 48.9021 4 [16, 17, 18, 19, ] -3 1 50.1793 4 [11, 13, 14, 15, ] -3 1 50.4502 4 [0, 3, 8, 12, ] -3 1 50.6312 4 [1, 2, 4, 5, ] -3 1 50.7263 4 [6, 7, 9, 10, ] -4 0 2.271 1 [17, ] -4 0 2.3708 1 [19, ] -4 0 2.7637 1 [14, ] -4 0 2.7913 1 [18, ] -4 0 2.8547 1 [16, ] -4 0 3.0569 1 [0, ] -4 0 3.1106 1 [1, 2, ] -4 0 3.1476 1 [3, ] -4 0 3.1541 1 [6, ] -4 0 3.1763 1 [5, ] -4 0 3.1897 1 [4, ] -4 0 3.2189 1 [15, ] -4 0 3.2206 1 [7, ] -4 0 3.3758 1 [8, ] -4 0 3.5326 1 [13, ] -4 0 3.6957 1 [9, ] -4 0 3.9623 1 [12, ] -4 0 4.337 1 [10, ] -4 0 4.4347 1 [11, ] -4 1 2.271 4 [14, 17, 18, 19, ] -4 1 2.8547 4 [0, 1, 2, 3, 16, ] -4 1 3.1541 4 [4, 5, 6, 15, ] -4 1 3.2206 4 [7, 8, 9, 13, ] -4 1 3.9623 3 [10, 11, 12, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index 1260b12de..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[4, 5, 6, 11, 16, 18, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index efcd7af8c..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index efcd7af8c..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index efcd7af8c..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, 15, ] diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index a30254994..618f451dc 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -4,16 +4,18 @@ use grenad::CompressionType; use heed::types::{ByteSlice, Str}; use heed::Database; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{ - create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, - CursorClonableMmap, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, + write_sorter_into_database, CursorClonableMmap, MergeFn, }; -use crate::{Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Result}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, - word_docids: Database, - word_prefix_docids: Database, + word_docids: Database, + word_prefix_docids: Database, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, @@ -23,8 +25,8 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, - word_docids: Database, - word_prefix_docids: Database, + word_docids: Database, + word_prefix_docids: Database, ) -> WordPrefixDocids<'t, 'u, 'i> { WordPrefixDocids { wtxn, @@ -51,7 +53,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -92,11 +94,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // We fetch the docids associated to the newly added word prefix fst only. let db = self.word_docids.remap_data_type::(); + let mut buffer = Vec::new(); for prefix in new_prefix_fst_words { let prefix = std::str::from_utf8(prefix.as_bytes())?; for result in db.prefix_iter(self.wtxn, prefix)? { let (_word, data) = result?; - prefix_docids_sorter.insert(prefix, data)?; + buffer.clear(); + let mut writer = KvWriterDelAdd::new(&mut buffer); + writer.insert(DelAdd::Addition, data)?; + + prefix_docids_sorter.insert(prefix, writer.into_inner()?)?; } } @@ -110,12 +117,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { drop(iter); + let database_is_empty = self.word_prefix_docids.is_empty(self.wtxn)?; + // We finally write the word prefix docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.word_prefix_docids.as_polymorph(), + write_sorter_into_database( prefix_docids_sorter, - merge_roaring_bitmaps, + &self.word_prefix_docids, + self.wtxn, + database_is_empty, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; Ok(()) diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs index c65438928..e083f510a 100644 --- a/milli/src/update/words_prefix_integer_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -9,9 +9,11 @@ use log::debug; use crate::error::SerializationError; use crate::heed_codec::StrBEU16Codec; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, - CursorClonableMmap, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, + write_sorter_into_database, CursorClonableMmap, MergeFn, }; use crate::{CboRoaringBitmapCodec, Result}; @@ -55,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { let mut prefix_integer_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -108,6 +110,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { // We fetch the docids associated to the newly added word prefix fst only. let db = self.word_database.remap_data_type::(); + let mut buffer = Vec::new(); for prefix_bytes in new_prefix_fst_words { let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| { SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } @@ -123,7 +126,11 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { if word.starts_with(prefix) { let key = (prefix, pos); let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); - prefix_integer_docids_sorter.insert(bytes, data)?; + + buffer.clear(); + let mut writer = KvWriterDelAdd::new(&mut buffer); + writer.insert(DelAdd::Addition, data)?; + prefix_integer_docids_sorter.insert(bytes, writer.into_inner()?)?; } } } @@ -143,12 +150,16 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { drop(iter); } + let database_is_empty = self.prefix_database.is_empty(self.wtxn)?; + // We finally write all the word prefix integer docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.prefix_database.as_polymorph(), + write_sorter_into_database( prefix_integer_docids_sorter, - merge_cbo_roaring_bitmaps, + &self.prefix_database, + self.wtxn, + database_is_empty, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; Ok(()) @@ -159,6 +170,7 @@ fn write_prefixes_in_sorter( prefixes: &mut HashMap, Vec>>, sorter: &mut grenad::Sorter, ) -> Result<()> { + // TODO: Merge before insertion. for (key, data_slices) in prefixes.drain() { for data in data_slices { if valid_lmdb_key(&key) { diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 1c68cfff2..9193ab762 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -88,9 +88,11 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec { let rtxn = index.read_txn().unwrap(); - let docid_map = index.external_documents_ids(&rtxn).unwrap(); - let docid_map: std::collections::HashMap<_, _> = - EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); + let docid_map = index.external_documents_ids(); + let docid_map: std::collections::HashMap<_, _> = EXTERNAL_DOCUMENTS_IDS + .iter() + .map(|id| (docid_map.get(&rtxn, id).unwrap().unwrap(), id)) + .collect(); internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect() }