mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-29 16:24:26 +01:00
Merge branch 'main' into tmp-release-v1.5.0
This commit is contained in:
commit
7cb7e37ba8
3
.github/workflows/benchmarks-pr.yml
vendored
3
.github/workflows/benchmarks-pr.yml
vendored
@ -90,7 +90,8 @@ jobs:
|
|||||||
set -x
|
set -x
|
||||||
export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8)
|
export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8)
|
||||||
export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json)
|
export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json)
|
||||||
echo 'Here are your benchmarks diff 👊' >> body.txt
|
export bench_name=$(echo ${{ steps.command.outputs.command-arguments }})
|
||||||
|
echo "Here are your $bench_name benchmarks diff 👊" >> body.txt
|
||||||
echo '```' >> body.txt
|
echo '```' >> body.txt
|
||||||
./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt
|
./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt
|
||||||
echo '```' >> body.txt
|
echo '```' >> body.txt
|
||||||
|
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
@ -50,7 +50,7 @@ jobs:
|
|||||||
needs: check-version
|
needs: check-version
|
||||||
steps:
|
steps:
|
||||||
- name: Create PR to Homebrew
|
- name: Create PR to Homebrew
|
||||||
uses: mislav/bump-homebrew-formula-action@v2
|
uses: mislav/bump-homebrew-formula-action@v3
|
||||||
with:
|
with:
|
||||||
formula-name: meilisearch
|
formula-name: meilisearch
|
||||||
formula-path: Formula/m/meilisearch.rb
|
formula-path: Formula/m/meilisearch.rb
|
||||||
|
2
.github/workflows/publish-docker-images.yml
vendored
2
.github/workflows/publish-docker-images.yml
vendored
@ -63,7 +63,7 @@ jobs:
|
|||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
- name: Login to Docker Hub
|
- name: Login to Docker Hub
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
4
.github/workflows/sdks-tests.yml
vendored
4
.github/workflows/sdks-tests.yml
vendored
@ -160,7 +160,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
repository: meilisearch/meilisearch-js
|
repository: meilisearch/meilisearch-js
|
||||||
- name: Setup node
|
- name: Setup node
|
||||||
uses: actions/setup-node@v3
|
uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
cache: 'yarn'
|
cache: 'yarn'
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
@ -318,7 +318,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
repository: meilisearch/meilisearch-js-plugins
|
repository: meilisearch/meilisearch-js-plugins
|
||||||
- name: Setup node
|
- name: Setup node
|
||||||
uses: actions/setup-node@v3
|
uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
cache: yarn
|
cache: yarn
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
10
.github/workflows/test-suite.yml
vendored
10
.github/workflows/test-suite.yml
vendored
@ -43,7 +43,7 @@ jobs:
|
|||||||
toolchain: nightly
|
toolchain: nightly
|
||||||
override: true
|
override: true
|
||||||
- name: Cache dependencies
|
- name: Cache dependencies
|
||||||
uses: Swatinem/rust-cache@v2.6.2
|
uses: Swatinem/rust-cache@v2.7.1
|
||||||
- name: Run cargo check without any default features
|
- name: Run cargo check without any default features
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
@ -65,7 +65,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- name: Cache dependencies
|
- name: Cache dependencies
|
||||||
uses: Swatinem/rust-cache@v2.6.2
|
uses: Swatinem/rust-cache@v2.7.1
|
||||||
- name: Run cargo check without any default features
|
- name: Run cargo check without any default features
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
@ -149,7 +149,7 @@ jobs:
|
|||||||
toolchain: stable
|
toolchain: stable
|
||||||
override: true
|
override: true
|
||||||
- name: Cache dependencies
|
- name: Cache dependencies
|
||||||
uses: Swatinem/rust-cache@v2.6.2
|
uses: Swatinem/rust-cache@v2.7.1
|
||||||
- name: Run tests in debug
|
- name: Run tests in debug
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
@ -168,7 +168,7 @@ jobs:
|
|||||||
override: true
|
override: true
|
||||||
components: clippy
|
components: clippy
|
||||||
- name: Cache dependencies
|
- name: Cache dependencies
|
||||||
uses: Swatinem/rust-cache@v2.6.2
|
uses: Swatinem/rust-cache@v2.7.1
|
||||||
- name: Run cargo clippy
|
- name: Run cargo clippy
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
@ -187,7 +187,7 @@ jobs:
|
|||||||
override: true
|
override: true
|
||||||
components: rustfmt
|
components: rustfmt
|
||||||
- name: Cache dependencies
|
- name: Cache dependencies
|
||||||
uses: Swatinem/rust-cache@v2.6.2
|
uses: Swatinem/rust-cache@v2.7.1
|
||||||
- name: Run cargo fmt
|
- name: Run cargo fmt
|
||||||
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
|
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
|
||||||
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate
|
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate
|
||||||
|
10
Cargo.lock
generated
10
Cargo.lock
generated
@ -1731,12 +1731,13 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "grenad"
|
name = "grenad"
|
||||||
version = "0.4.4"
|
version = "0.4.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5232b2d157b7bf63d7abe1b12177039e58db2f29e377517c0cdee1578cca4c93"
|
checksum = "6a007932af5475ebb5c63bef8812bb1c36f317983bb4ca663e9d6dd58d6a0f8c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck",
|
"bytemuck",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
|
"rayon",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -3281,6 +3282,7 @@ dependencies = [
|
|||||||
"logging_timer",
|
"logging_timer",
|
||||||
"maplit",
|
"maplit",
|
||||||
"md5",
|
"md5",
|
||||||
|
"meili-snap",
|
||||||
"memmap2",
|
"memmap2",
|
||||||
"mimalloc",
|
"mimalloc",
|
||||||
"obkv",
|
"obkv",
|
||||||
@ -3443,9 +3445,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "obkv"
|
name = "obkv"
|
||||||
version = "0.2.0"
|
version = "0.2.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385"
|
checksum = "6c459142426056c639ff88d053ebaaaeca0ee1411c94362892398ef4ccd81080"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "once_cell"
|
name = "once_cell"
|
||||||
|
@ -25,12 +25,6 @@
|
|||||||
|
|
||||||
<p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>
|
<p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A!
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.
|
Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.
|
||||||
|
|
||||||
<p align="center" name="demo">
|
<p align="center" name="demo">
|
||||||
|
@ -6,9 +6,7 @@ use std::path::Path;
|
|||||||
|
|
||||||
use criterion::{criterion_group, criterion_main, Criterion};
|
use criterion::{criterion_group, criterion_main, Criterion};
|
||||||
use milli::heed::{EnvOpenOptions, RwTxn};
|
use milli::heed::{EnvOpenOptions, RwTxn};
|
||||||
use milli::update::{
|
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||||
DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
|
|
||||||
};
|
|
||||||
use milli::Index;
|
use milli::Index;
|
||||||
use rand::seq::SliceRandom;
|
use rand::seq::SliceRandom;
|
||||||
use rand_chacha::rand_core::SeedableRng;
|
use rand_chacha::rand_core::SeedableRng;
|
||||||
@ -266,17 +264,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
|
|||||||
(index, document_ids_to_delete)
|
(index, document_ids_to_delete)
|
||||||
},
|
},
|
||||||
move |(index, document_ids_to_delete)| {
|
move |(index, document_ids_to_delete)| {
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
delete_documents_from_ids(index, document_ids_to_delete)
|
||||||
|
|
||||||
for ids in document_ids_to_delete {
|
|
||||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
|
||||||
builder.delete_documents(&ids);
|
|
||||||
builder.execute().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
index.prepare_for_closing().wait();
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
@ -613,17 +601,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
|
|||||||
(index, document_ids_to_delete)
|
(index, document_ids_to_delete)
|
||||||
},
|
},
|
||||||
move |(index, document_ids_to_delete)| {
|
move |(index, document_ids_to_delete)| {
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
delete_documents_from_ids(index, document_ids_to_delete)
|
||||||
|
|
||||||
for ids in document_ids_to_delete {
|
|
||||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
|
||||||
builder.delete_documents(&ids);
|
|
||||||
builder.execute().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
index.prepare_for_closing().wait();
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
@ -875,20 +853,29 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
|
|||||||
(index, document_ids_to_delete)
|
(index, document_ids_to_delete)
|
||||||
},
|
},
|
||||||
move |(index, document_ids_to_delete)| {
|
move |(index, document_ids_to_delete)| {
|
||||||
|
delete_documents_from_ids(index, document_ids_to_delete)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBitmap>) {
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
let indexer_config = IndexerConfig::default();
|
||||||
for ids in document_ids_to_delete {
|
for ids in document_ids_to_delete {
|
||||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
let config = IndexDocumentsConfig::default();
|
||||||
builder.delete_documents(&ids);
|
|
||||||
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
|
||||||
|
.unwrap();
|
||||||
|
(builder, _) = builder.remove_documents_from_db_no_batch(&ids).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
index.prepare_for_closing().wait();
|
index.prepare_for_closing().wait();
|
||||||
},
|
|
||||||
)
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn indexing_movies_in_three_batches(c: &mut Criterion) {
|
fn indexing_movies_in_three_batches(c: &mut Criterion) {
|
||||||
@ -1112,17 +1099,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
|
|||||||
(index, document_ids_to_delete)
|
(index, document_ids_to_delete)
|
||||||
},
|
},
|
||||||
move |(index, document_ids_to_delete)| {
|
move |(index, document_ids_to_delete)| {
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
delete_documents_from_ids(index, document_ids_to_delete)
|
||||||
|
|
||||||
for ids in document_ids_to_delete {
|
|
||||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
|
||||||
builder.delete_documents(&ids);
|
|
||||||
builder.execute().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
index.prepare_for_closing().wait();
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
@ -1338,17 +1315,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
|
|||||||
(index, document_ids_to_delete)
|
(index, document_ids_to_delete)
|
||||||
},
|
},
|
||||||
move |(index, document_ids_to_delete)| {
|
move |(index, document_ids_to_delete)| {
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
delete_documents_from_ids(index, document_ids_to_delete)
|
||||||
|
|
||||||
for ids in document_ids_to_delete {
|
|
||||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
|
||||||
builder.delete_documents(&ids);
|
|
||||||
builder.execute().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
index.prepare_for_closing().wait();
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
|
@ -526,12 +526,12 @@ pub(crate) mod test {
|
|||||||
assert!(indexes.is_empty());
|
assert!(indexes.is_empty());
|
||||||
|
|
||||||
// products
|
// products
|
||||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "products",
|
"uid": "products",
|
||||||
"primaryKey": "sku",
|
"primaryKey": "sku",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:22.688964637Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:27:23.951017769Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@ -541,12 +541,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||||
|
|
||||||
// movies
|
// movies
|
||||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "movies",
|
"uid": "movies",
|
||||||
"primaryKey": "id",
|
"primaryKey": "id",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:22.197788495Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:28:01.93111053Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@ -571,12 +571,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
||||||
|
|
||||||
// spells
|
// spells
|
||||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "dnd_spells",
|
"uid": "dnd_spells",
|
||||||
"primaryKey": "index",
|
"primaryKey": "index",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:24.242683494Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:27:24.312809641Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@ -617,12 +617,12 @@ pub(crate) mod test {
|
|||||||
assert!(indexes.is_empty());
|
assert!(indexes.is_empty());
|
||||||
|
|
||||||
// products
|
// products
|
||||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "products",
|
"uid": "products",
|
||||||
"primaryKey": "sku",
|
"primaryKey": "sku",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:56.595257Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:58.70348Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@ -632,12 +632,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||||
|
|
||||||
// movies
|
// movies
|
||||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "movies",
|
"uid": "movies",
|
||||||
"primaryKey": "id",
|
"primaryKey": "id",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:56.192178Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:56.455714Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@ -647,12 +647,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
||||||
|
|
||||||
// spells
|
// spells
|
||||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "dnd_spells",
|
"uid": "dnd_spells",
|
||||||
"primaryKey": "index",
|
"primaryKey": "index",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:58.876405Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:59.079906Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
|
@ -1,24 +0,0 @@
|
|||||||
---
|
|
||||||
source: dump/src/reader/mod.rs
|
|
||||||
expression: spells.settings().unwrap()
|
|
||||||
---
|
|
||||||
{
|
|
||||||
"displayedAttributes": [
|
|
||||||
"*"
|
|
||||||
],
|
|
||||||
"searchableAttributes": [
|
|
||||||
"*"
|
|
||||||
],
|
|
||||||
"filterableAttributes": [],
|
|
||||||
"sortableAttributes": [],
|
|
||||||
"rankingRules": [
|
|
||||||
"typo",
|
|
||||||
"words",
|
|
||||||
"proximity",
|
|
||||||
"attribute",
|
|
||||||
"exactness"
|
|
||||||
],
|
|
||||||
"stopWords": [],
|
|
||||||
"synonyms": {},
|
|
||||||
"distinctAttribute": null
|
|
||||||
}
|
|
@ -1,38 +0,0 @@
|
|||||||
---
|
|
||||||
source: dump/src/reader/mod.rs
|
|
||||||
expression: products.settings().unwrap()
|
|
||||||
---
|
|
||||||
{
|
|
||||||
"displayedAttributes": [
|
|
||||||
"*"
|
|
||||||
],
|
|
||||||
"searchableAttributes": [
|
|
||||||
"*"
|
|
||||||
],
|
|
||||||
"filterableAttributes": [],
|
|
||||||
"sortableAttributes": [],
|
|
||||||
"rankingRules": [
|
|
||||||
"typo",
|
|
||||||
"words",
|
|
||||||
"proximity",
|
|
||||||
"attribute",
|
|
||||||
"exactness"
|
|
||||||
],
|
|
||||||
"stopWords": [],
|
|
||||||
"synonyms": {
|
|
||||||
"android": [
|
|
||||||
"phone",
|
|
||||||
"smartphone"
|
|
||||||
],
|
|
||||||
"iphone": [
|
|
||||||
"phone",
|
|
||||||
"smartphone"
|
|
||||||
],
|
|
||||||
"phone": [
|
|
||||||
"android",
|
|
||||||
"iphone",
|
|
||||||
"smartphone"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"distinctAttribute": null
|
|
||||||
}
|
|
@ -1,31 +0,0 @@
|
|||||||
---
|
|
||||||
source: dump/src/reader/mod.rs
|
|
||||||
expression: movies.settings().unwrap()
|
|
||||||
---
|
|
||||||
{
|
|
||||||
"displayedAttributes": [
|
|
||||||
"*"
|
|
||||||
],
|
|
||||||
"searchableAttributes": [
|
|
||||||
"*"
|
|
||||||
],
|
|
||||||
"filterableAttributes": [
|
|
||||||
"genres",
|
|
||||||
"id"
|
|
||||||
],
|
|
||||||
"sortableAttributes": [
|
|
||||||
"genres",
|
|
||||||
"id"
|
|
||||||
],
|
|
||||||
"rankingRules": [
|
|
||||||
"typo",
|
|
||||||
"words",
|
|
||||||
"proximity",
|
|
||||||
"attribute",
|
|
||||||
"exactness",
|
|
||||||
"release_date:asc"
|
|
||||||
],
|
|
||||||
"stopWords": [],
|
|
||||||
"synonyms": {},
|
|
||||||
"distinctAttribute": null
|
|
||||||
}
|
|
@ -46,6 +46,7 @@ pub type Checked = settings::Checked;
|
|||||||
pub type Unchecked = settings::Unchecked;
|
pub type Unchecked = settings::Unchecked;
|
||||||
|
|
||||||
pub type Task = updates::UpdateEntry;
|
pub type Task = updates::UpdateEntry;
|
||||||
|
pub type Kind = updates::UpdateMeta;
|
||||||
|
|
||||||
// everything related to the errors
|
// everything related to the errors
|
||||||
pub type ResponseError = errors::ResponseError;
|
pub type ResponseError = errors::ResponseError;
|
||||||
@ -107,8 +108,11 @@ impl V2Reader {
|
|||||||
pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V2IndexReader>> + '_> {
|
pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V2IndexReader>> + '_> {
|
||||||
Ok(self.index_uuid.iter().map(|index| -> Result<_> {
|
Ok(self.index_uuid.iter().map(|index| -> Result<_> {
|
||||||
V2IndexReader::new(
|
V2IndexReader::new(
|
||||||
index.uid.clone(),
|
|
||||||
&self.dump.path().join("indexes").join(format!("index-{}", index.uuid)),
|
&self.dump.path().join("indexes").join(format!("index-{}", index.uuid)),
|
||||||
|
index,
|
||||||
|
BufReader::new(
|
||||||
|
File::open(self.dump.path().join("updates").join("data.jsonl")).unwrap(),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
@ -143,16 +147,41 @@ pub struct V2IndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl V2IndexReader {
|
impl V2IndexReader {
|
||||||
pub fn new(name: String, path: &Path) -> Result<Self> {
|
pub fn new(path: &Path, index_uuid: &IndexUuid, tasks: BufReader<File>) -> Result<Self> {
|
||||||
let meta = File::open(path.join("meta.json"))?;
|
let meta = File::open(path.join("meta.json"))?;
|
||||||
let meta: DumpMeta = serde_json::from_reader(meta)?;
|
let meta: DumpMeta = serde_json::from_reader(meta)?;
|
||||||
|
|
||||||
|
let mut created_at = None;
|
||||||
|
let mut updated_at = None;
|
||||||
|
|
||||||
|
for line in tasks.lines() {
|
||||||
|
let task: Task = serde_json::from_str(&line?)?;
|
||||||
|
if !(task.uuid == index_uuid.uuid && task.is_finished()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let new_created_at = match task.update.meta() {
|
||||||
|
Kind::DocumentsAddition { .. } | Kind::Settings(_) => task.update.finished_at(),
|
||||||
|
_ => None,
|
||||||
|
};
|
||||||
|
let new_updated_at = task.update.finished_at();
|
||||||
|
|
||||||
|
if created_at.is_none() || created_at > new_created_at {
|
||||||
|
created_at = new_created_at;
|
||||||
|
}
|
||||||
|
|
||||||
|
if updated_at.is_none() || updated_at < new_updated_at {
|
||||||
|
updated_at = new_updated_at;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let current_time = OffsetDateTime::now_utc();
|
||||||
|
|
||||||
let metadata = IndexMetadata {
|
let metadata = IndexMetadata {
|
||||||
uid: name,
|
uid: index_uuid.uid.clone(),
|
||||||
primary_key: meta.primary_key,
|
primary_key: meta.primary_key,
|
||||||
// FIXME: Iterate over the whole task queue to find the creation and last update date.
|
created_at: created_at.unwrap_or(current_time),
|
||||||
created_at: OffsetDateTime::now_utc(),
|
updated_at: updated_at.unwrap_or(current_time),
|
||||||
updated_at: OffsetDateTime::now_utc(),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let ret = V2IndexReader {
|
let ret = V2IndexReader {
|
||||||
@ -248,12 +277,12 @@ pub(crate) mod test {
|
|||||||
assert!(indexes.is_empty());
|
assert!(indexes.is_empty());
|
||||||
|
|
||||||
// products
|
// products
|
||||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "products",
|
"uid": "products",
|
||||||
"primaryKey": "sku",
|
"primaryKey": "sku",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:22.688964637Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:27:23.951017769Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@ -263,12 +292,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||||
|
|
||||||
// movies
|
// movies
|
||||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "movies",
|
"uid": "movies",
|
||||||
"primaryKey": "id",
|
"primaryKey": "id",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:22.197788495Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:28:01.93111053Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@ -293,12 +322,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
||||||
|
|
||||||
// spells
|
// spells
|
||||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "dnd_spells",
|
"uid": "dnd_spells",
|
||||||
"primaryKey": "index",
|
"primaryKey": "index",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:24.242683494Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:27:24.312809641Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@ -340,12 +369,12 @@ pub(crate) mod test {
|
|||||||
assert!(indexes.is_empty());
|
assert!(indexes.is_empty());
|
||||||
|
|
||||||
// products
|
// products
|
||||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "products",
|
"uid": "products",
|
||||||
"primaryKey": "sku",
|
"primaryKey": "sku",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:56.595257Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:58.70348Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@ -355,12 +384,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||||
|
|
||||||
// movies
|
// movies
|
||||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "movies",
|
"uid": "movies",
|
||||||
"primaryKey": "id",
|
"primaryKey": "id",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:56.192178Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:56.455714Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@ -370,12 +399,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
||||||
|
|
||||||
// spells
|
// spells
|
||||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "dnd_spells",
|
"uid": "dnd_spells",
|
||||||
"primaryKey": "index",
|
"primaryKey": "index",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:58.876405Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:59.079906Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
|
@ -227,4 +227,14 @@ impl UpdateStatus {
|
|||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn finished_at(&self) -> Option<OffsetDateTime> {
|
||||||
|
match self {
|
||||||
|
UpdateStatus::Processing(_) => None,
|
||||||
|
UpdateStatus::Enqueued(_) => None,
|
||||||
|
UpdateStatus::Processed(u) => Some(u.processed_at),
|
||||||
|
UpdateStatus::Aborted(_) => None,
|
||||||
|
UpdateStatus::Failed(u) => Some(u.failed_at),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -24,14 +24,13 @@ use std::fs::{self, File};
|
|||||||
use std::io::BufWriter;
|
use std::io::BufWriter;
|
||||||
|
|
||||||
use dump::IndexMetadata;
|
use dump::IndexMetadata;
|
||||||
use log::{debug, error, info};
|
use log::{debug, error, info, trace};
|
||||||
use meilisearch_types::error::Code;
|
use meilisearch_types::error::Code;
|
||||||
use meilisearch_types::heed::{RoTxn, RwTxn};
|
use meilisearch_types::heed::{RoTxn, RwTxn};
|
||||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||||
use meilisearch_types::milli::heed::CompactionOption;
|
use meilisearch_types::milli::heed::CompactionOption;
|
||||||
use meilisearch_types::milli::update::{
|
use meilisearch_types::milli::update::{
|
||||||
DeleteDocuments, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
|
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
||||||
Settings as MilliSettings,
|
|
||||||
};
|
};
|
||||||
use meilisearch_types::milli::{self, Filter, BEU32};
|
use meilisearch_types::milli::{self, Filter, BEU32};
|
||||||
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
|
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
|
||||||
@ -44,7 +43,7 @@ use uuid::Uuid;
|
|||||||
|
|
||||||
use crate::autobatcher::{self, BatchKind};
|
use crate::autobatcher::{self, BatchKind};
|
||||||
use crate::utils::{self, swap_index_uid_in_task};
|
use crate::utils::{self, swap_index_uid_in_task};
|
||||||
use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId};
|
use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId};
|
||||||
|
|
||||||
/// Represents a combination of tasks that can all be processed at the same time.
|
/// Represents a combination of tasks that can all be processed at the same time.
|
||||||
///
|
///
|
||||||
@ -105,12 +104,6 @@ pub(crate) enum IndexOperation {
|
|||||||
operations: Vec<DocumentOperation>,
|
operations: Vec<DocumentOperation>,
|
||||||
tasks: Vec<Task>,
|
tasks: Vec<Task>,
|
||||||
},
|
},
|
||||||
DocumentDeletion {
|
|
||||||
index_uid: String,
|
|
||||||
// The vec associated with each document deletion tasks.
|
|
||||||
documents: Vec<Vec<String>>,
|
|
||||||
tasks: Vec<Task>,
|
|
||||||
},
|
|
||||||
IndexDocumentDeletionByFilter {
|
IndexDocumentDeletionByFilter {
|
||||||
index_uid: String,
|
index_uid: String,
|
||||||
task: Task,
|
task: Task,
|
||||||
@ -162,7 +155,6 @@ impl Batch {
|
|||||||
}
|
}
|
||||||
Batch::IndexOperation { op, .. } => match op {
|
Batch::IndexOperation { op, .. } => match op {
|
||||||
IndexOperation::DocumentOperation { tasks, .. }
|
IndexOperation::DocumentOperation { tasks, .. }
|
||||||
| IndexOperation::DocumentDeletion { tasks, .. }
|
|
||||||
| IndexOperation::Settings { tasks, .. }
|
| IndexOperation::Settings { tasks, .. }
|
||||||
| IndexOperation::DocumentClear { tasks, .. } => {
|
| IndexOperation::DocumentClear { tasks, .. } => {
|
||||||
tasks.iter().map(|task| task.uid).collect()
|
tasks.iter().map(|task| task.uid).collect()
|
||||||
@ -227,7 +219,6 @@ impl IndexOperation {
|
|||||||
pub fn index_uid(&self) -> &str {
|
pub fn index_uid(&self) -> &str {
|
||||||
match self {
|
match self {
|
||||||
IndexOperation::DocumentOperation { index_uid, .. }
|
IndexOperation::DocumentOperation { index_uid, .. }
|
||||||
| IndexOperation::DocumentDeletion { index_uid, .. }
|
|
||||||
| IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
|
| IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
|
||||||
| IndexOperation::DocumentClear { index_uid, .. }
|
| IndexOperation::DocumentClear { index_uid, .. }
|
||||||
| IndexOperation::Settings { index_uid, .. }
|
| IndexOperation::Settings { index_uid, .. }
|
||||||
@ -243,9 +234,6 @@ impl fmt::Display for IndexOperation {
|
|||||||
IndexOperation::DocumentOperation { .. } => {
|
IndexOperation::DocumentOperation { .. } => {
|
||||||
f.write_str("IndexOperation::DocumentOperation")
|
f.write_str("IndexOperation::DocumentOperation")
|
||||||
}
|
}
|
||||||
IndexOperation::DocumentDeletion { .. } => {
|
|
||||||
f.write_str("IndexOperation::DocumentDeletion")
|
|
||||||
}
|
|
||||||
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
|
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
|
||||||
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
|
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
|
||||||
}
|
}
|
||||||
@ -348,18 +336,27 @@ impl IndexScheduler {
|
|||||||
BatchKind::DocumentDeletion { deletion_ids } => {
|
BatchKind::DocumentDeletion { deletion_ids } => {
|
||||||
let tasks = self.get_existing_tasks(rtxn, deletion_ids)?;
|
let tasks = self.get_existing_tasks(rtxn, deletion_ids)?;
|
||||||
|
|
||||||
let mut documents = Vec::new();
|
let mut operations = Vec::with_capacity(tasks.len());
|
||||||
|
let mut documents_counts = Vec::with_capacity(tasks.len());
|
||||||
for task in &tasks {
|
for task in &tasks {
|
||||||
match task.kind {
|
match task.kind {
|
||||||
KindWithContent::DocumentDeletion { ref documents_ids, .. } => {
|
KindWithContent::DocumentDeletion { ref documents_ids, .. } => {
|
||||||
documents.push(documents_ids.clone())
|
operations.push(DocumentOperation::Delete(documents_ids.clone()));
|
||||||
|
documents_counts.push(documents_ids.len() as u64);
|
||||||
}
|
}
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Some(Batch::IndexOperation {
|
Ok(Some(Batch::IndexOperation {
|
||||||
op: IndexOperation::DocumentDeletion { index_uid, documents, tasks },
|
op: IndexOperation::DocumentOperation {
|
||||||
|
index_uid,
|
||||||
|
primary_key: None,
|
||||||
|
method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
|
documents_counts,
|
||||||
|
operations,
|
||||||
|
tasks,
|
||||||
|
},
|
||||||
must_create_index,
|
must_create_index,
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
@ -825,6 +822,10 @@ impl IndexScheduler {
|
|||||||
// 2. dump the tasks
|
// 2. dump the tasks
|
||||||
let mut dump_tasks = dump.create_tasks_queue()?;
|
let mut dump_tasks = dump.create_tasks_queue()?;
|
||||||
for ret in self.all_tasks.iter(&rtxn)? {
|
for ret in self.all_tasks.iter(&rtxn)? {
|
||||||
|
if self.must_stop_processing.get() {
|
||||||
|
return Err(Error::AbortedTask);
|
||||||
|
}
|
||||||
|
|
||||||
let (_, mut t) = ret?;
|
let (_, mut t) = ret?;
|
||||||
let status = t.status;
|
let status = t.status;
|
||||||
let content_file = t.content_uuid();
|
let content_file = t.content_uuid();
|
||||||
@ -845,6 +846,9 @@ impl IndexScheduler {
|
|||||||
|
|
||||||
// 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
|
// 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
|
||||||
if let Some(content_file) = content_file {
|
if let Some(content_file) = content_file {
|
||||||
|
if self.must_stop_processing.get() {
|
||||||
|
return Err(Error::AbortedTask);
|
||||||
|
}
|
||||||
if status == Status::Enqueued {
|
if status == Status::Enqueued {
|
||||||
let content_file = self.file_store.get_update(content_file)?;
|
let content_file = self.file_store.get_update(content_file)?;
|
||||||
|
|
||||||
@ -884,6 +888,9 @@ impl IndexScheduler {
|
|||||||
|
|
||||||
// 3.1. Dump the documents
|
// 3.1. Dump the documents
|
||||||
for ret in index.all_documents(&rtxn)? {
|
for ret in index.all_documents(&rtxn)? {
|
||||||
|
if self.must_stop_processing.get() {
|
||||||
|
return Err(Error::AbortedTask);
|
||||||
|
}
|
||||||
let (_id, doc) = ret?;
|
let (_id, doc) = ret?;
|
||||||
let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||||
index_dumper.push_document(&document)?;
|
index_dumper.push_document(&document)?;
|
||||||
@ -903,6 +910,9 @@ impl IndexScheduler {
|
|||||||
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
|
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
|
||||||
)).unwrap();
|
)).unwrap();
|
||||||
|
|
||||||
|
if self.must_stop_processing.get() {
|
||||||
|
return Err(Error::AbortedTask);
|
||||||
|
}
|
||||||
let path = self.dumps_path.join(format!("{}.dump", dump_uid));
|
let path = self.dumps_path.join(format!("{}.dump", dump_uid));
|
||||||
let file = File::create(path)?;
|
let file = File::create(path)?;
|
||||||
dump.persist_to(BufWriter::new(file))?;
|
dump.persist_to(BufWriter::new(file))?;
|
||||||
@ -1195,7 +1205,7 @@ impl IndexScheduler {
|
|||||||
index,
|
index,
|
||||||
indexer_config,
|
indexer_config,
|
||||||
config,
|
config,
|
||||||
|indexing_step| debug!("update: {:?}", indexing_step),
|
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||||
|| must_stop_processing.get(),
|
|| must_stop_processing.get(),
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
@ -1242,7 +1252,8 @@ impl IndexScheduler {
|
|||||||
let (new_builder, user_result) =
|
let (new_builder, user_result) =
|
||||||
builder.remove_documents(document_ids)?;
|
builder.remove_documents(document_ids)?;
|
||||||
builder = new_builder;
|
builder = new_builder;
|
||||||
|
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||||
|
let count = user_result.unwrap();
|
||||||
let provided_ids =
|
let provided_ids =
|
||||||
if let Some(Details::DocumentDeletion { provided_ids, .. }) =
|
if let Some(Details::DocumentDeletion { provided_ids, .. }) =
|
||||||
task.details
|
task.details
|
||||||
@ -1253,24 +1264,12 @@ impl IndexScheduler {
|
|||||||
unreachable!();
|
unreachable!();
|
||||||
};
|
};
|
||||||
|
|
||||||
match user_result {
|
|
||||||
Ok(count) => {
|
|
||||||
task.status = Status::Succeeded;
|
task.status = Status::Succeeded;
|
||||||
task.details = Some(Details::DocumentDeletion {
|
task.details = Some(Details::DocumentDeletion {
|
||||||
provided_ids,
|
provided_ids,
|
||||||
deleted_documents: Some(count),
|
deleted_documents: Some(count),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
Err(e) => {
|
|
||||||
task.status = Status::Failed;
|
|
||||||
task.details = Some(Details::DocumentDeletion {
|
|
||||||
provided_ids,
|
|
||||||
deleted_documents: Some(0),
|
|
||||||
});
|
|
||||||
task.error = Some(milli::Error::from(e).into());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1284,31 +1283,13 @@ impl IndexScheduler {
|
|||||||
milli::update::Settings::new(index_wtxn, index, indexer_config);
|
milli::update::Settings::new(index_wtxn, index, indexer_config);
|
||||||
builder.reset_primary_key();
|
builder.reset_primary_key();
|
||||||
builder.execute(
|
builder.execute(
|
||||||
|indexing_step| debug!("update: {:?}", indexing_step),
|
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||||
|| must_stop_processing.clone().get(),
|
|| must_stop_processing.clone().get(),
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(tasks)
|
Ok(tasks)
|
||||||
}
|
}
|
||||||
IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => {
|
|
||||||
let mut builder = milli::update::DeleteDocuments::new(index_wtxn, index)?;
|
|
||||||
documents.iter().flatten().for_each(|id| {
|
|
||||||
builder.delete_external_id(id);
|
|
||||||
});
|
|
||||||
|
|
||||||
let DocumentDeletionResult { deleted_documents, .. } = builder.execute()?;
|
|
||||||
|
|
||||||
for (task, documents) in tasks.iter_mut().zip(documents) {
|
|
||||||
task.status = Status::Succeeded;
|
|
||||||
task.details = Some(Details::DocumentDeletion {
|
|
||||||
provided_ids: documents.len(),
|
|
||||||
deleted_documents: Some(deleted_documents.min(documents.len() as u64)),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(tasks)
|
|
||||||
}
|
|
||||||
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
|
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
|
||||||
let filter =
|
let filter =
|
||||||
if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
|
if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
|
||||||
@ -1318,7 +1299,13 @@ impl IndexScheduler {
|
|||||||
} else {
|
} else {
|
||||||
unreachable!()
|
unreachable!()
|
||||||
};
|
};
|
||||||
let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
|
let deleted_documents = delete_document_by_filter(
|
||||||
|
index_wtxn,
|
||||||
|
filter,
|
||||||
|
self.index_mapper.indexer_config(),
|
||||||
|
self.must_stop_processing.clone(),
|
||||||
|
index,
|
||||||
|
);
|
||||||
let original_filter = if let Some(Details::DocumentDeletionByFilter {
|
let original_filter = if let Some(Details::DocumentDeletionByFilter {
|
||||||
original_filter,
|
original_filter,
|
||||||
deleted_documents: _,
|
deleted_documents: _,
|
||||||
@ -1552,6 +1539,8 @@ impl IndexScheduler {
|
|||||||
fn delete_document_by_filter<'a>(
|
fn delete_document_by_filter<'a>(
|
||||||
wtxn: &mut RwTxn<'a, '_>,
|
wtxn: &mut RwTxn<'a, '_>,
|
||||||
filter: &serde_json::Value,
|
filter: &serde_json::Value,
|
||||||
|
indexer_config: &IndexerConfig,
|
||||||
|
must_stop_processing: MustStopProcessing,
|
||||||
index: &'a Index,
|
index: &'a Index,
|
||||||
) -> Result<u64> {
|
) -> Result<u64> {
|
||||||
let filter = Filter::from_json(filter)?;
|
let filter = Filter::from_json(filter)?;
|
||||||
@ -1562,9 +1551,26 @@ fn delete_document_by_filter<'a>(
|
|||||||
}
|
}
|
||||||
e => e.into(),
|
e => e.into(),
|
||||||
})?;
|
})?;
|
||||||
let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
|
|
||||||
delete_operation.delete_documents(&candidates);
|
let config = IndexDocumentsConfig {
|
||||||
delete_operation.execute().map(|result| result.deleted_documents)?
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut builder = milli::update::IndexDocuments::new(
|
||||||
|
wtxn,
|
||||||
|
index,
|
||||||
|
indexer_config,
|
||||||
|
config,
|
||||||
|
|indexing_step| debug!("update: {:?}", indexing_step),
|
||||||
|
|| must_stop_processing.get(),
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let (new_builder, count) = builder.remove_documents_from_db_no_batch(&candidates)?;
|
||||||
|
builder = new_builder;
|
||||||
|
|
||||||
|
let _ = builder.execute()?;
|
||||||
|
count
|
||||||
} else {
|
} else {
|
||||||
0
|
0
|
||||||
})
|
})
|
||||||
|
@ -108,6 +108,8 @@ pub enum Error {
|
|||||||
TaskDeletionWithEmptyQuery,
|
TaskDeletionWithEmptyQuery,
|
||||||
#[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")]
|
#[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")]
|
||||||
TaskCancelationWithEmptyQuery,
|
TaskCancelationWithEmptyQuery,
|
||||||
|
#[error("Aborted task")]
|
||||||
|
AbortedTask,
|
||||||
|
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Dump(#[from] dump::Error),
|
Dump(#[from] dump::Error),
|
||||||
@ -175,6 +177,7 @@ impl Error {
|
|||||||
| Error::TaskNotFound(_)
|
| Error::TaskNotFound(_)
|
||||||
| Error::TaskDeletionWithEmptyQuery
|
| Error::TaskDeletionWithEmptyQuery
|
||||||
| Error::TaskCancelationWithEmptyQuery
|
| Error::TaskCancelationWithEmptyQuery
|
||||||
|
| Error::AbortedTask
|
||||||
| Error::Dump(_)
|
| Error::Dump(_)
|
||||||
| Error::Heed(_)
|
| Error::Heed(_)
|
||||||
| Error::Milli(_)
|
| Error::Milli(_)
|
||||||
@ -236,6 +239,9 @@ impl ErrorCode for Error {
|
|||||||
Error::TaskDatabaseUpdate(_) => Code::Internal,
|
Error::TaskDatabaseUpdate(_) => Code::Internal,
|
||||||
Error::CreateBatch(_) => Code::Internal,
|
Error::CreateBatch(_) => Code::Internal,
|
||||||
|
|
||||||
|
// This one should never be seen by the end user
|
||||||
|
Error::AbortedTask => Code::Internal,
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
Error::PlannedFailure => Code::Internal,
|
Error::PlannedFailure => Code::Internal,
|
||||||
}
|
}
|
||||||
|
@ -1183,7 +1183,8 @@ impl IndexScheduler {
|
|||||||
// If we have an abortion error we must stop the tick here and re-schedule tasks.
|
// If we have an abortion error we must stop the tick here and re-schedule tasks.
|
||||||
Err(Error::Milli(milli::Error::InternalError(
|
Err(Error::Milli(milli::Error::InternalError(
|
||||||
milli::InternalError::AbortedIndexation,
|
milli::InternalError::AbortedIndexation,
|
||||||
))) => {
|
)))
|
||||||
|
| Err(Error::AbortedTask) => {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
self.breakpoint(Breakpoint::AbortedIndexation);
|
self.breakpoint(Breakpoint::AbortedIndexation);
|
||||||
wtxn.abort().map_err(Error::HeedTransaction)?;
|
wtxn.abort().map_err(Error::HeedTransaction)?;
|
||||||
@ -4339,4 +4340,26 @@ mod tests {
|
|||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cancel_processing_dump() {
|
||||||
|
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
||||||
|
|
||||||
|
let dump_creation = KindWithContent::DumpCreation { keys: Vec::new(), instance_uid: None };
|
||||||
|
let dump_cancellation = KindWithContent::TaskCancelation {
|
||||||
|
query: "cancel dump".to_owned(),
|
||||||
|
tasks: RoaringBitmap::from_iter([0]),
|
||||||
|
};
|
||||||
|
let _ = index_scheduler.register(dump_creation).unwrap();
|
||||||
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register");
|
||||||
|
handle.advance_till([Start, BatchCreated, InsideProcessBatch]);
|
||||||
|
|
||||||
|
let _ = index_scheduler.register(dump_cancellation).unwrap();
|
||||||
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered");
|
||||||
|
|
||||||
|
snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation");
|
||||||
|
|
||||||
|
handle.advance_one_successful_batch();
|
||||||
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,35 @@
|
|||||||
|
---
|
||||||
|
source: index-scheduler/src/lib.rs
|
||||||
|
---
|
||||||
|
### Autobatching Enabled = true
|
||||||
|
### Processing Tasks:
|
||||||
|
[]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### All Tasks:
|
||||||
|
0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Status:
|
||||||
|
enqueued [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Kind:
|
||||||
|
"dumpCreation" [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Tasks:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Mapper:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Canceled By:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Enqueued At:
|
||||||
|
[timestamp] [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Started At:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Finished At:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### File Store:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
@ -0,0 +1,45 @@
|
|||||||
|
---
|
||||||
|
source: index-scheduler/src/lib.rs
|
||||||
|
---
|
||||||
|
### Autobatching Enabled = true
|
||||||
|
### Processing Tasks:
|
||||||
|
[]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### All Tasks:
|
||||||
|
0 {uid: 0, status: canceled, canceled_by: 1, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||||
|
1 {uid: 1, status: succeeded, details: { matched_tasks: 1, canceled_tasks: Some(0), original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Status:
|
||||||
|
enqueued []
|
||||||
|
succeeded [1,]
|
||||||
|
canceled [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Kind:
|
||||||
|
"taskCancelation" [1,]
|
||||||
|
"dumpCreation" [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Tasks:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Mapper:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Canceled By:
|
||||||
|
1 [0,]
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Enqueued At:
|
||||||
|
[timestamp] [0,]
|
||||||
|
[timestamp] [1,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Started At:
|
||||||
|
[timestamp] [0,]
|
||||||
|
[timestamp] [1,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Finished At:
|
||||||
|
[timestamp] [0,]
|
||||||
|
[timestamp] [1,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### File Store:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
@ -0,0 +1,38 @@
|
|||||||
|
---
|
||||||
|
source: index-scheduler/src/lib.rs
|
||||||
|
---
|
||||||
|
### Autobatching Enabled = true
|
||||||
|
### Processing Tasks:
|
||||||
|
[0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### All Tasks:
|
||||||
|
0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||||
|
1 {uid: 1, status: enqueued, details: { matched_tasks: 1, canceled_tasks: None, original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Status:
|
||||||
|
enqueued [0,1,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Kind:
|
||||||
|
"taskCancelation" [1,]
|
||||||
|
"dumpCreation" [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Tasks:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Mapper:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Canceled By:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Enqueued At:
|
||||||
|
[timestamp] [0,]
|
||||||
|
[timestamp] [1,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Started At:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Finished At:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### File Store:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
@ -324,7 +324,6 @@ impl ErrorCode for milli::Error {
|
|||||||
UserError::SerdeJson(_)
|
UserError::SerdeJson(_)
|
||||||
| UserError::InvalidLmdbOpenOptions
|
| UserError::InvalidLmdbOpenOptions
|
||||||
| UserError::DocumentLimitReached
|
| UserError::DocumentLimitReached
|
||||||
| UserError::AccessingSoftDeletedDocument { .. }
|
|
||||||
| UserError::UnknownInternalDocumentId { .. } => Code::Internal,
|
| UserError::UnknownInternalDocumentId { .. } => Code::Internal,
|
||||||
UserError::InvalidStoreFile => Code::InvalidStoreFile,
|
UserError::InvalidStoreFile => Code::InvalidStoreFile,
|
||||||
UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice,
|
UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice,
|
||||||
|
@ -362,7 +362,7 @@ fn import_dump(
|
|||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
},
|
},
|
||||||
|indexing_step| log::debug!("update: {:?}", indexing_step),
|
|indexing_step| log::trace!("update: {:?}", indexing_step),
|
||||||
|| false,
|
|| false,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
@ -612,8 +612,8 @@ fn retrieve_document<S: AsRef<str>>(
|
|||||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||||
|
|
||||||
let internal_id = index
|
let internal_id = index
|
||||||
.external_documents_ids(&txn)?
|
.external_documents_ids()
|
||||||
.get(doc_id.as_bytes())
|
.get(&txn, doc_id)?
|
||||||
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
|
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
|
||||||
|
|
||||||
let document = index
|
let document = index
|
||||||
|
@ -397,7 +397,7 @@ async fn delete_document_by_complex_filter() {
|
|||||||
"canceledBy": null,
|
"canceledBy": null,
|
||||||
"details": {
|
"details": {
|
||||||
"providedIds": 0,
|
"providedIds": 0,
|
||||||
"deletedDocuments": 4,
|
"deletedDocuments": 2,
|
||||||
"originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]"
|
"originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]"
|
||||||
},
|
},
|
||||||
"error": null,
|
"error": null,
|
||||||
|
@ -26,8 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" }
|
|||||||
fst = "0.4.7"
|
fst = "0.4.7"
|
||||||
fxhash = "0.2.1"
|
fxhash = "0.2.1"
|
||||||
geoutils = "0.5.1"
|
geoutils = "0.5.1"
|
||||||
grenad = { version = "0.4.4", default-features = false, features = [
|
grenad = { version = "0.4.5", default-features = false, features = [
|
||||||
"tempfile",
|
"rayon", "tempfile"
|
||||||
] }
|
] }
|
||||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
|
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
|
||||||
"lmdb", "read-txn-no-tls"
|
"lmdb", "read-txn-no-tls"
|
||||||
@ -79,6 +79,7 @@ big_s = "1.0.2"
|
|||||||
insta = "1.29.0"
|
insta = "1.29.0"
|
||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
|
meili-snap = { path = "../meili-snap" }
|
||||||
rand = { version = "0.8.5", features = ["small_rng"] }
|
rand = { version = "0.8.5", features = ["small_rng"] }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
mod builder;
|
mod builder;
|
||||||
mod enriched;
|
mod enriched;
|
||||||
|
mod primary_key;
|
||||||
mod reader;
|
mod reader;
|
||||||
mod serde_impl;
|
mod serde_impl;
|
||||||
|
|
||||||
@ -11,6 +12,7 @@ use bimap::BiHashMap;
|
|||||||
pub use builder::DocumentsBatchBuilder;
|
pub use builder::DocumentsBatchBuilder;
|
||||||
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
|
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
|
||||||
use obkv::KvReader;
|
use obkv::KvReader;
|
||||||
|
pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY};
|
||||||
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
|
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
@ -87,6 +89,12 @@ impl DocumentsBatchIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl FieldIdMapper for DocumentsBatchIndex {
|
||||||
|
fn id(&self, name: &str) -> Option<FieldId> {
|
||||||
|
self.id(name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
#[error("Error parsing number {value:?} at line {line}: {error}")]
|
#[error("Error parsing number {value:?} at line {line}: {error}")]
|
||||||
|
172
milli/src/documents/primary_key.rs
Normal file
172
milli/src/documents/primary_key.rs
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
use std::iter;
|
||||||
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use crate::{FieldId, InternalError, Object, Result, UserError};
|
||||||
|
|
||||||
|
/// The symbol used to define levels in a nested primary key.
|
||||||
|
const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
|
||||||
|
|
||||||
|
/// The default primary that is used when not specified.
|
||||||
|
pub const DEFAULT_PRIMARY_KEY: &str = "id";
|
||||||
|
|
||||||
|
/// Trait for objects that can map the name of a field to its [`FieldId`].
|
||||||
|
pub trait FieldIdMapper {
|
||||||
|
/// Attempts to map the passed name to its [`FieldId`].
|
||||||
|
///
|
||||||
|
/// `None` if the field with this name was not found.
|
||||||
|
fn id(&self, name: &str) -> Option<FieldId>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A type that represent the type of primary key that has been set
|
||||||
|
/// for this index, a classic flat one or a nested one.
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub enum PrimaryKey<'a> {
|
||||||
|
Flat { name: &'a str, field_id: FieldId },
|
||||||
|
Nested { name: &'a str },
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum DocumentIdExtractionError {
|
||||||
|
InvalidDocumentId(UserError),
|
||||||
|
MissingDocumentId,
|
||||||
|
TooManyDocumentIds(usize),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> PrimaryKey<'a> {
|
||||||
|
pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option<Self> {
|
||||||
|
Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) {
|
||||||
|
Self::Nested { name: path }
|
||||||
|
} else {
|
||||||
|
let field_id = fields.id(path)?;
|
||||||
|
Self::Flat { name: path, field_id }
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn name(&self) -> &str {
|
||||||
|
match self {
|
||||||
|
PrimaryKey::Flat { name, .. } => name,
|
||||||
|
PrimaryKey::Nested { name } => name,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn document_id(
|
||||||
|
&self,
|
||||||
|
document: &obkv::KvReader<FieldId>,
|
||||||
|
fields: &impl FieldIdMapper,
|
||||||
|
) -> Result<StdResult<String, DocumentIdExtractionError>> {
|
||||||
|
match self {
|
||||||
|
PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) {
|
||||||
|
Some(document_id_bytes) => {
|
||||||
|
let document_id = serde_json::from_slice(document_id_bytes)
|
||||||
|
.map_err(InternalError::SerdeJson)?;
|
||||||
|
match validate_document_id_value(document_id)? {
|
||||||
|
Ok(document_id) => Ok(Ok(document_id)),
|
||||||
|
Err(user_error) => {
|
||||||
|
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||||
|
},
|
||||||
|
nested @ PrimaryKey::Nested { .. } => {
|
||||||
|
let mut matching_documents_ids = Vec::new();
|
||||||
|
for (first_level_name, right) in nested.possible_level_names() {
|
||||||
|
if let Some(field_id) = fields.id(first_level_name) {
|
||||||
|
if let Some(value_bytes) = document.get(field_id) {
|
||||||
|
let object = serde_json::from_slice(value_bytes)
|
||||||
|
.map_err(InternalError::SerdeJson)?;
|
||||||
|
fetch_matching_values(object, right, &mut matching_documents_ids);
|
||||||
|
|
||||||
|
if matching_documents_ids.len() >= 2 {
|
||||||
|
return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(
|
||||||
|
matching_documents_ids.len(),
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match matching_documents_ids.pop() {
|
||||||
|
Some(document_id) => match validate_document_id_value(document_id)? {
|
||||||
|
Ok(document_id) => Ok(Ok(document_id)),
|
||||||
|
Err(user_error) => {
|
||||||
|
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||||
|
}
|
||||||
|
},
|
||||||
|
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns an `Iterator` that gives all the possible fields names the primary key
|
||||||
|
/// can have depending of the first level name and depth of the objects.
|
||||||
|
pub fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
|
||||||
|
let name = self.name();
|
||||||
|
name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
|
||||||
|
.map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
|
||||||
|
.chain(iter::once((name, "")))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
|
||||||
|
match value {
|
||||||
|
Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
|
||||||
|
otherwise => output.push(otherwise),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_matching_values_in_object(
|
||||||
|
object: Object,
|
||||||
|
selector: &str,
|
||||||
|
base_key: &str,
|
||||||
|
output: &mut Vec<Value>,
|
||||||
|
) {
|
||||||
|
for (key, value) in object {
|
||||||
|
let base_key = if base_key.is_empty() {
|
||||||
|
key.to_string()
|
||||||
|
} else {
|
||||||
|
format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
|
||||||
|
};
|
||||||
|
|
||||||
|
if starts_with(selector, &base_key) {
|
||||||
|
match value {
|
||||||
|
Value::Object(object) => {
|
||||||
|
fetch_matching_values_in_object(object, selector, &base_key, output)
|
||||||
|
}
|
||||||
|
value => output.push(value),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn starts_with(selector: &str, key: &str) -> bool {
|
||||||
|
selector.strip_prefix(key).map_or(false, |tail| {
|
||||||
|
tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: move to a DocumentId struct
|
||||||
|
|
||||||
|
fn validate_document_id(document_id: &str) -> Option<&str> {
|
||||||
|
if !document_id.is_empty()
|
||||||
|
&& document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
||||||
|
{
|
||||||
|
Some(document_id)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
|
||||||
|
match document_id {
|
||||||
|
Value::String(string) => match validate_document_id(&string) {
|
||||||
|
Some(s) if s.len() == string.len() => Ok(Ok(string)),
|
||||||
|
Some(s) => Ok(Ok(s.to_string())),
|
||||||
|
None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
|
||||||
|
},
|
||||||
|
Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
|
||||||
|
content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
|
||||||
|
}
|
||||||
|
}
|
@ -89,8 +89,6 @@ pub enum FieldIdMapMissingEntry {
|
|||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum UserError {
|
pub enum UserError {
|
||||||
#[error("A soft deleted internal document id have been used: `{document_id}`.")]
|
|
||||||
AccessingSoftDeletedDocument { document_id: DocumentId },
|
|
||||||
#[error("A document cannot contain more than 65,535 fields.")]
|
#[error("A document cannot contain more than 65,535 fields.")]
|
||||||
AttributeLimitReached,
|
AttributeLimitReached,
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
|
@ -1,159 +1,75 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::convert::TryInto;
|
|
||||||
use std::{fmt, str};
|
|
||||||
|
|
||||||
use fst::map::IndexedValue;
|
use heed::types::{OwnedType, Str};
|
||||||
use fst::{IntoStreamer, Streamer};
|
use heed::{Database, RoIter, RoTxn, RwTxn};
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
const DELETED_ID: u64 = u64::MAX;
|
use crate::{DocumentId, BEU32};
|
||||||
|
|
||||||
pub struct ExternalDocumentsIds<'a> {
|
pub enum DocumentOperationKind {
|
||||||
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
|
Create,
|
||||||
pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
|
Delete,
|
||||||
soft_deleted_docids: RoaringBitmap,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ExternalDocumentsIds<'a> {
|
pub struct DocumentOperation {
|
||||||
pub fn new(
|
pub external_id: String,
|
||||||
hard: fst::Map<Cow<'a, [u8]>>,
|
pub internal_id: DocumentId,
|
||||||
soft: fst::Map<Cow<'a, [u8]>>,
|
pub kind: DocumentOperationKind,
|
||||||
soft_deleted_docids: RoaringBitmap,
|
|
||||||
) -> ExternalDocumentsIds<'a> {
|
|
||||||
ExternalDocumentsIds { hard, soft, soft_deleted_docids }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn into_static(self) -> ExternalDocumentsIds<'static> {
|
pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>);
|
||||||
ExternalDocumentsIds {
|
|
||||||
hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
|
impl ExternalDocumentsIds {
|
||||||
soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
|
pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds {
|
||||||
soft_deleted_docids: self.soft_deleted_docids,
|
ExternalDocumentsIds(db)
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `true` if hard and soft external documents lists are empty.
|
/// Returns `true` if hard and soft external documents lists are empty.
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result<bool> {
|
||||||
self.hard.is_empty() && self.soft.is_empty()
|
self.0.is_empty(rtxn).map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
|
pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> {
|
||||||
let external_id = external_id.as_ref();
|
Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get()))
|
||||||
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
|
|
||||||
Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
|
|
||||||
Some(id.try_into().unwrap())
|
|
||||||
}
|
|
||||||
_otherwise => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
|
|
||||||
/// don't contain any soft deleted document id.
|
|
||||||
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
|
|
||||||
let mut new_hard_builder = fst::MapBuilder::memory();
|
|
||||||
|
|
||||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
|
||||||
let mut iter = union_op.into_stream();
|
|
||||||
while let Some((external_id, docids)) = iter.next() {
|
|
||||||
// prefer selecting the ids from soft, always
|
|
||||||
let id = indexed_last_value(docids).unwrap();
|
|
||||||
if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) {
|
|
||||||
new_hard_builder.insert(external_id, id)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
drop(iter);
|
|
||||||
|
|
||||||
// Delete soft map completely
|
|
||||||
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
|
||||||
// We save the new map as the new hard map.
|
|
||||||
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn insert_ids<A: AsRef<[u8]>>(&mut self, other: &fst::Map<A>) -> fst::Result<()> {
|
|
||||||
let union_op = self.soft.op().add(other).r#union();
|
|
||||||
|
|
||||||
let mut new_soft_builder = fst::MapBuilder::memory();
|
|
||||||
let mut iter = union_op.into_stream();
|
|
||||||
while let Some((external_id, marked_docids)) = iter.next() {
|
|
||||||
let id = indexed_last_value(marked_docids).unwrap();
|
|
||||||
new_soft_builder.insert(external_id, id)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
drop(iter);
|
|
||||||
|
|
||||||
// We save the new map as the new soft map.
|
|
||||||
self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
|
|
||||||
self.merge_soft_into_hard()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// An helper function to debug this type, returns an `HashMap` of both,
|
/// An helper function to debug this type, returns an `HashMap` of both,
|
||||||
/// soft and hard fst maps, combined.
|
/// soft and hard fst maps, combined.
|
||||||
pub fn to_hash_map(&self) -> HashMap<String, u32> {
|
pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, u32>> {
|
||||||
let mut map = HashMap::new();
|
let mut map = HashMap::default();
|
||||||
|
for result in self.0.iter(rtxn)? {
|
||||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
let (external, internal) = result?;
|
||||||
let mut iter = union_op.into_stream();
|
map.insert(external.to_owned(), internal.get());
|
||||||
while let Some((external_id, marked_docids)) = iter.next() {
|
|
||||||
let id = indexed_last_value(marked_docids).unwrap();
|
|
||||||
if id != DELETED_ID {
|
|
||||||
let external_id = str::from_utf8(external_id).unwrap();
|
|
||||||
map.insert(external_id.to_owned(), id.try_into().unwrap());
|
|
||||||
}
|
}
|
||||||
|
Ok(map)
|
||||||
}
|
}
|
||||||
|
|
||||||
map
|
/// Applies the list of operations passed as argument, modifying the current external to internal id mapping.
|
||||||
|
///
|
||||||
|
/// If the list contains multiple operations on the same external id, then the result is unspecified.
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// - If attempting to delete a document that doesn't exist
|
||||||
|
/// - If attempting to create a document that already exists
|
||||||
|
pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec<DocumentOperation>) -> heed::Result<()> {
|
||||||
|
for DocumentOperation { external_id, internal_id, kind } in operations {
|
||||||
|
match kind {
|
||||||
|
DocumentOperationKind::Create => {
|
||||||
|
self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?;
|
||||||
}
|
}
|
||||||
|
DocumentOperationKind::Delete => {
|
||||||
/// Return an fst of the combined hard and soft deleted ID.
|
if !self.0.delete(wtxn, &external_id)? {
|
||||||
pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
|
panic!("Attempting to delete a non-existing document")
|
||||||
if self.soft.is_empty() {
|
|
||||||
return Ok(Cow::Borrowed(&self.hard));
|
|
||||||
}
|
|
||||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
|
||||||
|
|
||||||
let mut iter = union_op.into_stream();
|
|
||||||
let mut new_hard_builder = fst::MapBuilder::memory();
|
|
||||||
while let Some((external_id, marked_docids)) = iter.next() {
|
|
||||||
let value = indexed_last_value(marked_docids).unwrap();
|
|
||||||
if value != DELETED_ID {
|
|
||||||
new_hard_builder.insert(external_id, value)?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
drop(iter);
|
|
||||||
|
|
||||||
Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
|
|
||||||
if self.soft.len() >= self.hard.len() / 2 {
|
|
||||||
self.hard = self.to_fst()?.into_owned();
|
|
||||||
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Debug for ExternalDocumentsIds<'_> {
|
/// Returns an iterator over all the external ids.
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, OwnedType<BEU32>>> {
|
||||||
f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish()
|
self.0.iter(rtxn)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for ExternalDocumentsIds<'static> {
|
|
||||||
fn default() -> Self {
|
|
||||||
ExternalDocumentsIds {
|
|
||||||
hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
|
|
||||||
soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
|
|
||||||
soft_deleted_docids: RoaringBitmap::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the value of the `IndexedValue` with the highest _index_.
|
|
||||||
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
|
|
||||||
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
|
|
||||||
}
|
|
||||||
|
@ -81,6 +81,12 @@ impl Default for FieldsIdsMap {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl crate::documents::FieldIdMapper for FieldsIdsMap {
|
||||||
|
fn id(&self, name: &str) -> Option<FieldId> {
|
||||||
|
self.id(name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
@ -6,6 +6,7 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::heed_codec::BytesDecodeOwned;
|
use crate::heed_codec::BytesDecodeOwned;
|
||||||
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
|
|
||||||
/// This is the limit where using a byteorder became less size efficient
|
/// This is the limit where using a byteorder became less size efficient
|
||||||
/// than using a direct roaring encoding, it is also the point where we are able
|
/// than using a direct roaring encoding, it is also the point where we are able
|
||||||
@ -60,12 +61,16 @@ impl CboRoaringBitmapCodec {
|
|||||||
/// if the merged values length is under the threshold, values are directly
|
/// if the merged values length is under the threshold, values are directly
|
||||||
/// serialized in the buffer else a RoaringBitmap is created from the
|
/// serialized in the buffer else a RoaringBitmap is created from the
|
||||||
/// values and is serialized in the buffer.
|
/// values and is serialized in the buffer.
|
||||||
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
|
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
|
||||||
|
where
|
||||||
|
I: IntoIterator<Item = A>,
|
||||||
|
A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
let mut roaring = RoaringBitmap::new();
|
let mut roaring = RoaringBitmap::new();
|
||||||
let mut vec = Vec::new();
|
let mut vec = Vec::new();
|
||||||
|
|
||||||
for bytes in slices {
|
for bytes in slices {
|
||||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
|
||||||
let mut reader = bytes.as_ref();
|
let mut reader = bytes.as_ref();
|
||||||
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
||||||
vec.push(integer);
|
vec.push(integer);
|
||||||
@ -85,7 +90,7 @@ impl CboRoaringBitmapCodec {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// We can unwrap safely because the vector is sorted upper.
|
// We can unwrap safely because the vector is sorted upper.
|
||||||
let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
|
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
|
||||||
roaring.serialize_into(buffer)?;
|
roaring.serialize_into(buffer)?;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -95,6 +100,33 @@ impl CboRoaringBitmapCodec {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Merges a DelAdd delta into a CboRoaringBitmap.
|
||||||
|
pub fn merge_deladd_into<'a>(
|
||||||
|
deladd: KvReaderDelAdd<'_>,
|
||||||
|
previous: &[u8],
|
||||||
|
buffer: &'a mut Vec<u8>,
|
||||||
|
) -> io::Result<Option<&'a [u8]>> {
|
||||||
|
// Deserialize the bitmap that is already there
|
||||||
|
let mut previous = Self::deserialize_from(previous)?;
|
||||||
|
|
||||||
|
// Remove integers we no more want in the previous bitmap
|
||||||
|
if let Some(value) = deladd.get(DelAdd::Deletion) {
|
||||||
|
previous -= Self::deserialize_from(value)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert the new integers we want in the previous bitmap
|
||||||
|
if let Some(value) = deladd.get(DelAdd::Addition) {
|
||||||
|
previous |= Self::deserialize_from(value)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if previous.is_empty() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::serialize_into(&previous, buffer);
|
||||||
|
Ok(Some(&buffer[..]))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -13,7 +13,7 @@ use crate::heed_codec::ByteSliceRefCodec;
|
|||||||
/// The documents returned by the iterator are grouped by the facet values that
|
/// The documents returned by the iterator are grouped by the facet values that
|
||||||
/// determined their rank. For example, given the documents:
|
/// determined their rank. For example, given the documents:
|
||||||
///
|
///
|
||||||
/// ```ignore
|
/// ```text
|
||||||
/// 0: { "colour": ["blue", "green"] }
|
/// 0: { "colour": ["blue", "green"] }
|
||||||
/// 1: { "colour": ["blue", "red"] }
|
/// 1: { "colour": ["blue", "red"] }
|
||||||
/// 2: { "colour": ["orange", "red"] }
|
/// 2: { "colour": ["orange", "red"] }
|
||||||
@ -22,7 +22,7 @@ use crate::heed_codec::ByteSliceRefCodec;
|
|||||||
/// ```
|
/// ```
|
||||||
/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator
|
/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator
|
||||||
/// over the following elements:
|
/// over the following elements:
|
||||||
/// ```ignore
|
/// ```text
|
||||||
/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue"
|
/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue"
|
||||||
/// [3] // same for "green"
|
/// [3] // same for "green"
|
||||||
/// [2] // same for "orange"
|
/// [2] // same for "orange"
|
||||||
|
@ -223,12 +223,9 @@ impl<'a> Filter<'a> {
|
|||||||
impl<'a> Filter<'a> {
|
impl<'a> Filter<'a> {
|
||||||
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
|
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
|
||||||
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
|
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
|
||||||
let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?;
|
|
||||||
let filterable_fields = index.filterable_fields(rtxn)?;
|
let filterable_fields = index.filterable_fields(rtxn)?;
|
||||||
|
|
||||||
// and finally we delete all the soft_deleted_documents, again, only once at the very end
|
|
||||||
self.inner_evaluate(rtxn, index, &filterable_fields)
|
self.inner_evaluate(rtxn, index, &filterable_fields)
|
||||||
.map(|result| result - soft_deleted_documents)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn evaluate_operator(
|
fn evaluate_operator(
|
||||||
|
@ -12,7 +12,7 @@ use super::Word;
|
|||||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||||
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
||||||
use crate::{
|
use crate::{
|
||||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
|
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// A cache storing pointers to values in the LMDB databases.
|
/// A cache storing pointers to values in the LMDB databases.
|
||||||
@ -25,7 +25,7 @@ pub struct DatabaseCache<'ctx> {
|
|||||||
pub word_pair_proximity_docids:
|
pub word_pair_proximity_docids:
|
||||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||||
pub word_prefix_pair_proximity_docids:
|
pub word_prefix_pair_proximity_docids:
|
||||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>,
|
||||||
pub prefix_word_pair_proximity_docids:
|
pub prefix_word_pair_proximity_docids:
|
||||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||||
pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||||
@ -168,7 +168,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
merge_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
word,
|
word,
|
||||||
self.word_interner.get(word).as_str(),
|
self.word_interner.get(word).as_str(),
|
||||||
@ -182,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
word: Interned<String>,
|
word: Interned<String>,
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
word,
|
word,
|
||||||
self.word_interner.get(word).as_str(),
|
self.word_interner.get(word).as_str(),
|
||||||
@ -230,7 +230,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
merge_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
prefix,
|
prefix,
|
||||||
self.word_interner.get(prefix).as_str(),
|
self.word_interner.get(prefix).as_str(),
|
||||||
@ -244,7 +244,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
prefix: Interned<String>,
|
prefix: Interned<String>,
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
prefix,
|
prefix,
|
||||||
self.word_interner.get(prefix).as_str(),
|
self.word_interner.get(prefix).as_str(),
|
||||||
@ -297,35 +297,47 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
prefix2: Interned<String>,
|
prefix2: Interned<String>,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
let docids = match self
|
||||||
self.txn,
|
.db_cache
|
||||||
(proximity, word1, prefix2),
|
.word_prefix_pair_proximity_docids
|
||||||
&(
|
.entry((proximity, word1, prefix2))
|
||||||
|
{
|
||||||
|
Entry::Occupied(docids) => docids.get().clone(),
|
||||||
|
Entry::Vacant(entry) => {
|
||||||
|
// compute docids using prefix iter and store the result in the cache.
|
||||||
|
let key = U8StrStrCodec::bytes_encode(&(
|
||||||
proximity,
|
proximity,
|
||||||
self.word_interner.get(word1).as_str(),
|
self.word_interner.get(word1).as_str(),
|
||||||
self.word_interner.get(prefix2).as_str(),
|
self.word_interner.get(prefix2).as_str(),
|
||||||
),
|
))
|
||||||
&mut self.db_cache.word_prefix_pair_proximity_docids,
|
.unwrap()
|
||||||
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
.into_owned();
|
||||||
)
|
let mut prefix_docids = RoaringBitmap::new();
|
||||||
|
let remap_key_type = self
|
||||||
|
.index
|
||||||
|
.word_pair_proximity_docids
|
||||||
|
.remap_key_type::<ByteSlice>()
|
||||||
|
.prefix_iter(self.txn, &key)?;
|
||||||
|
for result in remap_key_type {
|
||||||
|
let (_, docids) = result?;
|
||||||
|
|
||||||
|
prefix_docids |= docids;
|
||||||
}
|
}
|
||||||
|
entry.insert(Some(prefix_docids.clone()));
|
||||||
|
Some(prefix_docids)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Ok(docids)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_db_prefix_word_pair_proximity_docids(
|
pub fn get_db_prefix_word_pair_proximity_docids(
|
||||||
&mut self,
|
&mut self,
|
||||||
left_prefix: Interned<String>,
|
left_prefix: Interned<String>,
|
||||||
right: Interned<String>,
|
right: Interned<String>,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
// only accept exact matches on reverted positions
|
||||||
self.txn,
|
self.get_db_word_pair_proximity_docids(left_prefix, right, proximity)
|
||||||
(proximity, left_prefix, right),
|
|
||||||
&(
|
|
||||||
proximity,
|
|
||||||
self.word_interner.get(left_prefix).as_str(),
|
|
||||||
self.word_interner.get(right).as_str(),
|
|
||||||
),
|
|
||||||
&mut self.db_cache.prefix_word_pair_proximity_docids,
|
|
||||||
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_db_word_fid_docids(
|
pub fn get_db_word_fid_docids(
|
||||||
|
@ -371,7 +371,7 @@ fn test_proximity_prefix_db() {
|
|||||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||||
s.query("best s");
|
s.query("best s");
|
||||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]");
|
||||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
@ -379,13 +379,13 @@ fn test_proximity_prefix_db() {
|
|||||||
insta::assert_debug_snapshot!(texts, @r###"
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
[
|
[
|
||||||
"\"this is the best summer meal\"",
|
"\"this is the best summer meal\"",
|
||||||
"\"summer best\"",
|
|
||||||
"\"this is the best meal of summer\"",
|
"\"this is the best meal of summer\"",
|
||||||
"\"summer x best\"",
|
|
||||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||||
"\"this is the best cooked meal of the summer\"",
|
"\"this is the best cooked meal of the summer\"",
|
||||||
"\"this is the best meal of the summer\"",
|
"\"this is the best meal of the summer\"",
|
||||||
"\"summer x y best\"",
|
"\"summer x y best\"",
|
||||||
|
"\"summer x best\"",
|
||||||
|
"\"summer best\"",
|
||||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
@ -423,17 +423,17 @@ fn test_proximity_prefix_db() {
|
|||||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||||
s.query("best win");
|
s.query("best win");
|
||||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
|
||||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
insta::assert_debug_snapshot!(texts, @r###"
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
[
|
[
|
||||||
|
"\"this is the best winter meal\"",
|
||||||
|
"\"this is the best meal of winter\"",
|
||||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
"\"this is the best cooked meal of the winter\"",
|
"\"this is the best cooked meal of the winter\"",
|
||||||
"\"this is the best meal of the winter\"",
|
"\"this is the best meal of the winter\"",
|
||||||
"\"this is the best meal of winter\"",
|
|
||||||
"\"this is the best winter meal\"",
|
|
||||||
"\"winter x y best\"",
|
"\"winter x y best\"",
|
||||||
"\"winter x best\"",
|
"\"winter x best\"",
|
||||||
"\"winter best\"",
|
"\"winter best\"",
|
||||||
@ -471,20 +471,20 @@ fn test_proximity_prefix_db() {
|
|||||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||||
s.query("best wi");
|
s.query("best wi");
|
||||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
|
||||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
insta::assert_debug_snapshot!(texts, @r###"
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
[
|
[
|
||||||
"\"this is the best winter meal\"",
|
"\"this is the best winter meal\"",
|
||||||
"\"winter best\"",
|
|
||||||
"\"this is the best meal of winter\"",
|
"\"this is the best meal of winter\"",
|
||||||
"\"winter x best\"",
|
|
||||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
"\"this is the best cooked meal of the winter\"",
|
"\"this is the best cooked meal of the winter\"",
|
||||||
"\"this is the best meal of the winter\"",
|
"\"this is the best meal of the winter\"",
|
||||||
"\"winter x y best\"",
|
"\"winter x y best\"",
|
||||||
|
"\"winter x best\"",
|
||||||
|
"\"winter best\"",
|
||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 3,
|
|
||||||
max_rank: 4,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 2,
|
rank: 1,
|
||||||
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 1,
|
||||||
max_rank: 4,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 3,
|
|
||||||
max_rank: 4,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 2,
|
rank: 1,
|
||||||
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 1,
|
||||||
max_rank: 4,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
@ -6,7 +6,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 4,
|
||||||
max_rank: 4,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
@ -14,7 +14,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 2,
|
||||||
max_rank: 4,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
@ -13,6 +13,7 @@ This module tests the `sort` ranking rule:
|
|||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
|
use meili_snap::insta;
|
||||||
|
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::search::new::tests::collect_field_values;
|
use crate::search::new::tests::collect_field_values;
|
||||||
|
@ -4,9 +4,8 @@ use std::path::Path;
|
|||||||
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::facet::FacetType;
|
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
|
||||||
use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index};
|
use crate::{make_db_snap_from_iter, obkv_to_json, Index};
|
||||||
|
|
||||||
#[track_caller]
|
#[track_caller]
|
||||||
pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) {
|
pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) {
|
||||||
@ -98,7 +97,6 @@ Create a snapshot test of the given database.
|
|||||||
- `facet_id_string_docids`
|
- `facet_id_string_docids`
|
||||||
- `documents_ids`
|
- `documents_ids`
|
||||||
- `stop_words`
|
- `stop_words`
|
||||||
- `soft_deleted_documents_ids`
|
|
||||||
- `field_distribution`
|
- `field_distribution`
|
||||||
- `fields_ids_map`
|
- `fields_ids_map`
|
||||||
- `geo_faceted_documents_ids`
|
- `geo_faceted_documents_ids`
|
||||||
@ -221,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
|
|||||||
&format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
|
&format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
|
|
||||||
make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |(
|
|
||||||
(proximity, word1, prefix),
|
|
||||||
b,
|
|
||||||
)| {
|
|
||||||
&format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String {
|
|
||||||
make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |(
|
|
||||||
(proximity, prefix, word2),
|
|
||||||
b,
|
|
||||||
)| {
|
|
||||||
&format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
pub fn snap_word_position_docids(index: &Index) -> String {
|
pub fn snap_word_position_docids(index: &Index) -> String {
|
||||||
make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
|
make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
|
||||||
&format!("{word:<16} {position:<6} {}", display_bitmap(&b))
|
&format!("{word:<16} {position:<6} {}", display_bitmap(&b))
|
||||||
@ -308,12 +290,6 @@ pub fn snap_stop_words(index: &Index) -> String {
|
|||||||
let snap = format!("{stop_words:?}");
|
let snap = format!("{stop_words:?}");
|
||||||
snap
|
snap
|
||||||
}
|
}
|
||||||
pub fn snap_soft_deleted_documents_ids(index: &Index) -> String {
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap();
|
|
||||||
|
|
||||||
display_bitmap(&soft_deleted_documents_ids)
|
|
||||||
}
|
|
||||||
pub fn snap_field_distributions(index: &Index) -> String {
|
pub fn snap_field_distributions(index: &Index) -> String {
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let mut snap = String::new();
|
let mut snap = String::new();
|
||||||
@ -340,50 +316,21 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
|
|||||||
}
|
}
|
||||||
pub fn snap_external_documents_ids(index: &Index) -> String {
|
pub fn snap_external_documents_ids(index: &Index) -> String {
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap();
|
let external_ids = index.external_documents_ids().to_hash_map(&rtxn).unwrap();
|
||||||
|
// ensure fixed order (not guaranteed by hashmap)
|
||||||
|
let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect();
|
||||||
|
external_ids.sort_by(|(l, _), (r, _)| l.cmp(r));
|
||||||
|
|
||||||
let mut snap = String::new();
|
let mut snap = String::new();
|
||||||
|
|
||||||
writeln!(&mut snap, "soft:").unwrap();
|
writeln!(&mut snap, "docids:").unwrap();
|
||||||
let stream_soft = soft.stream();
|
for (key, id) in external_ids {
|
||||||
let soft_external_ids = stream_soft.into_str_vec().unwrap();
|
|
||||||
for (key, id) in soft_external_ids {
|
|
||||||
writeln!(&mut snap, "{key:<24} {id}").unwrap();
|
|
||||||
}
|
|
||||||
writeln!(&mut snap, "hard:").unwrap();
|
|
||||||
let stream_hard = hard.stream();
|
|
||||||
let hard_external_ids = stream_hard.into_str_vec().unwrap();
|
|
||||||
for (key, id) in hard_external_ids {
|
|
||||||
writeln!(&mut snap, "{key:<24} {id}").unwrap();
|
writeln!(&mut snap, "{key:<24} {id}").unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
snap
|
snap
|
||||||
}
|
}
|
||||||
pub fn snap_number_faceted_documents_ids(index: &Index) -> String {
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
|
||||||
let mut snap = String::new();
|
|
||||||
for field_id in fields_ids_map.ids() {
|
|
||||||
let number_faceted_documents_ids =
|
|
||||||
index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap();
|
|
||||||
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
snap
|
|
||||||
}
|
|
||||||
pub fn snap_string_faceted_documents_ids(index: &Index) -> String {
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
|
||||||
|
|
||||||
let mut snap = String::new();
|
|
||||||
for field_id in fields_ids_map.ids() {
|
|
||||||
let string_faceted_documents_ids =
|
|
||||||
index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap();
|
|
||||||
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
snap
|
|
||||||
}
|
|
||||||
pub fn snap_words_fst(index: &Index) -> String {
|
pub fn snap_words_fst(index: &Index) -> String {
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let words_fst = index.words_fst(&rtxn).unwrap();
|
let words_fst = index.words_fst(&rtxn).unwrap();
|
||||||
@ -516,9 +463,6 @@ macro_rules! full_snap_of_db {
|
|||||||
($index:ident, stop_words) => {{
|
($index:ident, stop_words) => {{
|
||||||
$crate::snapshot_tests::snap_stop_words(&$index)
|
$crate::snapshot_tests::snap_stop_words(&$index)
|
||||||
}};
|
}};
|
||||||
($index:ident, soft_deleted_documents_ids) => {{
|
|
||||||
$crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index)
|
|
||||||
}};
|
|
||||||
($index:ident, field_distribution) => {{
|
($index:ident, field_distribution) => {{
|
||||||
$crate::snapshot_tests::snap_field_distributions(&$index)
|
$crate::snapshot_tests::snap_field_distributions(&$index)
|
||||||
}};
|
}};
|
||||||
@ -531,12 +475,6 @@ macro_rules! full_snap_of_db {
|
|||||||
($index:ident, external_documents_ids) => {{
|
($index:ident, external_documents_ids) => {{
|
||||||
$crate::snapshot_tests::snap_external_documents_ids(&$index)
|
$crate::snapshot_tests::snap_external_documents_ids(&$index)
|
||||||
}};
|
}};
|
||||||
($index:ident, number_faceted_documents_ids) => {{
|
|
||||||
$crate::snapshot_tests::snap_number_faceted_documents_ids(&$index)
|
|
||||||
}};
|
|
||||||
($index:ident, string_faceted_documents_ids) => {{
|
|
||||||
$crate::snapshot_tests::snap_string_faceted_documents_ids(&$index)
|
|
||||||
}};
|
|
||||||
($index:ident, words_fst) => {{
|
($index:ident, words_fst) => {{
|
||||||
$crate::snapshot_tests::snap_words_fst(&$index)
|
$crate::snapshot_tests::snap_words_fst(&$index)
|
||||||
}};
|
}};
|
||||||
|
@ -8,16 +8,11 @@ pub struct AvailableDocumentsIds {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl AvailableDocumentsIds {
|
impl AvailableDocumentsIds {
|
||||||
pub fn from_documents_ids(
|
pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
|
||||||
docids: &RoaringBitmap,
|
match docids.max() {
|
||||||
soft_deleted_docids: &RoaringBitmap,
|
|
||||||
) -> AvailableDocumentsIds {
|
|
||||||
let used_docids = docids | soft_deleted_docids;
|
|
||||||
|
|
||||||
match used_docids.max() {
|
|
||||||
Some(last_id) => {
|
Some(last_id) => {
|
||||||
let mut available = RoaringBitmap::from_iter(0..last_id);
|
let mut available = RoaringBitmap::from_iter(0..last_id);
|
||||||
available -= used_docids;
|
available -= docids;
|
||||||
|
|
||||||
let iter = match last_id.checked_add(1) {
|
let iter = match last_id.checked_add(1) {
|
||||||
Some(id) => id..=u32::max_value(),
|
Some(id) => id..=u32::max_value(),
|
||||||
@ -50,7 +45,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn empty() {
|
fn empty() {
|
||||||
let base = RoaringBitmap::new();
|
let base = RoaringBitmap::new();
|
||||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
|
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||||
let right = 0..=u32::max_value();
|
let right = 0..=u32::max_value();
|
||||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||||
}
|
}
|
||||||
@ -63,28 +58,8 @@ mod tests {
|
|||||||
base.insert(100);
|
base.insert(100);
|
||||||
base.insert(405);
|
base.insert(405);
|
||||||
|
|
||||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
|
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||||
let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
|
let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
|
||||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn soft_deleted() {
|
|
||||||
let mut base = RoaringBitmap::new();
|
|
||||||
base.insert(0);
|
|
||||||
base.insert(10);
|
|
||||||
base.insert(100);
|
|
||||||
base.insert(405);
|
|
||||||
|
|
||||||
let mut soft_deleted = RoaringBitmap::new();
|
|
||||||
soft_deleted.insert(1);
|
|
||||||
soft_deleted.insert(11);
|
|
||||||
soft_deleted.insert(101);
|
|
||||||
soft_deleted.insert(406);
|
|
||||||
|
|
||||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted);
|
|
||||||
let right =
|
|
||||||
(0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n));
|
|
||||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
use crate::facet::FacetType;
|
use crate::{FieldDistribution, Index, Result};
|
||||||
use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
|
|
||||||
|
|
||||||
pub struct ClearDocuments<'t, 'u, 'i> {
|
pub struct ClearDocuments<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
@ -21,13 +20,12 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
let Index {
|
let Index {
|
||||||
env: _env,
|
env: _env,
|
||||||
main: _main,
|
main: _main,
|
||||||
|
external_documents_ids,
|
||||||
word_docids,
|
word_docids,
|
||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
exact_word_prefix_docids,
|
exact_word_prefix_docids,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
|
||||||
prefix_word_pair_proximity_docids,
|
|
||||||
word_position_docids,
|
word_position_docids,
|
||||||
word_fid_docids,
|
word_fid_docids,
|
||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
@ -51,43 +49,23 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
// We retrieve the number of documents ids that we are deleting.
|
// We retrieve the number of documents ids that we are deleting.
|
||||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
|
||||||
|
|
||||||
// We clean some of the main engine datastructures.
|
// We clean some of the main engine datastructures.
|
||||||
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
||||||
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
|
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
|
||||||
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
|
|
||||||
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
|
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
|
||||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?;
|
|
||||||
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
||||||
self.index.delete_geo_rtree(self.wtxn)?;
|
self.index.delete_geo_rtree(self.wtxn)?;
|
||||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||||
self.index.delete_vector_hnsw(self.wtxn)?;
|
self.index.delete_vector_hnsw(self.wtxn)?;
|
||||||
|
|
||||||
// We clean all the faceted documents ids.
|
|
||||||
for field_id in faceted_fields {
|
|
||||||
self.index.put_faceted_documents_ids(
|
|
||||||
self.wtxn,
|
|
||||||
field_id,
|
|
||||||
FacetType::Number,
|
|
||||||
&empty_roaring,
|
|
||||||
)?;
|
|
||||||
self.index.put_faceted_documents_ids(
|
|
||||||
self.wtxn,
|
|
||||||
field_id,
|
|
||||||
FacetType::String,
|
|
||||||
&empty_roaring,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear the other databases.
|
// Clear the other databases.
|
||||||
|
external_documents_ids.clear(self.wtxn)?;
|
||||||
word_docids.clear(self.wtxn)?;
|
word_docids.clear(self.wtxn)?;
|
||||||
exact_word_docids.clear(self.wtxn)?;
|
exact_word_docids.clear(self.wtxn)?;
|
||||||
word_prefix_docids.clear(self.wtxn)?;
|
word_prefix_docids.clear(self.wtxn)?;
|
||||||
exact_word_prefix_docids.clear(self.wtxn)?;
|
exact_word_prefix_docids.clear(self.wtxn)?;
|
||||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
|
||||||
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
|
|
||||||
word_position_docids.clear(self.wtxn)?;
|
word_position_docids.clear(self.wtxn)?;
|
||||||
word_fid_docids.clear(self.wtxn)?;
|
word_fid_docids.clear(self.wtxn)?;
|
||||||
field_id_word_count_docids.clear(self.wtxn)?;
|
field_id_word_count_docids.clear(self.wtxn)?;
|
||||||
@ -140,7 +118,7 @@ mod tests {
|
|||||||
|
|
||||||
assert!(index.words_fst(&rtxn).unwrap().is_empty());
|
assert!(index.words_fst(&rtxn).unwrap().is_empty());
|
||||||
assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
|
assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
|
||||||
assert!(index.external_documents_ids(&rtxn).unwrap().is_empty());
|
assert!(index.external_documents_ids().is_empty(&rtxn).unwrap());
|
||||||
assert!(index.documents_ids(&rtxn).unwrap().is_empty());
|
assert!(index.documents_ids(&rtxn).unwrap().is_empty());
|
||||||
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
|
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
|
||||||
assert!(index.geo_rtree(&rtxn).unwrap().is_none());
|
assert!(index.geo_rtree(&rtxn).unwrap().is_none());
|
||||||
@ -150,7 +128,6 @@ mod tests {
|
|||||||
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
|
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
|
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
|
||||||
assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
|
assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
|
assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());
|
assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());
|
||||||
|
125
milli/src/update/del_add.rs
Normal file
125
milli/src/update/del_add.rs
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
use obkv::Key;
|
||||||
|
|
||||||
|
pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
|
||||||
|
pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>;
|
||||||
|
|
||||||
|
/// DelAdd defines the new value to add in the database and old value to delete from the database.
|
||||||
|
///
|
||||||
|
/// Its used in an OBKV to be serialized in grenad files.
|
||||||
|
#[repr(u8)]
|
||||||
|
#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)]
|
||||||
|
pub enum DelAdd {
|
||||||
|
Deletion = 0,
|
||||||
|
Addition = 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Key for DelAdd {
|
||||||
|
const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>();
|
||||||
|
type BYTES = [u8; Self::BYTES_SIZE];
|
||||||
|
|
||||||
|
fn to_be_bytes(&self) -> Self::BYTES {
|
||||||
|
u8::to_be_bytes(*self as u8)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_be_bytes(array: Self::BYTES) -> Self {
|
||||||
|
match u8::from_be_bytes(array) {
|
||||||
|
0 => Self::Deletion,
|
||||||
|
1 => Self::Addition,
|
||||||
|
otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
|
||||||
|
///
|
||||||
|
/// Deletion: put all the values under DelAdd::Deletion
|
||||||
|
/// Addition: put all the values under DelAdd::Addition,
|
||||||
|
/// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition,
|
||||||
|
pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
|
||||||
|
reader: obkv::KvReader<K>,
|
||||||
|
operation: DelAddOperation,
|
||||||
|
buffer: &mut Vec<u8>,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
|
let mut writer = obkv::KvWriter::new(buffer);
|
||||||
|
let mut value_buffer = Vec::new();
|
||||||
|
for (key, value) in reader.iter() {
|
||||||
|
value_buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) {
|
||||||
|
value_writer.insert(DelAdd::Deletion, value)?;
|
||||||
|
}
|
||||||
|
if matches!(operation, DelAddOperation::Addition | DelAddOperation::DeletionAndAddition) {
|
||||||
|
value_writer.insert(DelAdd::Addition, value)?;
|
||||||
|
}
|
||||||
|
value_writer.finish()?;
|
||||||
|
writer.insert(key, &value_buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.finish()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enum controlling the side of the DelAdd obkv in which the provided value will be written.
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub enum DelAddOperation {
|
||||||
|
Deletion,
|
||||||
|
Addition,
|
||||||
|
DeletionAndAddition,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
|
||||||
|
///
|
||||||
|
/// putting each deletion obkv's keys under an DelAdd::Deletion
|
||||||
|
/// and putting each addition obkv's keys under an DelAdd::Addition
|
||||||
|
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
|
||||||
|
deletion: obkv::KvReader<K>,
|
||||||
|
addition: obkv::KvReader<K>,
|
||||||
|
buffer: &mut Vec<u8>,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
|
use itertools::merge_join_by;
|
||||||
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|
||||||
|
let mut writer = obkv::KvWriter::new(buffer);
|
||||||
|
let mut value_buffer = Vec::new();
|
||||||
|
|
||||||
|
for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||||
|
value_buffer.clear();
|
||||||
|
match eob {
|
||||||
|
Left((k, v)) => {
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
value_writer.insert(DelAdd::Deletion, v).unwrap();
|
||||||
|
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||||
|
}
|
||||||
|
Right((k, v)) => {
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
value_writer.insert(DelAdd::Addition, v).unwrap();
|
||||||
|
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||||
|
}
|
||||||
|
Both((k, deletion), (_, addition)) => {
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||||
|
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||||
|
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.finish()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
|
||||||
|
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A function that extracts and returns the Add side of a DelAdd obkv.
|
||||||
|
/// This is useful when there are no previous value in the database and
|
||||||
|
/// therefore we don't need to do a diff with what's already there.
|
||||||
|
///
|
||||||
|
/// If there is no Add side we currently write an empty buffer
|
||||||
|
/// which is a valid CboRoaringBitmap.
|
||||||
|
#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
|
||||||
|
pub fn deladd_serialize_add_side<'a>(
|
||||||
|
obkv: &'a [u8],
|
||||||
|
_buffer: &mut Vec<u8>,
|
||||||
|
) -> crate::Result<&'a [u8]> {
|
||||||
|
Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
@ -1,10 +1,9 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
|
|
||||||
use grenad::CompressionType;
|
use grenad::CompressionType;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use heed::{BytesEncode, Error, RoTxn, RwTxn};
|
use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||||
@ -13,17 +12,15 @@ use crate::heed_codec::facet::{
|
|||||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::ByteSliceRefCodec;
|
use crate::heed_codec::ByteSliceRefCodec;
|
||||||
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
|
||||||
|
|
||||||
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
||||||
/// by rebuilding the database "from scratch".
|
/// by rebuilding the database "from scratch".
|
||||||
///
|
///
|
||||||
/// First, the new elements are inserted into the level 0 of the database. Then, the
|
/// First, the new elements are inserted into the level 0 of the database. Then, the
|
||||||
/// higher levels are cleared and recomputed from the content of level 0.
|
/// higher levels are cleared and recomputed from the content of level 0.
|
||||||
///
|
|
||||||
/// Finally, the `faceted_documents_ids` value in the main database of `Index`
|
|
||||||
/// is updated to contain the new set of faceted documents.
|
|
||||||
pub struct FacetsUpdateBulk<'i> {
|
pub struct FacetsUpdateBulk<'i> {
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
@ -31,7 +28,7 @@ pub struct FacetsUpdateBulk<'i> {
|
|||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
field_ids: Vec<FieldId>,
|
field_ids: Vec<FieldId>,
|
||||||
// None if level 0 does not need to be updated
|
// None if level 0 does not need to be updated
|
||||||
new_data: Option<grenad::Reader<BufReader<File>>>,
|
delta_data: Option<grenad::Reader<BufReader<File>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'i> FacetsUpdateBulk<'i> {
|
impl<'i> FacetsUpdateBulk<'i> {
|
||||||
@ -39,7 +36,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
field_ids: Vec<FieldId>,
|
field_ids: Vec<FieldId>,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
new_data: grenad::Reader<BufReader<File>>,
|
delta_data: grenad::Reader<BufReader<File>>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
) -> FacetsUpdateBulk<'i> {
|
) -> FacetsUpdateBulk<'i> {
|
||||||
@ -49,7 +46,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
group_size,
|
group_size,
|
||||||
min_level_size,
|
min_level_size,
|
||||||
facet_type,
|
facet_type,
|
||||||
new_data: Some(new_data),
|
delta_data: Some(delta_data),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -64,13 +61,13 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
group_size: FACET_GROUP_SIZE,
|
group_size: FACET_GROUP_SIZE,
|
||||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||||
facet_type,
|
facet_type,
|
||||||
new_data: None,
|
delta_data: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
||||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||||
let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self;
|
let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
|
||||||
|
|
||||||
let db = match facet_type {
|
let db = match facet_type {
|
||||||
FacetType::String => index
|
FacetType::String => index
|
||||||
@ -81,12 +78,9 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };
|
let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
|
||||||
|
|
||||||
inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {
|
inner.update(wtxn, &field_ids)?;
|
||||||
index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?;
|
|
||||||
Ok(())
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -95,26 +89,19 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||||
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||||
pub new_data: Option<grenad::Reader<R>>,
|
pub delta_data: Option<grenad::Reader<R>>,
|
||||||
pub group_size: u8,
|
pub group_size: u8,
|
||||||
pub min_level_size: u8,
|
pub min_level_size: u8,
|
||||||
}
|
}
|
||||||
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||||
pub fn update(
|
pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> {
|
||||||
mut self,
|
|
||||||
wtxn: &mut RwTxn,
|
|
||||||
field_ids: &[u16],
|
|
||||||
mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>,
|
|
||||||
) -> Result<()> {
|
|
||||||
self.update_level0(wtxn)?;
|
self.update_level0(wtxn)?;
|
||||||
for &field_id in field_ids.iter() {
|
for &field_id in field_ids.iter() {
|
||||||
self.clear_levels(wtxn, field_id)?;
|
self.clear_levels(wtxn, field_id)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
for &field_id in field_ids.iter() {
|
for &field_id in field_ids.iter() {
|
||||||
let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?;
|
let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
|
||||||
|
|
||||||
handle_all_docids(wtxn, field_id, all_docids)?;
|
|
||||||
|
|
||||||
for level_reader in level_readers {
|
for level_reader in level_readers {
|
||||||
let mut cursor = level_reader.into_cursor()?;
|
let mut cursor = level_reader.into_cursor()?;
|
||||||
@ -133,19 +120,27 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
self.db.delete_range(wtxn, &range).map(drop)?;
|
self.db.delete_range(wtxn, &range).map(drop)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
|
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
|
||||||
let new_data = match self.new_data.take() {
|
let delta_data = match self.delta_data.take() {
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
None => return Ok(()),
|
None => return Ok(()),
|
||||||
};
|
};
|
||||||
if self.db.is_empty(wtxn)? {
|
if self.db.is_empty(wtxn)? {
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
|
let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
|
||||||
let mut cursor = new_data.into_cursor()?;
|
let mut cursor = delta_data.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
if !valid_lmdb_key(key) {
|
if !valid_lmdb_key(key) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
let value = KvReaderDelAdd::new(value);
|
||||||
|
|
||||||
|
// DB is empty, it is safe to ignore Del operations
|
||||||
|
let Some(value) = value.get(DelAdd::Addition) else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
// the group size for level 0
|
// the group size for level 0
|
||||||
buffer.push(1);
|
buffer.push(1);
|
||||||
@ -157,11 +152,14 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let database = self.db.remap_types::<ByteSlice, ByteSlice>();
|
let database = self.db.remap_types::<ByteSlice, ByteSlice>();
|
||||||
|
|
||||||
let mut cursor = new_data.into_cursor()?;
|
let mut cursor = delta_data.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
if !valid_lmdb_key(key) {
|
if !valid_lmdb_key(key) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let value = KvReaderDelAdd::new(value);
|
||||||
|
|
||||||
// the value is a CboRoaringBitmap, but I still need to prepend the
|
// the value is a CboRoaringBitmap, but I still need to prepend the
|
||||||
// group size for level 0 (= 1) to it
|
// group size for level 0 (= 1) to it
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
@ -169,35 +167,39 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
// then we extend the buffer with the docids bitmap
|
// then we extend the buffer with the docids bitmap
|
||||||
match database.get(wtxn, key)? {
|
match database.get(wtxn, key)? {
|
||||||
Some(prev_value) => {
|
Some(prev_value) => {
|
||||||
|
// prev_value is the group size for level 0, followed by the previous bitmap.
|
||||||
let old_bitmap = &prev_value[1..];
|
let old_bitmap = &prev_value[1..];
|
||||||
CboRoaringBitmapCodec::merge_into(
|
CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
|
||||||
&[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)],
|
|
||||||
&mut buffer,
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
|
// it is safe to ignore the del in that case.
|
||||||
|
let Some(value) = value.get(DelAdd::Addition) else {
|
||||||
|
// won't put the key in DB as the value would be empty
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
buffer.extend_from_slice(value);
|
buffer.extend_from_slice(value);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
let new_bitmap = &buffer[1..];
|
||||||
|
// if the new bitmap is empty, let's remove it
|
||||||
|
if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 {
|
||||||
|
database.delete(wtxn, key)?;
|
||||||
|
} else {
|
||||||
database.put(wtxn, key, &buffer)?;
|
database.put(wtxn, key, &buffer)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
fn compute_levels_for_field_id(
|
fn compute_levels_for_field_id(
|
||||||
&self,
|
&self,
|
||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
txn: &RoTxn,
|
txn: &RoTxn,
|
||||||
) -> Result<(Vec<grenad::Reader<BufReader<File>>>, RoaringBitmap)> {
|
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||||
let mut all_docids = RoaringBitmap::new();
|
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
|
||||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
|
|
||||||
for bitmap in bitmaps {
|
|
||||||
all_docids |= bitmap;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok((subwriters, all_docids))
|
Ok(subwriters)
|
||||||
}
|
}
|
||||||
#[allow(clippy::type_complexity)]
|
#[allow(clippy::type_complexity)]
|
||||||
fn read_level_0<'t>(
|
fn read_level_0<'t>(
|
||||||
@ -491,7 +493,6 @@ mod tests {
|
|||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
|
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
|
||||||
db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -1,360 +0,0 @@
|
|||||||
use std::collections::{HashMap, HashSet};
|
|
||||||
|
|
||||||
use heed::RwTxn;
|
|
||||||
use log::debug;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
use time::OffsetDateTime;
|
|
||||||
|
|
||||||
use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
|
||||||
use crate::facet::FacetType;
|
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
|
||||||
use crate::heed_codec::ByteSliceRefCodec;
|
|
||||||
use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner};
|
|
||||||
use crate::{FieldId, Index, Result};
|
|
||||||
|
|
||||||
/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
|
||||||
///
|
|
||||||
/// Depending on the number of removed elements and the existing size of the database, we use either
|
|
||||||
/// a bulk delete method or an incremental delete method.
|
|
||||||
pub struct FacetsDelete<'i, 'b> {
|
|
||||||
index: &'i Index,
|
|
||||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
|
||||||
facet_type: FacetType,
|
|
||||||
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
|
|
||||||
docids_to_delete: &'b RoaringBitmap,
|
|
||||||
group_size: u8,
|
|
||||||
max_group_size: u8,
|
|
||||||
min_level_size: u8,
|
|
||||||
}
|
|
||||||
impl<'i, 'b> FacetsDelete<'i, 'b> {
|
|
||||||
pub fn new(
|
|
||||||
index: &'i Index,
|
|
||||||
facet_type: FacetType,
|
|
||||||
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
|
|
||||||
docids_to_delete: &'b RoaringBitmap,
|
|
||||||
) -> Self {
|
|
||||||
let database = match facet_type {
|
|
||||||
FacetType::String => index
|
|
||||||
.facet_id_string_docids
|
|
||||||
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
|
||||||
FacetType::Number => {
|
|
||||||
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
Self {
|
|
||||||
index,
|
|
||||||
database,
|
|
||||||
facet_type,
|
|
||||||
affected_facet_values,
|
|
||||||
docids_to_delete,
|
|
||||||
group_size: FACET_GROUP_SIZE,
|
|
||||||
max_group_size: FACET_MAX_GROUP_SIZE,
|
|
||||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> {
|
|
||||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
|
||||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
|
||||||
|
|
||||||
for (field_id, affected_facet_values) in self.affected_facet_values {
|
|
||||||
// This is an incorrect condition, since we assume that the length of the database is equal
|
|
||||||
// to the number of facet values for the given field_id. It means that in some cases, we might
|
|
||||||
// wrongly choose the incremental indexer over the bulk indexer. But the only case where that could
|
|
||||||
// really be a performance problem is when we fully delete a large ratio of all facet values for
|
|
||||||
// each field id. This would almost never happen. Still, to be overly cautious, I have added a
|
|
||||||
// 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance
|
|
||||||
// penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead.
|
|
||||||
if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) {
|
|
||||||
// Bulk delete
|
|
||||||
let mut modified = false;
|
|
||||||
|
|
||||||
for facet_value in affected_facet_values {
|
|
||||||
let key =
|
|
||||||
FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() };
|
|
||||||
let mut old = self.database.get(wtxn, &key)?.unwrap();
|
|
||||||
let previous_len = old.bitmap.len();
|
|
||||||
old.bitmap -= self.docids_to_delete;
|
|
||||||
if old.bitmap.is_empty() {
|
|
||||||
modified = true;
|
|
||||||
self.database.delete(wtxn, &key)?;
|
|
||||||
} else if old.bitmap.len() != previous_len {
|
|
||||||
modified = true;
|
|
||||||
self.database.put(wtxn, &key, &old)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if modified {
|
|
||||||
let builder = FacetsUpdateBulk::new_not_updating_level_0(
|
|
||||||
self.index,
|
|
||||||
vec![field_id],
|
|
||||||
self.facet_type,
|
|
||||||
);
|
|
||||||
builder.execute(wtxn)?;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Incremental
|
|
||||||
let inc = FacetsUpdateIncrementalInner {
|
|
||||||
db: self.database,
|
|
||||||
group_size: self.group_size,
|
|
||||||
min_level_size: self.min_level_size,
|
|
||||||
max_group_size: self.max_group_size,
|
|
||||||
};
|
|
||||||
for facet_value in affected_facet_values {
|
|
||||||
inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use std::iter::FromIterator;
|
|
||||||
|
|
||||||
use big_s::S;
|
|
||||||
use maplit::hashset;
|
|
||||||
use rand::seq::SliceRandom;
|
|
||||||
use rand::SeedableRng;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use crate::db_snap;
|
|
||||||
use crate::documents::documents_batch_reader_from_objects;
|
|
||||||
use crate::index::tests::TempIndex;
|
|
||||||
use crate::update::facet::test_helpers::ordered_string;
|
|
||||||
use crate::update::{DeleteDocuments, DeletionStrategy};
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn delete_mixed_incremental_and_bulk() {
|
|
||||||
// The point of this test is to create an index populated with documents
|
|
||||||
// containing different filterable attributes. Then, we delete a bunch of documents
|
|
||||||
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
|
|
||||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_filterable_fields(
|
|
||||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
|
||||||
);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let mut documents = vec![];
|
|
||||||
for i in 0..1000 {
|
|
||||||
documents.push(
|
|
||||||
serde_json::json! {
|
|
||||||
{
|
|
||||||
"id": i,
|
|
||||||
"label": i / 10,
|
|
||||||
"colour": i / 100,
|
|
||||||
"timestamp": i / 2,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
|
|
||||||
db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf");
|
|
||||||
|
|
||||||
let mut wtxn = index.env.write_txn().unwrap();
|
|
||||||
|
|
||||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
|
||||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
|
||||||
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
|
|
||||||
// by deleting the first 100 documents, we expect that:
|
|
||||||
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
|
|
||||||
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
|
|
||||||
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
|
|
||||||
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
|
|
||||||
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
|
|
||||||
builder.execute().unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
|
||||||
db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
|
|
||||||
db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Same test as above but working with string values for the facets
|
|
||||||
#[test]
|
|
||||||
fn delete_mixed_incremental_and_bulk_string() {
|
|
||||||
// The point of this test is to create an index populated with documents
|
|
||||||
// containing different filterable attributes. Then, we delete a bunch of documents
|
|
||||||
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
|
|
||||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_filterable_fields(
|
|
||||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
|
||||||
);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let mut documents = vec![];
|
|
||||||
for i in 0..1000 {
|
|
||||||
documents.push(
|
|
||||||
serde_json::json! {
|
|
||||||
{
|
|
||||||
"id": i,
|
|
||||||
"label": ordered_string(i / 10),
|
|
||||||
"colour": ordered_string(i / 100),
|
|
||||||
"timestamp": ordered_string(i / 2),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
|
||||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
|
||||||
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
|
||||||
|
|
||||||
let mut wtxn = index.env.write_txn().unwrap();
|
|
||||||
|
|
||||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
|
||||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
|
||||||
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
|
|
||||||
// by deleting the first 100 documents, we expect that:
|
|
||||||
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
|
|
||||||
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
|
|
||||||
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
|
|
||||||
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
|
|
||||||
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
|
|
||||||
builder.execute().unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
|
||||||
db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
|
|
||||||
db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn delete_almost_all_incrementally_string() {
|
|
||||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_filterable_fields(
|
|
||||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
|
||||||
);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let mut documents = vec![];
|
|
||||||
for i in 0..1000 {
|
|
||||||
documents.push(
|
|
||||||
serde_json::json! {
|
|
||||||
{
|
|
||||||
"id": i,
|
|
||||||
"label": ordered_string(i / 10),
|
|
||||||
"colour": ordered_string(i / 100),
|
|
||||||
"timestamp": ordered_string(i / 2),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
|
||||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
|
||||||
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
|
||||||
|
|
||||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
|
||||||
|
|
||||||
let mut docids_to_delete = (0..1000).collect::<Vec<_>>();
|
|
||||||
docids_to_delete.shuffle(&mut rng);
|
|
||||||
for docid in docids_to_delete.into_iter().take(990) {
|
|
||||||
let mut wtxn = index.env.write_txn().unwrap();
|
|
||||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
|
||||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
|
||||||
builder.delete_documents(&RoaringBitmap::from_iter([docid]));
|
|
||||||
builder.execute().unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
|
||||||
db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
|
|
||||||
db_snap!(index, string_faceted_documents_ids, 2, @r###"
|
|
||||||
0 []
|
|
||||||
1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
|
||||||
2 [292, 324, 358, 381, 493, 839, 852, ]
|
|
||||||
3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(unused)]
|
|
||||||
#[cfg(test)]
|
|
||||||
mod comparison_bench {
|
|
||||||
use std::iter::once;
|
|
||||||
|
|
||||||
use rand::Rng;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use crate::heed_codec::facet::OrderedF64Codec;
|
|
||||||
use crate::update::facet::test_helpers::FacetIndex;
|
|
||||||
|
|
||||||
// This is a simple test to get an intuition on the relative speed
|
|
||||||
// of the incremental vs. bulk indexer.
|
|
||||||
//
|
|
||||||
// The benchmark shows the worst-case scenario for the incremental indexer, since
|
|
||||||
// each facet value contains only one document ID.
|
|
||||||
//
|
|
||||||
// In that scenario, it appears that the incremental indexer is about 70 times slower than the
|
|
||||||
// bulk indexer.
|
|
||||||
// #[test]
|
|
||||||
fn benchmark_facet_indexing_delete() {
|
|
||||||
let mut r = rand::thread_rng();
|
|
||||||
|
|
||||||
for i in 1..=20 {
|
|
||||||
let size = 50_000 * i;
|
|
||||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
|
||||||
|
|
||||||
let mut txn = index.env.write_txn().unwrap();
|
|
||||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
|
||||||
for i in 0..size {
|
|
||||||
// field id = 0, left_bound = i, docids = [i]
|
|
||||||
elements.push(((0, i as f64), once(i).collect()));
|
|
||||||
}
|
|
||||||
let timer = std::time::Instant::now();
|
|
||||||
index.bulk_insert(&mut txn, &[0], elements.iter());
|
|
||||||
let time_spent = timer.elapsed().as_millis();
|
|
||||||
println!("bulk {size} : {time_spent}ms");
|
|
||||||
|
|
||||||
txn.commit().unwrap();
|
|
||||||
|
|
||||||
for nbr_doc in [1, 100, 1000, 10_000] {
|
|
||||||
let mut txn = index.env.write_txn().unwrap();
|
|
||||||
let timer = std::time::Instant::now();
|
|
||||||
//
|
|
||||||
// delete one document
|
|
||||||
//
|
|
||||||
for _ in 0..nbr_doc {
|
|
||||||
let deleted_u32 = r.gen::<u32>() % size;
|
|
||||||
let deleted_f64 = deleted_u32 as f64;
|
|
||||||
index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32)
|
|
||||||
}
|
|
||||||
let time_spent = timer.elapsed().as_millis();
|
|
||||||
println!(" delete {nbr_doc} : {time_spent}ms");
|
|
||||||
txn.abort().unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,9 +1,9 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
|
|
||||||
use heed::types::{ByteSlice, DecodeIgnore};
|
use heed::types::{ByteSlice, DecodeIgnore};
|
||||||
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
||||||
|
use obkv::KvReader;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
@ -12,8 +12,9 @@ use crate::heed_codec::facet::{
|
|||||||
};
|
};
|
||||||
use crate::heed_codec::ByteSliceRefCodec;
|
use crate::heed_codec::ByteSliceRefCodec;
|
||||||
use crate::search::facet::get_highest_level;
|
use crate::search::facet::get_highest_level;
|
||||||
|
use crate::update::del_add::DelAdd;
|
||||||
use crate::update::index_documents::valid_lmdb_key;
|
use crate::update::index_documents::valid_lmdb_key;
|
||||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||||
|
|
||||||
enum InsertionResult {
|
enum InsertionResult {
|
||||||
InPlace,
|
InPlace,
|
||||||
@ -28,27 +29,21 @@ enum DeletionResult {
|
|||||||
|
|
||||||
/// Algorithm to incrementally insert and delete elememts into the
|
/// Algorithm to incrementally insert and delete elememts into the
|
||||||
/// `facet_id_(string/f64)_docids` databases.
|
/// `facet_id_(string/f64)_docids` databases.
|
||||||
///
|
pub struct FacetsUpdateIncremental {
|
||||||
/// Rhe `faceted_documents_ids` value in the main database of `Index`
|
|
||||||
/// is also updated to contain the new set of faceted documents.
|
|
||||||
pub struct FacetsUpdateIncremental<'i> {
|
|
||||||
index: &'i Index,
|
|
||||||
inner: FacetsUpdateIncrementalInner,
|
inner: FacetsUpdateIncrementalInner,
|
||||||
facet_type: FacetType,
|
delta_data: grenad::Reader<BufReader<File>>,
|
||||||
new_data: grenad::Reader<BufReader<File>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'i> FacetsUpdateIncremental<'i> {
|
impl FacetsUpdateIncremental {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
index: &'i Index,
|
index: &Index,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
new_data: grenad::Reader<BufReader<File>>,
|
delta_data: grenad::Reader<BufReader<File>>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
max_group_size: u8,
|
max_group_size: u8,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
FacetsUpdateIncremental {
|
FacetsUpdateIncremental {
|
||||||
index,
|
|
||||||
inner: FacetsUpdateIncrementalInner {
|
inner: FacetsUpdateIncrementalInner {
|
||||||
db: match facet_type {
|
db: match facet_type {
|
||||||
FacetType::String => index
|
FacetType::String => index
|
||||||
@ -62,31 +57,41 @@ impl<'i> FacetsUpdateIncremental<'i> {
|
|||||||
max_group_size,
|
max_group_size,
|
||||||
min_level_size,
|
min_level_size,
|
||||||
},
|
},
|
||||||
facet_type,
|
delta_data,
|
||||||
new_data,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
|
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
|
||||||
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
let mut cursor = self.delta_data.into_cursor()?;
|
||||||
|
|
||||||
let mut cursor = self.new_data.into_cursor()?;
|
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
if !valid_lmdb_key(key) {
|
if !valid_lmdb_key(key) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key)
|
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key)
|
||||||
.ok_or(heed::Error::Encoding)?;
|
.ok_or(heed::Error::Encoding)?;
|
||||||
let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
let value = KvReader::new(value);
|
||||||
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
|
||||||
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
let docids_to_delete = value
|
||||||
|
.get(DelAdd::Deletion)
|
||||||
|
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||||
|
.map(|o| o.ok_or(heed::Error::Encoding));
|
||||||
|
|
||||||
|
let docids_to_add = value
|
||||||
|
.get(DelAdd::Addition)
|
||||||
|
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||||
|
.map(|o| o.ok_or(heed::Error::Encoding));
|
||||||
|
|
||||||
|
if let Some(docids_to_delete) = docids_to_delete {
|
||||||
|
let docids_to_delete = docids_to_delete?;
|
||||||
|
self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (field_id, new_docids) in new_faceted_docids {
|
if let Some(docids_to_add) = docids_to_add {
|
||||||
let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
|
let docids_to_add = docids_to_add?;
|
||||||
docids |= new_docids;
|
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?;
|
||||||
self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?;
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -14,7 +14,7 @@ The databases must be able to return results for queries such as:
|
|||||||
The algorithms that implement these queries are found in the `src/search/facet` folder.
|
The algorithms that implement these queries are found in the `src/search/facet` folder.
|
||||||
|
|
||||||
To make these queries fast to compute, the database adopts a tree structure:
|
To make these queries fast to compute, the database adopts a tree structure:
|
||||||
```ignore
|
```text
|
||||||
┌───────────────────────────────┬───────────────────────────────┬───────────────┐
|
┌───────────────────────────────┬───────────────────────────────┬───────────────┐
|
||||||
┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │
|
┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │
|
||||||
│Level 2│ │ │ │ │
|
│Level 2│ │ │ │ │
|
||||||
@ -41,7 +41,7 @@ These documents all contain a facet value that is contained within `ab .. gaf`.
|
|||||||
In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
|
In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
|
||||||
[`FacetGroupValue`], which have the following format:
|
[`FacetGroupValue`], which have the following format:
|
||||||
|
|
||||||
```ignore
|
```text
|
||||||
FacetGroupKey:
|
FacetGroupKey:
|
||||||
- field id : u16
|
- field id : u16
|
||||||
- level : u8
|
- level : u8
|
||||||
@ -98,7 +98,6 @@ use crate::update::merge_btreeset_string;
|
|||||||
use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH};
|
use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH};
|
||||||
|
|
||||||
pub mod bulk;
|
pub mod bulk;
|
||||||
pub mod delete;
|
|
||||||
pub mod incremental;
|
pub mod incremental;
|
||||||
|
|
||||||
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
||||||
@ -109,7 +108,7 @@ pub struct FacetsUpdate<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
new_data: grenad::Reader<BufReader<File>>,
|
delta_data: grenad::Reader<BufReader<File>>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
max_group_size: u8,
|
max_group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
@ -118,7 +117,7 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
pub fn new(
|
pub fn new(
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
new_data: grenad::Reader<BufReader<File>>,
|
delta_data: grenad::Reader<BufReader<File>>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let database = match facet_type {
|
let database = match facet_type {
|
||||||
FacetType::String => index
|
FacetType::String => index
|
||||||
@ -135,26 +134,26 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
max_group_size: FACET_MAX_GROUP_SIZE,
|
max_group_size: FACET_MAX_GROUP_SIZE,
|
||||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||||
facet_type,
|
facet_type,
|
||||||
new_data,
|
delta_data,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||||
if self.new_data.is_empty() {
|
if self.delta_data.is_empty() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||||
|
|
||||||
// See self::comparison_bench::benchmark_facet_indexing
|
// See self::comparison_bench::benchmark_facet_indexing
|
||||||
if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
if self.delta_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
||||||
let field_ids =
|
let field_ids =
|
||||||
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
||||||
let bulk_update = FacetsUpdateBulk::new(
|
let bulk_update = FacetsUpdateBulk::new(
|
||||||
self.index,
|
self.index,
|
||||||
field_ids,
|
field_ids,
|
||||||
self.facet_type,
|
self.facet_type,
|
||||||
self.new_data,
|
self.delta_data,
|
||||||
self.group_size,
|
self.group_size,
|
||||||
self.min_level_size,
|
self.min_level_size,
|
||||||
);
|
);
|
||||||
@ -163,7 +162,7 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
let incremental_update = FacetsUpdateIncremental::new(
|
let incremental_update = FacetsUpdateIncremental::new(
|
||||||
self.index,
|
self.index,
|
||||||
self.facet_type,
|
self.facet_type,
|
||||||
self.new_data,
|
self.delta_data,
|
||||||
self.group_size,
|
self.group_size,
|
||||||
self.min_level_size,
|
self.min_level_size,
|
||||||
self.max_group_size,
|
self.max_group_size,
|
||||||
@ -279,6 +278,7 @@ pub(crate) mod test_helpers {
|
|||||||
use crate::heed_codec::ByteSliceRefCodec;
|
use crate::heed_codec::ByteSliceRefCodec;
|
||||||
use crate::search::facet::get_highest_level;
|
use crate::search::facet::get_highest_level;
|
||||||
use crate::snapshot_tests::display_bitmap;
|
use crate::snapshot_tests::display_bitmap;
|
||||||
|
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||||
use crate::update::FacetsUpdateIncrementalInner;
|
use crate::update::FacetsUpdateIncrementalInner;
|
||||||
use crate::CboRoaringBitmapCodec;
|
use crate::CboRoaringBitmapCodec;
|
||||||
|
|
||||||
@ -455,20 +455,22 @@ pub(crate) mod test_helpers {
|
|||||||
let key: FacetGroupKey<&[u8]> =
|
let key: FacetGroupKey<&[u8]> =
|
||||||
FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
|
FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
|
||||||
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key).unwrap();
|
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key).unwrap();
|
||||||
|
let mut inner_writer = KvWriterDelAdd::memory();
|
||||||
let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
|
let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
|
||||||
writer.insert(&key, &value).unwrap();
|
inner_writer.insert(DelAdd::Addition, value).unwrap();
|
||||||
|
writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
|
||||||
}
|
}
|
||||||
writer.finish().unwrap();
|
writer.finish().unwrap();
|
||||||
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
||||||
|
|
||||||
let update = FacetsUpdateBulkInner {
|
let update = FacetsUpdateBulkInner {
|
||||||
db: self.content,
|
db: self.content,
|
||||||
new_data: Some(reader),
|
delta_data: Some(reader),
|
||||||
group_size: self.group_size.get(),
|
group_size: self.group_size.get(),
|
||||||
min_level_size: self.min_level_size.get(),
|
min_level_size: self.min_level_size.get(),
|
||||||
};
|
};
|
||||||
|
|
||||||
update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap();
|
update.update(wtxn, field_ids).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) {
|
pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) {
|
||||||
@ -556,101 +558,6 @@ pub(crate) mod test_helpers {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use big_s::S;
|
|
||||||
use maplit::hashset;
|
|
||||||
|
|
||||||
use crate::db_snap;
|
|
||||||
use crate::documents::documents_batch_reader_from_objects;
|
|
||||||
use crate::index::tests::TempIndex;
|
|
||||||
use crate::update::DeletionStrategy;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn replace_all_identical_soft_deletion_then_hard_deletion() {
|
|
||||||
let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
|
||||||
|
|
||||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_primary_key("id".to_owned());
|
|
||||||
settings.set_filterable_fields(hashset! { S("size") });
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let mut documents = vec![];
|
|
||||||
for i in 0..1000 {
|
|
||||||
documents.push(
|
|
||||||
serde_json::json! {
|
|
||||||
{
|
|
||||||
"id": i,
|
|
||||||
"size": i % 250,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
|
|
||||||
db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9");
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
|
|
||||||
|
|
||||||
let mut documents = vec![];
|
|
||||||
for i in 0..999 {
|
|
||||||
documents.push(
|
|
||||||
serde_json::json! {
|
|
||||||
{
|
|
||||||
"id": i,
|
|
||||||
"size": i % 250,
|
|
||||||
"other": 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
|
|
||||||
db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06");
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
|
|
||||||
|
|
||||||
// Then replace the last document while disabling soft_deletion
|
|
||||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
|
||||||
let mut documents = vec![];
|
|
||||||
for i in 999..1000 {
|
|
||||||
documents.push(
|
|
||||||
serde_json::json! {
|
|
||||||
{
|
|
||||||
"id": i,
|
|
||||||
"size": i % 250,
|
|
||||||
"other": 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
|
|
||||||
db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028");
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(unused)]
|
#[allow(unused)]
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod comparison_bench {
|
mod comparison_bench {
|
||||||
|
@ -1,20 +1,17 @@
|
|||||||
|
use std::fmt;
|
||||||
use std::io::{BufWriter, Read, Seek};
|
use std::io::{BufWriter, Read, Seek};
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
use std::{fmt, iter};
|
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader};
|
use crate::documents::{
|
||||||
|
DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader,
|
||||||
|
EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY,
|
||||||
|
};
|
||||||
use crate::error::{GeoError, InternalError, UserError};
|
use crate::error::{GeoError, InternalError, UserError};
|
||||||
use crate::update::index_documents::{obkv_to_object, writer_into_reader};
|
use crate::update::index_documents::{obkv_to_object, writer_into_reader};
|
||||||
use crate::{FieldId, Index, Object, Result};
|
use crate::{FieldId, Index, Result};
|
||||||
|
|
||||||
/// The symbol used to define levels in a nested primary key.
|
|
||||||
const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
|
|
||||||
|
|
||||||
/// The default primary that is used when not specified.
|
|
||||||
const DEFAULT_PRIMARY_KEY: &str = "id";
|
|
||||||
|
|
||||||
/// This function validates and enrich the documents by checking that:
|
/// This function validates and enrich the documents by checking that:
|
||||||
/// - we can infer a primary key,
|
/// - we can infer a primary key,
|
||||||
@ -41,14 +38,12 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
|||||||
// The primary key *field id* that has already been set for this index or the one
|
// The primary key *field id* that has already been set for this index or the one
|
||||||
// we will guess by searching for the first key that contains "id" as a substring.
|
// we will guess by searching for the first key that contains "id" as a substring.
|
||||||
let primary_key = match index.primary_key(rtxn)? {
|
let primary_key = match index.primary_key(rtxn)? {
|
||||||
Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => {
|
Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) {
|
||||||
PrimaryKey::nested(primary_key)
|
Some(primary_key) => primary_key,
|
||||||
}
|
None if autogenerate_docids => PrimaryKey::Flat {
|
||||||
Some(primary_key) => match documents_batch_index.id(primary_key) {
|
name: primary_key,
|
||||||
Some(id) => PrimaryKey::flat(primary_key, id),
|
field_id: documents_batch_index.insert(primary_key),
|
||||||
None if autogenerate_docids => {
|
},
|
||||||
PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key))
|
|
||||||
}
|
|
||||||
None => {
|
None => {
|
||||||
return match cursor.next_document()? {
|
return match cursor.next_document()? {
|
||||||
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
|
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
|
||||||
@ -76,14 +71,14 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
|||||||
});
|
});
|
||||||
|
|
||||||
match guesses.as_slice() {
|
match guesses.as_slice() {
|
||||||
[] if autogenerate_docids => PrimaryKey::flat(
|
[] if autogenerate_docids => PrimaryKey::Flat {
|
||||||
DEFAULT_PRIMARY_KEY,
|
name: DEFAULT_PRIMARY_KEY,
|
||||||
documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||||
),
|
},
|
||||||
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
||||||
[(field_id, name)] => {
|
[(field_id, name)] => {
|
||||||
log::info!("Primary key was not specified in index. Inferred to '{name}'");
|
log::info!("Primary key was not specified in index. Inferred to '{name}'");
|
||||||
PrimaryKey::flat(name, *field_id)
|
PrimaryKey::Flat { name, field_id: *field_id }
|
||||||
}
|
}
|
||||||
multiple => {
|
multiple => {
|
||||||
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||||
@ -156,92 +151,24 @@ fn fetch_or_generate_document_id(
|
|||||||
uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
|
uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
|
||||||
count: u32,
|
count: u32,
|
||||||
) -> Result<StdResult<DocumentId, UserError>> {
|
) -> Result<StdResult<DocumentId, UserError>> {
|
||||||
match primary_key {
|
Ok(match primary_key.document_id(document, documents_batch_index)? {
|
||||||
PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => {
|
Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }),
|
||||||
match document.get(primary_key_id) {
|
Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),
|
||||||
Some(document_id_bytes) => {
|
Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => {
|
||||||
let document_id = serde_json::from_slice(document_id_bytes)
|
|
||||||
.map_err(InternalError::SerdeJson)?;
|
|
||||||
match validate_document_id_value(document_id)? {
|
|
||||||
Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
|
|
||||||
Err(user_error) => Ok(Err(user_error)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None if autogenerate_docids => {
|
|
||||||
let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
|
let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
|
||||||
Ok(Ok(DocumentId::generated(uuid.to_string(), count)))
|
Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count })
|
||||||
}
|
}
|
||||||
None => Ok(Err(UserError::MissingDocumentId {
|
Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId {
|
||||||
primary_key: primary_key.to_string(),
|
primary_key: primary_key.name().to_string(),
|
||||||
document: obkv_to_object(document, documents_batch_index)?,
|
document: obkv_to_object(document, documents_batch_index)?,
|
||||||
})),
|
}),
|
||||||
}
|
Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
|
||||||
}
|
Err(UserError::TooManyDocumentIds {
|
||||||
nested @ PrimaryKey::Nested { .. } => {
|
primary_key: primary_key.name().to_string(),
|
||||||
let mut matching_documents_ids = Vec::new();
|
|
||||||
for (first_level_name, right) in nested.possible_level_names() {
|
|
||||||
if let Some(field_id) = documents_batch_index.id(first_level_name) {
|
|
||||||
if let Some(value_bytes) = document.get(field_id) {
|
|
||||||
let object = serde_json::from_slice(value_bytes)
|
|
||||||
.map_err(InternalError::SerdeJson)?;
|
|
||||||
fetch_matching_values(object, right, &mut matching_documents_ids);
|
|
||||||
|
|
||||||
if matching_documents_ids.len() >= 2 {
|
|
||||||
return Ok(Err(UserError::TooManyDocumentIds {
|
|
||||||
primary_key: nested.name().to_string(),
|
|
||||||
document: obkv_to_object(document, documents_batch_index)?,
|
document: obkv_to_object(document, documents_batch_index)?,
|
||||||
}));
|
})
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
match matching_documents_ids.pop() {
|
|
||||||
Some(document_id) => match validate_document_id_value(document_id)? {
|
|
||||||
Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
|
|
||||||
Err(user_error) => Ok(Err(user_error)),
|
|
||||||
},
|
|
||||||
None => Ok(Err(UserError::MissingDocumentId {
|
|
||||||
primary_key: nested.name().to_string(),
|
|
||||||
document: obkv_to_object(document, documents_batch_index)?,
|
|
||||||
})),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A type that represent the type of primary key that has been set
|
|
||||||
/// for this index, a classic flat one or a nested one.
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
|
||||||
enum PrimaryKey<'a> {
|
|
||||||
Flat { name: &'a str, field_id: FieldId },
|
|
||||||
Nested { name: &'a str },
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PrimaryKey<'_> {
|
|
||||||
fn flat(name: &str, field_id: FieldId) -> PrimaryKey {
|
|
||||||
PrimaryKey::Flat { name, field_id }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn nested(name: &str) -> PrimaryKey {
|
|
||||||
PrimaryKey::Nested { name }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn name(&self) -> &str {
|
|
||||||
match self {
|
|
||||||
PrimaryKey::Flat { name, .. } => name,
|
|
||||||
PrimaryKey::Nested { name } => name,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns an `Iterator` that gives all the possible fields names the primary key
|
|
||||||
/// can have depending of the first level name and deepnes of the objects.
|
|
||||||
fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
|
|
||||||
let name = self.name();
|
|
||||||
name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
|
|
||||||
.map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
|
|
||||||
.chain(iter::once((name, "")))
|
|
||||||
}
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A type that represents a document id that has been retrieved from a document or auto-generated.
|
/// A type that represents a document id that has been retrieved from a document or auto-generated.
|
||||||
@ -255,14 +182,6 @@ pub enum DocumentId {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl DocumentId {
|
impl DocumentId {
|
||||||
fn retrieved(value: String) -> DocumentId {
|
|
||||||
DocumentId::Retrieved { value }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn generated(value: String, document_nth: u32) -> DocumentId {
|
|
||||||
DocumentId::Generated { value, document_nth }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn debug(&self) -> String {
|
fn debug(&self) -> String {
|
||||||
format!("{:?}", self)
|
format!("{:?}", self)
|
||||||
}
|
}
|
||||||
@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn starts_with(selector: &str, key: &str) -> bool {
|
|
||||||
selector.strip_prefix(key).map_or(false, |tail| {
|
|
||||||
tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
|
|
||||||
match value {
|
|
||||||
Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
|
|
||||||
otherwise => output.push(otherwise),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn fetch_matching_values_in_object(
|
|
||||||
object: Object,
|
|
||||||
selector: &str,
|
|
||||||
base_key: &str,
|
|
||||||
output: &mut Vec<Value>,
|
|
||||||
) {
|
|
||||||
for (key, value) in object {
|
|
||||||
let base_key = if base_key.is_empty() {
|
|
||||||
key.to_string()
|
|
||||||
} else {
|
|
||||||
format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
|
|
||||||
};
|
|
||||||
|
|
||||||
if starts_with(selector, &base_key) {
|
|
||||||
match value {
|
|
||||||
Value::Object(object) => {
|
|
||||||
fetch_matching_values_in_object(object, selector, &base_key, output)
|
|
||||||
}
|
|
||||||
value => output.push(value),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn validate_document_id(document_id: &str) -> Option<&str> {
|
|
||||||
if !document_id.is_empty()
|
|
||||||
&& document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
|
||||||
{
|
|
||||||
Some(document_id)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parses a Json encoded document id and validate it, returning a user error when it is one.
|
|
||||||
pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
|
|
||||||
match document_id {
|
|
||||||
Value::String(string) => match validate_document_id(&string) {
|
|
||||||
Some(s) if s.len() == string.len() => Ok(Ok(string)),
|
|
||||||
Some(s) => Ok(Ok(s.to_string())),
|
|
||||||
None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
|
|
||||||
},
|
|
||||||
Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
|
|
||||||
content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Try to extract an `f64` from a JSON `Value` and return the `Value`
|
/// Try to extract an `f64` from a JSON `Value` and return the `Value`
|
||||||
/// in the `Err` variant if it failed.
|
/// in the `Err` variant if it failed.
|
||||||
pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
|
pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
|
||||||
|
@ -5,18 +5,16 @@ use std::io::BufReader;
|
|||||||
use std::{io, mem, str};
|
use std::{io, mem, str};
|
||||||
|
|
||||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||||
use obkv::KvReader;
|
use obkv::{KvReader, KvWriterU16};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
||||||
use crate::error::{InternalError, SerializationError};
|
use crate::error::{InternalError, SerializationError};
|
||||||
use crate::update::index_documents::MergeFn;
|
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||||
use crate::{
|
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||||
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
|
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
||||||
|
|
||||||
/// Extracts the word and positions where this word appear and
|
/// Extracts the word and positions where this word appear and
|
||||||
/// prefixes it by the document id.
|
/// prefixes it by the document id.
|
||||||
@ -32,25 +30,162 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
allowed_separators: Option<&[&str]>,
|
allowed_separators: Option<&[&str]>,
|
||||||
dictionary: Option<&[&str]>,
|
dictionary: Option<&[&str]>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_positions_per_attributes = max_positions_per_attributes
|
let max_positions_per_attributes = max_positions_per_attributes
|
||||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
|
// initialize destination values.
|
||||||
let mut documents_ids = RoaringBitmap::new();
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
let mut script_language_docids = HashMap::new();
|
let mut script_language_docids = HashMap::new();
|
||||||
let mut docid_word_positions_sorter = create_sorter(
|
let mut docid_word_positions_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
concat_u32s_array,
|
keep_latest_obkv,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut buffers = Buffers::default();
|
// initialize buffers.
|
||||||
|
let mut del_buffers = Buffers::default();
|
||||||
|
let mut add_buffers = Buffers::default();
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
let mut value_buffer = Vec::new();
|
||||||
|
|
||||||
|
// initialize tokenizer.
|
||||||
|
let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None);
|
||||||
|
let tokenizer = builder.build();
|
||||||
|
|
||||||
|
// iterate over documents.
|
||||||
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let document_id = key
|
||||||
|
.try_into()
|
||||||
|
.map(u32::from_be_bytes)
|
||||||
|
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||||
|
let obkv = KvReader::<FieldId>::new(value);
|
||||||
|
|
||||||
|
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||||
|
if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
documents_ids.push(document_id);
|
||||||
|
|
||||||
|
// Update key buffer prefix.
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||||
|
|
||||||
|
// Tokenize deletions and additions in 2 diffferent threads.
|
||||||
|
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||||
|
|| {
|
||||||
|
// deletions
|
||||||
|
lang_safe_tokens_from_document(
|
||||||
|
&obkv,
|
||||||
|
searchable_fields,
|
||||||
|
&tokenizer,
|
||||||
|
stop_words,
|
||||||
|
allowed_separators,
|
||||||
|
dictionary,
|
||||||
|
max_positions_per_attributes,
|
||||||
|
DelAdd::Deletion,
|
||||||
|
&mut del_buffers,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
// additions
|
||||||
|
lang_safe_tokens_from_document(
|
||||||
|
&obkv,
|
||||||
|
searchable_fields,
|
||||||
|
&tokenizer,
|
||||||
|
stop_words,
|
||||||
|
allowed_separators,
|
||||||
|
dictionary,
|
||||||
|
max_positions_per_attributes,
|
||||||
|
DelAdd::Addition,
|
||||||
|
&mut add_buffers,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
let (del_obkv, del_script_language_word_count) = del?;
|
||||||
|
let (add_obkv, add_script_language_word_count) = add?;
|
||||||
|
|
||||||
|
// merge deletions and additions.
|
||||||
|
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
||||||
|
value_buffer.clear();
|
||||||
|
del_add_from_two_obkvs(
|
||||||
|
KvReader::<FieldId>::new(del_obkv),
|
||||||
|
KvReader::<FieldId>::new(add_obkv),
|
||||||
|
&mut value_buffer,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
// write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
|
||||||
|
let obkv = KvReader::<FieldId>::new(&value_buffer);
|
||||||
|
for (field_id, value) in obkv.iter() {
|
||||||
|
key_buffer.truncate(mem::size_of::<u32>());
|
||||||
|
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
docid_word_positions_sorter.insert(&key_buffer, value)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// update script_language_docids deletions.
|
||||||
|
for (script, languages_frequency) in del_script_language_word_count {
|
||||||
|
for (language, _) in languages_frequency {
|
||||||
|
let entry = script_language_docids
|
||||||
|
.entry((script, language))
|
||||||
|
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
||||||
|
entry.0.push(document_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update script_language_docids additions.
|
||||||
|
for (script, languages_frequency) in add_script_language_word_count {
|
||||||
|
for (language, _) in languages_frequency {
|
||||||
|
let entry = script_language_docids
|
||||||
|
.entry((script, language))
|
||||||
|
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
||||||
|
entry.1.push(document_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
|
||||||
|
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||||
|
.map(|reader| (reader, script_language_docids))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if any searchable fields of a document changed.
|
||||||
|
fn searchable_fields_changed(
|
||||||
|
obkv: &KvReader<FieldId>,
|
||||||
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
|
) -> bool {
|
||||||
|
for (field_id, field_bytes) in obkv.iter() {
|
||||||
|
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||||
|
let del_add = KvReaderDelAdd::new(field_bytes);
|
||||||
|
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||||
|
// if both fields are None, check the next field.
|
||||||
|
(None, None) => (),
|
||||||
|
// if both contains a value and values are the same, check the next field.
|
||||||
|
(Some(del), Some(add)) if del == add => (),
|
||||||
|
// otherwise the fields are different, return true.
|
||||||
|
_otherwise => return true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Factorize tokenizer building.
|
||||||
|
fn tokenizer_builder<'a>(
|
||||||
|
stop_words: Option<&'a fst::Set<&[u8]>>,
|
||||||
|
allowed_separators: Option<&'a [&str]>,
|
||||||
|
dictionary: Option<&'a [&str]>,
|
||||||
|
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
||||||
|
) -> TokenizerBuilder<'a, &'a [u8]> {
|
||||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||||
if let Some(stop_words) = stop_words {
|
if let Some(stop_words) = stop_words {
|
||||||
tokenizer_builder.stop_words(stop_words);
|
tokenizer_builder.stop_words(stop_words);
|
||||||
@ -61,30 +196,38 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
if let Some(separators) = allowed_separators {
|
if let Some(separators) = allowed_separators {
|
||||||
tokenizer_builder.separators(separators);
|
tokenizer_builder.separators(separators);
|
||||||
}
|
}
|
||||||
let tokenizer = tokenizer_builder.build();
|
|
||||||
|
|
||||||
let mut cursor = obkv_documents.into_cursor()?;
|
if let Some(script_language) = script_language {
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
tokenizer_builder.allow_list(script_language);
|
||||||
let document_id = key
|
}
|
||||||
.try_into()
|
|
||||||
.map(u32::from_be_bytes)
|
|
||||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
|
||||||
let obkv = KvReader::<FieldId>::new(value);
|
|
||||||
|
|
||||||
documents_ids.push(document_id);
|
tokenizer_builder
|
||||||
buffers.key_buffer.clear();
|
}
|
||||||
buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
|
||||||
|
|
||||||
|
/// Extract words mapped with their positions of a document,
|
||||||
|
/// ensuring no Language detection mistakes was made.
|
||||||
|
#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
|
||||||
|
fn lang_safe_tokens_from_document<'a>(
|
||||||
|
obkv: &KvReader<FieldId>,
|
||||||
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
|
tokenizer: &Tokenizer,
|
||||||
|
stop_words: Option<&fst::Set<&[u8]>>,
|
||||||
|
allowed_separators: Option<&[&str]>,
|
||||||
|
dictionary: Option<&[&str]>,
|
||||||
|
max_positions_per_attributes: u32,
|
||||||
|
del_add: DelAdd,
|
||||||
|
buffers: &'a mut Buffers,
|
||||||
|
) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> {
|
||||||
let mut script_language_word_count = HashMap::new();
|
let mut script_language_word_count = HashMap::new();
|
||||||
|
|
||||||
extract_tokens_from_document(
|
tokens_from_document(
|
||||||
&obkv,
|
obkv,
|
||||||
searchable_fields,
|
searchable_fields,
|
||||||
&tokenizer,
|
tokenizer,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
&mut buffers,
|
del_add,
|
||||||
|
buffers,
|
||||||
&mut script_language_word_count,
|
&mut script_language_word_count,
|
||||||
&mut docid_word_positions_sorter,
|
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// if we detect a potetial mistake in the language detection,
|
// if we detect a potetial mistake in the language detection,
|
||||||
@ -103,56 +246,62 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
// then we don't rerun the extraction.
|
// then we don't rerun the extraction.
|
||||||
if !script_language.is_empty() {
|
if !script_language.is_empty() {
|
||||||
// build a new temporary tokenizer including the allow list.
|
// build a new temporary tokenizer including the allow list.
|
||||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
let mut builder = tokenizer_builder(
|
||||||
if let Some(stop_words) = stop_words {
|
stop_words,
|
||||||
tokenizer_builder.stop_words(stop_words);
|
allowed_separators,
|
||||||
}
|
dictionary,
|
||||||
tokenizer_builder.allow_list(&script_language);
|
Some(&script_language),
|
||||||
let tokenizer = tokenizer_builder.build();
|
);
|
||||||
|
let tokenizer = builder.build();
|
||||||
|
|
||||||
script_language_word_count.clear();
|
script_language_word_count.clear();
|
||||||
|
|
||||||
// rerun the extraction.
|
// rerun the extraction.
|
||||||
extract_tokens_from_document(
|
tokens_from_document(
|
||||||
&obkv,
|
obkv,
|
||||||
searchable_fields,
|
searchable_fields,
|
||||||
&tokenizer,
|
&tokenizer,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
&mut buffers,
|
del_add,
|
||||||
|
buffers,
|
||||||
&mut script_language_word_count,
|
&mut script_language_word_count,
|
||||||
&mut docid_word_positions_sorter,
|
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (script, languages_frequency) in script_language_word_count {
|
// returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>)
|
||||||
for (language, _) in languages_frequency {
|
Ok((&buffers.obkv_buffer, script_language_word_count))
|
||||||
let entry = script_language_docids
|
|
||||||
.entry((script, language))
|
|
||||||
.or_insert_with(RoaringBitmap::new);
|
|
||||||
entry.push(document_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
/// Extract words mapped with their positions of a document.
|
||||||
.map(|reader| (documents_ids, reader, script_language_docids))
|
fn tokens_from_document<'a>(
|
||||||
}
|
|
||||||
|
|
||||||
fn extract_tokens_from_document(
|
|
||||||
obkv: &KvReader<FieldId>,
|
obkv: &KvReader<FieldId>,
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
tokenizer: &Tokenizer,
|
tokenizer: &Tokenizer,
|
||||||
max_positions_per_attributes: u32,
|
max_positions_per_attributes: u32,
|
||||||
buffers: &mut Buffers,
|
del_add: DelAdd,
|
||||||
|
buffers: &'a mut Buffers,
|
||||||
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
||||||
docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>,
|
) -> Result<&'a [u8]> {
|
||||||
) -> Result<()> {
|
buffers.obkv_buffer.clear();
|
||||||
|
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||||
for (field_id, field_bytes) in obkv.iter() {
|
for (field_id, field_bytes) in obkv.iter() {
|
||||||
|
// if field is searchable.
|
||||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||||
let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
// extract deletion or addition only.
|
||||||
|
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
|
||||||
|
// parse json.
|
||||||
|
let value =
|
||||||
|
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||||
|
|
||||||
|
// prepare writing destination.
|
||||||
|
buffers.obkv_positions_buffer.clear();
|
||||||
|
let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
|
||||||
|
|
||||||
|
// convert json into a unique string.
|
||||||
buffers.field_buffer.clear();
|
buffers.field_buffer.clear();
|
||||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||||
|
// create an iterator of token with their positions.
|
||||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||||
|
|
||||||
@ -167,24 +316,27 @@ fn extract_tokens_from_document(
|
|||||||
None => entry.push((language, 1)),
|
None => entry.push((language, 1)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// keep a word only if it is not empty and fit in a LMDB key.
|
||||||
let token = token.lemma().trim();
|
let token = token.lemma().trim();
|
||||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||||
buffers.key_buffer.truncate(mem::size_of::<u32>());
|
|
||||||
buffers.key_buffer.extend_from_slice(token.as_bytes());
|
|
||||||
|
|
||||||
let position: u16 = index
|
let position: u16 = index
|
||||||
.try_into()
|
.try_into()
|
||||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||||
let position = absolute_from_relative_position(field_id, position);
|
writer.insert(position, token.as_bytes())?;
|
||||||
docid_word_positions_sorter
|
|
||||||
.insert(&buffers.key_buffer, position.to_ne_bytes())?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// write positions into document.
|
||||||
|
let positions = writer.into_inner()?;
|
||||||
|
document_writer.insert(field_id, positions)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
// returns a KV<FieldId, KV<u16, String>>
|
||||||
|
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Transform a JSON value into a string that can be indexed.
|
/// Transform a JSON value into a string that can be indexed.
|
||||||
@ -287,10 +439,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)
|
|||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct Buffers {
|
struct Buffers {
|
||||||
// the key buffer is the concatenation of the internal document id with the field id.
|
|
||||||
// The buffer has to be completelly cleared between documents,
|
|
||||||
// and the field id part must be cleared between each field.
|
|
||||||
key_buffer: Vec<u8>,
|
|
||||||
// the field buffer for each fields desserialization, and must be cleared between each field.
|
// the field buffer for each fields desserialization, and must be cleared between each field.
|
||||||
field_buffer: String,
|
field_buffer: String,
|
||||||
|
// buffer used to store the value data containing an obkv.
|
||||||
|
obkv_buffer: Vec<u8>,
|
||||||
|
// buffer used to store the value data containing an obkv of tokens with their positions.
|
||||||
|
obkv_positions_buffer: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
@ -4,11 +4,12 @@ use std::io::{self, BufReader};
|
|||||||
use heed::{BytesDecode, BytesEncode};
|
use heed::{BytesDecode, BytesEncode};
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||||
};
|
};
|
||||||
|
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
/// Extracts the facet number and the documents ids where this facet number appear.
|
/// Extracts the facet number and the documents ids where this facet number appear.
|
||||||
@ -17,7 +18,7 @@ use crate::Result;
|
|||||||
/// documents ids from the given chunk of docid facet number positions.
|
/// documents ids from the given chunk of docid facet number positions.
|
||||||
#[logging_timer::time]
|
#[logging_timer::time]
|
||||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||||
docid_fid_facet_number: grenad::Reader<R>,
|
fid_docid_facet_number: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
@ -26,21 +27,30 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut facet_number_docids_sorter = create_sorter(
|
let mut facet_number_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut cursor = docid_fid_facet_number.into_cursor()?;
|
let mut buffer = Vec::new();
|
||||||
while let Some((key_bytes, _)) = cursor.move_on_next()? {
|
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
||||||
|
while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
|
||||||
let (field_id, document_id, number) =
|
let (field_id, document_id, number) =
|
||||||
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
||||||
|
|
||||||
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
|
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
|
||||||
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
||||||
facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
|
|
||||||
|
buffer.clear();
|
||||||
|
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||||
|
for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() {
|
||||||
|
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||||
|
}
|
||||||
|
obkv.finish()?;
|
||||||
|
|
||||||
|
facet_number_docids_sorter.insert(key_bytes, &buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(facet_number_docids_sorter, indexer)
|
sorter_into_reader(facet_number_docids_sorter, indexer)
|
||||||
|
@ -1,13 +1,15 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, BufReader};
|
use std::io::BufReader;
|
||||||
|
use std::{io, str};
|
||||||
|
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
|
|
||||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||||
use crate::heed_codec::StrRefCodec;
|
use crate::heed_codec::StrRefCodec;
|
||||||
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
|
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
|
||||||
|
use crate::{FieldId, Result};
|
||||||
|
|
||||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||||
///
|
///
|
||||||
@ -24,15 +26,16 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut facet_string_docids_sorter = create_sorter(
|
let mut facet_string_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||||
while let Some((key, _original_value_bytes)) = cursor.move_on_next()? {
|
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||||
|
|
||||||
@ -40,21 +43,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
|||||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?;
|
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||||
|
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||||
let normalised_truncated_value: String;
|
|
||||||
if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
|
|
||||||
normalised_truncated_value = normalised_value
|
|
||||||
.char_indices()
|
|
||||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
|
||||||
.map(|(_, c)| c)
|
|
||||||
.collect();
|
|
||||||
normalised_value = normalised_truncated_value.as_str();
|
|
||||||
}
|
|
||||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
|
|
||||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||||
// document id is encoded in native-endian because of the CBO roaring bitmap codec
|
|
||||||
facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
|
buffer.clear();
|
||||||
|
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||||
|
for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
|
||||||
|
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||||
|
}
|
||||||
|
obkv.finish()?;
|
||||||
|
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(facet_string_docids_sorter, indexer)
|
sorter_into_reader(facet_string_docids_sorter, indexer)
|
||||||
|
@ -1,24 +1,36 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
use std::collections::{BTreeMap, HashSet};
|
use std::collections::{BTreeMap, HashSet};
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, BufReader};
|
use std::io::{self, BufReader};
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
|
use grenad::Sorter;
|
||||||
use heed::zerocopy::AsBytes;
|
use heed::zerocopy::AsBytes;
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
|
use itertools::EitherOrBoth;
|
||||||
|
use ordered_float::OrderedFloat;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::{from_slice, Value};
|
use serde_json::{from_slice, Value};
|
||||||
|
use FilterableValues::{Empty, Null, Values};
|
||||||
|
|
||||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||||
use crate::error::InternalError;
|
use crate::error::InternalError;
|
||||||
use crate::facet::value_encoding::f64_into_bytes;
|
use crate::facet::value_encoding::f64_into_bytes;
|
||||||
|
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
|
use crate::{
|
||||||
|
CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// The length of the elements that are always in the buffer when inserting new values.
|
||||||
|
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
||||||
|
|
||||||
/// The extracted facet values stored in grenad files by type.
|
/// The extracted facet values stored in grenad files by type.
|
||||||
pub struct ExtractedFacetValues {
|
pub struct ExtractedFacetValues {
|
||||||
pub docid_fid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
|
pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub docid_fid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
|
pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
|
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
|
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
|
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||||
@ -58,71 +70,150 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
max_memory.map(|m| m / 2),
|
max_memory.map(|m| m / 2),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
// The tuples represents the Del and Add side for a bitmap
|
||||||
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||||
|
let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||||
|
|
||||||
|
// We create two buffers for mutable ref issues with closures.
|
||||||
|
let mut numbers_key_buffer = Vec::new();
|
||||||
|
let mut strings_key_buffer = Vec::new();
|
||||||
|
|
||||||
let mut key_buffer = Vec::new();
|
|
||||||
let mut cursor = obkv_documents.into_cursor()?;
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||||
let obkv = obkv::KvReader::new(value);
|
let obkv = obkv::KvReader::new(value);
|
||||||
|
|
||||||
for (field_id, field_bytes) in obkv.iter() {
|
for (field_id, field_bytes) in obkv.iter() {
|
||||||
if faceted_fields.contains(&field_id) {
|
if faceted_fields.contains(&field_id) {
|
||||||
key_buffer.clear();
|
numbers_key_buffer.clear();
|
||||||
|
strings_key_buffer.clear();
|
||||||
|
|
||||||
// Set key to the field_id
|
// Set key to the field_id
|
||||||
// Note: this encoding is consistent with FieldIdCodec
|
// Note: this encoding is consistent with FieldIdCodec
|
||||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
|
||||||
// Here, we know already that the document must be added to the “field id exists” database
|
|
||||||
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
||||||
let document = BEU32::from(document).get();
|
let document = BEU32::from(document).get();
|
||||||
|
|
||||||
facet_exists_docids.entry(field_id).or_default().insert(document);
|
|
||||||
|
|
||||||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||||
key_buffer.extend_from_slice(docid_bytes);
|
numbers_key_buffer.extend_from_slice(docid_bytes);
|
||||||
|
strings_key_buffer.extend_from_slice(docid_bytes);
|
||||||
|
|
||||||
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
let del_add_obkv = obkv::KvReader::new(field_bytes);
|
||||||
|
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
|
||||||
|
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
let add_value = match del_add_obkv.get(DelAdd::Addition) {
|
||||||
|
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
match extract_facet_values(
|
// We insert the document id on the Del and the Add side if the field exists.
|
||||||
&value,
|
let (ref mut del_exists, ref mut add_exists) =
|
||||||
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng),
|
facet_exists_docids.entry(field_id).or_default();
|
||||||
) {
|
let (ref mut del_is_null, ref mut add_is_null) =
|
||||||
FilterableValues::Null => {
|
facet_is_null_docids.entry(field_id).or_default();
|
||||||
facet_is_null_docids.entry(field_id).or_default().insert(document);
|
let (ref mut del_is_empty, ref mut add_is_empty) =
|
||||||
}
|
facet_is_empty_docids.entry(field_id).or_default();
|
||||||
FilterableValues::Empty => {
|
|
||||||
facet_is_empty_docids.entry(field_id).or_default().insert(document);
|
|
||||||
}
|
|
||||||
FilterableValues::Values { numbers, strings } => {
|
|
||||||
// insert facet numbers in sorter
|
|
||||||
for number in numbers {
|
|
||||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
|
||||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
|
||||||
key_buffer.extend_from_slice(&value_bytes);
|
|
||||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
|
||||||
|
|
||||||
fid_docid_facet_numbers_sorter
|
if del_value.is_some() {
|
||||||
.insert(&key_buffer, ().as_bytes())?;
|
del_exists.insert(document);
|
||||||
}
|
}
|
||||||
|
if add_value.is_some() {
|
||||||
|
add_exists.insert(document);
|
||||||
}
|
}
|
||||||
|
|
||||||
// insert normalized and original facet string in sorter
|
let geo_support =
|
||||||
for (normalized, original) in
|
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||||
strings.into_iter().filter(|(n, _)| !n.is_empty())
|
let del_filterable_values =
|
||||||
{
|
del_value.map(|value| extract_facet_values(&value, geo_support));
|
||||||
let normalized_truncated_value: String = normalized
|
let add_filterable_values =
|
||||||
.char_indices()
|
add_value.map(|value| extract_facet_values(&value, geo_support));
|
||||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
|
||||||
.map(|(_, c)| c)
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
// Those closures are just here to simplify things a bit.
|
||||||
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
|
let mut insert_numbers_diff = |del_numbers, add_numbers| {
|
||||||
fid_docid_facet_strings_sorter
|
insert_numbers_diff(
|
||||||
.insert(&key_buffer, original.as_bytes())?;
|
&mut fid_docid_facet_numbers_sorter,
|
||||||
|
&mut numbers_key_buffer,
|
||||||
|
del_numbers,
|
||||||
|
add_numbers,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
let mut insert_strings_diff = |del_strings, add_strings| {
|
||||||
|
insert_strings_diff(
|
||||||
|
&mut fid_docid_facet_strings_sorter,
|
||||||
|
&mut strings_key_buffer,
|
||||||
|
del_strings,
|
||||||
|
add_strings,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
match (del_filterable_values, add_filterable_values) {
|
||||||
|
(None, None) => (),
|
||||||
|
(Some(del_filterable_values), None) => match del_filterable_values {
|
||||||
|
Null => {
|
||||||
|
del_is_null.insert(document);
|
||||||
|
}
|
||||||
|
Empty => {
|
||||||
|
del_is_empty.insert(document);
|
||||||
|
}
|
||||||
|
Values { numbers, strings } => {
|
||||||
|
insert_numbers_diff(numbers, vec![])?;
|
||||||
|
insert_strings_diff(strings, vec![])?;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
(None, Some(add_filterable_values)) => match add_filterable_values {
|
||||||
|
Null => {
|
||||||
|
add_is_null.insert(document);
|
||||||
|
}
|
||||||
|
Empty => {
|
||||||
|
add_is_empty.insert(document);
|
||||||
|
}
|
||||||
|
Values { numbers, strings } => {
|
||||||
|
insert_numbers_diff(vec![], numbers)?;
|
||||||
|
insert_strings_diff(vec![], strings)?;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
(Some(del_filterable_values), Some(add_filterable_values)) => {
|
||||||
|
match (del_filterable_values, add_filterable_values) {
|
||||||
|
(Null, Null) | (Empty, Empty) => (),
|
||||||
|
(Null, Empty) => {
|
||||||
|
del_is_null.insert(document);
|
||||||
|
add_is_empty.insert(document);
|
||||||
|
}
|
||||||
|
(Empty, Null) => {
|
||||||
|
del_is_empty.insert(document);
|
||||||
|
add_is_null.insert(document);
|
||||||
|
}
|
||||||
|
(Null, Values { numbers, strings }) => {
|
||||||
|
insert_numbers_diff(vec![], numbers)?;
|
||||||
|
insert_strings_diff(vec![], strings)?;
|
||||||
|
del_is_null.insert(document);
|
||||||
|
}
|
||||||
|
(Empty, Values { numbers, strings }) => {
|
||||||
|
insert_numbers_diff(vec![], numbers)?;
|
||||||
|
insert_strings_diff(vec![], strings)?;
|
||||||
|
del_is_empty.insert(document);
|
||||||
|
}
|
||||||
|
(Values { numbers, strings }, Null) => {
|
||||||
|
add_is_null.insert(document);
|
||||||
|
insert_numbers_diff(numbers, vec![])?;
|
||||||
|
insert_strings_diff(strings, vec![])?;
|
||||||
|
}
|
||||||
|
(Values { numbers, strings }, Empty) => {
|
||||||
|
add_is_empty.insert(document);
|
||||||
|
insert_numbers_diff(numbers, vec![])?;
|
||||||
|
insert_strings_diff(strings, vec![])?;
|
||||||
|
}
|
||||||
|
(
|
||||||
|
Values { numbers: del_numbers, strings: del_strings },
|
||||||
|
Values { numbers: add_numbers, strings: add_strings },
|
||||||
|
) => {
|
||||||
|
insert_numbers_diff(del_numbers, add_numbers)?;
|
||||||
|
insert_strings_diff(del_strings, add_strings)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -130,14 +221,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
let mut facet_exists_docids_writer = create_writer(
|
let mut facet_exists_docids_writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
for (fid, bitmap) in facet_exists_docids.into_iter() {
|
for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
|
||||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||||
facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||||
}
|
}
|
||||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||||
|
|
||||||
@ -146,9 +238,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
for (fid, bitmap) in facet_is_null_docids.into_iter() {
|
for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
|
||||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||||
}
|
}
|
||||||
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
||||||
|
|
||||||
@ -157,21 +249,156 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
|
for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
|
||||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||||
}
|
}
|
||||||
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
||||||
|
|
||||||
Ok(ExtractedFacetValues {
|
Ok(ExtractedFacetValues {
|
||||||
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||||
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||||
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
||||||
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
||||||
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
|
||||||
|
fn deladd_obkv_cbo_roaring_bitmaps(
|
||||||
|
buffer: &mut Vec<u8>,
|
||||||
|
del_bitmap: &RoaringBitmap,
|
||||||
|
add_bitmap: &RoaringBitmap,
|
||||||
|
) -> io::Result<()> {
|
||||||
|
buffer.clear();
|
||||||
|
let mut obkv = KvWriterDelAdd::new(buffer);
|
||||||
|
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
|
||||||
|
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
|
||||||
|
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
|
||||||
|
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
|
||||||
|
obkv.finish()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Truncates a string to the biggest valid LMDB key size.
|
||||||
|
fn truncate_string(s: String) -> String {
|
||||||
|
s.char_indices()
|
||||||
|
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||||
|
.map(|(_, c)| c)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Computes the diff between both Del and Add numbers and
|
||||||
|
/// only inserts the parts that differ in the sorter.
|
||||||
|
fn insert_numbers_diff<MF>(
|
||||||
|
fid_docid_facet_numbers_sorter: &mut Sorter<MF>,
|
||||||
|
key_buffer: &mut Vec<u8>,
|
||||||
|
mut del_numbers: Vec<f64>,
|
||||||
|
mut add_numbers: Vec<f64>,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||||
|
{
|
||||||
|
// We sort and dedup the float numbers
|
||||||
|
del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||||
|
add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||||
|
del_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||||
|
add_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||||
|
|
||||||
|
let merged_numbers_iter = itertools::merge_join_by(
|
||||||
|
del_numbers.into_iter().map(OrderedFloat),
|
||||||
|
add_numbers.into_iter().map(OrderedFloat),
|
||||||
|
|del, add| del.cmp(add),
|
||||||
|
);
|
||||||
|
|
||||||
|
// insert facet numbers in sorter
|
||||||
|
for eob in merged_numbers_iter {
|
||||||
|
key_buffer.truncate(TRUNCATE_SIZE);
|
||||||
|
match eob {
|
||||||
|
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||||
|
EitherOrBoth::Left(OrderedFloat(number)) => {
|
||||||
|
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||||
|
key_buffer.extend_from_slice(&value_bytes);
|
||||||
|
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||||
|
|
||||||
|
// We insert only the Del part of the Obkv to inform
|
||||||
|
// that we only want to remove all those numbers.
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Deletion, ().as_bytes())?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right(OrderedFloat(number)) => {
|
||||||
|
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||||
|
key_buffer.extend_from_slice(&value_bytes);
|
||||||
|
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||||
|
|
||||||
|
// We insert only the Add part of the Obkv to inform
|
||||||
|
// that we only want to remove all those numbers.
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Addition, ().as_bytes())?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Computes the diff between both Del and Add strings and
|
||||||
|
/// only inserts the parts that differ in the sorter.
|
||||||
|
fn insert_strings_diff<MF>(
|
||||||
|
fid_docid_facet_strings_sorter: &mut Sorter<MF>,
|
||||||
|
key_buffer: &mut Vec<u8>,
|
||||||
|
mut del_strings: Vec<(String, String)>,
|
||||||
|
mut add_strings: Vec<(String, String)>,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||||
|
{
|
||||||
|
// We sort and dedup the normalized and original strings
|
||||||
|
del_strings.sort_unstable();
|
||||||
|
add_strings.sort_unstable();
|
||||||
|
del_strings.dedup();
|
||||||
|
add_strings.dedup();
|
||||||
|
|
||||||
|
let merged_strings_iter = itertools::merge_join_by(
|
||||||
|
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||||
|
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||||
|
|del, add| del.cmp(add),
|
||||||
|
);
|
||||||
|
|
||||||
|
// insert normalized and original facet string in sorter
|
||||||
|
for eob in merged_strings_iter {
|
||||||
|
key_buffer.truncate(TRUNCATE_SIZE);
|
||||||
|
match eob {
|
||||||
|
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||||
|
EitherOrBoth::Left((normalized, original)) => {
|
||||||
|
let truncated = truncate_string(normalized);
|
||||||
|
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||||
|
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Deletion, original)?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right((normalized, original)) => {
|
||||||
|
let truncated = truncate_string(normalized);
|
||||||
|
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||||
|
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Addition, original)?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Represent what a document field contains.
|
/// Represent what a document field contains.
|
||||||
enum FilterableValues {
|
enum FilterableValues {
|
||||||
/// Corresponds to the JSON `null` value.
|
/// Corresponds to the JSON `null` value.
|
||||||
@ -182,6 +409,7 @@ enum FilterableValues {
|
|||||||
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extracts the facet values of a JSON field.
|
||||||
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
|
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
|
||||||
fn inner_extract_facet_values(
|
fn inner_extract_facet_values(
|
||||||
value: &Value,
|
value: &Value,
|
||||||
|
@ -1,16 +1,18 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, BufReader};
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use grenad::Sorter;
|
use obkv::KvReaderU16;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||||
try_split_array_at, GrenadParameters, MergeFn,
|
GrenadParameters,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
|
use crate::Result;
|
||||||
|
|
||||||
|
const MAX_COUNTED_WORDS: usize = 30;
|
||||||
|
|
||||||
/// Extracts the field id word count and the documents ids where
|
/// Extracts the field id word count and the documents ids where
|
||||||
/// this field id with this amount of words appear.
|
/// this field id with this amount of words appear.
|
||||||
@ -28,70 +30,62 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut fid_word_count_docids_sorter = create_sorter(
|
let mut fid_word_count_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
// This map is assumed to not consume a lot of memory.
|
let mut key_buffer = Vec::new();
|
||||||
let mut document_fid_wordcount = HashMap::new();
|
let mut value_buffer = Vec::new();
|
||||||
let mut current_document_id = None;
|
|
||||||
|
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
let (document_id_bytes, _word_bytes) = try_split_array_at(key)
|
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
let curr_document_id = *current_document_id.get_or_insert(document_id);
|
let del_add_reader = KvReaderDelAdd::new(value);
|
||||||
if curr_document_id != document_id {
|
let deletion = del_add_reader
|
||||||
drain_document_fid_wordcount_into_sorter(
|
// get deleted words
|
||||||
&mut fid_word_count_docids_sorter,
|
.get(DelAdd::Deletion)
|
||||||
&mut document_fid_wordcount,
|
// count deleted words
|
||||||
curr_document_id,
|
.map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||||
)?;
|
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||||
current_document_id = Some(document_id);
|
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||||
|
let addition = del_add_reader
|
||||||
|
// get added words
|
||||||
|
.get(DelAdd::Addition)
|
||||||
|
// count added words
|
||||||
|
.map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||||
|
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||||
|
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||||
|
|
||||||
|
if deletion != addition {
|
||||||
|
// Insert deleted word count in sorter if exist.
|
||||||
|
if let Some(word_count) = deletion {
|
||||||
|
value_buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(fid_bytes);
|
||||||
|
key_buffer.push(word_count as u8);
|
||||||
|
fid_word_count_docids_sorter
|
||||||
|
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||||
}
|
}
|
||||||
|
// Insert added word count in sorter if exist.
|
||||||
for position in read_u32_ne_bytes(value) {
|
if let Some(word_count) = addition {
|
||||||
let (field_id, _) = relative_from_absolute_position(position);
|
value_buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
*value += 1;
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(fid_bytes);
|
||||||
|
key_buffer.push(word_count as u8);
|
||||||
|
fid_word_count_docids_sorter
|
||||||
|
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(document_id) = current_document_id {
|
|
||||||
// We must make sure that don't lose the current document field id
|
|
||||||
// word count map if we break because we reached the end of the chunk.
|
|
||||||
drain_document_fid_wordcount_into_sorter(
|
|
||||||
&mut fid_word_count_docids_sorter,
|
|
||||||
&mut document_fid_wordcount,
|
|
||||||
document_id,
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn drain_document_fid_wordcount_into_sorter(
|
|
||||||
fid_word_count_docids_sorter: &mut Sorter<MergeFn>,
|
|
||||||
document_fid_wordcount: &mut HashMap<FieldId, u32>,
|
|
||||||
document_id: DocumentId,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut key_buffer = Vec::new();
|
|
||||||
|
|
||||||
for (fid, count) in document_fid_wordcount.drain() {
|
|
||||||
if count <= 30 {
|
|
||||||
key_buffer.clear();
|
|
||||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
|
||||||
key_buffer.push(count as u8);
|
|
||||||
|
|
||||||
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
@ -6,6 +6,7 @@ use serde_json::Value;
|
|||||||
|
|
||||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||||
use crate::error::GeoError;
|
use crate::error::GeoError;
|
||||||
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::extract_finite_float_from_value;
|
use crate::update::index_documents::extract_finite_float_from_value;
|
||||||
use crate::{FieldId, InternalError, Result};
|
use crate::{FieldId, InternalError, Result};
|
||||||
|
|
||||||
@ -30,19 +31,62 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
|||||||
let mut cursor = obkv_documents.into_cursor()?;
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||||
let obkv = obkv::KvReader::new(value);
|
let obkv = obkv::KvReader::new(value);
|
||||||
// since we only needs the primary key when we throw an error we create this getter to
|
// since we only need the primary key when we throw an error
|
||||||
// lazily get it when needed
|
// we create this getter to lazily get it when needed
|
||||||
let document_id = || -> Value {
|
let document_id = || -> Value {
|
||||||
let document_id = obkv.get(primary_key_id).unwrap();
|
let document_id = obkv.get(primary_key_id).unwrap();
|
||||||
serde_json::from_slice(document_id).unwrap()
|
serde_json::from_slice(document_id).unwrap()
|
||||||
};
|
};
|
||||||
|
|
||||||
// first we get the two fields
|
// first we get the two fields
|
||||||
let lat = obkv.get(lat_fid);
|
match (obkv.get(lat_fid), obkv.get(lng_fid)) {
|
||||||
let lng = obkv.get(lng_fid);
|
(Some(lat), Some(lng)) => {
|
||||||
|
let deladd_lat_obkv = KvReaderDelAdd::new(lat);
|
||||||
|
let deladd_lng_obkv = KvReaderDelAdd::new(lng);
|
||||||
|
|
||||||
if let Some((lat, lng)) = lat.zip(lng) {
|
|
||||||
// then we extract the values
|
// then we extract the values
|
||||||
|
let del_lat_lng = deladd_lat_obkv
|
||||||
|
.get(DelAdd::Deletion)
|
||||||
|
.zip(deladd_lng_obkv.get(DelAdd::Deletion))
|
||||||
|
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
|
||||||
|
.transpose()?;
|
||||||
|
let add_lat_lng = deladd_lat_obkv
|
||||||
|
.get(DelAdd::Addition)
|
||||||
|
.zip(deladd_lng_obkv.get(DelAdd::Addition))
|
||||||
|
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
|
||||||
|
.transpose()?;
|
||||||
|
|
||||||
|
if del_lat_lng != add_lat_lng {
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
if let Some([lat, lng]) = del_lat_lng {
|
||||||
|
#[allow(clippy::drop_non_drop)]
|
||||||
|
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||||
|
obkv.insert(DelAdd::Deletion, bytes)?;
|
||||||
|
}
|
||||||
|
if let Some([lat, lng]) = add_lat_lng {
|
||||||
|
#[allow(clippy::drop_non_drop)]
|
||||||
|
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||||
|
obkv.insert(DelAdd::Addition, bytes)?;
|
||||||
|
}
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
writer.insert(docid_bytes, bytes)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(None, Some(_)) => {
|
||||||
|
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
|
||||||
|
}
|
||||||
|
(Some(_), None) => {
|
||||||
|
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
|
||||||
|
}
|
||||||
|
(None, None) => (),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writer_into_reader(writer)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the finite floats lat and lng from two bytes slices.
|
||||||
|
fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
|
||||||
let lat = extract_finite_float_from_value(
|
let lat = extract_finite_float_from_value(
|
||||||
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||||
)
|
)
|
||||||
@ -53,16 +97,5 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
|||||||
)
|
)
|
||||||
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
||||||
|
|
||||||
#[allow(clippy::drop_non_drop)]
|
Ok([lat, lng])
|
||||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
|
||||||
writer.insert(docid_bytes, bytes)?;
|
|
||||||
} else if lat.is_none() && lng.is_some() {
|
|
||||||
return Err(GeoError::MissingLatitude { document_id: document_id() })?;
|
|
||||||
} else if lat.is_some() && lng.is_none() {
|
|
||||||
return Err(GeoError::MissingLongitude { document_id: document_id() })?;
|
|
||||||
}
|
|
||||||
// else => the _geo object was `null`, there is nothing to do
|
|
||||||
}
|
|
||||||
|
|
||||||
writer_into_reader(writer)
|
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,24 @@
|
|||||||
|
use std::cmp::Ordering;
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, BufReader};
|
use std::io::{self, BufReader, BufWriter};
|
||||||
|
use std::mem::size_of;
|
||||||
|
use std::str::from_utf8;
|
||||||
|
|
||||||
use bytemuck::cast_slice;
|
use bytemuck::cast_slice;
|
||||||
|
use grenad::Writer;
|
||||||
|
use itertools::EitherOrBoth;
|
||||||
|
use ordered_float::OrderedFloat;
|
||||||
use serde_json::{from_slice, Value};
|
use serde_json::{from_slice, Value};
|
||||||
|
|
||||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||||
use crate::error::UserError;
|
use crate::error::UserError;
|
||||||
use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
|
use crate::update::index_documents::helpers::try_split_at;
|
||||||
|
use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
||||||
|
|
||||||
|
/// The length of the elements that are always in the buffer when inserting new values.
|
||||||
|
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||||
|
|
||||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||||
///
|
///
|
||||||
@ -16,7 +27,6 @@ use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
|||||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||||
obkv_documents: grenad::Reader<R>,
|
obkv_documents: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
primary_key_id: FieldId,
|
|
||||||
vectors_fid: FieldId,
|
vectors_fid: FieldId,
|
||||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
@ -27,43 +37,112 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
let mut cursor = obkv_documents.into_cursor()?;
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
// this must always be serialized as (docid, external_docid);
|
||||||
|
let (docid_bytes, external_id_bytes) =
|
||||||
|
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
|
||||||
|
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
||||||
|
|
||||||
let obkv = obkv::KvReader::new(value);
|
let obkv = obkv::KvReader::new(value);
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(docid_bytes);
|
||||||
|
|
||||||
// since we only needs the primary key when we throw an error we create this getter to
|
// since we only needs the primary key when we throw an error we create this getter to
|
||||||
// lazily get it when needed
|
// lazily get it when needed
|
||||||
let document_id = || -> Value {
|
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||||
let document_id = obkv.get(primary_key_id).unwrap();
|
|
||||||
from_slice(document_id).unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
// first we retrieve the _vectors field
|
// first we retrieve the _vectors field
|
||||||
if let Some(vectors) = obkv.get(vectors_fid) {
|
if let Some(value) = obkv.get(vectors_fid) {
|
||||||
// extract the vectors
|
let vectors_obkv = KvReaderDelAdd::new(value);
|
||||||
let vectors = match from_slice(vectors) {
|
|
||||||
Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors),
|
|
||||||
Err(_) => {
|
|
||||||
return Err(UserError::InvalidVectorsType {
|
|
||||||
document_id: document_id(),
|
|
||||||
value: from_slice(vectors).map_err(InternalError::SerdeJson)?,
|
|
||||||
}
|
|
||||||
.into())
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(vectors) = vectors {
|
// then we extract the values
|
||||||
for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
|
let del_vectors = vectors_obkv
|
||||||
let index = u16::try_from(i).unwrap();
|
.get(DelAdd::Deletion)
|
||||||
let mut key = docid_bytes.to_vec();
|
.map(|vectors| extract_vectors(vectors, document_id))
|
||||||
key.extend_from_slice(&index.to_be_bytes());
|
.transpose()?
|
||||||
let bytes = cast_slice(&vector);
|
.flatten();
|
||||||
writer.insert(key, bytes)?;
|
let add_vectors = vectors_obkv
|
||||||
|
.get(DelAdd::Addition)
|
||||||
|
.map(|vectors| extract_vectors(vectors, document_id))
|
||||||
|
.transpose()?
|
||||||
|
.flatten();
|
||||||
|
|
||||||
|
// and we finally push the unique vectors into the writer
|
||||||
|
push_vectors_diff(
|
||||||
|
&mut writer,
|
||||||
|
&mut key_buffer,
|
||||||
|
del_vectors.unwrap_or_default(),
|
||||||
|
add_vectors.unwrap_or_default(),
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
// else => the `_vectors` object was `null`, there is nothing to do
|
|
||||||
}
|
|
||||||
|
|
||||||
writer_into_reader(writer)
|
writer_into_reader(writer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Computes the diff between both Del and Add numbers and
|
||||||
|
/// only inserts the parts that differ in the sorter.
|
||||||
|
fn push_vectors_diff(
|
||||||
|
writer: &mut Writer<BufWriter<File>>,
|
||||||
|
key_buffer: &mut Vec<u8>,
|
||||||
|
mut del_vectors: Vec<Vec<f32>>,
|
||||||
|
mut add_vectors: Vec<Vec<f32>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
// We sort and dedup the vectors
|
||||||
|
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||||
|
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||||
|
del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||||
|
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||||
|
|
||||||
|
let merged_vectors_iter =
|
||||||
|
itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
|
||||||
|
|
||||||
|
// insert vectors into the writer
|
||||||
|
for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) {
|
||||||
|
// Generate the key by extending the unique index to it.
|
||||||
|
key_buffer.truncate(TRUNCATE_SIZE);
|
||||||
|
let index = u16::try_from(i).unwrap();
|
||||||
|
key_buffer.extend_from_slice(&index.to_be_bytes());
|
||||||
|
|
||||||
|
match eob {
|
||||||
|
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||||
|
EitherOrBoth::Left(vector) => {
|
||||||
|
// We insert only the Del part of the Obkv to inform
|
||||||
|
// that we only want to remove all those vectors.
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
writer.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right(vector) => {
|
||||||
|
// We insert only the Add part of the Obkv to inform
|
||||||
|
// that we only want to remove all those vectors.
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
writer.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compares two vectors by using the OrderingFloat helper.
|
||||||
|
fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
|
||||||
|
a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extracts the vectors from a JSON value.
|
||||||
|
fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result<Option<Vec<Vec<f32>>>> {
|
||||||
|
match from_slice(value) {
|
||||||
|
Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)),
|
||||||
|
Err(_) => Err(UserError::InvalidVectorsType {
|
||||||
|
document_id: document_id(),
|
||||||
|
value: from_slice(value).map_err(InternalError::SerdeJson)?,
|
||||||
|
}
|
||||||
|
.into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,18 +1,20 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::{BTreeSet, HashSet};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, BufReader};
|
use std::io::{self, BufReader};
|
||||||
use std::iter::FromIterator;
|
|
||||||
|
|
||||||
use roaring::RoaringBitmap;
|
use heed::BytesDecode;
|
||||||
|
use obkv::KvReaderU16;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader,
|
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
|
||||||
try_split_array_at, GrenadParameters,
|
try_split_array_at, writer_into_reader, GrenadParameters,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
|
use crate::heed_codec::StrBEU16Codec;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::update::index_documents::helpers::read_u32_ne_bytes;
|
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::{relative_from_absolute_position, FieldId, Result};
|
use crate::update::MergeFn;
|
||||||
|
use crate::{DocumentId, FieldId, Result};
|
||||||
|
|
||||||
/// Extracts the word and the documents ids where this word appear.
|
/// Extracts the word and the documents ids where this word appear.
|
||||||
///
|
///
|
||||||
@ -26,65 +28,152 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
exact_attributes: &HashSet<FieldId>,
|
exact_attributes: &HashSet<FieldId>,
|
||||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
) -> Result<(
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
)> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_docids_sorter = create_sorter(
|
let mut word_fid_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory.map(|x| x / 2),
|
max_memory.map(|x| x / 3),
|
||||||
|
);
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
let mut del_words = BTreeSet::new();
|
||||||
|
let mut add_words = BTreeSet::new();
|
||||||
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||||
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
|
let (fid_bytes, _) = try_split_array_at(fid_bytes)
|
||||||
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
let fid = u16::from_be_bytes(fid_bytes);
|
||||||
|
|
||||||
|
let del_add_reader = KvReaderDelAdd::new(value);
|
||||||
|
// extract all unique words to remove.
|
||||||
|
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||||
|
for (_pos, word) in KvReaderU16::new(deletion).iter() {
|
||||||
|
del_words.insert(word.to_vec());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// extract all unique additional words.
|
||||||
|
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||||
|
for (_pos, word) in KvReaderU16::new(addition).iter() {
|
||||||
|
add_words.insert(word.to_vec());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
words_into_sorter(
|
||||||
|
document_id,
|
||||||
|
fid,
|
||||||
|
&mut key_buffer,
|
||||||
|
&del_words,
|
||||||
|
&add_words,
|
||||||
|
&mut word_fid_docids_sorter,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
del_words.clear();
|
||||||
|
add_words.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut word_docids_sorter = create_sorter(
|
||||||
|
grenad::SortAlgorithm::Unstable,
|
||||||
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory.map(|x| x / 3),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut exact_word_docids_sorter = create_sorter(
|
let mut exact_word_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory.map(|x| x / 2),
|
max_memory.map(|x| x / 3),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut value_buffer = Vec::new();
|
let mut word_fid_docids_writer = create_writer(
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
indexer.chunk_compression_type,
|
||||||
while let Some((key, positions)) = cursor.move_on_next()? {
|
indexer.chunk_compression_level,
|
||||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
|
||||||
|
// TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
|
||||||
|
while let Some((key, value)) = iter.next()? {
|
||||||
|
// only keep the value if their is a change to apply in the DB.
|
||||||
|
if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
|
||||||
|
word_fid_docids_writer.insert(key, value)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let (word, fid) = StrBEU16Codec::bytes_decode(key)
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
|
||||||
|
|
||||||
let bitmap = RoaringBitmap::from_iter(Some(document_id));
|
// every words contained in an attribute set to exact must be pushed in the exact_words list.
|
||||||
serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
|
if exact_attributes.contains(&fid) {
|
||||||
|
exact_word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||||
// If there are no exact attributes, we do not need to iterate over positions.
|
|
||||||
if exact_attributes.is_empty() {
|
|
||||||
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
|
||||||
} else {
|
} else {
|
||||||
let mut added_to_exact = false;
|
word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||||
let mut added_to_word_docids = false;
|
|
||||||
for position in read_u32_ne_bytes(positions) {
|
|
||||||
// as soon as we know that this word had been to both readers, we don't need to
|
|
||||||
// iterate over the positions.
|
|
||||||
if added_to_exact && added_to_word_docids {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
let (fid, _) = relative_from_absolute_position(position);
|
|
||||||
if exact_attributes.contains(&fid) && !added_to_exact {
|
|
||||||
exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
|
||||||
added_to_exact = true;
|
|
||||||
} else if !added_to_word_docids {
|
|
||||||
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
|
||||||
added_to_word_docids = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
sorter_into_reader(word_docids_sorter, indexer)?,
|
sorter_into_reader(word_docids_sorter, indexer)?,
|
||||||
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
||||||
|
writer_into_reader(word_fid_docids_writer)?,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn words_into_sorter(
|
||||||
|
document_id: DocumentId,
|
||||||
|
fid: FieldId,
|
||||||
|
key_buffer: &mut Vec<u8>,
|
||||||
|
del_words: &BTreeSet<Vec<u8>>,
|
||||||
|
add_words: &BTreeSet<Vec<u8>>,
|
||||||
|
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||||
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
|
use itertools::merge_join_by;
|
||||||
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
|
||||||
|
buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||||
|
let word_bytes = match eob {
|
||||||
|
Left(word_bytes) => {
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
word_bytes
|
||||||
|
}
|
||||||
|
Right(word_bytes) => {
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
word_bytes
|
||||||
|
}
|
||||||
|
Both(word_bytes, _) => {
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
word_bytes
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
|
key_buffer.push(0);
|
||||||
|
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
|
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
@ -1,51 +0,0 @@
|
|||||||
use std::fs::File;
|
|
||||||
use std::io::{self, BufReader};
|
|
||||||
|
|
||||||
use super::helpers::{
|
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
|
||||||
try_split_array_at, GrenadParameters,
|
|
||||||
};
|
|
||||||
use crate::error::SerializationError;
|
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
|
||||||
use crate::{relative_from_absolute_position, DocumentId, Result};
|
|
||||||
|
|
||||||
/// Extracts the word, field id, and the documents ids where this word appear at this field id.
|
|
||||||
#[logging_timer::time]
|
|
||||||
pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
|
|
||||||
docid_word_positions: grenad::Reader<R>,
|
|
||||||
indexer: GrenadParameters,
|
|
||||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
|
||||||
puffin::profile_function!();
|
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
|
||||||
|
|
||||||
let mut word_fid_docids_sorter = create_sorter(
|
|
||||||
grenad::SortAlgorithm::Unstable,
|
|
||||||
merge_cbo_roaring_bitmaps,
|
|
||||||
indexer.chunk_compression_type,
|
|
||||||
indexer.chunk_compression_level,
|
|
||||||
indexer.max_nb_chunks,
|
|
||||||
max_memory,
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut key_buffer = Vec::new();
|
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
|
||||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
|
||||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
|
||||||
|
|
||||||
for position in read_u32_ne_bytes(value) {
|
|
||||||
key_buffer.clear();
|
|
||||||
key_buffer.extend_from_slice(word_bytes);
|
|
||||||
key_buffer.push(0);
|
|
||||||
let (fid, _) = relative_from_absolute_position(position);
|
|
||||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
|
||||||
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
|
|
||||||
|
|
||||||
Ok(word_fid_docids_reader)
|
|
||||||
}
|
|
@ -1,16 +1,18 @@
|
|||||||
use std::cmp::Ordering;
|
use std::collections::{BTreeMap, VecDeque};
|
||||||
use std::collections::{BinaryHeap, HashMap};
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
use std::{cmp, io, mem, str, vec};
|
use std::{cmp, io};
|
||||||
|
|
||||||
|
use obkv::KvReaderU16;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
|
||||||
try_split_array_at, GrenadParameters, MergeFn,
|
writer_into_reader, GrenadParameters, MergeFn,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::proximity::{positions_proximity, MAX_DISTANCE};
|
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||||
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::{DocumentId, Result};
|
use crate::{DocumentId, Result};
|
||||||
|
|
||||||
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
||||||
@ -26,58 +28,137 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_pair_proximity_docids_sorter = create_sorter(
|
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
|
||||||
|
.map(|_| {
|
||||||
|
create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory.map(|m| m / 2),
|
max_memory.map(|m| m / MAX_DISTANCE as usize),
|
||||||
);
|
)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
// This map is assumed to not consume a lot of memory.
|
let mut del_word_positions: VecDeque<(String, u16)> =
|
||||||
let mut document_word_positions_heap = BinaryHeap::new();
|
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||||
|
let mut add_word_positions: VecDeque<(String, u16)> =
|
||||||
|
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||||
|
let mut del_word_pair_proximity = BTreeMap::new();
|
||||||
|
let mut add_word_pair_proximity = BTreeMap::new();
|
||||||
let mut current_document_id = None;
|
let mut current_document_id = None;
|
||||||
|
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
let word = str::from_utf8(word_bytes)?;
|
|
||||||
|
|
||||||
let curr_document_id = *current_document_id.get_or_insert(document_id);
|
// if we change document, we fill the sorter
|
||||||
if curr_document_id != document_id {
|
if current_document_id.map_or(false, |id| id != document_id) {
|
||||||
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
|
puffin::profile_scope!("Document into sorter");
|
||||||
|
|
||||||
document_word_positions_into_sorter(
|
document_word_positions_into_sorter(
|
||||||
curr_document_id,
|
current_document_id.unwrap(),
|
||||||
document_word_positions_heap,
|
&del_word_pair_proximity,
|
||||||
&mut word_pair_proximity_docids_sorter,
|
&add_word_pair_proximity,
|
||||||
|
&mut word_pair_proximity_docids_sorters,
|
||||||
)?;
|
)?;
|
||||||
current_document_id = Some(document_id);
|
del_word_pair_proximity.clear();
|
||||||
|
add_word_pair_proximity.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
let word = word.to_string();
|
current_document_id = Some(document_id);
|
||||||
let mut positions: Vec<_> = read_u32_ne_bytes(value).collect();
|
|
||||||
positions.sort_unstable();
|
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||||
let mut iter = positions.into_iter();
|
|| {
|
||||||
if let Some(position) = iter.next() {
|
// deletions
|
||||||
document_word_positions_heap.push(PeekedWordPosition { word, position, iter });
|
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
|
||||||
|
for (position, word) in KvReaderU16::new(deletion).iter() {
|
||||||
|
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||||
|
while del_word_positions.get(0).map_or(false, |(_w, p)| {
|
||||||
|
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||||
|
}) {
|
||||||
|
word_positions_into_word_pair_proximity(
|
||||||
|
&mut del_word_positions,
|
||||||
|
&mut del_word_pair_proximity,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// insert the new word.
|
||||||
|
let word = std::str::from_utf8(word)?;
|
||||||
|
del_word_positions.push_back((word.to_string(), position));
|
||||||
|
}
|
||||||
|
|
||||||
|
while !del_word_positions.is_empty() {
|
||||||
|
word_positions_into_word_pair_proximity(
|
||||||
|
&mut del_word_positions,
|
||||||
|
&mut del_word_pair_proximity,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
// additions
|
||||||
|
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||||
|
for (position, word) in KvReaderU16::new(addition).iter() {
|
||||||
|
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||||
|
while add_word_positions.get(0).map_or(false, |(_w, p)| {
|
||||||
|
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||||
|
}) {
|
||||||
|
word_positions_into_word_pair_proximity(
|
||||||
|
&mut add_word_positions,
|
||||||
|
&mut add_word_pair_proximity,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// insert the new word.
|
||||||
|
let word = std::str::from_utf8(word)?;
|
||||||
|
add_word_positions.push_back((word.to_string(), position));
|
||||||
|
}
|
||||||
|
|
||||||
|
while !add_word_positions.is_empty() {
|
||||||
|
word_positions_into_word_pair_proximity(
|
||||||
|
&mut add_word_positions,
|
||||||
|
&mut add_word_pair_proximity,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
del?;
|
||||||
|
add?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(document_id) = current_document_id {
|
if let Some(document_id) = current_document_id {
|
||||||
// We must make sure that don't lose the current document field id
|
puffin::profile_scope!("Final document into sorter");
|
||||||
// word count map if we break because we reached the end of the chunk.
|
|
||||||
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
|
|
||||||
document_word_positions_into_sorter(
|
document_word_positions_into_sorter(
|
||||||
document_id,
|
document_id,
|
||||||
document_word_positions_heap,
|
&del_word_pair_proximity,
|
||||||
&mut word_pair_proximity_docids_sorter,
|
&add_word_pair_proximity,
|
||||||
|
&mut word_pair_proximity_docids_sorters,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
puffin::profile_scope!("sorter_into_reader");
|
||||||
|
let mut writer = create_writer(
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
|
||||||
sorter_into_reader(word_pair_proximity_docids_sorter, indexer)
|
for sorter in word_pair_proximity_docids_sorters {
|
||||||
|
sorter.write_into_stream_writer(&mut writer)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
writer_into_reader(writer)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
||||||
@ -86,96 +167,66 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||||||
/// close to each other.
|
/// close to each other.
|
||||||
fn document_word_positions_into_sorter(
|
fn document_word_positions_into_sorter(
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
mut word_positions_heap: BinaryHeap<PeekedWordPosition<vec::IntoIter<u32>>>,
|
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||||
word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||||
|
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeFn>],
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut word_pair_proximity = HashMap::new();
|
use itertools::merge_join_by;
|
||||||
let mut ordered_peeked_word_positions = Vec::new();
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
while !word_positions_heap.is_empty() {
|
|
||||||
while let Some(peeked_word_position) = word_positions_heap.pop() {
|
|
||||||
ordered_peeked_word_positions.push(peeked_word_position);
|
|
||||||
if ordered_peeked_word_positions.len() == 7 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some((head, tail)) = ordered_peeked_word_positions.split_first() {
|
|
||||||
for PeekedWordPosition { word, position, .. } in tail {
|
|
||||||
let prox = positions_proximity(head.position, *position);
|
|
||||||
if prox > 0 && prox < MAX_DISTANCE {
|
|
||||||
word_pair_proximity
|
|
||||||
.entry((head.word.clone(), word.clone()))
|
|
||||||
.and_modify(|p| {
|
|
||||||
*p = cmp::min(*p, prox);
|
|
||||||
})
|
|
||||||
.or_insert(prox);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Push the tail in the heap.
|
|
||||||
let tail_iter = ordered_peeked_word_positions.drain(1..);
|
|
||||||
word_positions_heap.extend(tail_iter);
|
|
||||||
|
|
||||||
// Advance the head and push it in the heap.
|
|
||||||
if let Some(mut head) = ordered_peeked_word_positions.pop() {
|
|
||||||
if let Some(next_position) = head.iter.next() {
|
|
||||||
let prox = positions_proximity(head.position, next_position);
|
|
||||||
|
|
||||||
if prox > 0 && prox < MAX_DISTANCE {
|
|
||||||
word_pair_proximity
|
|
||||||
.entry((head.word.clone(), head.word.clone()))
|
|
||||||
.and_modify(|p| {
|
|
||||||
*p = cmp::min(*p, prox);
|
|
||||||
})
|
|
||||||
.or_insert(prox);
|
|
||||||
}
|
|
||||||
|
|
||||||
word_positions_heap.push(PeekedWordPosition {
|
|
||||||
word: head.word,
|
|
||||||
position: next_position,
|
|
||||||
iter: head.iter,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
for ((w1, w2), prox) in word_pair_proximity {
|
for eob in
|
||||||
|
merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
|
||||||
|
d.cmp(a)
|
||||||
|
})
|
||||||
|
{
|
||||||
|
buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||||
|
let ((w1, w2), prox) = match eob {
|
||||||
|
Left(key_value) => {
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key_value
|
||||||
|
}
|
||||||
|
Right(key_value) => {
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key_value
|
||||||
|
}
|
||||||
|
Both(key_value, _) => {
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key_value
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.push(prox as u8);
|
key_buffer.push(*prox);
|
||||||
key_buffer.extend_from_slice(w1.as_bytes());
|
key_buffer.extend_from_slice(w1.as_bytes());
|
||||||
key_buffer.push(0);
|
key_buffer.push(0);
|
||||||
key_buffer.extend_from_slice(w2.as_bytes());
|
key_buffer.extend_from_slice(w2.as_bytes());
|
||||||
|
|
||||||
word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
word_pair_proximity_docids_sorters[*prox as usize - 1]
|
||||||
|
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
struct PeekedWordPosition<I> {
|
fn word_positions_into_word_pair_proximity(
|
||||||
word: String,
|
word_positions: &mut VecDeque<(String, u16)>,
|
||||||
position: u32,
|
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||||
iter: I,
|
) -> Result<()> {
|
||||||
}
|
let (head_word, head_position) = word_positions.pop_front().unwrap();
|
||||||
|
for (word, position) in word_positions.iter() {
|
||||||
impl<I> Ord for PeekedWordPosition<I> {
|
let prox = index_proximity(head_position as u32, *position as u32) as u8;
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
if prox > 0 && prox < MAX_DISTANCE as u8 {
|
||||||
self.position.cmp(&other.position).reverse()
|
word_pair_proximity
|
||||||
|
.entry((head_word.clone(), word.clone()))
|
||||||
|
.and_modify(|p| {
|
||||||
|
*p = cmp::min(*p, prox);
|
||||||
|
})
|
||||||
|
.or_insert(prox);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
impl<I> PartialOrd for PeekedWordPosition<I> {
|
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
|
||||||
Some(self.cmp(other))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<I> Eq for PeekedWordPosition<I> {}
|
|
||||||
|
|
||||||
impl<I> PartialEq for PeekedWordPosition<I> {
|
|
||||||
fn eq(&self, other: &Self) -> bool {
|
|
||||||
self.position == other.position
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,18 @@
|
|||||||
|
use std::collections::BTreeSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, BufReader};
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
|
use obkv::KvReaderU16;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||||
try_split_array_at, GrenadParameters,
|
GrenadParameters,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
|
use crate::update::MergeFn;
|
||||||
|
use crate::{bucketed_position, DocumentId, Result};
|
||||||
|
|
||||||
/// Extracts the word positions and the documents ids where this word appear.
|
/// Extracts the word positions and the documents ids where this word appear.
|
||||||
///
|
///
|
||||||
@ -24,32 +29,111 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut word_position_docids_sorter = create_sorter(
|
let mut word_position_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||||
|
let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||||
|
let mut current_document_id: Option<u32> = None;
|
||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
for position in read_u32_ne_bytes(value) {
|
if current_document_id.map_or(false, |id| document_id != id) {
|
||||||
key_buffer.clear();
|
words_position_into_sorter(
|
||||||
key_buffer.extend_from_slice(word_bytes);
|
current_document_id.unwrap(),
|
||||||
key_buffer.push(0);
|
&mut key_buffer,
|
||||||
let (_, position) = relative_from_absolute_position(position);
|
&del_word_positions,
|
||||||
|
&add_word_positions,
|
||||||
|
&mut word_position_docids_sorter,
|
||||||
|
)?;
|
||||||
|
del_word_positions.clear();
|
||||||
|
add_word_positions.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
current_document_id = Some(document_id);
|
||||||
|
|
||||||
|
let del_add_reader = KvReaderDelAdd::new(value);
|
||||||
|
// extract all unique words to remove.
|
||||||
|
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||||
|
for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
|
||||||
let position = bucketed_position(position);
|
let position = bucketed_position(position);
|
||||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
del_word_positions.insert((position, word_bytes.to_vec()));
|
||||||
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// extract all unique additional words.
|
||||||
|
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||||
|
for (position, word_bytes) in KvReaderU16::new(addition).iter() {
|
||||||
|
let position = bucketed_position(position);
|
||||||
|
add_word_positions.insert((position, word_bytes.to_vec()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(document_id) = current_document_id {
|
||||||
|
words_position_into_sorter(
|
||||||
|
document_id,
|
||||||
|
&mut key_buffer,
|
||||||
|
&del_word_positions,
|
||||||
|
&add_word_positions,
|
||||||
|
&mut word_position_docids_sorter,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO remove noop DelAdd OBKV
|
||||||
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
||||||
|
|
||||||
Ok(word_position_docids_reader)
|
Ok(word_position_docids_reader)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn words_position_into_sorter(
|
||||||
|
document_id: DocumentId,
|
||||||
|
key_buffer: &mut Vec<u8>,
|
||||||
|
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||||
|
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||||
|
word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||||
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
|
use itertools::merge_join_by;
|
||||||
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a))
|
||||||
|
{
|
||||||
|
buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||||
|
let (position, word_bytes) = match eob {
|
||||||
|
Left(key) => {
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key
|
||||||
|
}
|
||||||
|
Right(key) => {
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key
|
||||||
|
}
|
||||||
|
Both(key, _) => {
|
||||||
|
// both values needs to be kept because it will be used in other extractors.
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
|
key_buffer.push(0);
|
||||||
|
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||||
|
word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
@ -6,7 +6,6 @@ mod extract_fid_word_count_docids;
|
|||||||
mod extract_geo_points;
|
mod extract_geo_points;
|
||||||
mod extract_vector_points;
|
mod extract_vector_points;
|
||||||
mod extract_word_docids;
|
mod extract_word_docids;
|
||||||
mod extract_word_fid_docids;
|
|
||||||
mod extract_word_pair_proximity_docids;
|
mod extract_word_pair_proximity_docids;
|
||||||
mod extract_word_position_docids;
|
mod extract_word_position_docids;
|
||||||
|
|
||||||
@ -26,12 +25,11 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
|||||||
use self::extract_geo_points::extract_geo_points;
|
use self::extract_geo_points::extract_geo_points;
|
||||||
use self::extract_vector_points::extract_vector_points;
|
use self::extract_vector_points::extract_vector_points;
|
||||||
use self::extract_word_docids::extract_word_docids;
|
use self::extract_word_docids::extract_word_docids;
|
||||||
use self::extract_word_fid_docids::extract_word_fid_docids;
|
|
||||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||||
use self::extract_word_position_docids::extract_word_position_docids;
|
use self::extract_word_position_docids::extract_word_position_docids;
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
|
as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters,
|
||||||
GrenadParameters, MergeFn, MergeableReader,
|
MergeFn, MergeableReader,
|
||||||
};
|
};
|
||||||
use super::{helpers, TypedChunk};
|
use super::{helpers, TypedChunk};
|
||||||
use crate::{FieldId, Result};
|
use crate::{FieldId, Result};
|
||||||
@ -65,7 +63,6 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
vectors_field_id,
|
vectors_field_id,
|
||||||
primary_key_id,
|
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.collect::<Result<()>>()?;
|
.collect::<Result<()>>()?;
|
||||||
@ -94,9 +91,9 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
let (
|
let (
|
||||||
docid_word_positions_chunks,
|
docid_word_positions_chunks,
|
||||||
(
|
(
|
||||||
docid_fid_facet_numbers_chunks,
|
fid_docid_facet_numbers_chunks,
|
||||||
(
|
(
|
||||||
docid_fid_facet_strings_chunks,
|
fid_docid_facet_strings_chunks,
|
||||||
(
|
(
|
||||||
facet_is_null_docids_chunks,
|
facet_is_null_docids_chunks,
|
||||||
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
||||||
@ -110,7 +107,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||||
rayon::spawn(move || {
|
rayon::spawn(move || {
|
||||||
debug!("merge {} database", "facet-id-exists-docids");
|
debug!("merge {} database", "facet-id-exists-docids");
|
||||||
match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||||
Ok(reader) => {
|
Ok(reader) => {
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
|
||||||
}
|
}
|
||||||
@ -126,7 +123,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||||
rayon::spawn(move || {
|
rayon::spawn(move || {
|
||||||
debug!("merge {} database", "facet-id-is-null-docids");
|
debug!("merge {} database", "facet-id-is-null-docids");
|
||||||
match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||||
Ok(reader) => {
|
Ok(reader) => {
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
|
||||||
}
|
}
|
||||||
@ -142,7 +139,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||||
rayon::spawn(move || {
|
rayon::spawn(move || {
|
||||||
debug!("merge {} database", "facet-id-is-empty-docids");
|
debug!("merge {} database", "facet-id-is-empty-docids");
|
||||||
match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||||
Ok(reader) => {
|
Ok(reader) => {
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
|
||||||
}
|
}
|
||||||
@ -158,7 +155,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_word_pair_proximity_docids,
|
extract_word_pair_proximity_docids,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
TypedChunk::WordPairProximityDocids,
|
TypedChunk::WordPairProximityDocids,
|
||||||
"word-pair-proximity-docids",
|
"word-pair-proximity-docids",
|
||||||
);
|
);
|
||||||
@ -168,24 +165,31 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_fid_word_count_docids,
|
extract_fid_word_count_docids,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
TypedChunk::FieldIdWordcountDocids,
|
TypedChunk::FieldIdWordCountDocids,
|
||||||
"field-id-wordcount-docids",
|
"field-id-wordcount-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<
|
spawn_extraction_task::<
|
||||||
_,
|
_,
|
||||||
_,
|
_,
|
||||||
Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)>,
|
Vec<(
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
)>,
|
||||||
>(
|
>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
||||||
merge_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
|(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
|
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
|
||||||
|
TypedChunk::WordDocids {
|
||||||
word_docids_reader,
|
word_docids_reader,
|
||||||
exact_word_docids_reader,
|
exact_word_docids_reader,
|
||||||
|
word_fid_docids_reader,
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"word-docids",
|
"word-docids",
|
||||||
);
|
);
|
||||||
@ -195,36 +199,27 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_word_position_docids,
|
extract_word_position_docids,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
TypedChunk::WordPositionDocids,
|
TypedChunk::WordPositionDocids,
|
||||||
"word-position-docids",
|
"word-position-docids",
|
||||||
);
|
);
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
|
||||||
docid_word_positions_chunks,
|
|
||||||
indexer,
|
|
||||||
lmdb_writer_sx.clone(),
|
|
||||||
extract_word_fid_docids,
|
|
||||||
merge_cbo_roaring_bitmaps,
|
|
||||||
TypedChunk::WordFidDocids,
|
|
||||||
"word-fid-docids",
|
|
||||||
);
|
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
docid_fid_facet_strings_chunks,
|
fid_docid_facet_strings_chunks,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_facet_string_docids,
|
extract_facet_string_docids,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
TypedChunk::FieldIdFacetStringDocids,
|
TypedChunk::FieldIdFacetStringDocids,
|
||||||
"field-id-facet-string-docids",
|
"field-id-facet-string-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
docid_fid_facet_numbers_chunks,
|
fid_docid_facet_numbers_chunks,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx,
|
lmdb_writer_sx,
|
||||||
extract_facet_number_docids,
|
extract_facet_number_docids,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
TypedChunk::FieldIdFacetNumberDocids,
|
TypedChunk::FieldIdFacetNumberDocids,
|
||||||
"field-id-facet-number-docids",
|
"field-id-facet-number-docids",
|
||||||
);
|
);
|
||||||
@ -278,7 +273,6 @@ fn send_original_documents_data(
|
|||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
vectors_field_id: Option<FieldId>,
|
vectors_field_id: Option<FieldId>,
|
||||||
primary_key_id: FieldId,
|
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let original_documents_chunk =
|
let original_documents_chunk =
|
||||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||||
@ -287,12 +281,7 @@ fn send_original_documents_data(
|
|||||||
let documents_chunk_cloned = original_documents_chunk.clone();
|
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||||
rayon::spawn(move || {
|
rayon::spawn(move || {
|
||||||
let result = extract_vector_points(
|
let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id);
|
||||||
documents_chunk_cloned,
|
|
||||||
indexer,
|
|
||||||
primary_key_id,
|
|
||||||
vectors_field_id,
|
|
||||||
);
|
|
||||||
let _ = match result {
|
let _ = match result {
|
||||||
Ok(vector_points) => {
|
Ok(vector_points) => {
|
||||||
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
|
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
|
||||||
@ -356,10 +345,10 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||||
rayon::join(
|
rayon::join(
|
||||||
|| {
|
|| {
|
||||||
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
|
let (docid_word_positions_chunk, script_language_pair) =
|
||||||
extract_docid_word_positions(
|
extract_docid_word_positions(
|
||||||
flattened_documents_chunk.clone(),
|
flattened_documents_chunk.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
@ -370,9 +359,6 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// send documents_ids to DB writer
|
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids)));
|
|
||||||
|
|
||||||
// send docid_word_positions_chunk to DB writer
|
// send docid_word_positions_chunk to DB writer
|
||||||
let docid_word_positions_chunk =
|
let docid_word_positions_chunk =
|
||||||
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
||||||
@ -384,8 +370,8 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
},
|
},
|
||||||
|| {
|
|| {
|
||||||
let ExtractedFacetValues {
|
let ExtractedFacetValues {
|
||||||
docid_fid_facet_numbers_chunk,
|
fid_docid_facet_numbers_chunk,
|
||||||
docid_fid_facet_strings_chunk,
|
fid_docid_facet_strings_chunk,
|
||||||
fid_facet_is_null_docids_chunk,
|
fid_facet_is_null_docids_chunk,
|
||||||
fid_facet_is_empty_docids_chunk,
|
fid_facet_is_empty_docids_chunk,
|
||||||
fid_facet_exists_docids_chunk,
|
fid_facet_exists_docids_chunk,
|
||||||
@ -396,26 +382,26 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
geo_fields_ids,
|
geo_fields_ids,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// send docid_fid_facet_numbers_chunk to DB writer
|
// send fid_docid_facet_numbers_chunk to DB writer
|
||||||
let docid_fid_facet_numbers_chunk =
|
let fid_docid_facet_numbers_chunk =
|
||||||
unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? };
|
unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
|
||||||
|
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
|
||||||
docid_fid_facet_numbers_chunk.clone(),
|
fid_docid_facet_numbers_chunk.clone(),
|
||||||
)));
|
)));
|
||||||
|
|
||||||
// send docid_fid_facet_strings_chunk to DB writer
|
// send fid_docid_facet_strings_chunk to DB writer
|
||||||
let docid_fid_facet_strings_chunk =
|
let fid_docid_facet_strings_chunk =
|
||||||
unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? };
|
unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
|
||||||
|
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
|
||||||
docid_fid_facet_strings_chunk.clone(),
|
fid_docid_facet_strings_chunk.clone(),
|
||||||
)));
|
)));
|
||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
docid_fid_facet_numbers_chunk,
|
fid_docid_facet_numbers_chunk,
|
||||||
(
|
(
|
||||||
docid_fid_facet_strings_chunk,
|
fid_docid_facet_strings_chunk,
|
||||||
(
|
(
|
||||||
fid_facet_is_null_docids_chunk,
|
fid_facet_is_null_docids_chunk,
|
||||||
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
||||||
@ -425,5 +411,5 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?))
|
Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
|
||||||
}
|
}
|
||||||
|
@ -1,14 +1,12 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, BufReader, BufWriter, Seek};
|
use std::io::{self, BufReader, BufWriter, Seek};
|
||||||
use std::time::Instant;
|
|
||||||
|
|
||||||
use grenad::{CompressionType, Sorter};
|
use grenad::{CompressionType, Sorter};
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use log::debug;
|
|
||||||
|
|
||||||
use super::{ClonableMmap, MergeFn};
|
use super::{ClonableMmap, MergeFn};
|
||||||
use crate::error::InternalError;
|
use crate::update::index_documents::valid_lmdb_key;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
|
pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
|
||||||
@ -47,6 +45,7 @@ pub fn create_sorter(
|
|||||||
builder.allow_realloc(false);
|
builder.allow_realloc(false);
|
||||||
}
|
}
|
||||||
builder.sort_algorithm(sort_algorithm);
|
builder.sort_algorithm(sort_algorithm);
|
||||||
|
builder.sort_in_parallel(true);
|
||||||
builder.build()
|
builder.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,6 +53,7 @@ pub fn sorter_into_reader(
|
|||||||
sorter: grenad::Sorter<MergeFn>,
|
sorter: grenad::Sorter<MergeFn>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
|
puffin::profile_function!();
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
@ -115,6 +115,32 @@ impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<Bu
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl MergeableReader
|
||||||
|
for Vec<(
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
)>
|
||||||
|
{
|
||||||
|
type Output = (
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
);
|
||||||
|
|
||||||
|
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||||
|
let mut m1 = MergerBuilder::new(merge_fn);
|
||||||
|
let mut m2 = MergerBuilder::new(merge_fn);
|
||||||
|
let mut m3 = MergerBuilder::new(merge_fn);
|
||||||
|
for (r1, r2, r3) in self.into_iter() {
|
||||||
|
m1.push(r1)?;
|
||||||
|
m2.push(r2)?;
|
||||||
|
m3.push(r3)?;
|
||||||
|
}
|
||||||
|
Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
||||||
|
|
||||||
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
||||||
@ -195,6 +221,7 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
|||||||
);
|
);
|
||||||
|
|
||||||
while let Some((document_id, obkv)) = cursor.move_on_next()? {
|
while let Some((document_id, obkv)) = cursor.move_on_next()? {
|
||||||
|
if !obkv.is_empty() {
|
||||||
obkv_documents.insert(document_id, obkv)?;
|
obkv_documents.insert(document_id, obkv)?;
|
||||||
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
||||||
|
|
||||||
@ -202,6 +229,7 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
|||||||
return writer_into_reader(obkv_documents).map(Some);
|
return writer_into_reader(obkv_documents).map(Some);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
continue_reading = false;
|
continue_reading = false;
|
||||||
writer_into_reader(obkv_documents).map(Some)
|
writer_into_reader(obkv_documents).map(Some)
|
||||||
@ -210,45 +238,46 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
|||||||
Ok(std::iter::from_fn(move || transposer().transpose()))
|
Ok(std::iter::from_fn(move || transposer().transpose()))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn sorter_into_lmdb_database(
|
/// Write provided sorter in database using serialize_value function.
|
||||||
wtxn: &mut heed::RwTxn,
|
/// merge_values function is used if an entry already exist in the database.
|
||||||
database: heed::PolyDatabase,
|
pub fn write_sorter_into_database<K, V, FS, FM>(
|
||||||
sorter: Sorter<MergeFn>,
|
sorter: Sorter<MergeFn>,
|
||||||
merge: MergeFn,
|
database: &heed::Database<K, V>,
|
||||||
) -> Result<()> {
|
wtxn: &mut heed::RwTxn,
|
||||||
|
index_is_empty: bool,
|
||||||
|
serialize_value: FS,
|
||||||
|
merge_values: FM,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||||
|
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||||
|
{
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
debug!("Writing MTBL sorter...");
|
|
||||||
let before = Instant::now();
|
let mut buffer = Vec::new();
|
||||||
|
let database = database.remap_types::<ByteSlice, ByteSlice>();
|
||||||
|
|
||||||
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
||||||
if database.is_empty(wtxn)? {
|
while let Some((key, value)) = merger_iter.next()? {
|
||||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
if valid_lmdb_key(key) {
|
||||||
while let Some((k, v)) = merger_iter.next()? {
|
buffer.clear();
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
let value = if index_is_empty {
|
||||||
unsafe { out_iter.append(k, v)? };
|
Some(serialize_value(value, &mut buffer)?)
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
while let Some((k, v)) = merger_iter.next()? {
|
match database.get(wtxn, key)? {
|
||||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
|
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||||
match iter.next().transpose()? {
|
None => Some(serialize_value(value, &mut buffer)?),
|
||||||
Some((key, old_val)) if key == k => {
|
|
||||||
let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
|
|
||||||
let val = merge(k, &vals).map_err(|_| {
|
|
||||||
// TODO just wrap this error?
|
|
||||||
InternalError::IndexingMergingKeys { process: "get-put-merge" }
|
|
||||||
})?;
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.put_current(k, &val)? };
|
|
||||||
}
|
}
|
||||||
_ => {
|
};
|
||||||
drop(iter);
|
match value {
|
||||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
Some(value) => database.put(wtxn, key, value)?,
|
||||||
|
None => {
|
||||||
|
database.delete(wtxn, key)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,22 +6,12 @@ use std::result::Result as StdResult;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::transform::Operation;
|
use crate::update::index_documents::transform::Operation;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
|
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
|
||||||
|
|
||||||
pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
|
||||||
if values.len() == 1 {
|
|
||||||
Ok(values[0].clone())
|
|
||||||
} else {
|
|
||||||
let capacity = values.iter().map(|v| v.len()).sum::<usize>();
|
|
||||||
let mut output = Vec::with_capacity(capacity);
|
|
||||||
values.iter().for_each(|integers| output.extend_from_slice(integers));
|
|
||||||
Ok(Cow::Owned(output))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
|
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
buffer.reserve(bitmap.serialized_size());
|
buffer.reserve(bitmap.serialized_size());
|
||||||
@ -75,57 +65,123 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<
|
|||||||
Ok(obkvs.last().unwrap().clone())
|
Ok(obkvs.last().unwrap().clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
|
pub fn merge_two_del_add_obkvs(
|
||||||
|
base: obkv::KvReaderU16,
|
||||||
|
update: obkv::KvReaderU16,
|
||||||
|
merge_additions: bool,
|
||||||
|
buffer: &mut Vec<u8>,
|
||||||
|
) {
|
||||||
use itertools::merge_join_by;
|
use itertools::merge_join_by;
|
||||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
|
|
||||||
let mut writer = obkv::KvWriter::new(buffer);
|
let mut writer = obkv::KvWriter::new(buffer);
|
||||||
|
let mut value_buffer = Vec::new();
|
||||||
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||||
match eob {
|
match eob {
|
||||||
Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(),
|
Left((k, v)) => {
|
||||||
|
if merge_additions {
|
||||||
|
writer.insert(k, v).unwrap()
|
||||||
|
} else {
|
||||||
|
// If merge_additions is false, recreate an obkv keeping the deletions only.
|
||||||
|
value_buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
let base_reader = KvReaderDelAdd::new(v);
|
||||||
|
|
||||||
|
if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
|
||||||
|
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||||
|
value_writer.finish().unwrap();
|
||||||
|
writer.insert(k, &value_buffer).unwrap()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Right((k, v)) => writer.insert(k, v).unwrap(),
|
||||||
|
Both((k, base), (_, update)) => {
|
||||||
|
// merge deletions and additions.
|
||||||
|
value_buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
let base_reader = KvReaderDelAdd::new(base);
|
||||||
|
let update_reader = KvReaderDelAdd::new(update);
|
||||||
|
|
||||||
|
// keep newest deletion.
|
||||||
|
if let Some(deletion) = update_reader
|
||||||
|
.get(DelAdd::Deletion)
|
||||||
|
.or_else(|| base_reader.get(DelAdd::Deletion))
|
||||||
|
{
|
||||||
|
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep base addition only if merge_additions is true.
|
||||||
|
let base_addition =
|
||||||
|
merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
|
||||||
|
// keep newest addition.
|
||||||
|
// TODO use or_else
|
||||||
|
if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
|
||||||
|
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
value_writer.finish().unwrap();
|
||||||
|
writer.insert(k, &value_buffer).unwrap()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
writer.finish().unwrap();
|
writer.finish().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Merge all the obks in the order we see them.
|
/// Merge all the obkvs from the newest to the oldest.
|
||||||
pub fn merge_obkvs_and_operations<'a>(
|
fn inner_merge_del_add_obkvs<'a>(
|
||||||
|
obkvs: &[Cow<'a, [u8]>],
|
||||||
|
merge_additions: bool,
|
||||||
|
) -> Result<Cow<'a, [u8]>> {
|
||||||
|
// pop the newest operation from the list.
|
||||||
|
let (newest, obkvs) = obkvs.split_last().unwrap();
|
||||||
|
// keep the operation type for the returned value.
|
||||||
|
let newest_operation_type = newest[0];
|
||||||
|
|
||||||
|
// treat the newest obkv as the starting point of the merge.
|
||||||
|
let mut acc_operation_type = newest_operation_type;
|
||||||
|
let mut acc = newest[1..].to_vec();
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
// reverse iter from the most recent to the oldest.
|
||||||
|
for current in obkvs.iter().rev() {
|
||||||
|
// if in the previous iteration there was a complete deletion,
|
||||||
|
// stop the merge process.
|
||||||
|
if acc_operation_type == Operation::Deletion as u8 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let newest = obkv::KvReader::new(&acc);
|
||||||
|
let oldest = obkv::KvReader::new(¤t[1..]);
|
||||||
|
merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
|
||||||
|
|
||||||
|
// we want the result of the merge into our accumulator.
|
||||||
|
std::mem::swap(&mut acc, &mut buffer);
|
||||||
|
acc_operation_type = current[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
acc.insert(0, newest_operation_type);
|
||||||
|
Ok(Cow::from(acc))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merge all the obkvs from the newest to the oldest.
|
||||||
|
pub fn obkvs_merge_additions_and_deletions<'a>(
|
||||||
_key: &[u8],
|
_key: &[u8],
|
||||||
obkvs: &[Cow<'a, [u8]>],
|
obkvs: &[Cow<'a, [u8]>],
|
||||||
) -> Result<Cow<'a, [u8]>> {
|
) -> Result<Cow<'a, [u8]>> {
|
||||||
// [add, add, delete, add, add]
|
inner_merge_del_add_obkvs(obkvs, true)
|
||||||
// we can ignore everything that happened before the last delete.
|
|
||||||
let starting_position =
|
|
||||||
obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0);
|
|
||||||
|
|
||||||
// [add, add, delete]
|
|
||||||
// if the last operation was a deletion then we simply return the deletion
|
|
||||||
if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8
|
|
||||||
{
|
|
||||||
return Ok(obkvs[obkvs.len() - 1].clone());
|
|
||||||
}
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
|
|
||||||
// (add, add, delete) [add, add]
|
|
||||||
// in the other case, no deletion will be encountered during the merge
|
|
||||||
let mut ret =
|
|
||||||
obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| {
|
|
||||||
let first = obkv::KvReader::new(&acc);
|
|
||||||
let second = obkv::KvReader::new(¤t[1..]);
|
|
||||||
merge_two_obkvs(first, second, &mut buffer);
|
|
||||||
|
|
||||||
// we want the result of the merge into our accumulator
|
|
||||||
std::mem::swap(&mut acc, &mut buffer);
|
|
||||||
acc
|
|
||||||
});
|
|
||||||
|
|
||||||
ret.insert(0, Operation::Addition as u8);
|
|
||||||
Ok(Cow::from(ret))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
|
||||||
|
pub fn obkvs_keep_last_addition_merge_deletions<'a>(
|
||||||
|
_key: &[u8],
|
||||||
|
obkvs: &[Cow<'a, [u8]>],
|
||||||
|
) -> Result<Cow<'a, [u8]>> {
|
||||||
|
inner_merge_del_add_obkvs(obkvs, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Do a union of all the CboRoaringBitmaps in the values.
|
||||||
pub fn merge_cbo_roaring_bitmaps<'a>(
|
pub fn merge_cbo_roaring_bitmaps<'a>(
|
||||||
_key: &[u8],
|
_key: &[u8],
|
||||||
values: &[Cow<'a, [u8]>],
|
values: &[Cow<'a, [u8]>],
|
||||||
@ -138,3 +194,52 @@ pub fn merge_cbo_roaring_bitmaps<'a>(
|
|||||||
Ok(Cow::from(vec))
|
Ok(Cow::from(vec))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
|
||||||
|
/// separately and outputs a new DelAdd with both unions.
|
||||||
|
pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
|
||||||
|
_key: &[u8],
|
||||||
|
values: &[Cow<'a, [u8]>],
|
||||||
|
) -> Result<Cow<'a, [u8]>> {
|
||||||
|
if values.len() == 1 {
|
||||||
|
Ok(values[0].clone())
|
||||||
|
} else {
|
||||||
|
// Retrieve the bitmaps from both sides
|
||||||
|
let mut del_bitmaps_bytes = Vec::new();
|
||||||
|
let mut add_bitmaps_bytes = Vec::new();
|
||||||
|
for value in values {
|
||||||
|
let obkv = KvReaderDelAdd::new(value);
|
||||||
|
if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
|
||||||
|
del_bitmaps_bytes.push(bitmap_bytes);
|
||||||
|
}
|
||||||
|
if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
|
||||||
|
add_bitmaps_bytes.push(bitmap_bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
|
||||||
|
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
|
||||||
|
buffer.clear();
|
||||||
|
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
|
||||||
|
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
|
||||||
|
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A function that merges a DelAdd of bitmao into an already existing bitmap.
|
||||||
|
///
|
||||||
|
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
|
||||||
|
/// the second one is the CboRoaringBitmap to merge into.
|
||||||
|
pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
||||||
|
deladd_obkv: &[u8],
|
||||||
|
previous: &[u8],
|
||||||
|
buffer: &'a mut Vec<u8>,
|
||||||
|
) -> Result<Option<&'a [u8]>> {
|
||||||
|
Ok(CboRoaringBitmapCodec::merge_deladd_into(
|
||||||
|
KvReaderDelAdd::new(deladd_obkv),
|
||||||
|
previous,
|
||||||
|
buffer,
|
||||||
|
)?)
|
||||||
|
}
|
||||||
|
@ -9,13 +9,14 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
|||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
pub use grenad_helpers::{
|
pub use grenad_helpers::{
|
||||||
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
||||||
merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader,
|
merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
|
||||||
GrenadParameters, MergeableReader,
|
GrenadParameters, MergeableReader,
|
||||||
};
|
};
|
||||||
pub use merge_functions::{
|
pub use merge_functions::{
|
||||||
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
|
keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
||||||
merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
serialize_roaring_bitmap, MergeFn,
|
merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
|
||||||
|
obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::MAX_WORD_LENGTH;
|
use crate::MAX_WORD_LENGTH;
|
||||||
@ -44,10 +45,6 @@ where
|
|||||||
Some((head, tail))
|
Some((head, tail))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
|
|
||||||
bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts an fst Stream into an HashSet of Strings.
|
/// Converts an fst Stream into an HashSet of Strings.
|
||||||
pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet<Vec<u8>>
|
pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet<Vec<u8>>
|
||||||
where
|
where
|
||||||
|
@ -20,11 +20,13 @@ use slice_group_by::GroupBy;
|
|||||||
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
||||||
|
|
||||||
use self::enrich::enrich_documents_batch;
|
use self::enrich::enrich_documents_batch;
|
||||||
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
|
pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId};
|
||||||
pub use self::helpers::{
|
pub use self::helpers::{
|
||||||
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
||||||
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
||||||
sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
|
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
|
merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader,
|
||||||
|
ClonableMmap, MergeFn,
|
||||||
};
|
};
|
||||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||||
pub use self::transform::{Transform, TransformOutput};
|
pub use self::transform::{Transform, TransformOutput};
|
||||||
@ -32,13 +34,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
|||||||
use crate::error::{Error, InternalError, UserError};
|
use crate::error::{Error, InternalError, UserError};
|
||||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||||
use crate::update::{
|
use crate::update::{
|
||||||
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
|
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||||
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
|
||||||
};
|
};
|
||||||
use crate::{Index, Result, RoaringBitmapCodec};
|
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||||
|
|
||||||
static MERGED_DATABASE_COUNT: usize = 7;
|
static MERGED_DATABASE_COUNT: usize = 7;
|
||||||
static PREFIX_DATABASE_COUNT: usize = 5;
|
static PREFIX_DATABASE_COUNT: usize = 4;
|
||||||
static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
|
static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
@ -86,7 +87,6 @@ pub struct IndexDocumentsConfig {
|
|||||||
pub words_positions_level_group_size: Option<NonZeroU32>,
|
pub words_positions_level_group_size: Option<NonZeroU32>,
|
||||||
pub words_positions_min_level_size: Option<NonZeroU32>,
|
pub words_positions_min_level_size: Option<NonZeroU32>,
|
||||||
pub update_method: IndexDocumentsMethod,
|
pub update_method: IndexDocumentsMethod,
|
||||||
pub deletion_strategy: DeletionStrategy,
|
|
||||||
pub autogenerate_docids: bool,
|
pub autogenerate_docids: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,6 +178,7 @@ where
|
|||||||
|
|
||||||
// Early return when there is no document to add
|
// Early return when there is no document to add
|
||||||
if to_delete.is_empty() {
|
if to_delete.is_empty() {
|
||||||
|
// Maintains Invariant: remove documents actually always returns Ok for the inner result
|
||||||
return Ok((self, Ok(0)));
|
return Ok((self, Ok(0)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -190,14 +191,48 @@ where
|
|||||||
|
|
||||||
self.deleted_documents += deleted_documents;
|
self.deleted_documents += deleted_documents;
|
||||||
|
|
||||||
|
// Maintains Invariant: remove documents actually always returns Ok for the inner result
|
||||||
Ok((self, Ok(deleted_documents)))
|
Ok((self, Ok(deleted_documents)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Removes documents from db using their internal document ids.
|
||||||
|
///
|
||||||
|
/// # Warning
|
||||||
|
///
|
||||||
|
/// This function is dangerous and will only work correctly if:
|
||||||
|
///
|
||||||
|
/// - All the passed ids currently exist in the database
|
||||||
|
/// - No batching using the standards `remove_documents` and `add_documents` took place
|
||||||
|
///
|
||||||
|
/// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
|
||||||
|
pub fn remove_documents_from_db_no_batch(
|
||||||
|
mut self,
|
||||||
|
to_delete: &RoaringBitmap,
|
||||||
|
) -> Result<(Self, u64)> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
|
// Early return when there is no document to add
|
||||||
|
if to_delete.is_empty() {
|
||||||
|
return Ok((self, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
let deleted_documents = self
|
||||||
|
.transform
|
||||||
|
.as_mut()
|
||||||
|
.expect("Invalid document deletion state")
|
||||||
|
.remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)?
|
||||||
|
as u64;
|
||||||
|
|
||||||
|
self.deleted_documents += deleted_documents;
|
||||||
|
|
||||||
|
Ok((self, deleted_documents))
|
||||||
|
}
|
||||||
|
|
||||||
#[logging_timer::time("IndexDocuments::{}")]
|
#[logging_timer::time("IndexDocuments::{}")]
|
||||||
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
|
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
if self.added_documents == 0 {
|
if self.added_documents == 0 && self.deleted_documents == 0 {
|
||||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||||
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
||||||
}
|
}
|
||||||
@ -241,9 +276,6 @@ where
|
|||||||
primary_key,
|
primary_key,
|
||||||
fields_ids_map,
|
fields_ids_map,
|
||||||
field_distribution,
|
field_distribution,
|
||||||
new_external_documents_ids,
|
|
||||||
new_documents_ids,
|
|
||||||
replaced_documents_ids,
|
|
||||||
documents_count,
|
documents_count,
|
||||||
original_documents,
|
original_documents,
|
||||||
flattened_documents,
|
flattened_documents,
|
||||||
@ -367,29 +399,12 @@ where
|
|||||||
let _ = lmdb_writer_sx.send(Err(e));
|
let _ = lmdb_writer_sx.send(Err(e));
|
||||||
}
|
}
|
||||||
|
|
||||||
// needs to be droped to avoid channel waiting lock.
|
// needs to be dropped to avoid channel waiting lock.
|
||||||
drop(lmdb_writer_sx)
|
drop(lmdb_writer_sx)
|
||||||
});
|
});
|
||||||
|
|
||||||
// We delete the documents that this document addition replaces. This way we are
|
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
|
||||||
// able to simply insert all the documents even if they already exist in the database.
|
|
||||||
if !replaced_documents_ids.is_empty() {
|
|
||||||
let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?;
|
|
||||||
deletion_builder.strategy(self.config.deletion_strategy);
|
|
||||||
debug!("documents to delete {:?}", replaced_documents_ids);
|
|
||||||
deletion_builder.delete_documents(&replaced_documents_ids);
|
|
||||||
let deleted_documents_result = deletion_builder.execute_inner()?;
|
|
||||||
debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
|
|
||||||
}
|
|
||||||
|
|
||||||
let index_documents_ids = self.index.documents_ids(self.wtxn)?;
|
|
||||||
let index_is_empty = index_documents_ids.is_empty();
|
|
||||||
let mut final_documents_ids = RoaringBitmap::new();
|
let mut final_documents_ids = RoaringBitmap::new();
|
||||||
let mut word_pair_proximity_docids = None;
|
|
||||||
let mut word_position_docids = None;
|
|
||||||
let mut word_fid_docids = None;
|
|
||||||
let mut word_docids = None;
|
|
||||||
let mut exact_word_docids = None;
|
|
||||||
|
|
||||||
let mut databases_seen = 0;
|
let mut databases_seen = 0;
|
||||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||||
@ -397,35 +412,40 @@ where
|
|||||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
let mut word_position_docids = None;
|
||||||
|
let mut word_fid_docids = None;
|
||||||
|
let mut word_docids = None;
|
||||||
|
let mut exact_word_docids = None;
|
||||||
|
|
||||||
for result in lmdb_writer_rx {
|
for result in lmdb_writer_rx {
|
||||||
if (self.should_abort)() {
|
if (self.should_abort)() {
|
||||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||||
}
|
}
|
||||||
|
|
||||||
let typed_chunk = match result? {
|
let typed_chunk = match result? {
|
||||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
TypedChunk::WordDocids {
|
||||||
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
word_fid_docids_reader,
|
||||||
|
} => {
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||||
word_docids = Some(cloneable_chunk);
|
word_docids = Some(cloneable_chunk);
|
||||||
let cloneable_chunk =
|
let cloneable_chunk =
|
||||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||||
exact_word_docids = Some(cloneable_chunk);
|
exact_word_docids = Some(cloneable_chunk);
|
||||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||||
|
word_fid_docids = Some(cloneable_chunk);
|
||||||
|
TypedChunk::WordDocids {
|
||||||
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
word_fid_docids_reader,
|
||||||
}
|
}
|
||||||
TypedChunk::WordPairProximityDocids(chunk) => {
|
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
|
||||||
word_pair_proximity_docids = Some(cloneable_chunk);
|
|
||||||
TypedChunk::WordPairProximityDocids(chunk)
|
|
||||||
}
|
}
|
||||||
TypedChunk::WordPositionDocids(chunk) => {
|
TypedChunk::WordPositionDocids(chunk) => {
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||||
word_position_docids = Some(cloneable_chunk);
|
word_position_docids = Some(cloneable_chunk);
|
||||||
TypedChunk::WordPositionDocids(chunk)
|
TypedChunk::WordPositionDocids(chunk)
|
||||||
}
|
}
|
||||||
TypedChunk::WordFidDocids(chunk) => {
|
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
|
||||||
word_fid_docids = Some(cloneable_chunk);
|
|
||||||
TypedChunk::WordFidDocids(chunk)
|
|
||||||
}
|
|
||||||
otherwise => otherwise,
|
otherwise => otherwise,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -457,25 +477,16 @@ where
|
|||||||
|
|
||||||
// We write the primary key field id into the main database
|
// We write the primary key field id into the main database
|
||||||
self.index.put_primary_key(self.wtxn, &primary_key)?;
|
self.index.put_primary_key(self.wtxn, &primary_key)?;
|
||||||
|
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||||
// We write the external documents ids into the main database.
|
|
||||||
let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
|
|
||||||
external_documents_ids.insert_ids(&new_external_documents_ids)?;
|
|
||||||
let external_documents_ids = external_documents_ids.into_static();
|
|
||||||
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
|
|
||||||
|
|
||||||
let all_documents_ids = index_documents_ids | new_documents_ids;
|
|
||||||
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
|
|
||||||
|
|
||||||
self.execute_prefix_databases(
|
self.execute_prefix_databases(
|
||||||
word_docids,
|
word_docids,
|
||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
word_pair_proximity_docids,
|
|
||||||
word_position_docids,
|
word_position_docids,
|
||||||
word_fid_docids,
|
word_fid_docids,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
Ok(all_documents_ids.len())
|
Ok(number_of_documents)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[logging_timer::time("IndexDocuments::{}")]
|
#[logging_timer::time("IndexDocuments::{}")]
|
||||||
@ -483,7 +494,6 @@ where
|
|||||||
self,
|
self,
|
||||||
word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||||
exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||||
word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
|
||||||
word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||||
word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
@ -604,32 +614,6 @@ where
|
|||||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||||
});
|
});
|
||||||
|
|
||||||
if let Some(word_pair_proximity_docids) = word_pair_proximity_docids {
|
|
||||||
// Run the word prefix pair proximity docids update operation.
|
|
||||||
PrefixWordPairsProximityDocids::new(
|
|
||||||
self.wtxn,
|
|
||||||
self.index,
|
|
||||||
self.indexer_config.chunk_compression_type,
|
|
||||||
self.indexer_config.chunk_compression_level,
|
|
||||||
)
|
|
||||||
.execute(
|
|
||||||
word_pair_proximity_docids,
|
|
||||||
&new_prefix_fst_words,
|
|
||||||
&common_prefix_fst_words,
|
|
||||||
&del_prefix_fst_words,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (self.should_abort)() {
|
|
||||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
|
||||||
}
|
|
||||||
|
|
||||||
databases_seen += 1;
|
|
||||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
|
||||||
databases_seen,
|
|
||||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
|
||||||
});
|
|
||||||
|
|
||||||
if let Some(word_position_docids) = word_position_docids {
|
if let Some(word_position_docids) = word_position_docids {
|
||||||
// Run the words prefix position docids update operation.
|
// Run the words prefix position docids update operation.
|
||||||
let mut builder = WordPrefixIntegerDocids::new(
|
let mut builder = WordPrefixIntegerDocids::new(
|
||||||
@ -687,8 +671,8 @@ where
|
|||||||
fn execute_word_prefix_docids(
|
fn execute_word_prefix_docids(
|
||||||
txn: &mut heed::RwTxn,
|
txn: &mut heed::RwTxn,
|
||||||
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
||||||
word_docids_db: Database<Str, RoaringBitmapCodec>,
|
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||||
word_prefix_docids_db: Database<Str, RoaringBitmapCodec>,
|
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||||
indexer_config: &IndexerConfig,
|
indexer_config: &IndexerConfig,
|
||||||
new_prefix_fst_words: &[String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
@ -709,14 +693,15 @@ fn execute_word_prefix_docids(
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
|
use fst::IntoStreamer;
|
||||||
|
use heed::RwTxn;
|
||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::documents::documents_batch_reader_from_objects;
|
use crate::documents::documents_batch_reader_from_objects;
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::search::TermsMatchingStrategy;
|
use crate::search::TermsMatchingStrategy;
|
||||||
use crate::update::DeleteDocuments;
|
use crate::{db_snap, Filter, Search, BEU16};
|
||||||
use crate::{db_snap, BEU16};
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn simple_document_replacement() {
|
fn simple_document_replacement() {
|
||||||
@ -807,11 +792,10 @@ mod tests {
|
|||||||
assert_eq!(count, 1);
|
assert_eq!(count, 1);
|
||||||
|
|
||||||
// Check that we get only one document from the database.
|
// Check that we get only one document from the database.
|
||||||
// Since the document has been deleted and re-inserted, its internal docid has been incremented to 1
|
let docs = index.documents(&rtxn, Some(0)).unwrap();
|
||||||
let docs = index.documents(&rtxn, Some(1)).unwrap();
|
|
||||||
assert_eq!(docs.len(), 1);
|
assert_eq!(docs.len(), 1);
|
||||||
let (id, doc) = docs[0];
|
let (id, doc) = docs[0];
|
||||||
assert_eq!(id, 1);
|
assert_eq!(id, 0);
|
||||||
|
|
||||||
// Check that this document is equal to the last one sent.
|
// Check that this document is equal to the last one sent.
|
||||||
let mut doc_iter = doc.iter();
|
let mut doc_iter = doc.iter();
|
||||||
@ -872,7 +856,7 @@ mod tests {
|
|||||||
assert_eq!(count, 3);
|
assert_eq!(count, 3);
|
||||||
|
|
||||||
// the document 0 has been deleted and reinserted with the id 3
|
// the document 0 has been deleted and reinserted with the id 3
|
||||||
let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap();
|
let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap();
|
||||||
let kevin_position =
|
let kevin_position =
|
||||||
docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
|
docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
|
||||||
assert_eq!(kevin_position, 2);
|
assert_eq!(kevin_position, 2);
|
||||||
@ -1018,7 +1002,6 @@ mod tests {
|
|||||||
assert_eq!(count, 6);
|
assert_eq!(count, 6);
|
||||||
|
|
||||||
db_snap!(index, word_docids, "updated");
|
db_snap!(index, word_docids, "updated");
|
||||||
db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]");
|
|
||||||
|
|
||||||
drop(rtxn);
|
drop(rtxn);
|
||||||
}
|
}
|
||||||
@ -1121,17 +1104,15 @@ mod tests {
|
|||||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||||
]))
|
]))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
|
||||||
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
|
|
||||||
|
|
||||||
// Delete not all of the documents but some of them.
|
// Delete not all of the documents but some of them.
|
||||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
index.delete_document("30");
|
||||||
builder.delete_external_id("30");
|
|
||||||
builder.execute().unwrap();
|
|
||||||
|
|
||||||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
let txn = index.read_txn().unwrap();
|
||||||
assert!(external_documents_ids.get("30").is_none());
|
assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId"));
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
let external_documents_ids = index.external_documents_ids();
|
||||||
|
assert!(external_documents_ids.get(&txn, "30").unwrap().is_none());
|
||||||
|
|
||||||
index
|
index
|
||||||
.add_documents(documents!([
|
.add_documents(documents!([
|
||||||
@ -1140,8 +1121,8 @@ mod tests {
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let wtxn = index.write_txn().unwrap();
|
let wtxn = index.write_txn().unwrap();
|
||||||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
let external_documents_ids = index.external_documents_ids();
|
||||||
assert!(external_documents_ids.get("30").is_some());
|
assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some());
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
index
|
index
|
||||||
@ -1435,8 +1416,10 @@ mod tests {
|
|||||||
index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap();
|
index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap();
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let external_documents_ids = index.external_documents_ids(&rtxn).unwrap();
|
let all_documents_count = index.all_documents(&rtxn).unwrap().count();
|
||||||
assert!(external_documents_ids.get("1").is_some());
|
assert_eq!(all_documents_count, 1);
|
||||||
|
let external_documents_ids = index.external_documents_ids();
|
||||||
|
assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -1490,12 +1473,6 @@ mod tests {
|
|||||||
3 2 second second
|
3 2 second second
|
||||||
3 3 third third
|
3 3 third third
|
||||||
"###);
|
"###);
|
||||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
|
||||||
0 []
|
|
||||||
1 []
|
|
||||||
2 []
|
|
||||||
3 [0, 1, 2, 3, ]
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
@ -1519,12 +1496,6 @@ mod tests {
|
|||||||
|
|
||||||
db_snap!(index, facet_id_string_docids, @"");
|
db_snap!(index, facet_id_string_docids, @"");
|
||||||
db_snap!(index, field_id_docid_facet_strings, @"");
|
db_snap!(index, field_id_docid_facet_strings, @"");
|
||||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
|
||||||
0 []
|
|
||||||
1 []
|
|
||||||
2 []
|
|
||||||
3 [0, 1, 2, 3, ]
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
@ -1551,12 +1522,6 @@ mod tests {
|
|||||||
3 2 second second
|
3 2 second second
|
||||||
3 3 third third
|
3 3 third third
|
||||||
"###);
|
"###);
|
||||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
|
||||||
0 []
|
|
||||||
1 []
|
|
||||||
2 []
|
|
||||||
3 [0, 1, 2, 3, ]
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
@ -1719,7 +1684,7 @@ mod tests {
|
|||||||
|
|
||||||
let wtxn = index.read_txn().unwrap();
|
let wtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map();
|
let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap();
|
||||||
let ids = map.values().collect::<HashSet<_>>();
|
let ids = map.values().collect::<HashSet<_>>();
|
||||||
|
|
||||||
assert_eq!(ids.len(), map.len());
|
assert_eq!(ids.len(), map.len());
|
||||||
@ -2531,17 +2496,8 @@ mod tests {
|
|||||||
db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
|
db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
|
||||||
db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
|
db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
|
||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
|
||||||
|
|
||||||
// Delete not all of the documents but some of them.
|
// Delete not all of the documents but some of them.
|
||||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
index.delete_documents(vec!["0".into(), "3".into()]);
|
||||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
|
||||||
builder.delete_external_id("0");
|
|
||||||
builder.delete_external_id("3");
|
|
||||||
let result = builder.execute().unwrap();
|
|
||||||
println!("{result:?}");
|
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
|
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
|
||||||
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
||||||
@ -2596,8 +2552,7 @@ mod tests {
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
*/
|
*/
|
||||||
let mut index = TempIndex::new();
|
let index = TempIndex::new();
|
||||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
|
||||||
|
|
||||||
// START OF BATCH
|
// START OF BATCH
|
||||||
|
|
||||||
@ -2637,8 +2592,7 @@ mod tests {
|
|||||||
{"id":1,"doggo":"bernese"}
|
{"id":1,"doggo":"bernese"}
|
||||||
"###);
|
"###);
|
||||||
db_snap!(index, external_documents_ids, @r###"
|
db_snap!(index, external_documents_ids, @r###"
|
||||||
soft:
|
docids:
|
||||||
hard:
|
|
||||||
1 0
|
1 0
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@ -2683,13 +2637,10 @@ mod tests {
|
|||||||
"###);
|
"###);
|
||||||
|
|
||||||
db_snap!(index, external_documents_ids, @r###"
|
db_snap!(index, external_documents_ids, @r###"
|
||||||
soft:
|
docids:
|
||||||
hard:
|
|
||||||
0 1
|
0 1
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
|
||||||
|
|
||||||
// BATCH 3
|
// BATCH 3
|
||||||
|
|
||||||
println!("--- ENTERING BATCH 3");
|
println!("--- ENTERING BATCH 3");
|
||||||
@ -2731,4 +2682,537 @@ mod tests {
|
|||||||
let res = index.search(&rtxn).execute().unwrap();
|
let res = index.search(&rtxn).execute().unwrap();
|
||||||
index.documents(&rtxn, res.documents_ids).unwrap();
|
index.documents(&rtxn, res.documents_ids).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn delete_documents<'t>(
|
||||||
|
wtxn: &mut RwTxn<'t, '_>,
|
||||||
|
index: &'t TempIndex,
|
||||||
|
external_ids: &[&str],
|
||||||
|
) -> Vec<u32> {
|
||||||
|
let external_document_ids = index.external_documents_ids();
|
||||||
|
let ids_to_delete: Vec<u32> = external_ids
|
||||||
|
.iter()
|
||||||
|
.map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Delete some documents.
|
||||||
|
index.delete_documents_using_wtxn(
|
||||||
|
wtxn,
|
||||||
|
external_ids.iter().map(ToString::to_string).collect(),
|
||||||
|
);
|
||||||
|
|
||||||
|
ids_to_delete
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn delete_documents_with_numbers_as_primary_key() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
index
|
||||||
|
.add_documents_using_wtxn(
|
||||||
|
&mut wtxn,
|
||||||
|
documents!([
|
||||||
|
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
|
||||||
|
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
|
||||||
|
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// delete those documents, ids are synchronous therefore 0, 1, and 2.
|
||||||
|
index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]);
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
// All these snapshots should be empty since the database was cleared
|
||||||
|
db_snap!(index, documents_ids);
|
||||||
|
db_snap!(index, word_docids);
|
||||||
|
db_snap!(index, word_pair_proximity_docids);
|
||||||
|
db_snap!(index, facet_id_exists_docids);
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn delete_documents_with_strange_primary_key() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
index
|
||||||
|
.add_documents_using_wtxn(
|
||||||
|
&mut wtxn,
|
||||||
|
documents!([
|
||||||
|
{ "mysuperid": 0, "name": "kevin" },
|
||||||
|
{ "mysuperid": 1, "name": "kevina" },
|
||||||
|
{ "mysuperid": 2, "name": "benoit" }
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
// Delete not all of the documents but some of them.
|
||||||
|
index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]);
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, documents_ids);
|
||||||
|
db_snap!(index, word_docids);
|
||||||
|
db_snap!(index, word_pair_proximity_docids);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn filtered_placeholder_search_should_not_return_deleted_documents() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||||
|
settings.set_primary_key(S("docid"));
|
||||||
|
settings.set_filterable_fields(hashset! { S("label"), S("label2") });
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents_using_wtxn(
|
||||||
|
&mut wtxn,
|
||||||
|
documents!([
|
||||||
|
{ "docid": "1_4", "label": ["sign"] },
|
||||||
|
{ "docid": "1_5", "label": ["letter"] },
|
||||||
|
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||||
|
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||||
|
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||||
|
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||||
|
{ "docid": "1_39", "label": ["abstract"] },
|
||||||
|
{ "docid": "1_40", "label": ["cartoon"] },
|
||||||
|
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||||
|
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||||
|
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||||
|
{ "docid": "1_44", "label": ["drawing"] },
|
||||||
|
{ "docid": "1_45", "label": ["art"] },
|
||||||
|
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||||
|
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||||
|
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||||
|
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||||
|
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||||
|
{ "docid": "1_68", "label": ["design"] },
|
||||||
|
{ "docid": "1_69", "label": ["geometry"] },
|
||||||
|
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||||
|
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||||
|
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]);
|
||||||
|
|
||||||
|
// Placeholder search with filter
|
||||||
|
let filter = Filter::from_str("label = sign").unwrap().unwrap();
|
||||||
|
let results = index.search(&wtxn).filter(filter).execute().unwrap();
|
||||||
|
assert!(results.documents_ids.is_empty());
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, word_docids);
|
||||||
|
db_snap!(index, facet_id_f64_docids);
|
||||||
|
db_snap!(index, word_pair_proximity_docids);
|
||||||
|
db_snap!(index, facet_id_exists_docids);
|
||||||
|
db_snap!(index, facet_id_string_docids);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn placeholder_search_should_not_return_deleted_documents() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
index
|
||||||
|
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||||
|
settings.set_primary_key(S("docid"));
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents_using_wtxn(
|
||||||
|
&mut wtxn,
|
||||||
|
documents!([
|
||||||
|
{ "docid": "1_4", "label": ["sign"] },
|
||||||
|
{ "docid": "1_5", "label": ["letter"] },
|
||||||
|
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||||
|
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||||
|
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||||
|
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||||
|
{ "docid": "1_39", "label": ["abstract"] },
|
||||||
|
{ "docid": "1_40", "label": ["cartoon"] },
|
||||||
|
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||||
|
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||||
|
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||||
|
{ "docid": "1_44", "label": ["drawing"] },
|
||||||
|
{ "docid": "1_45", "label": ["art"] },
|
||||||
|
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||||
|
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||||
|
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||||
|
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||||
|
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||||
|
{ "docid": "1_68", "label": ["design"] },
|
||||||
|
{ "docid": "1_69", "label": ["geometry"] },
|
||||||
|
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||||
|
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||||
|
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]);
|
||||||
|
|
||||||
|
// Placeholder search
|
||||||
|
let results = index.search(&wtxn).execute().unwrap();
|
||||||
|
assert!(!results.documents_ids.is_empty());
|
||||||
|
for id in results.documents_ids.iter() {
|
||||||
|
assert!(
|
||||||
|
!deleted_internal_ids.contains(id),
|
||||||
|
"The document {} was supposed to be deleted",
|
||||||
|
id
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn search_should_not_return_deleted_documents() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
index
|
||||||
|
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||||
|
settings.set_primary_key(S("docid"));
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents_using_wtxn(
|
||||||
|
&mut wtxn,
|
||||||
|
documents!([
|
||||||
|
{ "docid": "1_4", "label": ["sign"] },
|
||||||
|
{ "docid": "1_5", "label": ["letter"] },
|
||||||
|
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||||
|
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||||
|
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||||
|
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||||
|
{ "docid": "1_39", "label": ["abstract"] },
|
||||||
|
{ "docid": "1_40", "label": ["cartoon"] },
|
||||||
|
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||||
|
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||||
|
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||||
|
{ "docid": "1_44", "label": ["drawing"] },
|
||||||
|
{ "docid": "1_45", "label": ["art"] },
|
||||||
|
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||||
|
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||||
|
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||||
|
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||||
|
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||||
|
{ "docid": "1_68", "label": ["design"] },
|
||||||
|
{ "docid": "1_69", "label": ["geometry"] },
|
||||||
|
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||||
|
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||||
|
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
|
||||||
|
|
||||||
|
// search for abstract
|
||||||
|
let results = index.search(&wtxn).query("abstract").execute().unwrap();
|
||||||
|
assert!(!results.documents_ids.is_empty());
|
||||||
|
for id in results.documents_ids.iter() {
|
||||||
|
assert!(
|
||||||
|
!deleted_internal_ids.contains(id),
|
||||||
|
"The document {} was supposed to be deleted",
|
||||||
|
id
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn geo_filtered_placeholder_search_should_not_return_deleted_documents() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
index
|
||||||
|
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||||
|
settings.set_primary_key(S("id"));
|
||||||
|
settings.set_filterable_fields(hashset!(S("_geo")));
|
||||||
|
settings.set_sortable_fields(hashset!(S("_geo")));
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index.add_documents_using_wtxn(&mut wtxn, documents!([
|
||||||
|
{ "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } },
|
||||||
|
{ "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } },
|
||||||
|
{ "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } },
|
||||||
|
{ "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } },
|
||||||
|
{ "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } },
|
||||||
|
{ "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } },
|
||||||
|
{ "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } },
|
||||||
|
{ "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } },
|
||||||
|
{ "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } },
|
||||||
|
{ "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } },
|
||||||
|
{ "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } },
|
||||||
|
{ "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } },
|
||||||
|
{ "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } },
|
||||||
|
{ "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } },
|
||||||
|
{ "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } },
|
||||||
|
{ "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } },
|
||||||
|
{ "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } },
|
||||||
|
{ "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } },
|
||||||
|
{ "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } },
|
||||||
|
{ "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } }
|
||||||
|
])).unwrap();
|
||||||
|
|
||||||
|
let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
|
||||||
|
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete);
|
||||||
|
|
||||||
|
// Placeholder search with geo filter
|
||||||
|
let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap();
|
||||||
|
let results = index.search(&wtxn).filter(filter).execute().unwrap();
|
||||||
|
assert!(!results.documents_ids.is_empty());
|
||||||
|
for id in results.documents_ids.iter() {
|
||||||
|
assert!(
|
||||||
|
!deleted_internal_ids.contains(id),
|
||||||
|
"The document {} was supposed to be deleted",
|
||||||
|
id
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, facet_id_f64_docids);
|
||||||
|
db_snap!(index, facet_id_string_docids);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn get_documents_should_not_return_deleted_documents() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
index
|
||||||
|
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||||
|
settings.set_primary_key(S("docid"));
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents_using_wtxn(
|
||||||
|
&mut wtxn,
|
||||||
|
documents!([
|
||||||
|
{ "docid": "1_4", "label": ["sign"] },
|
||||||
|
{ "docid": "1_5", "label": ["letter"] },
|
||||||
|
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||||
|
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||||
|
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||||
|
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||||
|
{ "docid": "1_39", "label": ["abstract"] },
|
||||||
|
{ "docid": "1_40", "label": ["cartoon"] },
|
||||||
|
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||||
|
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||||
|
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||||
|
{ "docid": "1_44", "label": ["drawing"] },
|
||||||
|
{ "docid": "1_45", "label": ["art"] },
|
||||||
|
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||||
|
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||||
|
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||||
|
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||||
|
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||||
|
{ "docid": "1_68", "label": ["design"] },
|
||||||
|
{ "docid": "1_69", "label": ["geometry"] },
|
||||||
|
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||||
|
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||||
|
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let deleted_external_ids = ["1_7", "1_52"];
|
||||||
|
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids);
|
||||||
|
|
||||||
|
// list all documents
|
||||||
|
let results = index.all_documents(&wtxn).unwrap();
|
||||||
|
for result in results {
|
||||||
|
let (id, _) = result.unwrap();
|
||||||
|
assert!(
|
||||||
|
!deleted_internal_ids.contains(&id),
|
||||||
|
"The document {} was supposed to be deleted",
|
||||||
|
id
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// list internal document ids
|
||||||
|
let results = index.documents_ids(&wtxn).unwrap();
|
||||||
|
for id in results {
|
||||||
|
assert!(
|
||||||
|
!deleted_internal_ids.contains(&id),
|
||||||
|
"The document {} was supposed to be deleted",
|
||||||
|
id
|
||||||
|
);
|
||||||
|
}
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
// get internal docids from deleted external document ids
|
||||||
|
let results = index.external_documents_ids();
|
||||||
|
for id in deleted_external_ids {
|
||||||
|
assert!(
|
||||||
|
results.get(&rtxn, id).unwrap().is_none(),
|
||||||
|
"The document {} was supposed to be deleted",
|
||||||
|
id
|
||||||
|
);
|
||||||
|
}
|
||||||
|
drop(rtxn);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn stats_should_not_return_deleted_documents() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||||
|
settings.set_primary_key(S("docid"));
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index.add_documents_using_wtxn(&mut wtxn, documents!([
|
||||||
|
{ "docid": "1_4", "label": ["sign"]},
|
||||||
|
{ "docid": "1_5", "label": ["letter"]},
|
||||||
|
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"},
|
||||||
|
{ "docid": "1_36", "label": ["drawing","painting","pattern"]},
|
||||||
|
{ "docid": "1_37", "label": ["art","drawing","outdoor"]},
|
||||||
|
{ "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"},
|
||||||
|
{ "docid": "1_39", "label": ["abstract"]},
|
||||||
|
{ "docid": "1_40", "label": ["cartoon"]},
|
||||||
|
{ "docid": "1_41", "label": ["art","drawing"]},
|
||||||
|
{ "docid": "1_42", "label": ["art","pattern"]},
|
||||||
|
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32},
|
||||||
|
{ "docid": "1_44", "label": ["drawing"], "number": 44i32},
|
||||||
|
{ "docid": "1_45", "label": ["art"]},
|
||||||
|
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"]},
|
||||||
|
{ "docid": "1_47", "label": ["abstract","pattern"]},
|
||||||
|
{ "docid": "1_52", "label": ["abstract","cartoon"]},
|
||||||
|
{ "docid": "1_57", "label": ["abstract","drawing","pattern"]},
|
||||||
|
{ "docid": "1_58", "label": ["abstract","art","cartoon"]},
|
||||||
|
{ "docid": "1_68", "label": ["design"]},
|
||||||
|
{ "docid": "1_69", "label": ["geometry"]}
|
||||||
|
])).unwrap();
|
||||||
|
|
||||||
|
delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
|
||||||
|
|
||||||
|
// count internal documents
|
||||||
|
let results = index.number_of_documents(&wtxn).unwrap();
|
||||||
|
assert_eq!(18, results);
|
||||||
|
|
||||||
|
// count field distribution
|
||||||
|
let results = index.field_distribution(&wtxn).unwrap();
|
||||||
|
assert_eq!(Some(&18), results.get("label"));
|
||||||
|
assert_eq!(Some(&1), results.get("title"));
|
||||||
|
assert_eq!(Some(&2), results.get("number"));
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn stored_detected_script_and_language_should_not_return_deleted_documents() {
|
||||||
|
use charabia::{Language, Script};
|
||||||
|
let index = TempIndex::new();
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
index
|
||||||
|
.add_documents_using_wtxn(
|
||||||
|
&mut wtxn,
|
||||||
|
documents!([
|
||||||
|
{ "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
|
||||||
|
{ "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
|
||||||
|
{ "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
|
||||||
|
{ "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
|
||||||
|
{ "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
|
||||||
|
{ "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let key_cmn = (Script::Cj, Language::Cmn);
|
||||||
|
let cj_cmn_docs =
|
||||||
|
index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
|
||||||
|
let mut expected_cj_cmn_docids = RoaringBitmap::new();
|
||||||
|
expected_cj_cmn_docids.push(1);
|
||||||
|
expected_cj_cmn_docids.push(5);
|
||||||
|
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
|
||||||
|
|
||||||
|
delete_documents(&mut wtxn, &index, &["1"]);
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let cj_cmn_docs =
|
||||||
|
index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
|
||||||
|
let mut expected_cj_cmn_docids = RoaringBitmap::new();
|
||||||
|
expected_cj_cmn_docids.push(5);
|
||||||
|
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn delete_words_exact_attributes() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|settings| {
|
||||||
|
settings.set_primary_key(S("id"));
|
||||||
|
settings.set_searchable_fields(vec![S("text"), S("exact")]);
|
||||||
|
settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{ "id": 0, "text": "hello" },
|
||||||
|
{ "id": 1, "exact": "hello"}
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
db_snap!(index, word_docids, 1, @r###"
|
||||||
|
hello [0, ]
|
||||||
|
"###);
|
||||||
|
db_snap!(index, exact_word_docids, 1, @r###"
|
||||||
|
hello [1, ]
|
||||||
|
"###);
|
||||||
|
db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]);
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, word_docids, 2, @r###"
|
||||||
|
hello [0, ]
|
||||||
|
"###);
|
||||||
|
db_snap!(index, exact_word_docids, 2, @"");
|
||||||
|
db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("hello");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/index_documents/mod.rs
|
||||||
|
---
|
||||||
|
[]
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/index_documents/mod.rs
|
||||||
|
---
|
||||||
|
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/index_documents/mod.rs
|
||||||
|
---
|
||||||
|
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/index_documents/mod.rs
|
||||||
|
---
|
||||||
|
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/index_documents/mod.rs
|
||||||
|
---
|
||||||
|
[2, ]
|
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/index_documents/mod.rs
|
||||||
|
---
|
||||||
|
benoit [2, ]
|
||||||
|
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/index_documents/mod.rs
|
||||||
|
---
|
||||||
|
|
@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/delete_documents.rs
|
source: milli/src/update/index_documents/mod.rs
|
||||||
---
|
---
|
||||||
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
|
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
|
||||||
2 [21, ]
|
2 [21, ]
|
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/index_documents/mod.rs
|
||||||
|
---
|
||||||
|
2 0 2.2 1 [21, ]
|
||||||
|
|
@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/delete_documents.rs
|
source: milli/src/update/index_documents/mod.rs
|
||||||
---
|
---
|
||||||
1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ]
|
1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ]
|
||||||
1 0 aquarium 1 [5, ]
|
1 0 aquarium 1 [5, ]
|
@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/delete_documents.rs
|
source: milli/src/update/index_documents/mod.rs
|
||||||
---
|
---
|
||||||
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
|
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
|
||||||
2 [21, ]
|
2 [21, ]
|
@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/delete_documents.rs
|
source: milli/src/update/index_documents/mod.rs
|
||||||
---
|
---
|
||||||
1 1 36 [3, ]
|
1 1 36 [3, ]
|
||||||
1 1 37 [4, ]
|
1 1 37 [4, ]
|
@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/delete_documents.rs
|
source: milli/src/update/index_documents/mod.rs
|
||||||
---
|
---
|
||||||
3 0 48.9021 1 [19, ]
|
3 0 48.9021 1 [19, ]
|
||||||
3 0 49.9314 1 [17, ]
|
3 0 49.9314 1 [17, ]
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/index_documents/mod.rs
|
||||||
|
---
|
||||||
|
|
@ -1,60 +1,56 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/index_documents/mod.rs
|
source: milli/src/update/index_documents/mod.rs
|
||||||
---
|
---
|
||||||
0 [1, 7, ]
|
0 [1, ]
|
||||||
1 [2, ]
|
1 [2, ]
|
||||||
10 [1, 7, ]
|
10 [1, ]
|
||||||
12 [0, 8, ]
|
12 [0, ]
|
||||||
1344 [3, ]
|
1344 [3, ]
|
||||||
1813 [8, ]
|
1813 [0, ]
|
||||||
2 [0, 8, ]
|
2 [0, ]
|
||||||
23 [5, ]
|
23 [5, ]
|
||||||
25 [2, ]
|
25 [2, ]
|
||||||
3 [0, 8, ]
|
3 [0, ]
|
||||||
35 [5, ]
|
35 [5, ]
|
||||||
4 [4, 6, ]
|
4 [4, ]
|
||||||
42 [0, 5, 8, ]
|
42 [0, 5, ]
|
||||||
456 [1, 7, ]
|
456 [1, ]
|
||||||
5 [0, 8, ]
|
5 [0, ]
|
||||||
99 [2, ]
|
99 [2, ]
|
||||||
adams [5, ]
|
adams [5, ]
|
||||||
adventure [1, 7, ]
|
adventure [1, ]
|
||||||
alice [2, ]
|
alice [2, ]
|
||||||
and [0, 4, 6, 8, ]
|
and [0, 4, ]
|
||||||
antoine [1, 7, ]
|
antoine [1, ]
|
||||||
austen [8, ]
|
austen [0, ]
|
||||||
austin [0, ]
|
blood [4, ]
|
||||||
blood [4, 6, ]
|
|
||||||
carroll [2, ]
|
carroll [2, ]
|
||||||
de [1, 7, ]
|
de [1, ]
|
||||||
douglas [5, ]
|
douglas [5, ]
|
||||||
exupery [1, 7, ]
|
exupery [1, ]
|
||||||
fantasy [2, 3, 4, 6, ]
|
fantasy [2, 3, 4, ]
|
||||||
galaxy [5, ]
|
galaxy [5, ]
|
||||||
guide [5, ]
|
guide [5, ]
|
||||||
half [4, 6, ]
|
half [4, ]
|
||||||
harry [4, 6, ]
|
harry [4, ]
|
||||||
hitchhiker [5, ]
|
hitchhiker [5, ]
|
||||||
hobbit [3, ]
|
hobbit [3, ]
|
||||||
in [2, ]
|
in [2, ]
|
||||||
j [3, 4, 6, 8, ]
|
j [0, 3, 4, ]
|
||||||
jane [0, ]
|
k [4, ]
|
||||||
k [4, 6, ]
|
|
||||||
le [1, ]
|
|
||||||
lewis [2, ]
|
lewis [2, ]
|
||||||
little [7, ]
|
little [1, ]
|
||||||
petit [1, ]
|
potter [4, ]
|
||||||
potter [4, 6, ]
|
prejudice [0, ]
|
||||||
prejudice [0, 8, ]
|
pride [0, ]
|
||||||
pride [0, 8, ]
|
prince [1, ]
|
||||||
prince [1, 4, 7, ]
|
princess [4, ]
|
||||||
princess [6, ]
|
|
||||||
r [3, ]
|
r [3, ]
|
||||||
romance [0, 8, ]
|
romance [0, ]
|
||||||
rowling [4, 6, ]
|
rowling [4, ]
|
||||||
s [5, ]
|
s [5, ]
|
||||||
saint [1, 7, ]
|
saint [1, ]
|
||||||
the [3, 4, 5, 6, 7, ]
|
the [1, 3, 4, 5, ]
|
||||||
to [5, ]
|
to [5, ]
|
||||||
tolkien [3, ]
|
tolkien [3, ]
|
||||||
wonderland [2, ]
|
wonderland [2, ]
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::btree_map::Entry as BEntry;
|
||||||
|
use std::collections::hash_map::Entry as HEntry;
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{Read, Seek};
|
use std::io::{Read, Seek};
|
||||||
@ -7,18 +8,21 @@ use std::io::{Read, Seek};
|
|||||||
use fxhash::FxHashMap;
|
use fxhash::FxHashMap;
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use obkv::{KvReader, KvWriter};
|
use obkv::{KvReader, KvReaderU16, KvWriter};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use smartstring::SmartString;
|
use smartstring::SmartString;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn,
|
create_sorter, create_writer, keep_first, obkvs_keep_last_addition_merge_deletions,
|
||||||
|
obkvs_merge_additions_and_deletions, sorter_into_reader, MergeFn,
|
||||||
};
|
};
|
||||||
use super::{IndexDocumentsMethod, IndexerConfig};
|
use super::{IndexDocumentsMethod, IndexerConfig};
|
||||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||||
use crate::error::{Error, InternalError, UserError};
|
use crate::error::{Error, InternalError, UserError};
|
||||||
use crate::index::{db_name, main_key};
|
use crate::index::{db_name, main_key};
|
||||||
|
use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd};
|
||||||
|
use crate::update::index_documents::GrenadParameters;
|
||||||
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
||||||
use crate::{
|
use crate::{
|
||||||
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
|
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
|
||||||
@ -28,9 +32,6 @@ pub struct TransformOutput {
|
|||||||
pub primary_key: String,
|
pub primary_key: String,
|
||||||
pub fields_ids_map: FieldsIdsMap,
|
pub fields_ids_map: FieldsIdsMap,
|
||||||
pub field_distribution: FieldDistribution,
|
pub field_distribution: FieldDistribution,
|
||||||
pub new_external_documents_ids: fst::Map<Cow<'static, [u8]>>,
|
|
||||||
pub new_documents_ids: RoaringBitmap,
|
|
||||||
pub replaced_documents_ids: RoaringBitmap,
|
|
||||||
pub documents_count: usize,
|
pub documents_count: usize,
|
||||||
pub original_documents: File,
|
pub original_documents: File,
|
||||||
pub flattened_documents: File,
|
pub flattened_documents: File,
|
||||||
@ -106,8 +107,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
// We must choose the appropriate merge function for when two or more documents
|
// We must choose the appropriate merge function for when two or more documents
|
||||||
// with the same user id must be merged or fully replaced in the same batch.
|
// with the same user id must be merged or fully replaced in the same batch.
|
||||||
let merge_function = match index_documents_method {
|
let merge_function = match index_documents_method {
|
||||||
IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv,
|
IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions,
|
||||||
IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations,
|
IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions,
|
||||||
};
|
};
|
||||||
|
|
||||||
// We initialize the sorter with the user indexing settings.
|
// We initialize the sorter with the user indexing settings.
|
||||||
@ -130,17 +131,13 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
indexer_settings.max_memory.map(|mem| mem / 2),
|
indexer_settings.max_memory.map(|mem| mem / 2),
|
||||||
);
|
);
|
||||||
let documents_ids = index.documents_ids(wtxn)?;
|
let documents_ids = index.documents_ids(wtxn)?;
|
||||||
let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?;
|
|
||||||
|
|
||||||
Ok(Transform {
|
Ok(Transform {
|
||||||
index,
|
index,
|
||||||
fields_ids_map: index.fields_ids_map(wtxn)?,
|
fields_ids_map: index.fields_ids_map(wtxn)?,
|
||||||
indexer_settings,
|
indexer_settings,
|
||||||
autogenerate_docids,
|
autogenerate_docids,
|
||||||
available_documents_ids: AvailableDocumentsIds::from_documents_ids(
|
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
|
||||||
&documents_ids,
|
|
||||||
&soft_deleted_documents_ids,
|
|
||||||
),
|
|
||||||
original_sorter,
|
original_sorter,
|
||||||
flattened_sorter,
|
flattened_sorter,
|
||||||
index_documents_method,
|
index_documents_method,
|
||||||
@ -151,6 +148,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[logging_timer::time]
|
||||||
pub fn read_documents<R, FP, FA>(
|
pub fn read_documents<R, FP, FA>(
|
||||||
&mut self,
|
&mut self,
|
||||||
reader: EnrichedDocumentsBatchReader<R>,
|
reader: EnrichedDocumentsBatchReader<R>,
|
||||||
@ -163,8 +161,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
FP: Fn(UpdateIndexingStep) + Sync,
|
FP: Fn(UpdateIndexingStep) + Sync,
|
||||||
FA: Fn() -> bool + Sync,
|
FA: Fn() -> bool + Sync,
|
||||||
{
|
{
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
|
let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
|
||||||
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
let external_documents_ids = self.index.external_documents_ids();
|
||||||
let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
|
let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
|
||||||
|
|
||||||
let primary_key = cursor.primary_key().to_string();
|
let primary_key = cursor.primary_key().to_string();
|
||||||
@ -172,7 +172,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
|
self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
|
||||||
|
|
||||||
let mut obkv_buffer = Vec::new();
|
let mut obkv_buffer = Vec::new();
|
||||||
let mut document_sorter_buffer = Vec::new();
|
let mut document_sorter_value_buffer = Vec::new();
|
||||||
|
let mut document_sorter_key_buffer = Vec::new();
|
||||||
let mut documents_count = 0;
|
let mut documents_count = 0;
|
||||||
let mut docid_buffer: Vec<u8> = Vec::new();
|
let mut docid_buffer: Vec<u8> = Vec::new();
|
||||||
let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
|
let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
|
||||||
@ -213,29 +214,30 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2));
|
field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2));
|
||||||
|
|
||||||
// Build the new obkv document.
|
// Build the new obkv document.
|
||||||
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
|
let mut writer = KvWriter::new(&mut obkv_buffer);
|
||||||
for (k, v) in field_buffer_cache.iter() {
|
for (k, v) in field_buffer_cache.iter() {
|
||||||
writer.insert(*k, v)?;
|
writer.insert(*k, v)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut original_docid = None;
|
let mut original_docid = None;
|
||||||
|
|
||||||
let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
|
let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
|
||||||
Entry::Occupied(entry) => *entry.get() as u32,
|
HEntry::Occupied(entry) => *entry.get() as u32,
|
||||||
Entry::Vacant(entry) => {
|
HEntry::Vacant(entry) => {
|
||||||
// If the document was already in the db we mark it as a replaced document.
|
let docid = match external_documents_ids.get(wtxn, entry.key())? {
|
||||||
// It'll be deleted later.
|
Some(docid) => {
|
||||||
if let Some(docid) = external_documents_ids.get(entry.key()) {
|
|
||||||
// If it was already in the list of replaced documents it means it was deleted
|
// If it was already in the list of replaced documents it means it was deleted
|
||||||
// by the remove_document method. We should starts as if it never existed.
|
// by the remove_document method. We should starts as if it never existed.
|
||||||
if self.replaced_documents_ids.insert(docid) {
|
if self.replaced_documents_ids.insert(docid) {
|
||||||
original_docid = Some(docid);
|
original_docid = Some(docid);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
docid
|
||||||
}
|
}
|
||||||
let docid = self
|
None => self
|
||||||
.available_documents_ids
|
.available_documents_ids
|
||||||
.next()
|
.next()
|
||||||
.ok_or(UserError::DocumentLimitReached)?;
|
.ok_or(UserError::DocumentLimitReached)?,
|
||||||
|
};
|
||||||
entry.insert(docid as u64);
|
entry.insert(docid as u64);
|
||||||
docid
|
docid
|
||||||
}
|
}
|
||||||
@ -263,47 +265,68 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
skip_insertion = true;
|
skip_insertion = true;
|
||||||
} else {
|
} else {
|
||||||
// we associate the base document with the new key, everything will get merged later.
|
// we associate the base document with the new key, everything will get merged later.
|
||||||
document_sorter_buffer.clear();
|
let deladd_operation = match self.index_documents_method {
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
IndexDocumentsMethod::UpdateDocuments => {
|
||||||
document_sorter_buffer.extend_from_slice(base_obkv);
|
DelAddOperation::DeletionAndAddition
|
||||||
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
}
|
||||||
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
|
IndexDocumentsMethod::ReplaceDocuments => DelAddOperation::Deletion,
|
||||||
Some(flattened_obkv) => {
|
};
|
||||||
|
document_sorter_key_buffer.clear();
|
||||||
|
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||||
|
document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
|
||||||
|
document_sorter_value_buffer.clear();
|
||||||
|
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||||
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(base_obkv),
|
||||||
|
deladd_operation,
|
||||||
|
&mut document_sorter_value_buffer,
|
||||||
|
)?;
|
||||||
|
self.original_sorter
|
||||||
|
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||||
|
let base_obkv = KvReader::new(base_obkv);
|
||||||
|
if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? {
|
||||||
// we recreate our buffer with the flattened documents
|
// we recreate our buffer with the flattened documents
|
||||||
document_sorter_buffer.clear();
|
document_sorter_value_buffer.clear();
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||||
document_sorter_buffer.extend_from_slice(&flattened_obkv);
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&flattened_obkv),
|
||||||
|
deladd_operation,
|
||||||
|
&mut document_sorter_value_buffer,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
self.flattened_sorter
|
self.flattened_sorter
|
||||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||||
}
|
|
||||||
None => self
|
|
||||||
.flattened_sorter
|
|
||||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !skip_insertion {
|
if !skip_insertion {
|
||||||
self.new_documents_ids.insert(docid);
|
self.new_documents_ids.insert(docid);
|
||||||
|
|
||||||
document_sorter_buffer.clear();
|
document_sorter_key_buffer.clear();
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||||
document_sorter_buffer.extend_from_slice(&obkv_buffer);
|
document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
|
||||||
|
document_sorter_value_buffer.clear();
|
||||||
|
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||||
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&obkv_buffer),
|
||||||
|
DelAddOperation::Addition,
|
||||||
|
&mut document_sorter_value_buffer,
|
||||||
|
)?;
|
||||||
// We use the extracted/generated user id as the key for this document.
|
// We use the extracted/generated user id as the key for this document.
|
||||||
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
self.original_sorter
|
||||||
|
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||||
|
|
||||||
match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
|
let flattened_obkv = KvReader::new(&obkv_buffer);
|
||||||
Some(flattened_obkv) => {
|
if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
|
||||||
document_sorter_buffer.clear();
|
document_sorter_value_buffer.clear();
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||||
document_sorter_buffer.extend_from_slice(&flattened_obkv);
|
into_del_add_obkv(
|
||||||
self.flattened_sorter
|
KvReaderU16::new(&obkv),
|
||||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
DelAddOperation::Addition,
|
||||||
}
|
&mut document_sorter_value_buffer,
|
||||||
None => self
|
)?
|
||||||
.flattened_sorter
|
|
||||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?,
|
|
||||||
}
|
}
|
||||||
|
self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||||
}
|
}
|
||||||
documents_count += 1;
|
documents_count += 1;
|
||||||
|
|
||||||
@ -338,6 +361,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
/// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db,
|
/// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db,
|
||||||
/// it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids.
|
/// it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids.
|
||||||
/// - If the document to remove was not present in either the db or the transform we do nothing.
|
/// - If the document to remove was not present in either the db or the transform we do nothing.
|
||||||
|
#[logging_timer::time]
|
||||||
pub fn remove_documents<FA>(
|
pub fn remove_documents<FA>(
|
||||||
&mut self,
|
&mut self,
|
||||||
mut to_remove: Vec<String>,
|
mut to_remove: Vec<String>,
|
||||||
@ -347,54 +371,176 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
where
|
where
|
||||||
FA: Fn() -> bool + Sync,
|
FA: Fn() -> bool + Sync,
|
||||||
{
|
{
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
// there may be duplicates in the documents to remove.
|
// there may be duplicates in the documents to remove.
|
||||||
to_remove.sort_unstable();
|
to_remove.sort_unstable();
|
||||||
to_remove.dedup();
|
to_remove.dedup();
|
||||||
|
|
||||||
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
let external_documents_ids = self.index.external_documents_ids();
|
||||||
|
|
||||||
let mut documents_deleted = 0;
|
let mut documents_deleted = 0;
|
||||||
|
let mut document_sorter_value_buffer = Vec::new();
|
||||||
|
let mut document_sorter_key_buffer = Vec::new();
|
||||||
for to_remove in to_remove {
|
for to_remove in to_remove {
|
||||||
if should_abort() {
|
if should_abort() {
|
||||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if the document has been added in the current indexing process.
|
||||||
|
let deleted_from_current =
|
||||||
match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
|
match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
|
||||||
// if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
|
// if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
|
||||||
Entry::Occupied(entry) => {
|
HEntry::Occupied(entry) => {
|
||||||
let doc_id = *entry.get() as u32;
|
let docid = *entry.get() as u32;
|
||||||
|
// Key is the concatenation of the internal docid and the external one.
|
||||||
|
document_sorter_key_buffer.clear();
|
||||||
|
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||||
|
document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes());
|
||||||
|
document_sorter_value_buffer.clear();
|
||||||
|
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||||
|
obkv::KvWriterU16::new(&mut document_sorter_value_buffer).finish().unwrap();
|
||||||
self.original_sorter
|
self.original_sorter
|
||||||
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
|
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||||
self.flattened_sorter
|
self.flattened_sorter
|
||||||
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
|
.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||||
|
|
||||||
// we must NOT update the list of replaced_documents_ids
|
// we must NOT update the list of replaced_documents_ids
|
||||||
// Either:
|
// Either:
|
||||||
// 1. It's already in it and there is nothing to do
|
// 1. It's already in it and there is nothing to do
|
||||||
// 2. It wasn't in it because the document was created by a previous batch and since
|
// 2. It wasn't in it because the document was created by a previous batch and since
|
||||||
// we're removing it there is nothing to do.
|
// we're removing it there is nothing to do.
|
||||||
self.new_documents_ids.remove(doc_id);
|
self.new_documents_ids.remove(docid);
|
||||||
entry.remove_entry();
|
entry.remove_entry();
|
||||||
|
true
|
||||||
}
|
}
|
||||||
Entry::Vacant(entry) => {
|
HEntry::Vacant(_) => false,
|
||||||
// If the document was already in the db we mark it as a `to_delete` document.
|
|
||||||
// It'll be deleted later. We don't need to push anything to the sorters.
|
|
||||||
if let Some(docid) = external_documents_ids.get(entry.key()) {
|
|
||||||
self.replaced_documents_ids.insert(docid);
|
|
||||||
} else {
|
|
||||||
// if the document is nowehere to be found, there is nothing to do and we must NOT
|
|
||||||
// increment the count of documents_deleted
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// If the document was already in the db we mark it as a `to_delete` document.
|
||||||
|
// Then we push the document in sorters in deletion mode.
|
||||||
|
let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? {
|
||||||
|
Some(docid) => {
|
||||||
|
self.remove_document_from_db(
|
||||||
|
docid,
|
||||||
|
to_remove,
|
||||||
|
wtxn,
|
||||||
|
&mut document_sorter_key_buffer,
|
||||||
|
&mut document_sorter_value_buffer,
|
||||||
|
)?;
|
||||||
|
true
|
||||||
|
}
|
||||||
|
None => false,
|
||||||
|
};
|
||||||
|
|
||||||
|
// increase counter only if the document existed somewhere before.
|
||||||
|
if deleted_from_current || deleted_from_db {
|
||||||
|
documents_deleted += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(documents_deleted)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Removes documents from db using their internal document ids.
|
||||||
|
///
|
||||||
|
/// # Warning
|
||||||
|
///
|
||||||
|
/// This function is dangerous and will only work correctly if:
|
||||||
|
///
|
||||||
|
/// - All the passed ids currently exist in the database
|
||||||
|
/// - No batching using the standards `remove_documents` and `add_documents` took place
|
||||||
|
///
|
||||||
|
/// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
|
||||||
|
#[logging_timer::time]
|
||||||
|
pub fn remove_documents_from_db_no_batch<FA>(
|
||||||
|
&mut self,
|
||||||
|
to_remove: &RoaringBitmap,
|
||||||
|
wtxn: &mut heed::RwTxn,
|
||||||
|
should_abort: FA,
|
||||||
|
) -> Result<usize>
|
||||||
|
where
|
||||||
|
FA: Fn() -> bool + Sync,
|
||||||
|
{
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
|
let mut documents_deleted = 0;
|
||||||
|
let mut document_sorter_value_buffer = Vec::new();
|
||||||
|
let mut document_sorter_key_buffer = Vec::new();
|
||||||
|
let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?;
|
||||||
|
|
||||||
|
for (internal_docid, external_docid) in to_remove.iter().zip(external_ids) {
|
||||||
|
let external_docid = external_docid?;
|
||||||
|
if should_abort() {
|
||||||
|
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||||
|
}
|
||||||
|
self.remove_document_from_db(
|
||||||
|
internal_docid,
|
||||||
|
external_docid,
|
||||||
|
wtxn,
|
||||||
|
&mut document_sorter_key_buffer,
|
||||||
|
&mut document_sorter_value_buffer,
|
||||||
|
)?;
|
||||||
|
|
||||||
documents_deleted += 1;
|
documents_deleted += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(documents_deleted)
|
Ok(documents_deleted)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn remove_document_from_db(
|
||||||
|
&mut self,
|
||||||
|
internal_docid: u32,
|
||||||
|
external_docid: String,
|
||||||
|
txn: &heed::RoTxn,
|
||||||
|
document_sorter_key_buffer: &mut Vec<u8>,
|
||||||
|
document_sorter_value_buffer: &mut Vec<u8>,
|
||||||
|
) -> Result<()> {
|
||||||
|
self.replaced_documents_ids.insert(internal_docid);
|
||||||
|
|
||||||
|
// fetch the obkv document
|
||||||
|
let original_key = BEU32::new(internal_docid);
|
||||||
|
let base_obkv = self
|
||||||
|
.index
|
||||||
|
.documents
|
||||||
|
.remap_data_type::<heed::types::ByteSlice>()
|
||||||
|
.get(txn, &original_key)?
|
||||||
|
.ok_or(InternalError::DatabaseMissingEntry {
|
||||||
|
db_name: db_name::DOCUMENTS,
|
||||||
|
key: None,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Key is the concatenation of the internal docid and the external one.
|
||||||
|
document_sorter_key_buffer.clear();
|
||||||
|
document_sorter_key_buffer.extend_from_slice(&internal_docid.to_be_bytes());
|
||||||
|
document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes());
|
||||||
|
// push it as to delete in the original_sorter
|
||||||
|
document_sorter_value_buffer.clear();
|
||||||
|
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||||
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(base_obkv),
|
||||||
|
DelAddOperation::Deletion,
|
||||||
|
document_sorter_value_buffer,
|
||||||
|
)?;
|
||||||
|
self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||||
|
|
||||||
|
// flatten it and push it as to delete in the flattened_sorter
|
||||||
|
let flattened_obkv = KvReader::new(base_obkv);
|
||||||
|
if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
|
||||||
|
// we recreate our buffer with the flattened documents
|
||||||
|
document_sorter_value_buffer.clear();
|
||||||
|
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||||
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&obkv),
|
||||||
|
DelAddOperation::Deletion,
|
||||||
|
document_sorter_value_buffer,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
self.flattened_sorter
|
||||||
|
.insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
// Flatten a document from the fields ids map contained in self and insert the new
|
// Flatten a document from the fields ids map contained in self and insert the new
|
||||||
// created fields. Returns `None` if the document doesn't need to be flattened.
|
// created fields. Returns `None` if the document doesn't need to be flattened.
|
||||||
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
|
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
|
||||||
@ -514,42 +660,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn remove_deleted_documents_from_field_distribution(
|
|
||||||
&self,
|
|
||||||
rtxn: &RoTxn,
|
|
||||||
field_distribution: &mut FieldDistribution,
|
|
||||||
) -> Result<()> {
|
|
||||||
for deleted_docid in self.replaced_documents_ids.iter() {
|
|
||||||
let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or(
|
|
||||||
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
|
|
||||||
)?;
|
|
||||||
|
|
||||||
for (key, _) in obkv.iter() {
|
|
||||||
let name =
|
|
||||||
self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
|
|
||||||
field_id: key,
|
|
||||||
process: "Computing field distribution in transform.",
|
|
||||||
})?;
|
|
||||||
// We checked that the document was in the db earlier. If we can't find it it means
|
|
||||||
// there is an inconsistency between the field distribution and the field id map.
|
|
||||||
let field =
|
|
||||||
field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId {
|
|
||||||
field_id: key,
|
|
||||||
process: "Accessing field distribution in transform.",
|
|
||||||
})?;
|
|
||||||
*field -= 1;
|
|
||||||
if *field == 0 {
|
|
||||||
// since we were able to get the field right before it's safe to unwrap here
|
|
||||||
field_distribution.remove(name).unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Generate the `TransformOutput` based on the given sorter that can be generated from any
|
/// Generate the `TransformOutput` based on the given sorter that can be generated from any
|
||||||
/// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
|
/// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
|
||||||
/// id for the user side and the value must be an obkv where keys are valid fields ids.
|
/// id for the user side and the value must be an obkv where keys are valid fields ids.
|
||||||
|
#[logging_timer::time]
|
||||||
pub(crate) fn output_from_sorter<F>(
|
pub(crate) fn output_from_sorter<F>(
|
||||||
self,
|
self,
|
||||||
wtxn: &mut heed::RwTxn,
|
wtxn: &mut heed::RwTxn,
|
||||||
@ -581,17 +695,13 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
// 2. Add all the new documents to the field distribution
|
// 2. Add all the new documents to the field distribution
|
||||||
let mut field_distribution = self.index.field_distribution(wtxn)?;
|
let mut field_distribution = self.index.field_distribution(wtxn)?;
|
||||||
|
|
||||||
self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?;
|
|
||||||
|
|
||||||
// Here we are going to do the document count + field distribution + `write_into_stream_writer`
|
// Here we are going to do the document count + field distribution + `write_into_stream_writer`
|
||||||
let mut iter = self.original_sorter.into_stream_merger_iter()?;
|
let mut iter = self.original_sorter.into_stream_merger_iter()?;
|
||||||
// used only for the callback
|
// used only for the callback
|
||||||
let mut documents_count = 0;
|
let mut documents_count = 0;
|
||||||
|
|
||||||
while let Some((key, val)) = iter.next()? {
|
while let Some((key, val)) = iter.next()? {
|
||||||
if val[0] == Operation::Deletion as u8 {
|
// skip first byte corresponding to the operation type (Deletion or Addition).
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let val = &val[1..];
|
let val = &val[1..];
|
||||||
|
|
||||||
// send a callback to show at which step we are
|
// send a callback to show at which step we are
|
||||||
@ -601,17 +711,52 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
total_documents: self.documents_count,
|
total_documents: self.documents_count,
|
||||||
});
|
});
|
||||||
|
|
||||||
// We increment all the field of the current document in the field distribution.
|
for (key, value) in KvReader::new(val) {
|
||||||
let obkv = KvReader::new(val);
|
let reader = KvReaderDelAdd::new(value);
|
||||||
|
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||||
for (key, _) in obkv.iter() {
|
(None, None) => {}
|
||||||
let name =
|
(None, Some(_)) => {
|
||||||
self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
|
// New field
|
||||||
|
let name = self.fields_ids_map.name(key).ok_or(
|
||||||
|
FieldIdMapMissingEntry::FieldId {
|
||||||
field_id: key,
|
field_id: key,
|
||||||
process: "Computing field distribution in transform.",
|
process: "Computing field distribution in transform.",
|
||||||
})?;
|
},
|
||||||
|
)?;
|
||||||
*field_distribution.entry(name.to_string()).or_insert(0) += 1;
|
*field_distribution.entry(name.to_string()).or_insert(0) += 1;
|
||||||
}
|
}
|
||||||
|
(Some(_), None) => {
|
||||||
|
// Field removed
|
||||||
|
let name = self.fields_ids_map.name(key).ok_or(
|
||||||
|
FieldIdMapMissingEntry::FieldId {
|
||||||
|
field_id: key,
|
||||||
|
process: "Computing field distribution in transform.",
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
match field_distribution.entry(name.to_string()) {
|
||||||
|
BEntry::Vacant(_) => { /* Bug? trying to remove a non-existing field */
|
||||||
|
}
|
||||||
|
BEntry::Occupied(mut entry) => {
|
||||||
|
// attempt to remove one
|
||||||
|
match entry.get_mut().checked_sub(1) {
|
||||||
|
Some(0) => {
|
||||||
|
entry.remove();
|
||||||
|
}
|
||||||
|
Some(new_val) => {
|
||||||
|
*entry.get_mut() = new_val;
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
unreachable!("Attempting to remove a field that wasn't in the field distribution")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(Some(_), Some(_)) => {
|
||||||
|
// Value change, no field distribution change
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
writer.insert(key, val)?;
|
writer.insert(key, val)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -631,9 +776,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
// We get rids of the `Operation` byte and skip the deleted documents as well.
|
// We get rids of the `Operation` byte and skip the deleted documents as well.
|
||||||
let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
|
let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
|
||||||
while let Some((key, val)) = iter.next()? {
|
while let Some((key, val)) = iter.next()? {
|
||||||
if val[0] == Operation::Deletion as u8 {
|
// skip first byte corresponding to the operation type (Deletion or Addition).
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let val = &val[1..];
|
let val = &val[1..];
|
||||||
writer.insert(key, val)?;
|
writer.insert(key, val)?;
|
||||||
}
|
}
|
||||||
@ -649,15 +792,11 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| {
|
new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| {
|
||||||
fst_new_external_documents_ids_builder.insert(key, value)
|
fst_new_external_documents_ids_builder.insert(key, value)
|
||||||
})?;
|
})?;
|
||||||
let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map();
|
|
||||||
|
|
||||||
Ok(TransformOutput {
|
Ok(TransformOutput {
|
||||||
primary_key,
|
primary_key,
|
||||||
fields_ids_map: self.fields_ids_map,
|
fields_ids_map: self.fields_ids_map,
|
||||||
field_distribution,
|
field_distribution,
|
||||||
new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(),
|
|
||||||
new_documents_ids: self.new_documents_ids,
|
|
||||||
replaced_documents_ids: self.replaced_documents_ids,
|
|
||||||
documents_count: self.documents_count,
|
documents_count: self.documents_count,
|
||||||
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
|
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
|
||||||
flattened_documents: flattened_documents
|
flattened_documents: flattened_documents
|
||||||
@ -687,37 +826,41 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
.to_string();
|
.to_string();
|
||||||
let field_distribution = self.index.field_distribution(wtxn)?;
|
let field_distribution = self.index.field_distribution(wtxn)?;
|
||||||
|
|
||||||
// Delete the soft deleted document ids from the maps inside the external_document_ids structure
|
|
||||||
let new_external_documents_ids = {
|
|
||||||
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
|
||||||
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
|
|
||||||
// This call should be free and can't fail since the previous method merged both fsts.
|
|
||||||
external_documents_ids.into_static().to_fst()?.into_owned()
|
|
||||||
};
|
|
||||||
|
|
||||||
let documents_ids = self.index.documents_ids(wtxn)?;
|
let documents_ids = self.index.documents_ids(wtxn)?;
|
||||||
let documents_count = documents_ids.len() as usize;
|
let documents_count = documents_ids.len() as usize;
|
||||||
|
|
||||||
// We create a final writer to write the new documents in order from the sorter.
|
// We initialize the sorter with the user indexing settings.
|
||||||
let mut original_writer = create_writer(
|
let mut original_sorter = create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
keep_first,
|
||||||
self.indexer_settings.chunk_compression_type,
|
self.indexer_settings.chunk_compression_type,
|
||||||
self.indexer_settings.chunk_compression_level,
|
self.indexer_settings.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
self.indexer_settings.max_nb_chunks,
|
||||||
|
self.indexer_settings.max_memory.map(|mem| mem / 2),
|
||||||
);
|
);
|
||||||
|
|
||||||
// We create a final writer to write the new documents in order from the sorter.
|
// We initialize the sorter with the user indexing settings.
|
||||||
let mut flattened_writer = create_writer(
|
let mut flattened_sorter = create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
keep_first,
|
||||||
self.indexer_settings.chunk_compression_type,
|
self.indexer_settings.chunk_compression_type,
|
||||||
self.indexer_settings.chunk_compression_level,
|
self.indexer_settings.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
self.indexer_settings.max_nb_chunks,
|
||||||
|
self.indexer_settings.max_memory.map(|mem| mem / 2),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut obkv_buffer = Vec::new();
|
let mut obkv_buffer = Vec::new();
|
||||||
for result in self.index.all_documents(wtxn)? {
|
let mut document_sorter_key_buffer = Vec::new();
|
||||||
let (docid, obkv) = result?;
|
let mut document_sorter_value_buffer = Vec::new();
|
||||||
|
for result in self.index.external_documents_ids().iter(wtxn)? {
|
||||||
|
let (external_id, docid) = result?;
|
||||||
|
let obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
|
||||||
|
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
|
||||||
|
)?;
|
||||||
|
let docid = docid.get();
|
||||||
|
|
||||||
obkv_buffer.clear();
|
obkv_buffer.clear();
|
||||||
let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
|
let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer);
|
||||||
|
|
||||||
// We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
|
// We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
|
||||||
for (id, name) in new_fields_ids_map.iter() {
|
for (id, name) in new_fields_ids_map.iter() {
|
||||||
@ -727,7 +870,17 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let buffer = obkv_writer.into_inner()?;
|
let buffer = obkv_writer.into_inner()?;
|
||||||
original_writer.insert(docid.to_be_bytes(), &buffer)?;
|
|
||||||
|
document_sorter_key_buffer.clear();
|
||||||
|
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||||
|
document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
|
||||||
|
document_sorter_value_buffer.clear();
|
||||||
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(buffer),
|
||||||
|
DelAddOperation::Addition,
|
||||||
|
&mut document_sorter_value_buffer,
|
||||||
|
)?;
|
||||||
|
original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||||
|
|
||||||
// Once we have the document. We're going to flatten it
|
// Once we have the document. We're going to flatten it
|
||||||
// and insert it in the flattened sorter.
|
// and insert it in the flattened sorter.
|
||||||
@ -762,29 +915,34 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||||
writer.insert(fid, &value)?;
|
writer.insert(fid, &value)?;
|
||||||
}
|
}
|
||||||
flattened_writer.insert(docid.to_be_bytes(), &buffer)?;
|
document_sorter_value_buffer.clear();
|
||||||
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&buffer),
|
||||||
|
DelAddOperation::Addition,
|
||||||
|
&mut document_sorter_value_buffer,
|
||||||
|
)?;
|
||||||
|
flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Once we have written all the documents, we extract
|
let grenad_params = GrenadParameters {
|
||||||
// the file and reset the seek to be able to read it again.
|
chunk_compression_type: self.indexer_settings.chunk_compression_type,
|
||||||
let mut original_documents = original_writer.into_inner()?;
|
chunk_compression_level: self.indexer_settings.chunk_compression_level,
|
||||||
original_documents.rewind()?;
|
max_memory: self.indexer_settings.max_memory,
|
||||||
|
max_nb_chunks: self.indexer_settings.max_nb_chunks, // default value, may be chosen.
|
||||||
|
};
|
||||||
|
|
||||||
let mut flattened_documents = flattened_writer.into_inner()?;
|
// Once we have written all the documents, we merge everything into a Reader.
|
||||||
flattened_documents.rewind()?;
|
let original_documents = sorter_into_reader(original_sorter, grenad_params)?;
|
||||||
|
|
||||||
|
let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?;
|
||||||
|
|
||||||
let output = TransformOutput {
|
let output = TransformOutput {
|
||||||
primary_key,
|
primary_key,
|
||||||
fields_ids_map: new_fields_ids_map,
|
fields_ids_map: new_fields_ids_map,
|
||||||
field_distribution,
|
field_distribution,
|
||||||
new_external_documents_ids,
|
|
||||||
new_documents_ids: documents_ids,
|
|
||||||
replaced_documents_ids: RoaringBitmap::default(),
|
|
||||||
documents_count,
|
documents_count,
|
||||||
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
|
original_documents: original_documents.into_inner().into_inner(),
|
||||||
flattened_documents: flattened_documents
|
flattened_documents: flattened_documents.into_inner().into_inner(),
|
||||||
.into_inner()
|
|
||||||
.map_err(|err| err.into_error())?,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let new_facets = output.compute_real_facets(wtxn, self.index)?;
|
let new_facets = output.compute_real_facets(wtxn, self.index)?;
|
||||||
@ -828,38 +986,111 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn merge_obkvs() {
|
fn merge_obkvs() {
|
||||||
let mut doc_0 = Vec::new();
|
let mut additive_doc_0 = Vec::new();
|
||||||
let mut kv_writer = KvWriter::new(&mut doc_0);
|
let mut deletive_doc_0 = Vec::new();
|
||||||
|
let mut del_add_doc_0 = Vec::new();
|
||||||
|
let mut kv_writer = KvWriter::memory();
|
||||||
kv_writer.insert(0_u8, [0]).unwrap();
|
kv_writer.insert(0_u8, [0]).unwrap();
|
||||||
kv_writer.finish().unwrap();
|
let buffer = kv_writer.into_inner().unwrap();
|
||||||
doc_0.insert(0, Operation::Addition as u8);
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&buffer),
|
||||||
let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap();
|
DelAddOperation::Addition,
|
||||||
assert_eq!(*ret, doc_0);
|
&mut additive_doc_0,
|
||||||
|
|
||||||
let ret = merge_obkvs_and_operations(
|
|
||||||
&[],
|
|
||||||
&[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())],
|
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(*ret, doc_0);
|
additive_doc_0.insert(0, Operation::Addition as u8);
|
||||||
|
into_del_add_obkv(
|
||||||
let ret = merge_obkvs_and_operations(
|
KvReaderU16::new(&buffer),
|
||||||
&[],
|
DelAddOperation::Deletion,
|
||||||
&[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())],
|
&mut deletive_doc_0,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(*ret, [Operation::Deletion as u8]);
|
deletive_doc_0.insert(0, Operation::Deletion as u8);
|
||||||
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&buffer),
|
||||||
|
DelAddOperation::DeletionAndAddition,
|
||||||
|
&mut del_add_doc_0,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
del_add_doc_0.insert(0, Operation::Addition as u8);
|
||||||
|
|
||||||
let ret = merge_obkvs_and_operations(
|
let mut additive_doc_1 = Vec::new();
|
||||||
|
let mut kv_writer = KvWriter::memory();
|
||||||
|
kv_writer.insert(1_u8, [1]).unwrap();
|
||||||
|
let buffer = kv_writer.into_inner().unwrap();
|
||||||
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&buffer),
|
||||||
|
DelAddOperation::Addition,
|
||||||
|
&mut additive_doc_1,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
additive_doc_1.insert(0, Operation::Addition as u8);
|
||||||
|
|
||||||
|
let mut additive_doc_0_1 = Vec::new();
|
||||||
|
let mut kv_writer = KvWriter::memory();
|
||||||
|
kv_writer.insert(0_u8, [0]).unwrap();
|
||||||
|
kv_writer.insert(1_u8, [1]).unwrap();
|
||||||
|
let buffer = kv_writer.into_inner().unwrap();
|
||||||
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&buffer),
|
||||||
|
DelAddOperation::Addition,
|
||||||
|
&mut additive_doc_0_1,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
additive_doc_0_1.insert(0, Operation::Addition as u8);
|
||||||
|
|
||||||
|
let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())])
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(*ret, additive_doc_0);
|
||||||
|
|
||||||
|
let ret = obkvs_merge_additions_and_deletions(
|
||||||
|
&[],
|
||||||
|
&[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(*ret, del_add_doc_0);
|
||||||
|
|
||||||
|
let ret = obkvs_merge_additions_and_deletions(
|
||||||
|
&[],
|
||||||
|
&[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(*ret, deletive_doc_0);
|
||||||
|
|
||||||
|
let ret = obkvs_merge_additions_and_deletions(
|
||||||
&[],
|
&[],
|
||||||
&[
|
&[
|
||||||
Cow::from([Operation::Addition as u8, 1].as_slice()),
|
Cow::from(additive_doc_1.as_slice()),
|
||||||
Cow::from([Operation::Deletion as u8].as_slice()),
|
Cow::from(deletive_doc_0.as_slice()),
|
||||||
Cow::from(doc_0.as_slice()),
|
Cow::from(additive_doc_0.as_slice()),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(*ret, doc_0);
|
assert_eq!(*ret, del_add_doc_0);
|
||||||
|
|
||||||
|
let ret = obkvs_merge_additions_and_deletions(
|
||||||
|
&[],
|
||||||
|
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(*ret, additive_doc_0_1);
|
||||||
|
|
||||||
|
let ret = obkvs_keep_last_addition_merge_deletions(
|
||||||
|
&[],
|
||||||
|
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(*ret, additive_doc_0);
|
||||||
|
|
||||||
|
let ret = obkvs_keep_last_addition_merge_deletions(
|
||||||
|
&[],
|
||||||
|
&[
|
||||||
|
Cow::from(deletive_doc_0.as_slice()),
|
||||||
|
Cow::from(additive_doc_1.as_slice()),
|
||||||
|
Cow::from(additive_doc_0.as_slice()),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(*ret, del_add_doc_0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
use std::borrow::Cow;
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, BufReader};
|
use std::io::{self, BufReader};
|
||||||
@ -9,32 +8,40 @@ use charabia::{Language, Script};
|
|||||||
use grenad::MergerBuilder;
|
use grenad::MergerBuilder;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use heed::RwTxn;
|
use heed::RwTxn;
|
||||||
|
use log::error;
|
||||||
|
use obkv::{KvReader, KvWriter};
|
||||||
|
use ordered_float::OrderedFloat;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
|
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values,
|
||||||
|
valid_lmdb_key, CursorClonableMmap,
|
||||||
};
|
};
|
||||||
use super::{ClonableMmap, MergeFn};
|
use super::{ClonableMmap, MergeFn};
|
||||||
use crate::distance::NDotProductPoint;
|
use crate::distance::NDotProductPoint;
|
||||||
use crate::error::UserError;
|
use crate::error::UserError;
|
||||||
|
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
|
use crate::index::db_name::DOCUMENTS;
|
||||||
use crate::index::Hnsw;
|
use crate::index::Hnsw;
|
||||||
|
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||||
use crate::update::facet::FacetsUpdate;
|
use crate::update::facet::FacetsUpdate;
|
||||||
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
||||||
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
|
use crate::{
|
||||||
|
lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError, BEU32,
|
||||||
|
};
|
||||||
|
|
||||||
pub(crate) enum TypedChunk {
|
pub(crate) enum TypedChunk {
|
||||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||||
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
||||||
Documents(grenad::Reader<CursorClonableMmap>),
|
Documents(grenad::Reader<CursorClonableMmap>),
|
||||||
FieldIdWordcountDocids(grenad::Reader<BufReader<File>>),
|
FieldIdWordCountDocids(grenad::Reader<BufReader<File>>),
|
||||||
NewDocumentsIds(RoaringBitmap),
|
|
||||||
WordDocids {
|
WordDocids {
|
||||||
word_docids_reader: grenad::Reader<BufReader<File>>,
|
word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||||
exact_word_docids_reader: grenad::Reader<BufReader<File>>,
|
exact_word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||||
|
word_fid_docids_reader: grenad::Reader<BufReader<File>>,
|
||||||
},
|
},
|
||||||
WordPositionDocids(grenad::Reader<BufReader<File>>),
|
WordPositionDocids(grenad::Reader<BufReader<File>>),
|
||||||
WordFidDocids(grenad::Reader<BufReader<File>>),
|
|
||||||
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
|
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>),
|
FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
|
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
|
||||||
@ -43,7 +50,7 @@ pub(crate) enum TypedChunk {
|
|||||||
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
|
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
|
||||||
GeoPoints(grenad::Reader<BufReader<File>>),
|
GeoPoints(grenad::Reader<BufReader<File>>),
|
||||||
VectorPoints(grenad::Reader<BufReader<File>>),
|
VectorPoints(grenad::Reader<BufReader<File>>),
|
||||||
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TypedChunk {
|
impl TypedChunk {
|
||||||
@ -58,23 +65,22 @@ impl TypedChunk {
|
|||||||
TypedChunk::Documents(grenad) => {
|
TypedChunk::Documents(grenad) => {
|
||||||
format!("Documents {{ number_of_entries: {} }}", grenad.len())
|
format!("Documents {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdWordcountDocids(grenad) => {
|
TypedChunk::FieldIdWordCountDocids(grenad) => {
|
||||||
format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::NewDocumentsIds(grenad) => {
|
TypedChunk::WordDocids {
|
||||||
format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len())
|
word_docids_reader,
|
||||||
}
|
exact_word_docids_reader,
|
||||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!(
|
word_fid_docids_reader,
|
||||||
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}",
|
} => format!(
|
||||||
|
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}",
|
||||||
word_docids_reader.len(),
|
word_docids_reader.len(),
|
||||||
exact_word_docids_reader.len()
|
exact_word_docids_reader.len(),
|
||||||
|
word_fid_docids_reader.len()
|
||||||
),
|
),
|
||||||
TypedChunk::WordPositionDocids(grenad) => {
|
TypedChunk::WordPositionDocids(grenad) => {
|
||||||
format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::WordFidDocids(grenad) => {
|
|
||||||
format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len())
|
|
||||||
}
|
|
||||||
TypedChunk::WordPairProximityDocids(grenad) => {
|
TypedChunk::WordPairProximityDocids(grenad) => {
|
||||||
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
@ -99,8 +105,8 @@ impl TypedChunk {
|
|||||||
TypedChunk::VectorPoints(grenad) => {
|
TypedChunk::VectorPoints(grenad) => {
|
||||||
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
|
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids(grenad) => {
|
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||||
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -119,34 +125,75 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
let mut is_merged_database = false;
|
let mut is_merged_database = false;
|
||||||
match typed_chunk {
|
match typed_chunk {
|
||||||
TypedChunk::Documents(obkv_documents_iter) => {
|
TypedChunk::Documents(obkv_documents_iter) => {
|
||||||
|
let mut operations: Vec<DocumentOperation> = Default::default();
|
||||||
|
|
||||||
|
let mut docids = index.documents_ids(wtxn)?;
|
||||||
let mut cursor = obkv_documents_iter.into_cursor()?;
|
let mut cursor = obkv_documents_iter.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, reader)) = cursor.move_on_next()? {
|
||||||
index.documents.remap_types::<ByteSlice, ByteSlice>().put(wtxn, key, value)?;
|
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
||||||
|
let reader: KvReader<FieldId> = KvReader::new(reader);
|
||||||
|
|
||||||
|
let (document_id_bytes, external_id_bytes) = try_split_array_at(key)
|
||||||
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?;
|
||||||
|
let docid = DocumentId::from_be_bytes(document_id_bytes);
|
||||||
|
let external_id = std::str::from_utf8(external_id_bytes)?;
|
||||||
|
|
||||||
|
for (field_id, value) in reader.iter() {
|
||||||
|
let del_add_reader = KvReaderDelAdd::new(value);
|
||||||
|
|
||||||
|
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||||
|
writer.insert(field_id, addition)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => {
|
|
||||||
|
let db = index.documents.remap_data_type::<ByteSlice>();
|
||||||
|
|
||||||
|
if !writer.is_empty() {
|
||||||
|
db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?;
|
||||||
|
operations.push(DocumentOperation {
|
||||||
|
external_id: external_id.to_string(),
|
||||||
|
internal_id: docid,
|
||||||
|
kind: DocumentOperationKind::Create,
|
||||||
|
});
|
||||||
|
docids.insert(docid);
|
||||||
|
} else {
|
||||||
|
db.delete(wtxn, &BEU32::new(docid))?;
|
||||||
|
operations.push(DocumentOperation {
|
||||||
|
external_id: external_id.to_string(),
|
||||||
|
internal_id: docid,
|
||||||
|
kind: DocumentOperationKind::Delete,
|
||||||
|
});
|
||||||
|
docids.remove(docid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let external_documents_docids = index.external_documents_ids();
|
||||||
|
external_documents_docids.apply(wtxn, operations)?;
|
||||||
|
index.put_documents_ids(wtxn, &docids)?;
|
||||||
|
}
|
||||||
|
TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {
|
||||||
append_entries_into_database(
|
append_entries_into_database(
|
||||||
fid_word_count_docids_iter,
|
fid_word_count_docids_iter,
|
||||||
&index.field_id_word_count_docids,
|
&index.field_id_word_count_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
|value, _buffer| Ok(value),
|
deladd_serialize_add_side,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::NewDocumentsIds(documents_ids) => {
|
TypedChunk::WordDocids {
|
||||||
return Ok((documents_ids, is_merged_database))
|
word_docids_reader,
|
||||||
}
|
exact_word_docids_reader,
|
||||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
word_fid_docids_reader,
|
||||||
|
} => {
|
||||||
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||||
append_entries_into_database(
|
append_entries_into_database(
|
||||||
word_docids_iter.clone(),
|
word_docids_iter.clone(),
|
||||||
&index.word_docids,
|
&index.word_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
|value, _buffer| Ok(value),
|
deladd_serialize_add_side,
|
||||||
merge_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||||
@ -155,8 +202,18 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.exact_word_docids,
|
&index.exact_word_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
|value, _buffer| Ok(value),
|
deladd_serialize_add_side,
|
||||||
merge_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
|
||||||
|
append_entries_into_database(
|
||||||
|
word_fid_docids_iter,
|
||||||
|
&index.word_fid_docids,
|
||||||
|
wtxn,
|
||||||
|
index_is_empty,
|
||||||
|
deladd_serialize_add_side,
|
||||||
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// create fst from word docids
|
// create fst from word docids
|
||||||
@ -177,19 +234,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.word_position_docids,
|
&index.word_position_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
|value, _buffer| Ok(value),
|
deladd_serialize_add_side,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
|
||||||
is_merged_database = true;
|
|
||||||
}
|
|
||||||
TypedChunk::WordFidDocids(word_fid_docids_iter) => {
|
|
||||||
append_entries_into_database(
|
|
||||||
word_fid_docids_iter,
|
|
||||||
&index.word_fid_docids,
|
|
||||||
wtxn,
|
|
||||||
index_is_empty,
|
|
||||||
|value, _buffer| Ok(value),
|
|
||||||
merge_cbo_roaring_bitmaps,
|
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@ -209,8 +255,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.facet_id_exists_docids,
|
&index.facet_id_exists_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
|value, _buffer| Ok(value),
|
deladd_serialize_add_side,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@ -220,8 +266,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.facet_id_is_null_docids,
|
&index.facet_id_is_null_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
|value, _buffer| Ok(value),
|
deladd_serialize_add_side,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@ -231,8 +277,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.facet_id_is_empty_docids,
|
&index.facet_id_is_empty_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
|value, _buffer| Ok(value),
|
deladd_serialize_add_side,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@ -242,8 +288,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.word_pair_proximity_docids,
|
&index.word_pair_proximity_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
|value, _buffer| Ok(value),
|
deladd_serialize_add_side,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@ -252,8 +298,18 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
index.field_id_docid_facet_f64s.remap_types::<ByteSlice, ByteSlice>();
|
index.field_id_docid_facet_f64s.remap_types::<ByteSlice, ByteSlice>();
|
||||||
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let reader = KvReaderDelAdd::new(value);
|
||||||
if valid_lmdb_key(key) {
|
if valid_lmdb_key(key) {
|
||||||
index_fid_docid_facet_numbers.put(wtxn, key, value)?;
|
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||||
|
(None, None) => {}
|
||||||
|
(None, Some(new)) => index_fid_docid_facet_numbers.put(wtxn, key, new)?,
|
||||||
|
(Some(_), None) => {
|
||||||
|
index_fid_docid_facet_numbers.delete(wtxn, key)?;
|
||||||
|
}
|
||||||
|
(Some(_), Some(new)) => {
|
||||||
|
index_fid_docid_facet_numbers.put(wtxn, key, new)?
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -262,8 +318,18 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
index.field_id_docid_facet_strings.remap_types::<ByteSlice, ByteSlice>();
|
index.field_id_docid_facet_strings.remap_types::<ByteSlice, ByteSlice>();
|
||||||
let mut cursor = fid_docid_facet_string.into_cursor()?;
|
let mut cursor = fid_docid_facet_string.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let reader = KvReaderDelAdd::new(value);
|
||||||
if valid_lmdb_key(key) {
|
if valid_lmdb_key(key) {
|
||||||
index_fid_docid_facet_strings.put(wtxn, key, value)?;
|
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||||
|
(None, None) => {}
|
||||||
|
(None, Some(new)) => index_fid_docid_facet_strings.put(wtxn, key, new)?,
|
||||||
|
(Some(_), None) => {
|
||||||
|
index_fid_docid_facet_strings.delete(wtxn, key)?;
|
||||||
|
}
|
||||||
|
(Some(_), Some(new)) => {
|
||||||
|
index_fid_docid_facet_strings.put(wtxn, key, new)?
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -276,57 +342,86 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
// convert the key back to a u32 (4 bytes)
|
// convert the key back to a u32 (4 bytes)
|
||||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||||
|
|
||||||
// convert the latitude and longitude back to a f64 (8 bytes)
|
let deladd_obkv = KvReaderDelAdd::new(value);
|
||||||
let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
|
if let Some(value) = deladd_obkv.get(DelAdd::Deletion) {
|
||||||
let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
|
let geopoint = extract_geo_point(value, docid);
|
||||||
let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
|
rtree.remove(&geopoint);
|
||||||
let xyz_point = lat_lng_to_xyz(&point);
|
geo_faceted_docids.remove(docid);
|
||||||
|
}
|
||||||
rtree.insert(GeoPoint::new(xyz_point, (docid, point)));
|
if let Some(value) = deladd_obkv.get(DelAdd::Addition) {
|
||||||
|
let geopoint = extract_geo_point(value, docid);
|
||||||
|
rtree.insert(geopoint);
|
||||||
geo_faceted_docids.insert(docid);
|
geo_faceted_docids.insert(docid);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
index.put_geo_rtree(wtxn, &rtree)?;
|
index.put_geo_rtree(wtxn, &rtree)?;
|
||||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||||
}
|
}
|
||||||
TypedChunk::VectorPoints(vector_points) => {
|
TypedChunk::VectorPoints(vector_points) => {
|
||||||
let (pids, mut points): (Vec<_>, Vec<_>) = match index.vector_hnsw(wtxn)? {
|
let mut vectors_set = HashSet::new();
|
||||||
Some(hnsw) => hnsw.iter().map(|(pid, point)| (pid, point.clone())).unzip(),
|
// We extract and store the previous vectors
|
||||||
None => Default::default(),
|
if let Some(hnsw) = index.vector_hnsw(wtxn)? {
|
||||||
};
|
for (pid, point) in hnsw.iter() {
|
||||||
|
let pid_key = BEU32::new(pid.into_inner());
|
||||||
// Convert the PointIds into DocumentIds
|
let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap().get();
|
||||||
let mut docids = Vec::new();
|
let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect();
|
||||||
for pid in pids {
|
vectors_set.insert((docid, vector));
|
||||||
let docid =
|
}
|
||||||
index.vector_id_docid.get(wtxn, &BEU32::new(pid.into_inner()))?.unwrap();
|
|
||||||
docids.push(docid.get());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut expected_dimensions = points.get(0).map(|p| p.len());
|
|
||||||
let mut cursor = vector_points.into_cursor()?;
|
let mut cursor = vector_points.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
// convert the key back to a u32 (4 bytes)
|
// convert the key back to a u32 (4 bytes)
|
||||||
let (left, _index) = try_split_array_at(key).unwrap();
|
let (left, _index) = try_split_array_at(key).unwrap();
|
||||||
let docid = DocumentId::from_be_bytes(left);
|
let docid = DocumentId::from_be_bytes(left);
|
||||||
// convert the vector back to a Vec<f32>
|
|
||||||
let vector: Vec<f32> = pod_collect_to_vec(value);
|
|
||||||
|
|
||||||
// TODO Inform the user about the document that has a wrong `_vectors`
|
let vector_deladd_obkv = KvReaderDelAdd::new(value);
|
||||||
let found = vector.len();
|
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
|
||||||
let expected = *expected_dimensions.get_or_insert(found);
|
// convert the vector back to a Vec<f32>
|
||||||
if expected != found {
|
let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
|
||||||
return Err(UserError::InvalidVectorDimensions { expected, found })?;
|
let key = (docid, vector);
|
||||||
|
if !vectors_set.remove(&key) {
|
||||||
|
error!("Unable to delete the vector: {:?}", key.1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
||||||
|
// convert the vector back to a Vec<f32>
|
||||||
|
let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
|
||||||
|
vectors_set.insert((docid, vector));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Extract the most common vector dimension
|
||||||
|
let expected_dimension_size = {
|
||||||
|
let mut dims = HashMap::new();
|
||||||
|
vectors_set.iter().for_each(|(_, v)| *dims.entry(v.len()).or_insert(0) += 1);
|
||||||
|
dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Ensure that the vector lengths are correct and
|
||||||
|
// prepare the vectors before inserting them in the HNSW.
|
||||||
|
let mut points = Vec::new();
|
||||||
|
let mut docids = Vec::new();
|
||||||
|
for (docid, vector) in vectors_set {
|
||||||
|
if expected_dimension_size.map_or(false, |expected| expected != vector.len()) {
|
||||||
|
return Err(UserError::InvalidVectorDimensions {
|
||||||
|
expected: expected_dimension_size.unwrap_or(vector.len()),
|
||||||
|
found: vector.len(),
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
} else {
|
||||||
|
let vector = vector.into_iter().map(OrderedFloat::into_inner).collect();
|
||||||
points.push(NDotProductPoint::new(vector));
|
points.push(NDotProductPoint::new(vector));
|
||||||
docids.push(docid);
|
docids.push(docid);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
assert_eq!(docids.len(), points.len());
|
|
||||||
|
|
||||||
let hnsw_length = points.len();
|
let hnsw_length = points.len();
|
||||||
let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
|
let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
|
||||||
|
|
||||||
|
assert_eq!(docids.len(), pids.len());
|
||||||
|
|
||||||
|
// Store the vectors in the point-docid relation database
|
||||||
index.vector_id_docid.clear(wtxn)?;
|
index.vector_id_docid.clear(wtxn)?;
|
||||||
for (docid, pid) in docids.into_iter().zip(pids) {
|
for (docid, pid) in docids.into_iter().zip(pids) {
|
||||||
index.vector_id_docid.put(
|
index.vector_id_docid.put(
|
||||||
@ -339,29 +434,41 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
|
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
|
||||||
index.put_vector_hnsw(wtxn, &new_hnsw)?;
|
index.put_vector_hnsw(wtxn, &new_hnsw)?;
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids(hash_pair) => {
|
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||||
let mut buffer = Vec::new();
|
for (key, (deletion, addition)) in sl_map {
|
||||||
for (key, value) in hash_pair {
|
let mut db_key_exists = false;
|
||||||
buffer.clear();
|
|
||||||
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
||||||
Some(db_values) => {
|
Some(db_values) => {
|
||||||
let mut db_value_buffer = Vec::new();
|
db_key_exists = true;
|
||||||
serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?;
|
(db_values - deletion) | addition
|
||||||
let mut new_value_buffer = Vec::new();
|
|
||||||
serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
|
|
||||||
merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
|
|
||||||
RoaringBitmap::deserialize_from(&buffer[..])?
|
|
||||||
}
|
}
|
||||||
None => value,
|
None => addition,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if final_value.is_empty() {
|
||||||
|
// If the database entry exists, delete it.
|
||||||
|
if db_key_exists {
|
||||||
|
index.script_language_docids.delete(wtxn, &key)?;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok((RoaringBitmap::new(), is_merged_database))
|
Ok((RoaringBitmap::new(), is_merged_database))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Converts the latitude and longitude back to an xyz GeoPoint.
|
||||||
|
fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
|
||||||
|
let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
|
||||||
|
let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
|
||||||
|
let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
|
||||||
|
let xyz_point = lat_lng_to_xyz(&point);
|
||||||
|
GeoPoint::new(xyz_point, (docid, point))
|
||||||
|
}
|
||||||
|
|
||||||
fn merge_word_docids_reader_into_fst(
|
fn merge_word_docids_reader_into_fst(
|
||||||
word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
||||||
exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
||||||
@ -379,24 +486,6 @@ fn merge_word_docids_reader_into_fst(
|
|||||||
Ok(builder.into_set())
|
Ok(builder.into_set())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
|
|
||||||
let new_value = RoaringBitmap::deserialize_from(new_value)?;
|
|
||||||
let db_value = RoaringBitmap::deserialize_from(db_value)?;
|
|
||||||
let value = new_value | db_value;
|
|
||||||
Ok(serialize_roaring_bitmap(&value, buffer)?)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_cbo_roaring_bitmaps(
|
|
||||||
new_value: &[u8],
|
|
||||||
db_value: &[u8],
|
|
||||||
buffer: &mut Vec<u8>,
|
|
||||||
) -> Result<()> {
|
|
||||||
Ok(CboRoaringBitmapCodec::merge_into(
|
|
||||||
&[Cow::Borrowed(db_value), Cow::Borrowed(new_value)],
|
|
||||||
buffer,
|
|
||||||
)?)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write provided entries in database using serialize_value function.
|
/// Write provided entries in database using serialize_value function.
|
||||||
/// merge_values function is used if an entry already exist in the database.
|
/// merge_values function is used if an entry already exist in the database.
|
||||||
fn write_entries_into_database<R, K, V, FS, FM>(
|
fn write_entries_into_database<R, K, V, FS, FM>(
|
||||||
@ -410,7 +499,7 @@ fn write_entries_into_database<R, K, V, FS, FM>(
|
|||||||
where
|
where
|
||||||
R: io::Read + io::Seek,
|
R: io::Read + io::Seek,
|
||||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||||
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||||
{
|
{
|
||||||
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
||||||
|
|
||||||
@ -422,17 +511,19 @@ where
|
|||||||
if valid_lmdb_key(key) {
|
if valid_lmdb_key(key) {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
let value = if index_is_empty {
|
let value = if index_is_empty {
|
||||||
serialize_value(value, &mut buffer)?
|
Some(serialize_value(value, &mut buffer)?)
|
||||||
} else {
|
} else {
|
||||||
match database.get(wtxn, key)? {
|
match database.get(wtxn, key)? {
|
||||||
Some(prev_value) => {
|
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||||
merge_values(value, prev_value, &mut buffer)?;
|
None => Some(serialize_value(value, &mut buffer)?),
|
||||||
&buffer[..]
|
|
||||||
}
|
|
||||||
None => serialize_value(value, &mut buffer)?,
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
database.put(wtxn, key, value)?;
|
match value {
|
||||||
|
Some(value) => database.put(wtxn, key, value)?,
|
||||||
|
None => {
|
||||||
|
database.delete(wtxn, key)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -454,7 +545,8 @@ fn append_entries_into_database<R, K, V, FS, FM>(
|
|||||||
where
|
where
|
||||||
R: io::Read + io::Seek,
|
R: io::Read + io::Seek,
|
||||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||||
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||||
|
K: for<'a> heed::BytesDecode<'a>,
|
||||||
{
|
{
|
||||||
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
||||||
|
|
||||||
@ -475,6 +567,12 @@ where
|
|||||||
let mut cursor = data.into_cursor()?;
|
let mut cursor = data.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
if valid_lmdb_key(key) {
|
if valid_lmdb_key(key) {
|
||||||
|
debug_assert!(
|
||||||
|
K::bytes_decode(key).is_some(),
|
||||||
|
"Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
|
||||||
|
key.len(),
|
||||||
|
&key
|
||||||
|
);
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
let value = serialize_value(value, &mut buffer)?;
|
let value = serialize_value(value, &mut buffer)?;
|
||||||
unsafe { database.append(key, value)? };
|
unsafe { database.append(key, value)? };
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
pub use self::available_documents_ids::AvailableDocumentsIds;
|
pub use self::available_documents_ids::AvailableDocumentsIds;
|
||||||
pub use self::clear_documents::ClearDocuments;
|
pub use self::clear_documents::ClearDocuments;
|
||||||
pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDeletionResult};
|
|
||||||
pub use self::facet::bulk::FacetsUpdateBulk;
|
pub use self::facet::bulk::FacetsUpdateBulk;
|
||||||
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
|
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
|
||||||
pub use self::index_documents::{
|
pub use self::index_documents::{
|
||||||
@ -9,10 +8,6 @@ pub use self::index_documents::{
|
|||||||
MergeFn,
|
MergeFn,
|
||||||
};
|
};
|
||||||
pub use self::indexer_config::IndexerConfig;
|
pub use self::indexer_config::IndexerConfig;
|
||||||
pub use self::prefix_word_pairs::{
|
|
||||||
PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
|
|
||||||
MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
|
|
||||||
};
|
|
||||||
pub use self::settings::{Setting, Settings};
|
pub use self::settings::{Setting, Settings};
|
||||||
pub use self::update_step::UpdateIndexingStep;
|
pub use self::update_step::UpdateIndexingStep;
|
||||||
pub use self::word_prefix_docids::WordPrefixDocids;
|
pub use self::word_prefix_docids::WordPrefixDocids;
|
||||||
@ -21,11 +16,10 @@ pub use self::words_prefixes_fst::WordsPrefixesFst;
|
|||||||
|
|
||||||
mod available_documents_ids;
|
mod available_documents_ids;
|
||||||
mod clear_documents;
|
mod clear_documents;
|
||||||
mod delete_documents;
|
pub(crate) mod del_add;
|
||||||
pub(crate) mod facet;
|
pub(crate) mod facet;
|
||||||
mod index_documents;
|
mod index_documents;
|
||||||
mod indexer_config;
|
mod indexer_config;
|
||||||
mod prefix_word_pairs;
|
|
||||||
mod settings;
|
mod settings;
|
||||||
mod update_step;
|
mod update_step;
|
||||||
mod word_prefix_docids;
|
mod word_prefix_docids;
|
||||||
|
@ -1,579 +0,0 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::io::{BufReader, BufWriter};
|
|
||||||
|
|
||||||
use grenad::CompressionType;
|
|
||||||
use heed::types::ByteSlice;
|
|
||||||
|
|
||||||
use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap};
|
|
||||||
use crate::{Index, Result};
|
|
||||||
|
|
||||||
mod prefix_word;
|
|
||||||
mod word_prefix;
|
|
||||||
|
|
||||||
pub use prefix_word::index_prefix_word_database;
|
|
||||||
pub use word_prefix::index_word_prefix_database;
|
|
||||||
|
|
||||||
pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4;
|
|
||||||
pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2;
|
|
||||||
|
|
||||||
pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
|
||||||
index: &'i Index,
|
|
||||||
max_proximity: u8,
|
|
||||||
max_prefix_length: usize,
|
|
||||||
chunk_compression_type: CompressionType,
|
|
||||||
chunk_compression_level: Option<u32>,
|
|
||||||
}
|
|
||||||
impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
|
||||||
pub fn new(
|
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
|
||||||
index: &'i Index,
|
|
||||||
chunk_compression_type: CompressionType,
|
|
||||||
chunk_compression_level: Option<u32>,
|
|
||||||
) -> Self {
|
|
||||||
Self {
|
|
||||||
wtxn,
|
|
||||||
index,
|
|
||||||
max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
|
|
||||||
max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
|
|
||||||
chunk_compression_type,
|
|
||||||
chunk_compression_level,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
|
|
||||||
pub fn execute<'a>(
|
|
||||||
self,
|
|
||||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
|
||||||
new_prefix_fst_words: &'a [String],
|
|
||||||
common_prefix_fst_words: &[&'a [String]],
|
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
|
||||||
) -> Result<()> {
|
|
||||||
puffin::profile_function!();
|
|
||||||
|
|
||||||
index_word_prefix_database(
|
|
||||||
self.wtxn,
|
|
||||||
self.index.word_pair_proximity_docids,
|
|
||||||
self.index.word_prefix_pair_proximity_docids,
|
|
||||||
self.max_proximity,
|
|
||||||
self.max_prefix_length,
|
|
||||||
new_word_pair_proximity_docids.clone(),
|
|
||||||
new_prefix_fst_words,
|
|
||||||
common_prefix_fst_words,
|
|
||||||
del_prefix_fst_words,
|
|
||||||
self.chunk_compression_type,
|
|
||||||
self.chunk_compression_level,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
index_prefix_word_database(
|
|
||||||
self.wtxn,
|
|
||||||
self.index.word_pair_proximity_docids,
|
|
||||||
self.index.prefix_word_pair_proximity_docids,
|
|
||||||
self.max_proximity,
|
|
||||||
self.max_prefix_length,
|
|
||||||
new_word_pair_proximity_docids,
|
|
||||||
new_prefix_fst_words,
|
|
||||||
common_prefix_fst_words,
|
|
||||||
del_prefix_fst_words,
|
|
||||||
self.chunk_compression_type,
|
|
||||||
self.chunk_compression_level,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// This is adapted from `sorter_into_lmdb_database`
|
|
||||||
pub fn insert_into_database(
|
|
||||||
wtxn: &mut heed::RwTxn,
|
|
||||||
database: heed::PolyDatabase,
|
|
||||||
new_key: &[u8],
|
|
||||||
new_value: &[u8],
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
|
|
||||||
match iter.next().transpose()? {
|
|
||||||
Some((key, old_val)) if new_key == key => {
|
|
||||||
let val =
|
|
||||||
merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
|
|
||||||
.map_err(|_| {
|
|
||||||
// TODO just wrap this error?
|
|
||||||
crate::error::InternalError::IndexingMergingKeys {
|
|
||||||
process: "get-put-merge",
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
// safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
|
|
||||||
unsafe { iter.put_current(new_key, &val)? };
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
drop(iter);
|
|
||||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
|
|
||||||
// but it uses `append` if the database is empty, and it assumes that the values in the
|
|
||||||
// writer don't conflict with values in the database.
|
|
||||||
pub fn write_into_lmdb_database_without_merging(
|
|
||||||
wtxn: &mut heed::RwTxn,
|
|
||||||
database: heed::PolyDatabase,
|
|
||||||
writer: grenad::Writer<BufWriter<std::fs::File>>,
|
|
||||||
) -> Result<()> {
|
|
||||||
let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
|
||||||
let reader = grenad::Reader::new(BufReader::new(file))?;
|
|
||||||
if database.is_empty(wtxn)? {
|
|
||||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
|
||||||
let mut cursor = reader.into_cursor()?;
|
|
||||||
while let Some((k, v)) = cursor.move_on_next()? {
|
|
||||||
// safety: the key comes from the grenad reader, not the database
|
|
||||||
unsafe { out_iter.append(k, v)? };
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
let mut cursor = reader.into_cursor()?;
|
|
||||||
while let Some((k, v)) = cursor.move_on_next()? {
|
|
||||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use std::io::Cursor;
|
|
||||||
use std::iter::FromIterator;
|
|
||||||
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use crate::db_snap;
|
|
||||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
|
||||||
use crate::index::tests::TempIndex;
|
|
||||||
use crate::update::{DeleteDocuments, DeletionStrategy, IndexDocumentsMethod};
|
|
||||||
|
|
||||||
fn documents_with_enough_different_words_for_prefixes(
|
|
||||||
prefixes: &[&str],
|
|
||||||
start_id: usize,
|
|
||||||
) -> Vec<crate::Object> {
|
|
||||||
let mut documents = Vec::new();
|
|
||||||
let mut id = start_id;
|
|
||||||
for prefix in prefixes {
|
|
||||||
for i in 0..50 {
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": id,
|
|
||||||
"text": format!("{prefix}{i:x}"),
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
id += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
documents
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn add_new_documents() {
|
|
||||||
let mut index = TempIndex::new();
|
|
||||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
|
||||||
index.index_documents_config.autogenerate_docids = true;
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let batch_reader_from_documents = |documents| {
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
for object in documents {
|
|
||||||
builder.append_json_object(&object).unwrap();
|
|
||||||
}
|
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0);
|
|
||||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": "9000",
|
|
||||||
"text": "At an amazing and beautiful house"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": "9001",
|
|
||||||
"text": "The bell rings at 5 am"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let documents = batch_reader_from_documents(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
|
||||||
|
|
||||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100);
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": "9002",
|
|
||||||
"text": "At an extraordinary house"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
let documents = batch_reader_from_documents(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, word_pair_proximity_docids, "update");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "update");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "update");
|
|
||||||
}
|
|
||||||
#[test]
|
|
||||||
fn batch_bug_3043() {
|
|
||||||
// https://github.com/meilisearch/meilisearch/issues/3043
|
|
||||||
let mut index = TempIndex::new();
|
|
||||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
|
||||||
index.index_documents_config.autogenerate_docids = true;
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let batch_reader_from_documents = |documents| {
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
for object in documents {
|
|
||||||
builder.append_json_object(&object).unwrap();
|
|
||||||
}
|
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0);
|
|
||||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"text": "x y"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"text": "x a y"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let documents = batch_reader_from_documents(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, word_pair_proximity_docids);
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids);
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn hard_delete_and_reupdate() {
|
|
||||||
let mut index = TempIndex::new();
|
|
||||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_primary_key("id".to_owned());
|
|
||||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let batch_reader_from_documents = |documents| {
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
for object in documents {
|
|
||||||
builder.append_json_object(&object).unwrap();
|
|
||||||
}
|
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
|
|
||||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": 9000,
|
|
||||||
"text": "At an amazing and beautiful house"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": 9001,
|
|
||||||
"text": "The bell rings at 5 am"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let documents = batch_reader_from_documents(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "initial");
|
|
||||||
db_snap!(index, word_docids, "initial");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
|
||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
|
||||||
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
|
||||||
delete.strategy(DeletionStrategy::AlwaysHard);
|
|
||||||
delete.delete_documents(&RoaringBitmap::from_iter([50]));
|
|
||||||
delete.execute().unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "first_delete");
|
|
||||||
db_snap!(index, word_docids, "first_delete");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "first_delete");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "first_delete");
|
|
||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
|
||||||
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
|
||||||
delete.strategy(DeletionStrategy::AlwaysHard);
|
|
||||||
delete.delete_documents(&RoaringBitmap::from_iter(0..50));
|
|
||||||
delete.execute().unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "second_delete");
|
|
||||||
db_snap!(index, word_docids, "second_delete");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "second_delete");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "second_delete");
|
|
||||||
|
|
||||||
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000);
|
|
||||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
|
||||||
|
|
||||||
index.add_documents(batch_reader_from_documents(documents)).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "reupdate");
|
|
||||||
db_snap!(index, word_docids, "reupdate");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "reupdate");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn soft_delete_and_reupdate() {
|
|
||||||
let mut index = TempIndex::new();
|
|
||||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_primary_key("id".to_owned());
|
|
||||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let batch_reader_from_documents = |documents| {
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
for object in documents {
|
|
||||||
builder.append_json_object(&object).unwrap();
|
|
||||||
}
|
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
|
|
||||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": 9000,
|
|
||||||
"text": "At an amazing and beautiful house"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": 9001,
|
|
||||||
"text": "The bell rings at 5 am"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let documents = batch_reader_from_documents(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "initial");
|
|
||||||
db_snap!(index, word_docids, "initial");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
|
||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
|
||||||
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
|
||||||
delete.strategy(DeletionStrategy::AlwaysSoft);
|
|
||||||
delete.delete_documents(&RoaringBitmap::from_iter([50]));
|
|
||||||
delete.execute().unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "first_delete");
|
|
||||||
db_snap!(index, word_docids, "first_delete");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "first_delete");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "first_delete");
|
|
||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
|
||||||
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
|
||||||
delete.strategy(DeletionStrategy::AlwaysSoft);
|
|
||||||
|
|
||||||
delete.delete_documents(&RoaringBitmap::from_iter(0..50));
|
|
||||||
delete.execute().unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "second_delete");
|
|
||||||
db_snap!(index, word_docids, "second_delete");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "second_delete");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "second_delete");
|
|
||||||
|
|
||||||
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000);
|
|
||||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
|
||||||
|
|
||||||
index.add_documents(batch_reader_from_documents(documents)).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "reupdate");
|
|
||||||
db_snap!(index, word_docids, "reupdate");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "reupdate");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn replace_soft_deletion() {
|
|
||||||
let mut index = TempIndex::new();
|
|
||||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
|
||||||
index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
|
|
||||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_primary_key("id".to_owned());
|
|
||||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let batch_reader_from_documents = |documents| {
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
for object in documents {
|
|
||||||
builder.append_json_object(&object).unwrap();
|
|
||||||
}
|
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
|
|
||||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": 9000,
|
|
||||||
"text": "At an amazing house"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": 9001,
|
|
||||||
"text": "The bell rings"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let documents = batch_reader_from_documents(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "initial");
|
|
||||||
db_snap!(index, word_docids, "initial");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
|
||||||
|
|
||||||
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0);
|
|
||||||
index.add_documents(batch_reader_from_documents(documents)).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "replaced");
|
|
||||||
db_snap!(index, word_docids, "replaced");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "replaced");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "replaced");
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, "replaced", @"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, ]");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn replace_hard_deletion() {
|
|
||||||
let mut index = TempIndex::new();
|
|
||||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
|
||||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
|
||||||
index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_primary_key("id".to_owned());
|
|
||||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let batch_reader_from_documents = |documents| {
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
for object in documents {
|
|
||||||
builder.append_json_object(&object).unwrap();
|
|
||||||
}
|
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
|
|
||||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": 9000,
|
|
||||||
"text": "At an amazing house"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"id": 9001,
|
|
||||||
"text": "The bell rings"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let documents = batch_reader_from_documents(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "initial");
|
|
||||||
db_snap!(index, word_docids, "initial");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
|
||||||
|
|
||||||
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0);
|
|
||||||
index.add_documents(batch_reader_from_documents(documents)).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, documents_ids, "replaced");
|
|
||||||
db_snap!(index, word_docids, "replaced");
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "replaced");
|
|
||||||
db_snap!(index, prefix_word_pair_proximity_docids, "replaced");
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, "replaced", @"[]");
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,182 +0,0 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use std::collections::{BTreeMap, HashSet};
|
|
||||||
|
|
||||||
use grenad::CompressionType;
|
|
||||||
use heed::types::ByteSlice;
|
|
||||||
use heed::BytesDecode;
|
|
||||||
use log::debug;
|
|
||||||
|
|
||||||
use crate::update::index_documents::{create_writer, CursorClonableMmap};
|
|
||||||
use crate::update::prefix_word_pairs::{
|
|
||||||
insert_into_database, write_into_lmdb_database_without_merging,
|
|
||||||
};
|
|
||||||
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
|
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
|
||||||
#[logging_timer::time]
|
|
||||||
pub fn index_prefix_word_database(
|
|
||||||
wtxn: &mut heed::RwTxn,
|
|
||||||
word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
|
||||||
prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
|
||||||
max_proximity: u8,
|
|
||||||
max_prefix_length: usize,
|
|
||||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
|
||||||
new_prefix_fst_words: &[String],
|
|
||||||
common_prefix_fst_words: &[&[String]],
|
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
|
||||||
chunk_compression_type: CompressionType,
|
|
||||||
chunk_compression_level: Option<u32>,
|
|
||||||
) -> Result<()> {
|
|
||||||
puffin::profile_function!();
|
|
||||||
|
|
||||||
let max_proximity = max_proximity - 1;
|
|
||||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
|
||||||
|
|
||||||
let common_prefixes: Vec<_> = common_prefix_fst_words
|
|
||||||
.iter()
|
|
||||||
.flat_map(|s| s.iter())
|
|
||||||
.map(|s| s.as_str())
|
|
||||||
.filter(|s| s.len() <= max_prefix_length)
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
for proximity in 1..max_proximity {
|
|
||||||
for prefix in common_prefixes.iter() {
|
|
||||||
let mut prefix_key = vec![proximity];
|
|
||||||
prefix_key.extend_from_slice(prefix.as_bytes());
|
|
||||||
let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?;
|
|
||||||
// This is the core of the algorithm
|
|
||||||
execute_on_word_pairs_and_prefixes(
|
|
||||||
proximity,
|
|
||||||
prefix.as_bytes(),
|
|
||||||
// the next two arguments tell how to iterate over the new word pairs
|
|
||||||
&mut cursor,
|
|
||||||
|cursor| {
|
|
||||||
if let Some((key, value)) = cursor.next()? {
|
|
||||||
let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
|
|
||||||
.ok_or(heed::Error::Decoding)?;
|
|
||||||
Ok(Some((word2, value)))
|
|
||||||
} else {
|
|
||||||
Ok(None)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
// and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap)
|
|
||||||
|key, value| {
|
|
||||||
insert_into_database(
|
|
||||||
wtxn,
|
|
||||||
*prefix_word_pair_proximity_docids.as_polymorph(),
|
|
||||||
key,
|
|
||||||
value,
|
|
||||||
)
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now we do the same thing with the new prefixes and all word pairs in the DB
|
|
||||||
let new_prefixes: Vec<_> = new_prefix_fst_words
|
|
||||||
.iter()
|
|
||||||
.map(|s| s.as_str())
|
|
||||||
.filter(|s| s.len() <= max_prefix_length)
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
// Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
|
|
||||||
// element in an intermediary grenad
|
|
||||||
let mut writer =
|
|
||||||
create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
|
|
||||||
|
|
||||||
for proximity in 1..max_proximity {
|
|
||||||
for prefix in new_prefixes.iter() {
|
|
||||||
let mut prefix_key = vec![proximity];
|
|
||||||
prefix_key.extend_from_slice(prefix.as_bytes());
|
|
||||||
let mut db_iter = word_pair_proximity_docids
|
|
||||||
.as_polymorph()
|
|
||||||
.prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())?
|
|
||||||
.remap_key_type::<UncheckedU8StrStrCodec>();
|
|
||||||
execute_on_word_pairs_and_prefixes(
|
|
||||||
proximity,
|
|
||||||
prefix.as_bytes(),
|
|
||||||
&mut db_iter,
|
|
||||||
|db_iter| {
|
|
||||||
db_iter
|
|
||||||
.next()
|
|
||||||
.transpose()
|
|
||||||
.map(|x| x.map(|((_, _, word2), value)| (word2, value)))
|
|
||||||
.map_err(|e| e.into())
|
|
||||||
},
|
|
||||||
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
|
||||||
)?;
|
|
||||||
drop(db_iter);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// and then we write the grenad into the DB
|
|
||||||
// Since the grenad contains only new prefixes, we know in advance that none
|
|
||||||
// of its elements already exist in the DB, thus there is no need to specify
|
|
||||||
// how to merge conflicting elements
|
|
||||||
write_into_lmdb_database_without_merging(
|
|
||||||
wtxn,
|
|
||||||
*prefix_word_pair_proximity_docids.as_polymorph(),
|
|
||||||
writer,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
// All of the word prefix pairs in the database that have a w2
|
|
||||||
// that is contained in the `suppr_pw` set must be removed as well.
|
|
||||||
if !del_prefix_fst_words.is_empty() {
|
|
||||||
let mut iter =
|
|
||||||
prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
|
|
||||||
while let Some(((_, prefix, _), _)) = iter.next().transpose()? {
|
|
||||||
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
|
||||||
// Delete this entry as the w2 prefix is no more in the words prefix fst.
|
|
||||||
unsafe { iter.del_current()? };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database.
|
|
||||||
///
|
|
||||||
/// Its arguments are:
|
|
||||||
/// - an iterator over the words following the given `prefix` with the given `proximity`
|
|
||||||
/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements
|
|
||||||
fn execute_on_word_pairs_and_prefixes<I>(
|
|
||||||
proximity: u8,
|
|
||||||
prefix: &[u8],
|
|
||||||
iter: &mut I,
|
|
||||||
mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>,
|
|
||||||
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = BTreeMap::default();
|
|
||||||
|
|
||||||
// Memory usage check:
|
|
||||||
// The content of the loop will be called for each `word2` that follows a word beginning
|
|
||||||
// with `prefix` with the given proximity.
|
|
||||||
// In practice, I don't think the batch can ever get too big.
|
|
||||||
while let Some((word2, docids)) = next_word2_and_docids(iter)? {
|
|
||||||
let entry = batch.entry(word2.to_owned()).or_default();
|
|
||||||
entry.push(Cow::Owned(docids.to_owned()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut key_buffer = Vec::with_capacity(512);
|
|
||||||
key_buffer.push(proximity);
|
|
||||||
key_buffer.extend_from_slice(prefix);
|
|
||||||
key_buffer.push(0);
|
|
||||||
|
|
||||||
let mut value_buffer = Vec::with_capacity(65_536);
|
|
||||||
|
|
||||||
for (word2, docids) in batch {
|
|
||||||
key_buffer.truncate(prefix.len() + 2);
|
|
||||||
value_buffer.clear();
|
|
||||||
|
|
||||||
key_buffer.extend_from_slice(&word2);
|
|
||||||
let data = if docids.len() > 1 {
|
|
||||||
CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?;
|
|
||||||
value_buffer.as_slice()
|
|
||||||
} else {
|
|
||||||
&docids[0]
|
|
||||||
};
|
|
||||||
insert(key_buffer.as_slice(), data)?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
@ -1,20 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
1 a 5 [101, ]
|
|
||||||
1 a amazing [100, ]
|
|
||||||
1 a an [100, ]
|
|
||||||
1 a and [100, ]
|
|
||||||
1 a beautiful [100, ]
|
|
||||||
1 b house [100, ]
|
|
||||||
1 b rings [101, ]
|
|
||||||
1 be house [100, ]
|
|
||||||
1 be rings [101, ]
|
|
||||||
2 a am [101, ]
|
|
||||||
2 a amazing [100, ]
|
|
||||||
2 a and [100, ]
|
|
||||||
2 a beautiful [100, ]
|
|
||||||
2 a house [100, ]
|
|
||||||
2 b at [101, ]
|
|
||||||
2 be at [101, ]
|
|
||||||
|
|
@ -1,23 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
1 5 a [101, ]
|
|
||||||
1 amazing a [100, ]
|
|
||||||
1 an a [100, ]
|
|
||||||
1 and b [100, ]
|
|
||||||
1 and be [100, ]
|
|
||||||
1 at a [100, ]
|
|
||||||
1 rings a [101, ]
|
|
||||||
1 the b [101, ]
|
|
||||||
1 the be [101, ]
|
|
||||||
2 amazing b [100, ]
|
|
||||||
2 amazing be [100, ]
|
|
||||||
2 an a [100, ]
|
|
||||||
2 at a [100, 101, ]
|
|
||||||
2 bell a [101, ]
|
|
||||||
3 an b [100, ]
|
|
||||||
3 an be [100, ]
|
|
||||||
3 at a [100, ]
|
|
||||||
3 rings a [101, ]
|
|
||||||
3 the a [101, ]
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
1 a 5 [101, ]
|
|
||||||
1 a amazing [100, ]
|
|
||||||
1 a an [100, 202, ]
|
|
||||||
1 a and [100, ]
|
|
||||||
1 a beautiful [100, ]
|
|
||||||
1 a extraordinary [202, ]
|
|
||||||
1 am and [100, ]
|
|
||||||
1 an amazing [100, ]
|
|
||||||
1 an beautiful [100, ]
|
|
||||||
1 an extraordinary [202, ]
|
|
||||||
1 b house [100, ]
|
|
||||||
1 b rings [101, ]
|
|
||||||
1 be house [100, ]
|
|
||||||
1 be rings [101, ]
|
|
||||||
2 a am [101, ]
|
|
||||||
2 a amazing [100, ]
|
|
||||||
2 a and [100, ]
|
|
||||||
2 a beautiful [100, ]
|
|
||||||
2 a extraordinary [202, ]
|
|
||||||
2 a house [100, 202, ]
|
|
||||||
2 am beautiful [100, ]
|
|
||||||
2 an and [100, ]
|
|
||||||
2 an house [100, 202, ]
|
|
||||||
2 b at [101, ]
|
|
||||||
2 be at [101, ]
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
1 5 am [101, ]
|
|
||||||
1 amazing and [100, ]
|
|
||||||
1 an amazing [100, ]
|
|
||||||
1 an extraordinary [202, ]
|
|
||||||
1 and beautiful [100, ]
|
|
||||||
1 at 5 [101, ]
|
|
||||||
1 at an [100, 202, ]
|
|
||||||
1 beautiful house [100, ]
|
|
||||||
1 bell rings [101, ]
|
|
||||||
1 extraordinary house [202, ]
|
|
||||||
1 rings at [101, ]
|
|
||||||
1 the bell [101, ]
|
|
||||||
2 amazing beautiful [100, ]
|
|
||||||
2 an and [100, ]
|
|
||||||
2 an house [202, ]
|
|
||||||
2 and house [100, ]
|
|
||||||
2 at am [101, ]
|
|
||||||
2 at amazing [100, ]
|
|
||||||
2 at extraordinary [202, ]
|
|
||||||
2 bell at [101, ]
|
|
||||||
2 rings 5 [101, ]
|
|
||||||
2 the rings [101, ]
|
|
||||||
3 amazing house [100, ]
|
|
||||||
3 an beautiful [100, ]
|
|
||||||
3 at and [100, ]
|
|
||||||
3 at house [202, ]
|
|
||||||
3 bell 5 [101, ]
|
|
||||||
3 rings am [101, ]
|
|
||||||
3 the at [101, ]
|
|
||||||
|
|
@ -1,31 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
1 5 a [101, ]
|
|
||||||
1 5 am [101, ]
|
|
||||||
1 amazing a [100, ]
|
|
||||||
1 amazing an [100, ]
|
|
||||||
1 an a [100, ]
|
|
||||||
1 an am [100, ]
|
|
||||||
1 and b [100, ]
|
|
||||||
1 and be [100, ]
|
|
||||||
1 at a [100, 202, ]
|
|
||||||
1 at an [100, 202, ]
|
|
||||||
1 rings a [101, ]
|
|
||||||
1 the b [101, ]
|
|
||||||
1 the be [101, ]
|
|
||||||
2 amazing b [100, ]
|
|
||||||
2 amazing be [100, ]
|
|
||||||
2 an a [100, ]
|
|
||||||
2 an an [100, ]
|
|
||||||
2 at a [100, 101, ]
|
|
||||||
2 at am [100, 101, ]
|
|
||||||
2 bell a [101, ]
|
|
||||||
3 an b [100, ]
|
|
||||||
3 an be [100, ]
|
|
||||||
3 at a [100, ]
|
|
||||||
3 at an [100, ]
|
|
||||||
3 rings a [101, ]
|
|
||||||
3 rings am [101, ]
|
|
||||||
3 the a [101, ]
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
1 a y [51, ]
|
|
||||||
1 x a [51, ]
|
|
||||||
1 x y [50, ]
|
|
||||||
2 x y [51, ]
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
1 a y [51, ]
|
|
||||||
1 x y [50, ]
|
|
||||||
2 x y [51, ]
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ]
|
|
@ -1,6 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
1 a 5 [51, ]
|
|
||||||
2 a am [51, ]
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
5 [51, ]
|
|
||||||
a0 [0, ]
|
|
||||||
a1 [1, ]
|
|
||||||
a10 [16, ]
|
|
||||||
a11 [17, ]
|
|
||||||
a12 [18, ]
|
|
||||||
a13 [19, ]
|
|
||||||
a14 [20, ]
|
|
||||||
a15 [21, ]
|
|
||||||
a16 [22, ]
|
|
||||||
a17 [23, ]
|
|
||||||
a18 [24, ]
|
|
||||||
a19 [25, ]
|
|
||||||
a1a [26, ]
|
|
||||||
a1b [27, ]
|
|
||||||
a1c [28, ]
|
|
||||||
a1d [29, ]
|
|
||||||
a1e [30, ]
|
|
||||||
a1f [31, ]
|
|
||||||
a2 [2, ]
|
|
||||||
a20 [32, ]
|
|
||||||
a21 [33, ]
|
|
||||||
a22 [34, ]
|
|
||||||
a23 [35, ]
|
|
||||||
a24 [36, ]
|
|
||||||
a25 [37, ]
|
|
||||||
a26 [38, ]
|
|
||||||
a27 [39, ]
|
|
||||||
a28 [40, ]
|
|
||||||
a29 [41, ]
|
|
||||||
a2a [42, ]
|
|
||||||
a2b [43, ]
|
|
||||||
a2c [44, ]
|
|
||||||
a2d [45, ]
|
|
||||||
a2e [46, ]
|
|
||||||
a2f [47, ]
|
|
||||||
a3 [3, ]
|
|
||||||
a30 [48, ]
|
|
||||||
a31 [49, ]
|
|
||||||
a4 [4, ]
|
|
||||||
a5 [5, ]
|
|
||||||
a6 [6, ]
|
|
||||||
a7 [7, ]
|
|
||||||
a8 [8, ]
|
|
||||||
a9 [9, ]
|
|
||||||
aa [10, ]
|
|
||||||
ab [11, ]
|
|
||||||
ac [12, ]
|
|
||||||
ad [13, ]
|
|
||||||
ae [14, ]
|
|
||||||
af [15, ]
|
|
||||||
am [51, ]
|
|
||||||
at [51, ]
|
|
||||||
bell [51, ]
|
|
||||||
rings [51, ]
|
|
||||||
the [51, ]
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
1 5 a [51, ]
|
|
||||||
1 rings a [51, ]
|
|
||||||
2 at a [51, ]
|
|
||||||
2 bell a [51, ]
|
|
||||||
3 rings a [51, ]
|
|
||||||
3 the a [51, ]
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ]
|
|
@ -1,14 +0,0 @@
|
|||||||
---
|
|
||||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
|
||||||
---
|
|
||||||
1 a 5 [51, ]
|
|
||||||
1 a amazing [50, ]
|
|
||||||
1 a an [50, ]
|
|
||||||
1 a and [50, ]
|
|
||||||
1 a beautiful [50, ]
|
|
||||||
2 a am [51, ]
|
|
||||||
2 a amazing [50, ]
|
|
||||||
2 a and [50, ]
|
|
||||||
2 a beautiful [50, ]
|
|
||||||
2 a house [50, ]
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user