mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-22 20:50:04 +01:00
Merge branch 'main' into tmp-release-v1.5.0
This commit is contained in:
commit
7cb7e37ba8
3
.github/workflows/benchmarks-pr.yml
vendored
3
.github/workflows/benchmarks-pr.yml
vendored
@ -90,7 +90,8 @@ jobs:
|
||||
set -x
|
||||
export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8)
|
||||
export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json)
|
||||
echo 'Here are your benchmarks diff 👊' >> body.txt
|
||||
export bench_name=$(echo ${{ steps.command.outputs.command-arguments }})
|
||||
echo "Here are your $bench_name benchmarks diff 👊" >> body.txt
|
||||
echo '```' >> body.txt
|
||||
./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt
|
||||
echo '```' >> body.txt
|
||||
|
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
@ -50,7 +50,7 @@ jobs:
|
||||
needs: check-version
|
||||
steps:
|
||||
- name: Create PR to Homebrew
|
||||
uses: mislav/bump-homebrew-formula-action@v2
|
||||
uses: mislav/bump-homebrew-formula-action@v3
|
||||
with:
|
||||
formula-name: meilisearch
|
||||
formula-path: Formula/m/meilisearch.rb
|
||||
|
2
.github/workflows/publish-docker-images.yml
vendored
2
.github/workflows/publish-docker-images.yml
vendored
@ -63,7 +63,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
4
.github/workflows/sdks-tests.yml
vendored
4
.github/workflows/sdks-tests.yml
vendored
@ -160,7 +160,7 @@ jobs:
|
||||
with:
|
||||
repository: meilisearch/meilisearch-js
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v3
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
cache: 'yarn'
|
||||
- name: Install dependencies
|
||||
@ -318,7 +318,7 @@ jobs:
|
||||
with:
|
||||
repository: meilisearch/meilisearch-js-plugins
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v3
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
cache: yarn
|
||||
- name: Install dependencies
|
||||
|
10
.github/workflows/test-suite.yml
vendored
10
.github/workflows/test-suite.yml
vendored
@ -43,7 +43,7 @@ jobs:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo check without any default features
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@ -65,7 +65,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo check without any default features
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@ -149,7 +149,7 @@ jobs:
|
||||
toolchain: stable
|
||||
override: true
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run tests in debug
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@ -168,7 +168,7 @@ jobs:
|
||||
override: true
|
||||
components: clippy
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo clippy
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@ -187,7 +187,7 @@ jobs:
|
||||
override: true
|
||||
components: rustfmt
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo fmt
|
||||
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
|
||||
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate
|
||||
|
10
Cargo.lock
generated
10
Cargo.lock
generated
@ -1731,12 +1731,13 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
||||
|
||||
[[package]]
|
||||
name = "grenad"
|
||||
version = "0.4.4"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5232b2d157b7bf63d7abe1b12177039e58db2f29e377517c0cdee1578cca4c93"
|
||||
checksum = "6a007932af5475ebb5c63bef8812bb1c36f317983bb4ca663e9d6dd58d6a0f8c"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
"rayon",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
@ -3281,6 +3282,7 @@ dependencies = [
|
||||
"logging_timer",
|
||||
"maplit",
|
||||
"md5",
|
||||
"meili-snap",
|
||||
"memmap2",
|
||||
"mimalloc",
|
||||
"obkv",
|
||||
@ -3443,9 +3445,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "obkv"
|
||||
version = "0.2.0"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385"
|
||||
checksum = "6c459142426056c639ff88d053ebaaaeca0ee1411c94362892398ef4ccd81080"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
|
@ -25,12 +25,6 @@
|
||||
|
||||
<p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>
|
||||
|
||||
---
|
||||
|
||||
### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A!
|
||||
|
||||
---
|
||||
|
||||
Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.
|
||||
|
||||
<p align="center" name="demo">
|
||||
|
@ -6,9 +6,7 @@ use std::path::Path;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use milli::heed::{EnvOpenOptions, RwTxn};
|
||||
use milli::update::{
|
||||
DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
|
||||
};
|
||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||
use milli::Index;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand_chacha::rand_core::SeedableRng;
|
||||
@ -266,17 +264,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
},
|
||||
)
|
||||
});
|
||||
@ -613,17 +601,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
},
|
||||
)
|
||||
});
|
||||
@ -875,22 +853,31 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
},
|
||||
)
|
||||
});
|
||||
}
|
||||
|
||||
fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBitmap>) {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let indexer_config = IndexerConfig::default();
|
||||
for ids in document_ids_to_delete {
|
||||
let config = IndexDocumentsConfig::default();
|
||||
|
||||
let mut builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
|
||||
.unwrap();
|
||||
(builder, _) = builder.remove_documents_from_db_no_batch(&ids).unwrap();
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
fn indexing_movies_in_three_batches(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("indexing");
|
||||
group.sample_size(BENCHMARK_ITERATION);
|
||||
@ -1112,17 +1099,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
},
|
||||
)
|
||||
});
|
||||
@ -1338,17 +1315,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
},
|
||||
)
|
||||
});
|
||||
|
@ -526,12 +526,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:22.688964637Z",
|
||||
"updatedAt": "2022-10-09T20:27:23.951017769Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -541,12 +541,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:22.197788495Z",
|
||||
"updatedAt": "2022-10-09T20:28:01.93111053Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -571,12 +571,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:24.242683494Z",
|
||||
"updatedAt": "2022-10-09T20:27:24.312809641Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -617,12 +617,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:56.595257Z",
|
||||
"updatedAt": "2023-01-30T16:25:58.70348Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -632,12 +632,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:56.192178Z",
|
||||
"updatedAt": "2023-01-30T16:25:56.455714Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -647,12 +647,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:58.876405Z",
|
||||
"updatedAt": "2023-01-30T16:25:59.079906Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
|
@ -1,24 +0,0 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: spells.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [],
|
||||
"sortableAttributes": [],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null
|
||||
}
|
@ -1,38 +0,0 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: products.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [],
|
||||
"sortableAttributes": [],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {
|
||||
"android": [
|
||||
"phone",
|
||||
"smartphone"
|
||||
],
|
||||
"iphone": [
|
||||
"phone",
|
||||
"smartphone"
|
||||
],
|
||||
"phone": [
|
||||
"android",
|
||||
"iphone",
|
||||
"smartphone"
|
||||
]
|
||||
},
|
||||
"distinctAttribute": null
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: movies.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [
|
||||
"genres",
|
||||
"id"
|
||||
],
|
||||
"sortableAttributes": [
|
||||
"genres",
|
||||
"id"
|
||||
],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness",
|
||||
"release_date:asc"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null
|
||||
}
|
@ -46,6 +46,7 @@ pub type Checked = settings::Checked;
|
||||
pub type Unchecked = settings::Unchecked;
|
||||
|
||||
pub type Task = updates::UpdateEntry;
|
||||
pub type Kind = updates::UpdateMeta;
|
||||
|
||||
// everything related to the errors
|
||||
pub type ResponseError = errors::ResponseError;
|
||||
@ -107,8 +108,11 @@ impl V2Reader {
|
||||
pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V2IndexReader>> + '_> {
|
||||
Ok(self.index_uuid.iter().map(|index| -> Result<_> {
|
||||
V2IndexReader::new(
|
||||
index.uid.clone(),
|
||||
&self.dump.path().join("indexes").join(format!("index-{}", index.uuid)),
|
||||
index,
|
||||
BufReader::new(
|
||||
File::open(self.dump.path().join("updates").join("data.jsonl")).unwrap(),
|
||||
),
|
||||
)
|
||||
}))
|
||||
}
|
||||
@ -143,16 +147,41 @@ pub struct V2IndexReader {
|
||||
}
|
||||
|
||||
impl V2IndexReader {
|
||||
pub fn new(name: String, path: &Path) -> Result<Self> {
|
||||
pub fn new(path: &Path, index_uuid: &IndexUuid, tasks: BufReader<File>) -> Result<Self> {
|
||||
let meta = File::open(path.join("meta.json"))?;
|
||||
let meta: DumpMeta = serde_json::from_reader(meta)?;
|
||||
|
||||
let mut created_at = None;
|
||||
let mut updated_at = None;
|
||||
|
||||
for line in tasks.lines() {
|
||||
let task: Task = serde_json::from_str(&line?)?;
|
||||
if !(task.uuid == index_uuid.uuid && task.is_finished()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let new_created_at = match task.update.meta() {
|
||||
Kind::DocumentsAddition { .. } | Kind::Settings(_) => task.update.finished_at(),
|
||||
_ => None,
|
||||
};
|
||||
let new_updated_at = task.update.finished_at();
|
||||
|
||||
if created_at.is_none() || created_at > new_created_at {
|
||||
created_at = new_created_at;
|
||||
}
|
||||
|
||||
if updated_at.is_none() || updated_at < new_updated_at {
|
||||
updated_at = new_updated_at;
|
||||
}
|
||||
}
|
||||
|
||||
let current_time = OffsetDateTime::now_utc();
|
||||
|
||||
let metadata = IndexMetadata {
|
||||
uid: name,
|
||||
uid: index_uuid.uid.clone(),
|
||||
primary_key: meta.primary_key,
|
||||
// FIXME: Iterate over the whole task queue to find the creation and last update date.
|
||||
created_at: OffsetDateTime::now_utc(),
|
||||
updated_at: OffsetDateTime::now_utc(),
|
||||
created_at: created_at.unwrap_or(current_time),
|
||||
updated_at: updated_at.unwrap_or(current_time),
|
||||
};
|
||||
|
||||
let ret = V2IndexReader {
|
||||
@ -248,12 +277,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:22.688964637Z",
|
||||
"updatedAt": "2022-10-09T20:27:23.951017769Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -263,12 +292,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:22.197788495Z",
|
||||
"updatedAt": "2022-10-09T20:28:01.93111053Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -293,12 +322,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:24.242683494Z",
|
||||
"updatedAt": "2022-10-09T20:27:24.312809641Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -340,12 +369,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:56.595257Z",
|
||||
"updatedAt": "2023-01-30T16:25:58.70348Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -355,12 +384,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:56.192178Z",
|
||||
"updatedAt": "2023-01-30T16:25:56.455714Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -370,12 +399,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:58.876405Z",
|
||||
"updatedAt": "2023-01-30T16:25:59.079906Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
|
@ -227,4 +227,14 @@ impl UpdateStatus {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn finished_at(&self) -> Option<OffsetDateTime> {
|
||||
match self {
|
||||
UpdateStatus::Processing(_) => None,
|
||||
UpdateStatus::Enqueued(_) => None,
|
||||
UpdateStatus::Processed(u) => Some(u.processed_at),
|
||||
UpdateStatus::Aborted(_) => None,
|
||||
UpdateStatus::Failed(u) => Some(u.failed_at),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -24,14 +24,13 @@ use std::fs::{self, File};
|
||||
use std::io::BufWriter;
|
||||
|
||||
use dump::IndexMetadata;
|
||||
use log::{debug, error, info};
|
||||
use log::{debug, error, info, trace};
|
||||
use meilisearch_types::error::Code;
|
||||
use meilisearch_types::heed::{RoTxn, RwTxn};
|
||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use meilisearch_types::milli::heed::CompactionOption;
|
||||
use meilisearch_types::milli::update::{
|
||||
DeleteDocuments, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||
Settings as MilliSettings,
|
||||
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
||||
};
|
||||
use meilisearch_types::milli::{self, Filter, BEU32};
|
||||
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
|
||||
@ -44,7 +43,7 @@ use uuid::Uuid;
|
||||
|
||||
use crate::autobatcher::{self, BatchKind};
|
||||
use crate::utils::{self, swap_index_uid_in_task};
|
||||
use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId};
|
||||
use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId};
|
||||
|
||||
/// Represents a combination of tasks that can all be processed at the same time.
|
||||
///
|
||||
@ -105,12 +104,6 @@ pub(crate) enum IndexOperation {
|
||||
operations: Vec<DocumentOperation>,
|
||||
tasks: Vec<Task>,
|
||||
},
|
||||
DocumentDeletion {
|
||||
index_uid: String,
|
||||
// The vec associated with each document deletion tasks.
|
||||
documents: Vec<Vec<String>>,
|
||||
tasks: Vec<Task>,
|
||||
},
|
||||
IndexDocumentDeletionByFilter {
|
||||
index_uid: String,
|
||||
task: Task,
|
||||
@ -162,7 +155,6 @@ impl Batch {
|
||||
}
|
||||
Batch::IndexOperation { op, .. } => match op {
|
||||
IndexOperation::DocumentOperation { tasks, .. }
|
||||
| IndexOperation::DocumentDeletion { tasks, .. }
|
||||
| IndexOperation::Settings { tasks, .. }
|
||||
| IndexOperation::DocumentClear { tasks, .. } => {
|
||||
tasks.iter().map(|task| task.uid).collect()
|
||||
@ -227,7 +219,6 @@ impl IndexOperation {
|
||||
pub fn index_uid(&self) -> &str {
|
||||
match self {
|
||||
IndexOperation::DocumentOperation { index_uid, .. }
|
||||
| IndexOperation::DocumentDeletion { index_uid, .. }
|
||||
| IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
|
||||
| IndexOperation::DocumentClear { index_uid, .. }
|
||||
| IndexOperation::Settings { index_uid, .. }
|
||||
@ -243,9 +234,6 @@ impl fmt::Display for IndexOperation {
|
||||
IndexOperation::DocumentOperation { .. } => {
|
||||
f.write_str("IndexOperation::DocumentOperation")
|
||||
}
|
||||
IndexOperation::DocumentDeletion { .. } => {
|
||||
f.write_str("IndexOperation::DocumentDeletion")
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
|
||||
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
|
||||
}
|
||||
@ -348,18 +336,27 @@ impl IndexScheduler {
|
||||
BatchKind::DocumentDeletion { deletion_ids } => {
|
||||
let tasks = self.get_existing_tasks(rtxn, deletion_ids)?;
|
||||
|
||||
let mut documents = Vec::new();
|
||||
let mut operations = Vec::with_capacity(tasks.len());
|
||||
let mut documents_counts = Vec::with_capacity(tasks.len());
|
||||
for task in &tasks {
|
||||
match task.kind {
|
||||
KindWithContent::DocumentDeletion { ref documents_ids, .. } => {
|
||||
documents.push(documents_ids.clone())
|
||||
operations.push(DocumentOperation::Delete(documents_ids.clone()));
|
||||
documents_counts.push(documents_ids.len() as u64);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Some(Batch::IndexOperation {
|
||||
op: IndexOperation::DocumentDeletion { index_uid, documents, tasks },
|
||||
op: IndexOperation::DocumentOperation {
|
||||
index_uid,
|
||||
primary_key: None,
|
||||
method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
documents_counts,
|
||||
operations,
|
||||
tasks,
|
||||
},
|
||||
must_create_index,
|
||||
}))
|
||||
}
|
||||
@ -825,6 +822,10 @@ impl IndexScheduler {
|
||||
// 2. dump the tasks
|
||||
let mut dump_tasks = dump.create_tasks_queue()?;
|
||||
for ret in self.all_tasks.iter(&rtxn)? {
|
||||
if self.must_stop_processing.get() {
|
||||
return Err(Error::AbortedTask);
|
||||
}
|
||||
|
||||
let (_, mut t) = ret?;
|
||||
let status = t.status;
|
||||
let content_file = t.content_uuid();
|
||||
@ -845,6 +846,9 @@ impl IndexScheduler {
|
||||
|
||||
// 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
|
||||
if let Some(content_file) = content_file {
|
||||
if self.must_stop_processing.get() {
|
||||
return Err(Error::AbortedTask);
|
||||
}
|
||||
if status == Status::Enqueued {
|
||||
let content_file = self.file_store.get_update(content_file)?;
|
||||
|
||||
@ -884,6 +888,9 @@ impl IndexScheduler {
|
||||
|
||||
// 3.1. Dump the documents
|
||||
for ret in index.all_documents(&rtxn)? {
|
||||
if self.must_stop_processing.get() {
|
||||
return Err(Error::AbortedTask);
|
||||
}
|
||||
let (_id, doc) = ret?;
|
||||
let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||
index_dumper.push_document(&document)?;
|
||||
@ -903,6 +910,9 @@ impl IndexScheduler {
|
||||
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
|
||||
)).unwrap();
|
||||
|
||||
if self.must_stop_processing.get() {
|
||||
return Err(Error::AbortedTask);
|
||||
}
|
||||
let path = self.dumps_path.join(format!("{}.dump", dump_uid));
|
||||
let file = File::create(path)?;
|
||||
dump.persist_to(BufWriter::new(file))?;
|
||||
@ -1195,7 +1205,7 @@ impl IndexScheduler {
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| debug!("update: {:?}", indexing_step),
|
||||
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
@ -1242,7 +1252,8 @@ impl IndexScheduler {
|
||||
let (new_builder, user_result) =
|
||||
builder.remove_documents(document_ids)?;
|
||||
builder = new_builder;
|
||||
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
let provided_ids =
|
||||
if let Some(Details::DocumentDeletion { provided_ids, .. }) =
|
||||
task.details
|
||||
@ -1253,23 +1264,11 @@ impl IndexScheduler {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
match user_result {
|
||||
Ok(count) => {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids,
|
||||
deleted_documents: Some(count),
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
task.status = Status::Failed;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids,
|
||||
deleted_documents: Some(0),
|
||||
});
|
||||
task.error = Some(milli::Error::from(e).into());
|
||||
}
|
||||
}
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids,
|
||||
deleted_documents: Some(count),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1284,31 +1283,13 @@ impl IndexScheduler {
|
||||
milli::update::Settings::new(index_wtxn, index, indexer_config);
|
||||
builder.reset_primary_key();
|
||||
builder.execute(
|
||||
|indexing_step| debug!("update: {:?}", indexing_step),
|
||||
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.clone().get(),
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => {
|
||||
let mut builder = milli::update::DeleteDocuments::new(index_wtxn, index)?;
|
||||
documents.iter().flatten().for_each(|id| {
|
||||
builder.delete_external_id(id);
|
||||
});
|
||||
|
||||
let DocumentDeletionResult { deleted_documents, .. } = builder.execute()?;
|
||||
|
||||
for (task, documents) in tasks.iter_mut().zip(documents) {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids: documents.len(),
|
||||
deleted_documents: Some(deleted_documents.min(documents.len() as u64)),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
|
||||
let filter =
|
||||
if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
|
||||
@ -1318,7 +1299,13 @@ impl IndexScheduler {
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
|
||||
let deleted_documents = delete_document_by_filter(
|
||||
index_wtxn,
|
||||
filter,
|
||||
self.index_mapper.indexer_config(),
|
||||
self.must_stop_processing.clone(),
|
||||
index,
|
||||
);
|
||||
let original_filter = if let Some(Details::DocumentDeletionByFilter {
|
||||
original_filter,
|
||||
deleted_documents: _,
|
||||
@ -1552,6 +1539,8 @@ impl IndexScheduler {
|
||||
fn delete_document_by_filter<'a>(
|
||||
wtxn: &mut RwTxn<'a, '_>,
|
||||
filter: &serde_json::Value,
|
||||
indexer_config: &IndexerConfig,
|
||||
must_stop_processing: MustStopProcessing,
|
||||
index: &'a Index,
|
||||
) -> Result<u64> {
|
||||
let filter = Filter::from_json(filter)?;
|
||||
@ -1562,9 +1551,26 @@ fn delete_document_by_filter<'a>(
|
||||
}
|
||||
e => e.into(),
|
||||
})?;
|
||||
let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
|
||||
delete_operation.delete_documents(&candidates);
|
||||
delete_operation.execute().map(|result| result.deleted_documents)?
|
||||
|
||||
let config = IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
wtxn,
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| debug!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
let (new_builder, count) = builder.remove_documents_from_db_no_batch(&candidates)?;
|
||||
builder = new_builder;
|
||||
|
||||
let _ = builder.execute()?;
|
||||
count
|
||||
} else {
|
||||
0
|
||||
})
|
||||
|
@ -108,6 +108,8 @@ pub enum Error {
|
||||
TaskDeletionWithEmptyQuery,
|
||||
#[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")]
|
||||
TaskCancelationWithEmptyQuery,
|
||||
#[error("Aborted task")]
|
||||
AbortedTask,
|
||||
|
||||
#[error(transparent)]
|
||||
Dump(#[from] dump::Error),
|
||||
@ -175,6 +177,7 @@ impl Error {
|
||||
| Error::TaskNotFound(_)
|
||||
| Error::TaskDeletionWithEmptyQuery
|
||||
| Error::TaskCancelationWithEmptyQuery
|
||||
| Error::AbortedTask
|
||||
| Error::Dump(_)
|
||||
| Error::Heed(_)
|
||||
| Error::Milli(_)
|
||||
@ -236,6 +239,9 @@ impl ErrorCode for Error {
|
||||
Error::TaskDatabaseUpdate(_) => Code::Internal,
|
||||
Error::CreateBatch(_) => Code::Internal,
|
||||
|
||||
// This one should never be seen by the end user
|
||||
Error::AbortedTask => Code::Internal,
|
||||
|
||||
#[cfg(test)]
|
||||
Error::PlannedFailure => Code::Internal,
|
||||
}
|
||||
|
@ -1183,7 +1183,8 @@ impl IndexScheduler {
|
||||
// If we have an abortion error we must stop the tick here and re-schedule tasks.
|
||||
Err(Error::Milli(milli::Error::InternalError(
|
||||
milli::InternalError::AbortedIndexation,
|
||||
))) => {
|
||||
)))
|
||||
| Err(Error::AbortedTask) => {
|
||||
#[cfg(test)]
|
||||
self.breakpoint(Breakpoint::AbortedIndexation);
|
||||
wtxn.abort().map_err(Error::HeedTransaction)?;
|
||||
@ -4339,4 +4340,26 @@ mod tests {
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cancel_processing_dump() {
|
||||
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
||||
|
||||
let dump_creation = KindWithContent::DumpCreation { keys: Vec::new(), instance_uid: None };
|
||||
let dump_cancellation = KindWithContent::TaskCancelation {
|
||||
query: "cancel dump".to_owned(),
|
||||
tasks: RoaringBitmap::from_iter([0]),
|
||||
};
|
||||
let _ = index_scheduler.register(dump_creation).unwrap();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register");
|
||||
handle.advance_till([Start, BatchCreated, InsideProcessBatch]);
|
||||
|
||||
let _ = index_scheduler.register(dump_cancellation).unwrap();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered");
|
||||
|
||||
snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation");
|
||||
|
||||
handle.advance_one_successful_batch();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed");
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,35 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"dumpCreation" [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
@ -0,0 +1,45 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: canceled, canceled_by: 1, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||
1 {uid: 1, status: succeeded, details: { matched_tasks: 1, canceled_tasks: Some(0), original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued []
|
||||
succeeded [1,]
|
||||
canceled [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"taskCancelation" [1,]
|
||||
"dumpCreation" [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
1 [0,]
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
@ -0,0 +1,38 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[0,]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||
1 {uid: 1, status: enqueued, details: { matched_tasks: 1, canceled_tasks: None, original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"taskCancelation" [1,]
|
||||
"dumpCreation" [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
@ -324,7 +324,6 @@ impl ErrorCode for milli::Error {
|
||||
UserError::SerdeJson(_)
|
||||
| UserError::InvalidLmdbOpenOptions
|
||||
| UserError::DocumentLimitReached
|
||||
| UserError::AccessingSoftDeletedDocument { .. }
|
||||
| UserError::UnknownInternalDocumentId { .. } => Code::Internal,
|
||||
UserError::InvalidStoreFile => Code::InvalidStoreFile,
|
||||
UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice,
|
||||
|
@ -362,7 +362,7 @@ fn import_dump(
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
},
|
||||
|indexing_step| log::debug!("update: {:?}", indexing_step),
|
||||
|indexing_step| log::trace!("update: {:?}", indexing_step),
|
||||
|| false,
|
||||
)?;
|
||||
|
||||
|
@ -612,8 +612,8 @@ fn retrieve_document<S: AsRef<str>>(
|
||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||
|
||||
let internal_id = index
|
||||
.external_documents_ids(&txn)?
|
||||
.get(doc_id.as_bytes())
|
||||
.external_documents_ids()
|
||||
.get(&txn, doc_id)?
|
||||
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
|
||||
|
||||
let document = index
|
||||
|
@ -397,7 +397,7 @@ async fn delete_document_by_complex_filter() {
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"providedIds": 0,
|
||||
"deletedDocuments": 4,
|
||||
"deletedDocuments": 2,
|
||||
"originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]"
|
||||
},
|
||||
"error": null,
|
||||
|
@ -26,8 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" }
|
||||
fst = "0.4.7"
|
||||
fxhash = "0.2.1"
|
||||
geoutils = "0.5.1"
|
||||
grenad = { version = "0.4.4", default-features = false, features = [
|
||||
"tempfile",
|
||||
grenad = { version = "0.4.5", default-features = false, features = [
|
||||
"rayon", "tempfile"
|
||||
] }
|
||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
|
||||
"lmdb", "read-txn-no-tls"
|
||||
@ -79,6 +79,7 @@ big_s = "1.0.2"
|
||||
insta = "1.29.0"
|
||||
maplit = "1.0.2"
|
||||
md5 = "0.7.0"
|
||||
meili-snap = { path = "../meili-snap" }
|
||||
rand = { version = "0.8.5", features = ["small_rng"] }
|
||||
|
||||
[features]
|
||||
|
@ -1,5 +1,6 @@
|
||||
mod builder;
|
||||
mod enriched;
|
||||
mod primary_key;
|
||||
mod reader;
|
||||
mod serde_impl;
|
||||
|
||||
@ -11,6 +12,7 @@ use bimap::BiHashMap;
|
||||
pub use builder::DocumentsBatchBuilder;
|
||||
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
|
||||
use obkv::KvReader;
|
||||
pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY};
|
||||
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@ -87,6 +89,12 @@ impl DocumentsBatchIndex {
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldIdMapper for DocumentsBatchIndex {
|
||||
fn id(&self, name: &str) -> Option<FieldId> {
|
||||
self.id(name)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum Error {
|
||||
#[error("Error parsing number {value:?} at line {line}: {error}")]
|
||||
|
172
milli/src/documents/primary_key.rs
Normal file
172
milli/src/documents/primary_key.rs
Normal file
@ -0,0 +1,172 @@
|
||||
use std::iter;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::{FieldId, InternalError, Object, Result, UserError};
|
||||
|
||||
/// The symbol used to define levels in a nested primary key.
|
||||
const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
|
||||
|
||||
/// The default primary that is used when not specified.
|
||||
pub const DEFAULT_PRIMARY_KEY: &str = "id";
|
||||
|
||||
/// Trait for objects that can map the name of a field to its [`FieldId`].
|
||||
pub trait FieldIdMapper {
|
||||
/// Attempts to map the passed name to its [`FieldId`].
|
||||
///
|
||||
/// `None` if the field with this name was not found.
|
||||
fn id(&self, name: &str) -> Option<FieldId>;
|
||||
}
|
||||
|
||||
/// A type that represent the type of primary key that has been set
|
||||
/// for this index, a classic flat one or a nested one.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum PrimaryKey<'a> {
|
||||
Flat { name: &'a str, field_id: FieldId },
|
||||
Nested { name: &'a str },
|
||||
}
|
||||
|
||||
pub enum DocumentIdExtractionError {
|
||||
InvalidDocumentId(UserError),
|
||||
MissingDocumentId,
|
||||
TooManyDocumentIds(usize),
|
||||
}
|
||||
|
||||
impl<'a> PrimaryKey<'a> {
|
||||
pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option<Self> {
|
||||
Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) {
|
||||
Self::Nested { name: path }
|
||||
} else {
|
||||
let field_id = fields.id(path)?;
|
||||
Self::Flat { name: path, field_id }
|
||||
})
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &str {
|
||||
match self {
|
||||
PrimaryKey::Flat { name, .. } => name,
|
||||
PrimaryKey::Nested { name } => name,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn document_id(
|
||||
&self,
|
||||
document: &obkv::KvReader<FieldId>,
|
||||
fields: &impl FieldIdMapper,
|
||||
) -> Result<StdResult<String, DocumentIdExtractionError>> {
|
||||
match self {
|
||||
PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) {
|
||||
Some(document_id_bytes) => {
|
||||
let document_id = serde_json::from_slice(document_id_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(document_id)),
|
||||
Err(user_error) => {
|
||||
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||
}
|
||||
}
|
||||
}
|
||||
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||
},
|
||||
nested @ PrimaryKey::Nested { .. } => {
|
||||
let mut matching_documents_ids = Vec::new();
|
||||
for (first_level_name, right) in nested.possible_level_names() {
|
||||
if let Some(field_id) = fields.id(first_level_name) {
|
||||
if let Some(value_bytes) = document.get(field_id) {
|
||||
let object = serde_json::from_slice(value_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
fetch_matching_values(object, right, &mut matching_documents_ids);
|
||||
|
||||
if matching_documents_ids.len() >= 2 {
|
||||
return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(
|
||||
matching_documents_ids.len(),
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match matching_documents_ids.pop() {
|
||||
Some(document_id) => match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(document_id)),
|
||||
Err(user_error) => {
|
||||
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||
}
|
||||
},
|
||||
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an `Iterator` that gives all the possible fields names the primary key
|
||||
/// can have depending of the first level name and depth of the objects.
|
||||
pub fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
|
||||
let name = self.name();
|
||||
name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
|
||||
.map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
|
||||
.chain(iter::once((name, "")))
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
|
||||
match value {
|
||||
Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
|
||||
otherwise => output.push(otherwise),
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_matching_values_in_object(
|
||||
object: Object,
|
||||
selector: &str,
|
||||
base_key: &str,
|
||||
output: &mut Vec<Value>,
|
||||
) {
|
||||
for (key, value) in object {
|
||||
let base_key = if base_key.is_empty() {
|
||||
key.to_string()
|
||||
} else {
|
||||
format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
|
||||
};
|
||||
|
||||
if starts_with(selector, &base_key) {
|
||||
match value {
|
||||
Value::Object(object) => {
|
||||
fetch_matching_values_in_object(object, selector, &base_key, output)
|
||||
}
|
||||
value => output.push(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn starts_with(selector: &str, key: &str) -> bool {
|
||||
selector.strip_prefix(key).map_or(false, |tail| {
|
||||
tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
|
||||
})
|
||||
}
|
||||
|
||||
// FIXME: move to a DocumentId struct
|
||||
|
||||
fn validate_document_id(document_id: &str) -> Option<&str> {
|
||||
if !document_id.is_empty()
|
||||
&& document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
||||
{
|
||||
Some(document_id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
|
||||
match document_id {
|
||||
Value::String(string) => match validate_document_id(&string) {
|
||||
Some(s) if s.len() == string.len() => Ok(Ok(string)),
|
||||
Some(s) => Ok(Ok(s.to_string())),
|
||||
None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
|
||||
},
|
||||
Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
|
||||
content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
|
||||
}
|
||||
}
|
@ -89,8 +89,6 @@ pub enum FieldIdMapMissingEntry {
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum UserError {
|
||||
#[error("A soft deleted internal document id have been used: `{document_id}`.")]
|
||||
AccessingSoftDeletedDocument { document_id: DocumentId },
|
||||
#[error("A document cannot contain more than 65,535 fields.")]
|
||||
AttributeLimitReached,
|
||||
#[error(transparent)]
|
||||
|
@ -1,159 +1,75 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::{fmt, str};
|
||||
|
||||
use fst::map::IndexedValue;
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use roaring::RoaringBitmap;
|
||||
use heed::types::{OwnedType, Str};
|
||||
use heed::{Database, RoIter, RoTxn, RwTxn};
|
||||
|
||||
const DELETED_ID: u64 = u64::MAX;
|
||||
use crate::{DocumentId, BEU32};
|
||||
|
||||
pub struct ExternalDocumentsIds<'a> {
|
||||
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
|
||||
pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
|
||||
soft_deleted_docids: RoaringBitmap,
|
||||
pub enum DocumentOperationKind {
|
||||
Create,
|
||||
Delete,
|
||||
}
|
||||
|
||||
impl<'a> ExternalDocumentsIds<'a> {
|
||||
pub fn new(
|
||||
hard: fst::Map<Cow<'a, [u8]>>,
|
||||
soft: fst::Map<Cow<'a, [u8]>>,
|
||||
soft_deleted_docids: RoaringBitmap,
|
||||
) -> ExternalDocumentsIds<'a> {
|
||||
ExternalDocumentsIds { hard, soft, soft_deleted_docids }
|
||||
}
|
||||
pub struct DocumentOperation {
|
||||
pub external_id: String,
|
||||
pub internal_id: DocumentId,
|
||||
pub kind: DocumentOperationKind,
|
||||
}
|
||||
|
||||
pub fn into_static(self) -> ExternalDocumentsIds<'static> {
|
||||
ExternalDocumentsIds {
|
||||
hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
|
||||
soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
|
||||
soft_deleted_docids: self.soft_deleted_docids,
|
||||
}
|
||||
pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>);
|
||||
|
||||
impl ExternalDocumentsIds {
|
||||
pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds {
|
||||
ExternalDocumentsIds(db)
|
||||
}
|
||||
|
||||
/// Returns `true` if hard and soft external documents lists are empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.hard.is_empty() && self.soft.is_empty()
|
||||
pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result<bool> {
|
||||
self.0.is_empty(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
|
||||
let external_id = external_id.as_ref();
|
||||
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
|
||||
Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
|
||||
Some(id.try_into().unwrap())
|
||||
}
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
|
||||
/// don't contain any soft deleted document id.
|
||||
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
|
||||
let mut new_hard_builder = fst::MapBuilder::memory();
|
||||
|
||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||
let mut iter = union_op.into_stream();
|
||||
while let Some((external_id, docids)) = iter.next() {
|
||||
// prefer selecting the ids from soft, always
|
||||
let id = indexed_last_value(docids).unwrap();
|
||||
if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) {
|
||||
new_hard_builder.insert(external_id, id)?;
|
||||
}
|
||||
}
|
||||
drop(iter);
|
||||
|
||||
// Delete soft map completely
|
||||
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
||||
// We save the new map as the new hard map.
|
||||
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_ids<A: AsRef<[u8]>>(&mut self, other: &fst::Map<A>) -> fst::Result<()> {
|
||||
let union_op = self.soft.op().add(other).r#union();
|
||||
|
||||
let mut new_soft_builder = fst::MapBuilder::memory();
|
||||
let mut iter = union_op.into_stream();
|
||||
while let Some((external_id, marked_docids)) = iter.next() {
|
||||
let id = indexed_last_value(marked_docids).unwrap();
|
||||
new_soft_builder.insert(external_id, id)?;
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
// We save the new map as the new soft map.
|
||||
self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
|
||||
self.merge_soft_into_hard()
|
||||
pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> {
|
||||
Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get()))
|
||||
}
|
||||
|
||||
/// An helper function to debug this type, returns an `HashMap` of both,
|
||||
/// soft and hard fst maps, combined.
|
||||
pub fn to_hash_map(&self) -> HashMap<String, u32> {
|
||||
let mut map = HashMap::new();
|
||||
|
||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||
let mut iter = union_op.into_stream();
|
||||
while let Some((external_id, marked_docids)) = iter.next() {
|
||||
let id = indexed_last_value(marked_docids).unwrap();
|
||||
if id != DELETED_ID {
|
||||
let external_id = str::from_utf8(external_id).unwrap();
|
||||
map.insert(external_id.to_owned(), id.try_into().unwrap());
|
||||
}
|
||||
pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, u32>> {
|
||||
let mut map = HashMap::default();
|
||||
for result in self.0.iter(rtxn)? {
|
||||
let (external, internal) = result?;
|
||||
map.insert(external.to_owned(), internal.get());
|
||||
}
|
||||
|
||||
map
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
/// Return an fst of the combined hard and soft deleted ID.
|
||||
pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
|
||||
if self.soft.is_empty() {
|
||||
return Ok(Cow::Borrowed(&self.hard));
|
||||
}
|
||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||
|
||||
let mut iter = union_op.into_stream();
|
||||
let mut new_hard_builder = fst::MapBuilder::memory();
|
||||
while let Some((external_id, marked_docids)) = iter.next() {
|
||||
let value = indexed_last_value(marked_docids).unwrap();
|
||||
if value != DELETED_ID {
|
||||
new_hard_builder.insert(external_id, value)?;
|
||||
/// Applies the list of operations passed as argument, modifying the current external to internal id mapping.
|
||||
///
|
||||
/// If the list contains multiple operations on the same external id, then the result is unspecified.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - If attempting to delete a document that doesn't exist
|
||||
/// - If attempting to create a document that already exists
|
||||
pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec<DocumentOperation>) -> heed::Result<()> {
|
||||
for DocumentOperation { external_id, internal_id, kind } in operations {
|
||||
match kind {
|
||||
DocumentOperationKind::Create => {
|
||||
self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?;
|
||||
}
|
||||
DocumentOperationKind::Delete => {
|
||||
if !self.0.delete(wtxn, &external_id)? {
|
||||
panic!("Attempting to delete a non-existing document")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
|
||||
}
|
||||
|
||||
fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
|
||||
if self.soft.len() >= self.hard.len() / 2 {
|
||||
self.hard = self.to_fst()?.into_owned();
|
||||
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for ExternalDocumentsIds<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish()
|
||||
/// Returns an iterator over all the external ids.
|
||||
pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, OwnedType<BEU32>>> {
|
||||
self.0.iter(rtxn)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ExternalDocumentsIds<'static> {
|
||||
fn default() -> Self {
|
||||
ExternalDocumentsIds {
|
||||
hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
|
||||
soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
|
||||
soft_deleted_docids: RoaringBitmap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the value of the `IndexedValue` with the highest _index_.
|
||||
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
|
||||
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
|
||||
}
|
||||
|
@ -81,6 +81,12 @@ impl Default for FieldsIdsMap {
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::documents::FieldIdMapper for FieldsIdsMap {
|
||||
fn id(&self, name: &str) -> Option<FieldId> {
|
||||
self.id(name)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
@ -6,6 +6,7 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
|
||||
/// This is the limit where using a byteorder became less size efficient
|
||||
/// than using a direct roaring encoding, it is also the point where we are able
|
||||
@ -60,12 +61,16 @@ impl CboRoaringBitmapCodec {
|
||||
/// if the merged values length is under the threshold, values are directly
|
||||
/// serialized in the buffer else a RoaringBitmap is created from the
|
||||
/// values and is serialized in the buffer.
|
||||
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = A>,
|
||||
A: AsRef<[u8]>,
|
||||
{
|
||||
let mut roaring = RoaringBitmap::new();
|
||||
let mut vec = Vec::new();
|
||||
|
||||
for bytes in slices {
|
||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
||||
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
|
||||
let mut reader = bytes.as_ref();
|
||||
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
||||
vec.push(integer);
|
||||
@ -85,7 +90,7 @@ impl CboRoaringBitmapCodec {
|
||||
}
|
||||
} else {
|
||||
// We can unwrap safely because the vector is sorted upper.
|
||||
let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
|
||||
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
|
||||
roaring.serialize_into(buffer)?;
|
||||
}
|
||||
} else {
|
||||
@ -95,6 +100,33 @@ impl CboRoaringBitmapCodec {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Merges a DelAdd delta into a CboRoaringBitmap.
|
||||
pub fn merge_deladd_into<'a>(
|
||||
deladd: KvReaderDelAdd<'_>,
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> io::Result<Option<&'a [u8]>> {
|
||||
// Deserialize the bitmap that is already there
|
||||
let mut previous = Self::deserialize_from(previous)?;
|
||||
|
||||
// Remove integers we no more want in the previous bitmap
|
||||
if let Some(value) = deladd.get(DelAdd::Deletion) {
|
||||
previous -= Self::deserialize_from(value)?;
|
||||
}
|
||||
|
||||
// Insert the new integers we want in the previous bitmap
|
||||
if let Some(value) = deladd.get(DelAdd::Addition) {
|
||||
previous |= Self::deserialize_from(value)?;
|
||||
}
|
||||
|
||||
if previous.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Self::serialize_into(&previous, buffer);
|
||||
Ok(Some(&buffer[..]))
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -13,7 +13,7 @@ use crate::heed_codec::ByteSliceRefCodec;
|
||||
/// The documents returned by the iterator are grouped by the facet values that
|
||||
/// determined their rank. For example, given the documents:
|
||||
///
|
||||
/// ```ignore
|
||||
/// ```text
|
||||
/// 0: { "colour": ["blue", "green"] }
|
||||
/// 1: { "colour": ["blue", "red"] }
|
||||
/// 2: { "colour": ["orange", "red"] }
|
||||
@ -22,7 +22,7 @@ use crate::heed_codec::ByteSliceRefCodec;
|
||||
/// ```
|
||||
/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator
|
||||
/// over the following elements:
|
||||
/// ```ignore
|
||||
/// ```text
|
||||
/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue"
|
||||
/// [3] // same for "green"
|
||||
/// [2] // same for "orange"
|
||||
|
@ -223,12 +223,9 @@ impl<'a> Filter<'a> {
|
||||
impl<'a> Filter<'a> {
|
||||
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
|
||||
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
|
||||
let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?;
|
||||
let filterable_fields = index.filterable_fields(rtxn)?;
|
||||
|
||||
// and finally we delete all the soft_deleted_documents, again, only once at the very end
|
||||
self.inner_evaluate(rtxn, index, &filterable_fields)
|
||||
.map(|result| result - soft_deleted_documents)
|
||||
}
|
||||
|
||||
fn evaluate_operator(
|
||||
|
@ -12,7 +12,7 @@ use super::Word;
|
||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
||||
use crate::{
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
|
||||
};
|
||||
|
||||
/// A cache storing pointers to values in the LMDB databases.
|
||||
@ -25,7 +25,7 @@ pub struct DatabaseCache<'ctx> {
|
||||
pub word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>,
|
||||
pub prefix_word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
@ -168,7 +168,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
@ -182,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
&mut self,
|
||||
word: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
@ -230,7 +230,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
@ -244,7 +244,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
&mut self,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
@ -297,35 +297,47 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
prefix2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
(proximity, word1, prefix2),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(prefix2).as_str(),
|
||||
),
|
||||
&mut self.db_cache.word_prefix_pair_proximity_docids,
|
||||
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
let docids = match self
|
||||
.db_cache
|
||||
.word_prefix_pair_proximity_docids
|
||||
.entry((proximity, word1, prefix2))
|
||||
{
|
||||
Entry::Occupied(docids) => docids.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
// compute docids using prefix iter and store the result in the cache.
|
||||
let key = U8StrStrCodec::bytes_encode(&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(prefix2).as_str(),
|
||||
))
|
||||
.unwrap()
|
||||
.into_owned();
|
||||
let mut prefix_docids = RoaringBitmap::new();
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_pair_proximity_docids
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(self.txn, &key)?;
|
||||
for result in remap_key_type {
|
||||
let (_, docids) = result?;
|
||||
|
||||
prefix_docids |= docids;
|
||||
}
|
||||
entry.insert(Some(prefix_docids.clone()));
|
||||
Some(prefix_docids)
|
||||
}
|
||||
};
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
pub fn get_db_prefix_word_pair_proximity_docids(
|
||||
&mut self,
|
||||
left_prefix: Interned<String>,
|
||||
right: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
(proximity, left_prefix, right),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(left_prefix).as_str(),
|
||||
self.word_interner.get(right).as_str(),
|
||||
),
|
||||
&mut self.db_cache.prefix_word_pair_proximity_docids,
|
||||
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
// only accept exact matches on reverted positions
|
||||
self.get_db_word_pair_proximity_docids(left_prefix, right, proximity)
|
||||
}
|
||||
|
||||
pub fn get_db_word_fid_docids(
|
||||
|
@ -371,7 +371,7 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best s");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
@ -379,13 +379,13 @@ fn test_proximity_prefix_db() {
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best summer meal\"",
|
||||
"\"summer best\"",
|
||||
"\"this is the best meal of summer\"",
|
||||
"\"summer x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||
"\"this is the best cooked meal of the summer\"",
|
||||
"\"this is the best meal of the summer\"",
|
||||
"\"summer x y best\"",
|
||||
"\"summer x best\"",
|
||||
"\"summer best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
]
|
||||
"###);
|
||||
@ -423,17 +423,17 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best win");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best winter meal\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"this is the best winter meal\"",
|
||||
"\"winter x y best\"",
|
||||
"\"winter x best\"",
|
||||
"\"winter best\"",
|
||||
@ -471,20 +471,20 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best wi");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best winter meal\"",
|
||||
"\"winter best\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"winter x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"winter x y best\"",
|
||||
"\"winter x best\"",
|
||||
"\"winter best\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
|
@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
|
@ -6,7 +6,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
@ -14,7 +14,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
|
@ -13,6 +13,7 @@ This module tests the `sort` ranking rule:
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
use meili_snap::insta;
|
||||
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::new::tests::collect_field_values;
|
||||
|
@ -4,9 +4,8 @@ use std::path::Path;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
|
||||
use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index};
|
||||
use crate::{make_db_snap_from_iter, obkv_to_json, Index};
|
||||
|
||||
#[track_caller]
|
||||
pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) {
|
||||
@ -98,7 +97,6 @@ Create a snapshot test of the given database.
|
||||
- `facet_id_string_docids`
|
||||
- `documents_ids`
|
||||
- `stop_words`
|
||||
- `soft_deleted_documents_ids`
|
||||
- `field_distribution`
|
||||
- `fields_ids_map`
|
||||
- `geo_faceted_documents_ids`
|
||||
@ -221,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
|
||||
&format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
|
||||
})
|
||||
}
|
||||
pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |(
|
||||
(proximity, word1, prefix),
|
||||
b,
|
||||
)| {
|
||||
&format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b))
|
||||
})
|
||||
}
|
||||
pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |(
|
||||
(proximity, prefix, word2),
|
||||
b,
|
||||
)| {
|
||||
&format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b))
|
||||
})
|
||||
}
|
||||
pub fn snap_word_position_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
|
||||
&format!("{word:<16} {position:<6} {}", display_bitmap(&b))
|
||||
@ -308,12 +290,6 @@ pub fn snap_stop_words(index: &Index) -> String {
|
||||
let snap = format!("{stop_words:?}");
|
||||
snap
|
||||
}
|
||||
pub fn snap_soft_deleted_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap();
|
||||
|
||||
display_bitmap(&soft_deleted_documents_ids)
|
||||
}
|
||||
pub fn snap_field_distributions(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let mut snap = String::new();
|
||||
@ -340,50 +316,21 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
|
||||
}
|
||||
pub fn snap_external_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap();
|
||||
let external_ids = index.external_documents_ids().to_hash_map(&rtxn).unwrap();
|
||||
// ensure fixed order (not guaranteed by hashmap)
|
||||
let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect();
|
||||
external_ids.sort_by(|(l, _), (r, _)| l.cmp(r));
|
||||
|
||||
let mut snap = String::new();
|
||||
|
||||
writeln!(&mut snap, "soft:").unwrap();
|
||||
let stream_soft = soft.stream();
|
||||
let soft_external_ids = stream_soft.into_str_vec().unwrap();
|
||||
for (key, id) in soft_external_ids {
|
||||
writeln!(&mut snap, "{key:<24} {id}").unwrap();
|
||||
}
|
||||
writeln!(&mut snap, "hard:").unwrap();
|
||||
let stream_hard = hard.stream();
|
||||
let hard_external_ids = stream_hard.into_str_vec().unwrap();
|
||||
for (key, id) in hard_external_ids {
|
||||
writeln!(&mut snap, "docids:").unwrap();
|
||||
for (key, id) in external_ids {
|
||||
writeln!(&mut snap, "{key:<24} {id}").unwrap();
|
||||
}
|
||||
|
||||
snap
|
||||
}
|
||||
pub fn snap_number_faceted_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let mut snap = String::new();
|
||||
for field_id in fields_ids_map.ids() {
|
||||
let number_faceted_documents_ids =
|
||||
index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap();
|
||||
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids))
|
||||
.unwrap();
|
||||
}
|
||||
snap
|
||||
}
|
||||
pub fn snap_string_faceted_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
|
||||
let mut snap = String::new();
|
||||
for field_id in fields_ids_map.ids() {
|
||||
let string_faceted_documents_ids =
|
||||
index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap();
|
||||
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids))
|
||||
.unwrap();
|
||||
}
|
||||
snap
|
||||
}
|
||||
pub fn snap_words_fst(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let words_fst = index.words_fst(&rtxn).unwrap();
|
||||
@ -516,9 +463,6 @@ macro_rules! full_snap_of_db {
|
||||
($index:ident, stop_words) => {{
|
||||
$crate::snapshot_tests::snap_stop_words(&$index)
|
||||
}};
|
||||
($index:ident, soft_deleted_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, field_distribution) => {{
|
||||
$crate::snapshot_tests::snap_field_distributions(&$index)
|
||||
}};
|
||||
@ -531,12 +475,6 @@ macro_rules! full_snap_of_db {
|
||||
($index:ident, external_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_external_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, number_faceted_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_number_faceted_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, string_faceted_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_string_faceted_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, words_fst) => {{
|
||||
$crate::snapshot_tests::snap_words_fst(&$index)
|
||||
}};
|
||||
|
@ -8,16 +8,11 @@ pub struct AvailableDocumentsIds {
|
||||
}
|
||||
|
||||
impl AvailableDocumentsIds {
|
||||
pub fn from_documents_ids(
|
||||
docids: &RoaringBitmap,
|
||||
soft_deleted_docids: &RoaringBitmap,
|
||||
) -> AvailableDocumentsIds {
|
||||
let used_docids = docids | soft_deleted_docids;
|
||||
|
||||
match used_docids.max() {
|
||||
pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
|
||||
match docids.max() {
|
||||
Some(last_id) => {
|
||||
let mut available = RoaringBitmap::from_iter(0..last_id);
|
||||
available -= used_docids;
|
||||
available -= docids;
|
||||
|
||||
let iter = match last_id.checked_add(1) {
|
||||
Some(id) => id..=u32::max_value(),
|
||||
@ -50,7 +45,7 @@ mod tests {
|
||||
#[test]
|
||||
fn empty() {
|
||||
let base = RoaringBitmap::new();
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||
let right = 0..=u32::max_value();
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
@ -63,28 +58,8 @@ mod tests {
|
||||
base.insert(100);
|
||||
base.insert(405);
|
||||
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||
let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn soft_deleted() {
|
||||
let mut base = RoaringBitmap::new();
|
||||
base.insert(0);
|
||||
base.insert(10);
|
||||
base.insert(100);
|
||||
base.insert(405);
|
||||
|
||||
let mut soft_deleted = RoaringBitmap::new();
|
||||
soft_deleted.insert(1);
|
||||
soft_deleted.insert(11);
|
||||
soft_deleted.insert(101);
|
||||
soft_deleted.insert(406);
|
||||
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted);
|
||||
let right =
|
||||
(0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n));
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,7 @@
|
||||
use roaring::RoaringBitmap;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
|
||||
use crate::{FieldDistribution, Index, Result};
|
||||
|
||||
pub struct ClearDocuments<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
@ -21,13 +20,12 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
let Index {
|
||||
env: _env,
|
||||
main: _main,
|
||||
external_documents_ids,
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_prefix_docids,
|
||||
exact_word_prefix_docids,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
prefix_word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
field_id_word_count_docids,
|
||||
@ -51,43 +49,23 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
|
||||
// We retrieve the number of documents ids that we are deleting.
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
||||
|
||||
// We clean some of the main engine datastructures.
|
||||
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
||||
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
|
||||
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
|
||||
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
|
||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?;
|
||||
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
||||
self.index.delete_geo_rtree(self.wtxn)?;
|
||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||
self.index.delete_vector_hnsw(self.wtxn)?;
|
||||
|
||||
// We clean all the faceted documents ids.
|
||||
for field_id in faceted_fields {
|
||||
self.index.put_faceted_documents_ids(
|
||||
self.wtxn,
|
||||
field_id,
|
||||
FacetType::Number,
|
||||
&empty_roaring,
|
||||
)?;
|
||||
self.index.put_faceted_documents_ids(
|
||||
self.wtxn,
|
||||
field_id,
|
||||
FacetType::String,
|
||||
&empty_roaring,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Clear the other databases.
|
||||
external_documents_ids.clear(self.wtxn)?;
|
||||
word_docids.clear(self.wtxn)?;
|
||||
exact_word_docids.clear(self.wtxn)?;
|
||||
word_prefix_docids.clear(self.wtxn)?;
|
||||
exact_word_prefix_docids.clear(self.wtxn)?;
|
||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_position_docids.clear(self.wtxn)?;
|
||||
word_fid_docids.clear(self.wtxn)?;
|
||||
field_id_word_count_docids.clear(self.wtxn)?;
|
||||
@ -140,7 +118,7 @@ mod tests {
|
||||
|
||||
assert!(index.words_fst(&rtxn).unwrap().is_empty());
|
||||
assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
|
||||
assert!(index.external_documents_ids(&rtxn).unwrap().is_empty());
|
||||
assert!(index.external_documents_ids().is_empty(&rtxn).unwrap());
|
||||
assert!(index.documents_ids(&rtxn).unwrap().is_empty());
|
||||
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
|
||||
assert!(index.geo_rtree(&rtxn).unwrap().is_none());
|
||||
@ -150,7 +128,6 @@ mod tests {
|
||||
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());
|
||||
|
125
milli/src/update/del_add.rs
Normal file
125
milli/src/update/del_add.rs
Normal file
@ -0,0 +1,125 @@
|
||||
use obkv::Key;
|
||||
|
||||
pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
|
||||
pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>;
|
||||
|
||||
/// DelAdd defines the new value to add in the database and old value to delete from the database.
|
||||
///
|
||||
/// Its used in an OBKV to be serialized in grenad files.
|
||||
#[repr(u8)]
|
||||
#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)]
|
||||
pub enum DelAdd {
|
||||
Deletion = 0,
|
||||
Addition = 1,
|
||||
}
|
||||
|
||||
impl Key for DelAdd {
|
||||
const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>();
|
||||
type BYTES = [u8; Self::BYTES_SIZE];
|
||||
|
||||
fn to_be_bytes(&self) -> Self::BYTES {
|
||||
u8::to_be_bytes(*self as u8)
|
||||
}
|
||||
|
||||
fn from_be_bytes(array: Self::BYTES) -> Self {
|
||||
match u8::from_be_bytes(array) {
|
||||
0 => Self::Deletion,
|
||||
1 => Self::Addition,
|
||||
otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
|
||||
///
|
||||
/// Deletion: put all the values under DelAdd::Deletion
|
||||
/// Addition: put all the values under DelAdd::Addition,
|
||||
/// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition,
|
||||
pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
|
||||
reader: obkv::KvReader<K>,
|
||||
operation: DelAddOperation,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
for (key, value) in reader.iter() {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) {
|
||||
value_writer.insert(DelAdd::Deletion, value)?;
|
||||
}
|
||||
if matches!(operation, DelAddOperation::Addition | DelAddOperation::DeletionAndAddition) {
|
||||
value_writer.insert(DelAdd::Addition, value)?;
|
||||
}
|
||||
value_writer.finish()?;
|
||||
writer.insert(key, &value_buffer)?;
|
||||
}
|
||||
|
||||
writer.finish()
|
||||
}
|
||||
|
||||
/// Enum controlling the side of the DelAdd obkv in which the provided value will be written.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum DelAddOperation {
|
||||
Deletion,
|
||||
Addition,
|
||||
DeletionAndAddition,
|
||||
}
|
||||
|
||||
/// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
|
||||
///
|
||||
/// putting each deletion obkv's keys under an DelAdd::Deletion
|
||||
/// and putting each addition obkv's keys under an DelAdd::Addition
|
||||
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
|
||||
deletion: obkv::KvReader<K>,
|
||||
addition: obkv::KvReader<K>,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||
value_buffer.clear();
|
||||
match eob {
|
||||
Left((k, v)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, v).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
Right((k, v)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Addition, v).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
Both((k, deletion), (_, addition)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer.finish()
|
||||
}
|
||||
|
||||
pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
|
||||
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
|
||||
}
|
||||
|
||||
/// A function that extracts and returns the Add side of a DelAdd obkv.
|
||||
/// This is useful when there are no previous value in the database and
|
||||
/// therefore we don't need to do a diff with what's already there.
|
||||
///
|
||||
/// If there is no Add side we currently write an empty buffer
|
||||
/// which is a valid CboRoaringBitmap.
|
||||
#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
|
||||
pub fn deladd_serialize_add_side<'a>(
|
||||
obkv: &'a [u8],
|
||||
_buffer: &mut Vec<u8>,
|
||||
) -> crate::Result<&'a [u8]> {
|
||||
Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,10 +1,9 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::{BytesEncode, Error, RoTxn, RwTxn};
|
||||
use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||
@ -13,17 +12,15 @@ use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
|
||||
|
||||
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
||||
/// by rebuilding the database "from scratch".
|
||||
///
|
||||
/// First, the new elements are inserted into the level 0 of the database. Then, the
|
||||
/// higher levels are cleared and recomputed from the content of level 0.
|
||||
///
|
||||
/// Finally, the `faceted_documents_ids` value in the main database of `Index`
|
||||
/// is updated to contain the new set of faceted documents.
|
||||
pub struct FacetsUpdateBulk<'i> {
|
||||
index: &'i Index,
|
||||
group_size: u8,
|
||||
@ -31,7 +28,7 @@ pub struct FacetsUpdateBulk<'i> {
|
||||
facet_type: FacetType,
|
||||
field_ids: Vec<FieldId>,
|
||||
// None if level 0 does not need to be updated
|
||||
new_data: Option<grenad::Reader<BufReader<File>>>,
|
||||
delta_data: Option<grenad::Reader<BufReader<File>>>,
|
||||
}
|
||||
|
||||
impl<'i> FacetsUpdateBulk<'i> {
|
||||
@ -39,7 +36,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
index: &'i Index,
|
||||
field_ids: Vec<FieldId>,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetsUpdateBulk<'i> {
|
||||
@ -49,7 +46,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
group_size,
|
||||
min_level_size,
|
||||
facet_type,
|
||||
new_data: Some(new_data),
|
||||
delta_data: Some(delta_data),
|
||||
}
|
||||
}
|
||||
|
||||
@ -64,13 +61,13 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
group_size: FACET_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
facet_type,
|
||||
new_data: None,
|
||||
delta_data: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||
let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self;
|
||||
let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
|
||||
|
||||
let db = match facet_type {
|
||||
FacetType::String => index
|
||||
@ -81,12 +78,9 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
}
|
||||
};
|
||||
|
||||
let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };
|
||||
let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
|
||||
|
||||
inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {
|
||||
index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?;
|
||||
Ok(())
|
||||
})?;
|
||||
inner.update(wtxn, &field_ids)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@ -95,26 +89,19 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||
pub new_data: Option<grenad::Reader<R>>,
|
||||
pub delta_data: Option<grenad::Reader<R>>,
|
||||
pub group_size: u8,
|
||||
pub min_level_size: u8,
|
||||
}
|
||||
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
pub fn update(
|
||||
mut self,
|
||||
wtxn: &mut RwTxn,
|
||||
field_ids: &[u16],
|
||||
mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> {
|
||||
self.update_level0(wtxn)?;
|
||||
for &field_id in field_ids.iter() {
|
||||
self.clear_levels(wtxn, field_id)?;
|
||||
}
|
||||
|
||||
for &field_id in field_ids.iter() {
|
||||
let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?;
|
||||
|
||||
handle_all_docids(wtxn, field_id, all_docids)?;
|
||||
let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
|
||||
|
||||
for level_reader in level_readers {
|
||||
let mut cursor = level_reader.into_cursor()?;
|
||||
@ -133,19 +120,27 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
self.db.delete_range(wtxn, &range).map(drop)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
|
||||
let new_data = match self.new_data.take() {
|
||||
let delta_data = match self.delta_data.take() {
|
||||
Some(x) => x,
|
||||
None => return Ok(()),
|
||||
};
|
||||
if self.db.is_empty(wtxn)? {
|
||||
let mut buffer = Vec::new();
|
||||
let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
|
||||
let mut cursor = new_data.into_cursor()?;
|
||||
let mut cursor = delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
let value = KvReaderDelAdd::new(value);
|
||||
|
||||
// DB is empty, it is safe to ignore Del operations
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
buffer.clear();
|
||||
// the group size for level 0
|
||||
buffer.push(1);
|
||||
@ -157,11 +152,14 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
let mut buffer = Vec::new();
|
||||
let database = self.db.remap_types::<ByteSlice, ByteSlice>();
|
||||
|
||||
let mut cursor = new_data.into_cursor()?;
|
||||
let mut cursor = delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = KvReaderDelAdd::new(value);
|
||||
|
||||
// the value is a CboRoaringBitmap, but I still need to prepend the
|
||||
// group size for level 0 (= 1) to it
|
||||
buffer.clear();
|
||||
@ -169,17 +167,27 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
// then we extend the buffer with the docids bitmap
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => {
|
||||
// prev_value is the group size for level 0, followed by the previous bitmap.
|
||||
let old_bitmap = &prev_value[1..];
|
||||
CboRoaringBitmapCodec::merge_into(
|
||||
&[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)],
|
||||
&mut buffer,
|
||||
)?;
|
||||
CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
|
||||
}
|
||||
None => {
|
||||
// it is safe to ignore the del in that case.
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
// won't put the key in DB as the value would be empty
|
||||
continue;
|
||||
};
|
||||
|
||||
buffer.extend_from_slice(value);
|
||||
}
|
||||
};
|
||||
database.put(wtxn, key, &buffer)?;
|
||||
let new_bitmap = &buffer[1..];
|
||||
// if the new bitmap is empty, let's remove it
|
||||
if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 {
|
||||
database.delete(wtxn, key)?;
|
||||
} else {
|
||||
database.put(wtxn, key, &buffer)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@ -188,16 +196,10 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
txn: &RoTxn,
|
||||
) -> Result<(Vec<grenad::Reader<BufReader<File>>>, RoaringBitmap)> {
|
||||
let mut all_docids = RoaringBitmap::new();
|
||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
|
||||
for bitmap in bitmaps {
|
||||
all_docids |= bitmap;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
|
||||
|
||||
Ok((subwriters, all_docids))
|
||||
Ok(subwriters)
|
||||
}
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn read_level_0<'t>(
|
||||
@ -491,7 +493,6 @@ mod tests {
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
|
||||
db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -1,360 +0,0 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use heed::RwTxn;
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner};
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
||||
///
|
||||
/// Depending on the number of removed elements and the existing size of the database, we use either
|
||||
/// a bulk delete method or an incremental delete method.
|
||||
pub struct FacetsDelete<'i, 'b> {
|
||||
index: &'i Index,
|
||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||
facet_type: FacetType,
|
||||
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
|
||||
docids_to_delete: &'b RoaringBitmap,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
}
|
||||
impl<'i, 'b> FacetsDelete<'i, 'b> {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
facet_type: FacetType,
|
||||
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
|
||||
docids_to_delete: &'b RoaringBitmap,
|
||||
) -> Self {
|
||||
let database = match facet_type {
|
||||
FacetType::String => index
|
||||
.facet_id_string_docids
|
||||
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
||||
FacetType::Number => {
|
||||
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>()
|
||||
}
|
||||
};
|
||||
Self {
|
||||
index,
|
||||
database,
|
||||
facet_type,
|
||||
affected_facet_values,
|
||||
docids_to_delete,
|
||||
group_size: FACET_GROUP_SIZE,
|
||||
max_group_size: FACET_MAX_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> {
|
||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
for (field_id, affected_facet_values) in self.affected_facet_values {
|
||||
// This is an incorrect condition, since we assume that the length of the database is equal
|
||||
// to the number of facet values for the given field_id. It means that in some cases, we might
|
||||
// wrongly choose the incremental indexer over the bulk indexer. But the only case where that could
|
||||
// really be a performance problem is when we fully delete a large ratio of all facet values for
|
||||
// each field id. This would almost never happen. Still, to be overly cautious, I have added a
|
||||
// 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance
|
||||
// penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead.
|
||||
if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) {
|
||||
// Bulk delete
|
||||
let mut modified = false;
|
||||
|
||||
for facet_value in affected_facet_values {
|
||||
let key =
|
||||
FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() };
|
||||
let mut old = self.database.get(wtxn, &key)?.unwrap();
|
||||
let previous_len = old.bitmap.len();
|
||||
old.bitmap -= self.docids_to_delete;
|
||||
if old.bitmap.is_empty() {
|
||||
modified = true;
|
||||
self.database.delete(wtxn, &key)?;
|
||||
} else if old.bitmap.len() != previous_len {
|
||||
modified = true;
|
||||
self.database.put(wtxn, &key, &old)?;
|
||||
}
|
||||
}
|
||||
if modified {
|
||||
let builder = FacetsUpdateBulk::new_not_updating_level_0(
|
||||
self.index,
|
||||
vec![field_id],
|
||||
self.facet_type,
|
||||
);
|
||||
builder.execute(wtxn)?;
|
||||
}
|
||||
} else {
|
||||
// Incremental
|
||||
let inc = FacetsUpdateIncrementalInner {
|
||||
db: self.database,
|
||||
group_size: self.group_size,
|
||||
min_level_size: self.min_level_size,
|
||||
max_group_size: self.max_group_size,
|
||||
};
|
||||
for facet_value in affected_facet_values {
|
||||
inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::SeedableRng;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::db_snap;
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::facet::test_helpers::ordered_string;
|
||||
use crate::update::{DeleteDocuments, DeletionStrategy};
|
||||
|
||||
#[test]
|
||||
fn delete_mixed_incremental_and_bulk() {
|
||||
// The point of this test is to create an index populated with documents
|
||||
// containing different filterable attributes. Then, we delete a bunch of documents
|
||||
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
|
||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(
|
||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
||||
);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"label": i / 10,
|
||||
"colour": i / 100,
|
||||
"timestamp": i / 2,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
|
||||
db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf");
|
||||
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
|
||||
// by deleting the first 100 documents, we expect that:
|
||||
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
|
||||
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
|
||||
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
|
||||
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
|
||||
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
|
||||
db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56");
|
||||
}
|
||||
|
||||
// Same test as above but working with string values for the facets
|
||||
#[test]
|
||||
fn delete_mixed_incremental_and_bulk_string() {
|
||||
// The point of this test is to create an index populated with documents
|
||||
// containing different filterable attributes. Then, we delete a bunch of documents
|
||||
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
|
||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(
|
||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
||||
);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"label": ordered_string(i / 10),
|
||||
"colour": ordered_string(i / 100),
|
||||
"timestamp": ordered_string(i / 2),
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
||||
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
||||
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
|
||||
// by deleting the first 100 documents, we expect that:
|
||||
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
|
||||
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
|
||||
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
|
||||
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
|
||||
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
|
||||
db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_almost_all_incrementally_string() {
|
||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(
|
||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
||||
);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"label": ordered_string(i / 10),
|
||||
"colour": ordered_string(i / 100),
|
||||
"timestamp": ordered_string(i / 2),
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
||||
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
||||
|
||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
||||
|
||||
let mut docids_to_delete = (0..1000).collect::<Vec<_>>();
|
||||
docids_to_delete.shuffle(&mut rng);
|
||||
for docid in docids_to_delete.into_iter().take(990) {
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_documents(&RoaringBitmap::from_iter([docid]));
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
|
||||
db_snap!(index, string_faceted_documents_ids, 2, @r###"
|
||||
0 []
|
||||
1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
||||
2 [292, 324, 358, 381, 493, 839, 852, ]
|
||||
3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
||||
"###);
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
#[cfg(test)]
|
||||
mod comparison_bench {
|
||||
use std::iter::once;
|
||||
|
||||
use rand::Rng;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
use crate::update::facet::test_helpers::FacetIndex;
|
||||
|
||||
// This is a simple test to get an intuition on the relative speed
|
||||
// of the incremental vs. bulk indexer.
|
||||
//
|
||||
// The benchmark shows the worst-case scenario for the incremental indexer, since
|
||||
// each facet value contains only one document ID.
|
||||
//
|
||||
// In that scenario, it appears that the incremental indexer is about 70 times slower than the
|
||||
// bulk indexer.
|
||||
// #[test]
|
||||
fn benchmark_facet_indexing_delete() {
|
||||
let mut r = rand::thread_rng();
|
||||
|
||||
for i in 1..=20 {
|
||||
let size = 50_000 * i;
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
||||
for i in 0..size {
|
||||
// field id = 0, left_bound = i, docids = [i]
|
||||
elements.push(((0, i as f64), once(i).collect()));
|
||||
}
|
||||
let timer = std::time::Instant::now();
|
||||
index.bulk_insert(&mut txn, &[0], elements.iter());
|
||||
let time_spent = timer.elapsed().as_millis();
|
||||
println!("bulk {size} : {time_spent}ms");
|
||||
|
||||
txn.commit().unwrap();
|
||||
|
||||
for nbr_doc in [1, 100, 1000, 10_000] {
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let timer = std::time::Instant::now();
|
||||
//
|
||||
// delete one document
|
||||
//
|
||||
for _ in 0..nbr_doc {
|
||||
let deleted_u32 = r.gen::<u32>() % size;
|
||||
let deleted_f64 = deleted_u32 as f64;
|
||||
index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32)
|
||||
}
|
||||
let time_spent = timer.elapsed().as_millis();
|
||||
println!(" delete {nbr_doc} : {time_spent}ms");
|
||||
txn.abort().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,9 +1,9 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
||||
use obkv::KvReader;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
@ -12,8 +12,9 @@ use crate::heed_codec::facet::{
|
||||
};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
|
||||
enum InsertionResult {
|
||||
InPlace,
|
||||
@ -28,27 +29,21 @@ enum DeletionResult {
|
||||
|
||||
/// Algorithm to incrementally insert and delete elememts into the
|
||||
/// `facet_id_(string/f64)_docids` databases.
|
||||
///
|
||||
/// Rhe `faceted_documents_ids` value in the main database of `Index`
|
||||
/// is also updated to contain the new set of faceted documents.
|
||||
pub struct FacetsUpdateIncremental<'i> {
|
||||
index: &'i Index,
|
||||
pub struct FacetsUpdateIncremental {
|
||||
inner: FacetsUpdateIncrementalInner,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
}
|
||||
|
||||
impl<'i> FacetsUpdateIncremental<'i> {
|
||||
impl FacetsUpdateIncremental {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
index: &Index,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
max_group_size: u8,
|
||||
) -> Self {
|
||||
FacetsUpdateIncremental {
|
||||
index,
|
||||
inner: FacetsUpdateIncrementalInner {
|
||||
db: match facet_type {
|
||||
FacetType::String => index
|
||||
@ -62,31 +57,41 @@ impl<'i> FacetsUpdateIncremental<'i> {
|
||||
max_group_size,
|
||||
min_level_size,
|
||||
},
|
||||
facet_type,
|
||||
new_data,
|
||||
delta_data,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
|
||||
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
||||
|
||||
let mut cursor = self.new_data.into_cursor()?;
|
||||
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
|
||||
let mut cursor = self.delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key)
|
||||
.ok_or(heed::Error::Encoding)?;
|
||||
let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
||||
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
||||
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
||||
let value = KvReader::new(value);
|
||||
|
||||
let docids_to_delete = value
|
||||
.get(DelAdd::Deletion)
|
||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||
.map(|o| o.ok_or(heed::Error::Encoding));
|
||||
|
||||
let docids_to_add = value
|
||||
.get(DelAdd::Addition)
|
||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||
.map(|o| o.ok_or(heed::Error::Encoding));
|
||||
|
||||
if let Some(docids_to_delete) = docids_to_delete {
|
||||
let docids_to_delete = docids_to_delete?;
|
||||
self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?;
|
||||
}
|
||||
|
||||
if let Some(docids_to_add) = docids_to_add {
|
||||
let docids_to_add = docids_to_add?;
|
||||
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?;
|
||||
}
|
||||
}
|
||||
|
||||
for (field_id, new_docids) in new_faceted_docids {
|
||||
let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
|
||||
docids |= new_docids;
|
||||
self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -14,7 +14,7 @@ The databases must be able to return results for queries such as:
|
||||
The algorithms that implement these queries are found in the `src/search/facet` folder.
|
||||
|
||||
To make these queries fast to compute, the database adopts a tree structure:
|
||||
```ignore
|
||||
```text
|
||||
┌───────────────────────────────┬───────────────────────────────┬───────────────┐
|
||||
┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │
|
||||
│Level 2│ │ │ │ │
|
||||
@ -41,7 +41,7 @@ These documents all contain a facet value that is contained within `ab .. gaf`.
|
||||
In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
|
||||
[`FacetGroupValue`], which have the following format:
|
||||
|
||||
```ignore
|
||||
```text
|
||||
FacetGroupKey:
|
||||
- field id : u16
|
||||
- level : u8
|
||||
@ -98,7 +98,6 @@ use crate::update::merge_btreeset_string;
|
||||
use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
pub mod bulk;
|
||||
pub mod delete;
|
||||
pub mod incremental;
|
||||
|
||||
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
||||
@ -109,7 +108,7 @@ pub struct FacetsUpdate<'i> {
|
||||
index: &'i Index,
|
||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
@ -118,7 +117,7 @@ impl<'i> FacetsUpdate<'i> {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
) -> Self {
|
||||
let database = match facet_type {
|
||||
FacetType::String => index
|
||||
@ -135,26 +134,26 @@ impl<'i> FacetsUpdate<'i> {
|
||||
max_group_size: FACET_MAX_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
facet_type,
|
||||
new_data,
|
||||
delta_data,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||
if self.new_data.is_empty() {
|
||||
if self.delta_data.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
// See self::comparison_bench::benchmark_facet_indexing
|
||||
if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
||||
if self.delta_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
||||
let field_ids =
|
||||
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
||||
let bulk_update = FacetsUpdateBulk::new(
|
||||
self.index,
|
||||
field_ids,
|
||||
self.facet_type,
|
||||
self.new_data,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
);
|
||||
@ -163,7 +162,7 @@ impl<'i> FacetsUpdate<'i> {
|
||||
let incremental_update = FacetsUpdateIncremental::new(
|
||||
self.index,
|
||||
self.facet_type,
|
||||
self.new_data,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
self.max_group_size,
|
||||
@ -279,6 +278,7 @@ pub(crate) mod test_helpers {
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::FacetsUpdateIncrementalInner;
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
@ -455,20 +455,22 @@ pub(crate) mod test_helpers {
|
||||
let key: FacetGroupKey<&[u8]> =
|
||||
FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
|
||||
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key).unwrap();
|
||||
let mut inner_writer = KvWriterDelAdd::memory();
|
||||
let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
|
||||
writer.insert(&key, &value).unwrap();
|
||||
inner_writer.insert(DelAdd::Addition, value).unwrap();
|
||||
writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
|
||||
}
|
||||
writer.finish().unwrap();
|
||||
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
||||
|
||||
let update = FacetsUpdateBulkInner {
|
||||
db: self.content,
|
||||
new_data: Some(reader),
|
||||
delta_data: Some(reader),
|
||||
group_size: self.group_size.get(),
|
||||
min_level_size: self.min_level_size.get(),
|
||||
};
|
||||
|
||||
update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap();
|
||||
update.update(wtxn, field_ids).unwrap();
|
||||
}
|
||||
|
||||
pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) {
|
||||
@ -556,101 +558,6 @@ pub(crate) mod test_helpers {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
|
||||
use crate::db_snap;
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::DeletionStrategy;
|
||||
|
||||
#[test]
|
||||
fn replace_all_identical_soft_deletion_then_hard_deletion() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_filterable_fields(hashset! { S("size") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"size": i % 250,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
|
||||
db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9");
|
||||
db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..999 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"size": i % 250,
|
||||
"other": 0,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
|
||||
db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06");
|
||||
db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
|
||||
|
||||
// Then replace the last document while disabling soft_deletion
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
||||
let mut documents = vec![];
|
||||
for i in 999..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"size": i % 250,
|
||||
"other": 0,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
|
||||
db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028");
|
||||
db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
#[cfg(test)]
|
||||
mod comparison_bench {
|
||||
|
@ -1,20 +1,17 @@
|
||||
use std::fmt;
|
||||
use std::io::{BufWriter, Read, Seek};
|
||||
use std::result::Result as StdResult;
|
||||
use std::{fmt, iter};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader};
|
||||
use crate::documents::{
|
||||
DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader,
|
||||
EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY,
|
||||
};
|
||||
use crate::error::{GeoError, InternalError, UserError};
|
||||
use crate::update::index_documents::{obkv_to_object, writer_into_reader};
|
||||
use crate::{FieldId, Index, Object, Result};
|
||||
|
||||
/// The symbol used to define levels in a nested primary key.
|
||||
const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
|
||||
|
||||
/// The default primary that is used when not specified.
|
||||
const DEFAULT_PRIMARY_KEY: &str = "id";
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// This function validates and enrich the documents by checking that:
|
||||
/// - we can infer a primary key,
|
||||
@ -41,14 +38,12 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
// The primary key *field id* that has already been set for this index or the one
|
||||
// we will guess by searching for the first key that contains "id" as a substring.
|
||||
let primary_key = match index.primary_key(rtxn)? {
|
||||
Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => {
|
||||
PrimaryKey::nested(primary_key)
|
||||
}
|
||||
Some(primary_key) => match documents_batch_index.id(primary_key) {
|
||||
Some(id) => PrimaryKey::flat(primary_key, id),
|
||||
None if autogenerate_docids => {
|
||||
PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key))
|
||||
}
|
||||
Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) {
|
||||
Some(primary_key) => primary_key,
|
||||
None if autogenerate_docids => PrimaryKey::Flat {
|
||||
name: primary_key,
|
||||
field_id: documents_batch_index.insert(primary_key),
|
||||
},
|
||||
None => {
|
||||
return match cursor.next_document()? {
|
||||
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
|
||||
@ -76,14 +71,14 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
});
|
||||
|
||||
match guesses.as_slice() {
|
||||
[] if autogenerate_docids => PrimaryKey::flat(
|
||||
DEFAULT_PRIMARY_KEY,
|
||||
documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||
),
|
||||
[] if autogenerate_docids => PrimaryKey::Flat {
|
||||
name: DEFAULT_PRIMARY_KEY,
|
||||
field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||
},
|
||||
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
||||
[(field_id, name)] => {
|
||||
log::info!("Primary key was not specified in index. Inferred to '{name}'");
|
||||
PrimaryKey::flat(name, *field_id)
|
||||
PrimaryKey::Flat { name, field_id: *field_id }
|
||||
}
|
||||
multiple => {
|
||||
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||
@ -156,92 +151,24 @@ fn fetch_or_generate_document_id(
|
||||
uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
|
||||
count: u32,
|
||||
) -> Result<StdResult<DocumentId, UserError>> {
|
||||
match primary_key {
|
||||
PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => {
|
||||
match document.get(primary_key_id) {
|
||||
Some(document_id_bytes) => {
|
||||
let document_id = serde_json::from_slice(document_id_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
|
||||
Err(user_error) => Ok(Err(user_error)),
|
||||
}
|
||||
}
|
||||
None if autogenerate_docids => {
|
||||
let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
|
||||
Ok(Ok(DocumentId::generated(uuid.to_string(), count)))
|
||||
}
|
||||
None => Ok(Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
})),
|
||||
}
|
||||
Ok(match primary_key.document_id(document, documents_batch_index)? {
|
||||
Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }),
|
||||
Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),
|
||||
Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => {
|
||||
let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
|
||||
Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count })
|
||||
}
|
||||
nested @ PrimaryKey::Nested { .. } => {
|
||||
let mut matching_documents_ids = Vec::new();
|
||||
for (first_level_name, right) in nested.possible_level_names() {
|
||||
if let Some(field_id) = documents_batch_index.id(first_level_name) {
|
||||
if let Some(value_bytes) = document.get(field_id) {
|
||||
let object = serde_json::from_slice(value_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
fetch_matching_values(object, right, &mut matching_documents_ids);
|
||||
|
||||
if matching_documents_ids.len() >= 2 {
|
||||
return Ok(Err(UserError::TooManyDocumentIds {
|
||||
primary_key: nested.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match matching_documents_ids.pop() {
|
||||
Some(document_id) => match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
|
||||
Err(user_error) => Ok(Err(user_error)),
|
||||
},
|
||||
None => Ok(Err(UserError::MissingDocumentId {
|
||||
primary_key: nested.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
})),
|
||||
}
|
||||
Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
}),
|
||||
Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
|
||||
Err(UserError::TooManyDocumentIds {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A type that represent the type of primary key that has been set
|
||||
/// for this index, a classic flat one or a nested one.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum PrimaryKey<'a> {
|
||||
Flat { name: &'a str, field_id: FieldId },
|
||||
Nested { name: &'a str },
|
||||
}
|
||||
|
||||
impl PrimaryKey<'_> {
|
||||
fn flat(name: &str, field_id: FieldId) -> PrimaryKey {
|
||||
PrimaryKey::Flat { name, field_id }
|
||||
}
|
||||
|
||||
fn nested(name: &str) -> PrimaryKey {
|
||||
PrimaryKey::Nested { name }
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
match self {
|
||||
PrimaryKey::Flat { name, .. } => name,
|
||||
PrimaryKey::Nested { name } => name,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an `Iterator` that gives all the possible fields names the primary key
|
||||
/// can have depending of the first level name and deepnes of the objects.
|
||||
fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
|
||||
let name = self.name();
|
||||
name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
|
||||
.map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
|
||||
.chain(iter::once((name, "")))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// A type that represents a document id that has been retrieved from a document or auto-generated.
|
||||
@ -255,14 +182,6 @@ pub enum DocumentId {
|
||||
}
|
||||
|
||||
impl DocumentId {
|
||||
fn retrieved(value: String) -> DocumentId {
|
||||
DocumentId::Retrieved { value }
|
||||
}
|
||||
|
||||
fn generated(value: String, document_nth: u32) -> DocumentId {
|
||||
DocumentId::Generated { value, document_nth }
|
||||
}
|
||||
|
||||
fn debug(&self) -> String {
|
||||
format!("{:?}", self)
|
||||
}
|
||||
@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId {
|
||||
}
|
||||
}
|
||||
|
||||
fn starts_with(selector: &str, key: &str) -> bool {
|
||||
selector.strip_prefix(key).map_or(false, |tail| {
|
||||
tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
|
||||
match value {
|
||||
Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
|
||||
otherwise => output.push(otherwise),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fetch_matching_values_in_object(
|
||||
object: Object,
|
||||
selector: &str,
|
||||
base_key: &str,
|
||||
output: &mut Vec<Value>,
|
||||
) {
|
||||
for (key, value) in object {
|
||||
let base_key = if base_key.is_empty() {
|
||||
key.to_string()
|
||||
} else {
|
||||
format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
|
||||
};
|
||||
|
||||
if starts_with(selector, &base_key) {
|
||||
match value {
|
||||
Value::Object(object) => {
|
||||
fetch_matching_values_in_object(object, selector, &base_key, output)
|
||||
}
|
||||
value => output.push(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_document_id(document_id: &str) -> Option<&str> {
|
||||
if !document_id.is_empty()
|
||||
&& document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
||||
{
|
||||
Some(document_id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses a Json encoded document id and validate it, returning a user error when it is one.
|
||||
pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
|
||||
match document_id {
|
||||
Value::String(string) => match validate_document_id(&string) {
|
||||
Some(s) if s.len() == string.len() => Ok(Ok(string)),
|
||||
Some(s) => Ok(Ok(s.to_string())),
|
||||
None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
|
||||
},
|
||||
Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
|
||||
content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to extract an `f64` from a JSON `Value` and return the `Value`
|
||||
/// in the `Err` variant if it failed.
|
||||
pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
|
||||
|
@ -5,18 +5,16 @@ use std::io::BufReader;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
use obkv::KvReader;
|
||||
use obkv::{KvReader, KvWriterU16};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
||||
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::update::index_documents::MergeFn;
|
||||
use crate::{
|
||||
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
||||
};
|
||||
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||
|
||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
|
||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
||||
|
||||
/// Extracts the word and positions where this word appear and
|
||||
/// prefixes it by the document id.
|
||||
@ -32,25 +30,162 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
// initialize destination values.
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
let mut script_language_docids = HashMap::new();
|
||||
let mut docid_word_positions_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
concat_u32s_array,
|
||||
keep_latest_obkv,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut buffers = Buffers::default();
|
||||
// initialize buffers.
|
||||
let mut del_buffers = Buffers::default();
|
||||
let mut add_buffers = Buffers::default();
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
// initialize tokenizer.
|
||||
let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
// iterate over documents.
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let document_id = key
|
||||
.try_into()
|
||||
.map(u32::from_be_bytes)
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
|
||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||
if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
|
||||
continue;
|
||||
}
|
||||
|
||||
documents_ids.push(document_id);
|
||||
|
||||
// Update key buffer prefix.
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||
|
||||
// Tokenize deletions and additions in 2 diffferent threads.
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
// deletions
|
||||
lang_safe_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Deletion,
|
||||
&mut del_buffers,
|
||||
)
|
||||
},
|
||||
|| {
|
||||
// additions
|
||||
lang_safe_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Addition,
|
||||
&mut add_buffers,
|
||||
)
|
||||
},
|
||||
);
|
||||
|
||||
let (del_obkv, del_script_language_word_count) = del?;
|
||||
let (add_obkv, add_script_language_word_count) = add?;
|
||||
|
||||
// merge deletions and additions.
|
||||
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
||||
value_buffer.clear();
|
||||
del_add_from_two_obkvs(
|
||||
KvReader::<FieldId>::new(del_obkv),
|
||||
KvReader::<FieldId>::new(add_obkv),
|
||||
&mut value_buffer,
|
||||
)?;
|
||||
|
||||
// write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
|
||||
let obkv = KvReader::<FieldId>::new(&value_buffer);
|
||||
for (field_id, value) in obkv.iter() {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
docid_word_positions_sorter.insert(&key_buffer, value)?;
|
||||
}
|
||||
|
||||
// update script_language_docids deletions.
|
||||
for (script, languages_frequency) in del_script_language_word_count {
|
||||
for (language, _) in languages_frequency {
|
||||
let entry = script_language_docids
|
||||
.entry((script, language))
|
||||
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
||||
entry.0.push(document_id);
|
||||
}
|
||||
}
|
||||
|
||||
// update script_language_docids additions.
|
||||
for (script, languages_frequency) in add_script_language_word_count {
|
||||
for (language, _) in languages_frequency {
|
||||
let entry = script_language_docids
|
||||
.entry((script, language))
|
||||
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
||||
entry.1.push(document_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||
.map(|reader| (reader, script_language_docids))
|
||||
}
|
||||
|
||||
/// Check if any searchable fields of a document changed.
|
||||
fn searchable_fields_changed(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
) -> bool {
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
let del_add = KvReaderDelAdd::new(field_bytes);
|
||||
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||
// if both fields are None, check the next field.
|
||||
(None, None) => (),
|
||||
// if both contains a value and values are the same, check the next field.
|
||||
(Some(del), Some(add)) if del == add => (),
|
||||
// otherwise the fields are different, return true.
|
||||
_otherwise => return true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Factorize tokenizer building.
|
||||
fn tokenizer_builder<'a>(
|
||||
stop_words: Option<&'a fst::Set<&[u8]>>,
|
||||
allowed_separators: Option<&'a [&str]>,
|
||||
dictionary: Option<&'a [&str]>,
|
||||
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
||||
) -> TokenizerBuilder<'a, &'a [u8]> {
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
@ -61,130 +196,147 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
if let Some(separators) = allowed_separators {
|
||||
tokenizer_builder.separators(separators);
|
||||
}
|
||||
let tokenizer = tokenizer_builder.build();
|
||||
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let document_id = key
|
||||
.try_into()
|
||||
.map(u32::from_be_bytes)
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
if let Some(script_language) = script_language {
|
||||
tokenizer_builder.allow_list(script_language);
|
||||
}
|
||||
|
||||
documents_ids.push(document_id);
|
||||
buffers.key_buffer.clear();
|
||||
buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||
tokenizer_builder
|
||||
}
|
||||
|
||||
let mut script_language_word_count = HashMap::new();
|
||||
/// Extract words mapped with their positions of a document,
|
||||
/// ensuring no Language detection mistakes was made.
|
||||
#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
|
||||
fn lang_safe_tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
tokenizer: &Tokenizer,
|
||||
stop_words: Option<&fst::Set<&[u8]>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: u32,
|
||||
del_add: DelAdd,
|
||||
buffers: &'a mut Buffers,
|
||||
) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> {
|
||||
let mut script_language_word_count = HashMap::new();
|
||||
|
||||
extract_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
&mut buffers,
|
||||
&mut script_language_word_count,
|
||||
&mut docid_word_positions_sorter,
|
||||
)?;
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
searchable_fields,
|
||||
tokenizer,
|
||||
max_positions_per_attributes,
|
||||
del_add,
|
||||
buffers,
|
||||
&mut script_language_word_count,
|
||||
)?;
|
||||
|
||||
// if we detect a potetial mistake in the language detection,
|
||||
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
||||
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
||||
if script_language_word_count
|
||||
.values()
|
||||
.map(Vec::as_slice)
|
||||
.any(potential_language_detection_error)
|
||||
{
|
||||
// build an allow list with the most frequent detected languages in the document.
|
||||
let script_language: HashMap<_, _> =
|
||||
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
||||
// if we detect a potetial mistake in the language detection,
|
||||
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
||||
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
||||
if script_language_word_count
|
||||
.values()
|
||||
.map(Vec::as_slice)
|
||||
.any(potential_language_detection_error)
|
||||
{
|
||||
// build an allow list with the most frequent detected languages in the document.
|
||||
let script_language: HashMap<_, _> =
|
||||
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
||||
|
||||
// if the allow list is empty, meaning that no Language is considered frequent,
|
||||
// then we don't rerun the extraction.
|
||||
if !script_language.is_empty() {
|
||||
// build a new temporary tokenizer including the allow list.
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
tokenizer_builder.allow_list(&script_language);
|
||||
let tokenizer = tokenizer_builder.build();
|
||||
// if the allow list is empty, meaning that no Language is considered frequent,
|
||||
// then we don't rerun the extraction.
|
||||
if !script_language.is_empty() {
|
||||
// build a new temporary tokenizer including the allow list.
|
||||
let mut builder = tokenizer_builder(
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
Some(&script_language),
|
||||
);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
script_language_word_count.clear();
|
||||
script_language_word_count.clear();
|
||||
|
||||
// rerun the extraction.
|
||||
extract_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
&mut buffers,
|
||||
&mut script_language_word_count,
|
||||
&mut docid_word_positions_sorter,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
for (script, languages_frequency) in script_language_word_count {
|
||||
for (language, _) in languages_frequency {
|
||||
let entry = script_language_docids
|
||||
.entry((script, language))
|
||||
.or_insert_with(RoaringBitmap::new);
|
||||
entry.push(document_id);
|
||||
}
|
||||
// rerun the extraction.
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
del_add,
|
||||
buffers,
|
||||
&mut script_language_word_count,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||
.map(|reader| (documents_ids, reader, script_language_docids))
|
||||
// returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>)
|
||||
Ok((&buffers.obkv_buffer, script_language_word_count))
|
||||
}
|
||||
|
||||
fn extract_tokens_from_document(
|
||||
/// Extract words mapped with their positions of a document.
|
||||
fn tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
tokenizer: &Tokenizer,
|
||||
max_positions_per_attributes: u32,
|
||||
buffers: &mut Buffers,
|
||||
del_add: DelAdd,
|
||||
buffers: &'a mut Buffers,
|
||||
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
||||
docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
) -> Result<&'a [u8]> {
|
||||
buffers.obkv_buffer.clear();
|
||||
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
// if field is searchable.
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
buffers.field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
// extract deletion or addition only.
|
||||
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
|
||||
// parse json.
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
for (index, token) in tokens {
|
||||
// if a language has been detected for the token, we update the counter.
|
||||
if let Some(language) = token.language {
|
||||
let script = token.script;
|
||||
let entry =
|
||||
script_language_word_count.entry(script).or_insert_with(Vec::new);
|
||||
match entry.iter_mut().find(|(l, _)| *l == language) {
|
||||
Some((_, n)) => *n += 1,
|
||||
None => entry.push((language, 1)),
|
||||
// prepare writing destination.
|
||||
buffers.obkv_positions_buffer.clear();
|
||||
let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
|
||||
|
||||
// convert json into a unique string.
|
||||
buffers.field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||
// create an iterator of token with their positions.
|
||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
|
||||
for (index, token) in tokens {
|
||||
// if a language has been detected for the token, we update the counter.
|
||||
if let Some(language) = token.language {
|
||||
let script = token.script;
|
||||
let entry =
|
||||
script_language_word_count.entry(script).or_insert_with(Vec::new);
|
||||
match entry.iter_mut().find(|(l, _)| *l == language) {
|
||||
Some((_, n)) => *n += 1,
|
||||
None => entry.push((language, 1)),
|
||||
}
|
||||
}
|
||||
|
||||
// keep a word only if it is not empty and fit in a LMDB key.
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
writer.insert(position, token.as_bytes())?;
|
||||
}
|
||||
}
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
buffers.key_buffer.truncate(mem::size_of::<u32>());
|
||||
buffers.key_buffer.extend_from_slice(token.as_bytes());
|
||||
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let position = absolute_from_relative_position(field_id, position);
|
||||
docid_word_positions_sorter
|
||||
.insert(&buffers.key_buffer, position.to_ne_bytes())?;
|
||||
}
|
||||
// write positions into document.
|
||||
let positions = writer.into_inner()?;
|
||||
document_writer.insert(field_id, positions)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
// returns a KV<FieldId, KV<u16, String>>
|
||||
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
@ -287,10 +439,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)
|
||||
|
||||
#[derive(Default)]
|
||||
struct Buffers {
|
||||
// the key buffer is the concatenation of the internal document id with the field id.
|
||||
// The buffer has to be completelly cleared between documents,
|
||||
// and the field id part must be cleared between each field.
|
||||
key_buffer: Vec<u8>,
|
||||
// the field buffer for each fields desserialization, and must be cleared between each field.
|
||||
field_buffer: String,
|
||||
// buffer used to store the value data containing an obkv.
|
||||
obkv_buffer: Vec<u8>,
|
||||
// buffer used to store the value data containing an obkv of tokens with their positions.
|
||||
obkv_positions_buffer: Vec<u8>,
|
||||
}
|
||||
|
@ -4,11 +4,12 @@ use std::io::{self, BufReader};
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
||||
};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||
};
|
||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::Result;
|
||||
|
||||
/// Extracts the facet number and the documents ids where this facet number appear.
|
||||
@ -17,7 +18,7 @@ use crate::Result;
|
||||
/// documents ids from the given chunk of docid facet number positions.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_number: grenad::Reader<R>,
|
||||
fid_docid_facet_number: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
@ -26,21 +27,30 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut facet_number_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut cursor = docid_fid_facet_number.into_cursor()?;
|
||||
while let Some((key_bytes, _)) = cursor.move_on_next()? {
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
||||
while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
|
||||
let (field_id, document_id, number) =
|
||||
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
|
||||
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
||||
facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
facet_number_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(facet_number_docids_sorter, indexer)
|
||||
|
@ -1,13 +1,15 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::io::BufReader;
|
||||
use std::{io, str};
|
||||
|
||||
use heed::BytesEncode;
|
||||
|
||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||
use crate::heed_codec::StrRefCodec;
|
||||
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
|
||||
use crate::{FieldId, Result};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
@ -24,15 +26,16 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, _original_value_bytes)) = cursor.move_on_next()? {
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
@ -40,21 +43,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
let normalised_truncated_value: String;
|
||||
if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
|
||||
normalised_truncated_value = normalised_value
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
normalised_value = normalised_truncated_value.as_str();
|
||||
}
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||
// document id is encoded in native-endian because of the CBO roaring bitmap codec
|
||||
facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer)
|
||||
|
@ -1,24 +1,36 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::mem::size_of;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use grenad::Sorter;
|
||||
use heed::zerocopy::AsBytes;
|
||||
use heed::BytesEncode;
|
||||
use itertools::EitherOrBoth;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{from_slice, Value};
|
||||
use FilterableValues::{Empty, Null, Values};
|
||||
|
||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
|
||||
use crate::{
|
||||
CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH,
|
||||
};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
||||
|
||||
/// The extracted facet values stored in grenad files by type.
|
||||
pub struct ExtractedFacetValues {
|
||||
pub docid_fid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub docid_fid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
@ -58,71 +70,150 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
// The tuples represents the Del and Add side for a bitmap
|
||||
let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
|
||||
// We create two buffers for mutable ref issues with closures.
|
||||
let mut numbers_key_buffer = Vec::new();
|
||||
let mut strings_key_buffer = Vec::new();
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if faceted_fields.contains(&field_id) {
|
||||
key_buffer.clear();
|
||||
numbers_key_buffer.clear();
|
||||
strings_key_buffer.clear();
|
||||
|
||||
// Set key to the field_id
|
||||
// Note: this encoding is consistent with FieldIdCodec
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
|
||||
// Here, we know already that the document must be added to the “field id exists” database
|
||||
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
||||
let document = BEU32::from(document).get();
|
||||
|
||||
facet_exists_docids.entry(field_id).or_default().insert(document);
|
||||
|
||||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||
key_buffer.extend_from_slice(docid_bytes);
|
||||
numbers_key_buffer.extend_from_slice(docid_bytes);
|
||||
strings_key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
let del_add_obkv = obkv::KvReader::new(field_bytes);
|
||||
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
None => None,
|
||||
};
|
||||
let add_value = match del_add_obkv.get(DelAdd::Addition) {
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
match extract_facet_values(
|
||||
&value,
|
||||
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng),
|
||||
) {
|
||||
FilterableValues::Null => {
|
||||
facet_is_null_docids.entry(field_id).or_default().insert(document);
|
||||
}
|
||||
FilterableValues::Empty => {
|
||||
facet_is_empty_docids.entry(field_id).or_default().insert(document);
|
||||
}
|
||||
FilterableValues::Values { numbers, strings } => {
|
||||
// insert facet numbers in sorter
|
||||
for number in numbers {
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
// We insert the document id on the Del and the Add side if the field exists.
|
||||
let (ref mut del_exists, ref mut add_exists) =
|
||||
facet_exists_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_null, ref mut add_is_null) =
|
||||
facet_is_null_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_empty, ref mut add_is_empty) =
|
||||
facet_is_empty_docids.entry(field_id).or_default();
|
||||
|
||||
fid_docid_facet_numbers_sorter
|
||||
.insert(&key_buffer, ().as_bytes())?;
|
||||
}
|
||||
if del_value.is_some() {
|
||||
del_exists.insert(document);
|
||||
}
|
||||
if add_value.is_some() {
|
||||
add_exists.insert(document);
|
||||
}
|
||||
|
||||
let geo_support =
|
||||
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let del_filterable_values =
|
||||
del_value.map(|value| extract_facet_values(&value, geo_support));
|
||||
let add_filterable_values =
|
||||
add_value.map(|value| extract_facet_values(&value, geo_support));
|
||||
|
||||
// Those closures are just here to simplify things a bit.
|
||||
let mut insert_numbers_diff = |del_numbers, add_numbers| {
|
||||
insert_numbers_diff(
|
||||
&mut fid_docid_facet_numbers_sorter,
|
||||
&mut numbers_key_buffer,
|
||||
del_numbers,
|
||||
add_numbers,
|
||||
)
|
||||
};
|
||||
let mut insert_strings_diff = |del_strings, add_strings| {
|
||||
insert_strings_diff(
|
||||
&mut fid_docid_facet_strings_sorter,
|
||||
&mut strings_key_buffer,
|
||||
del_strings,
|
||||
add_strings,
|
||||
)
|
||||
};
|
||||
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(None, None) => (),
|
||||
(Some(del_filterable_values), None) => match del_filterable_values {
|
||||
Null => {
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
|
||||
// insert normalized and original facet string in sorter
|
||||
for (normalized, original) in
|
||||
strings.into_iter().filter(|(n, _)| !n.is_empty())
|
||||
{
|
||||
let normalized_truncated_value: String = normalized
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
|
||||
fid_docid_facet_strings_sorter
|
||||
.insert(&key_buffer, original.as_bytes())?;
|
||||
Empty => {
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
},
|
||||
(None, Some(add_filterable_values)) => match add_filterable_values {
|
||||
Null => {
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
Empty => {
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
}
|
||||
},
|
||||
(Some(del_filterable_values), Some(add_filterable_values)) => {
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(Null, Null) | (Empty, Empty) => (),
|
||||
(Null, Empty) => {
|
||||
del_is_null.insert(document);
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
(Empty, Null) => {
|
||||
del_is_empty.insert(document);
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
(Null, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
(Empty, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
(Values { numbers, strings }, Null) => {
|
||||
add_is_null.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(Values { numbers, strings }, Empty) => {
|
||||
add_is_empty.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(
|
||||
Values { numbers: del_numbers, strings: del_strings },
|
||||
Values { numbers: add_numbers, strings: add_strings },
|
||||
) => {
|
||||
insert_numbers_diff(del_numbers, add_numbers)?;
|
||||
insert_strings_diff(del_strings, add_strings)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -130,14 +221,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
}
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut facet_exists_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_exists_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||
|
||||
@ -146,9 +238,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_is_null_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
||||
|
||||
@ -157,21 +249,156 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
||||
|
||||
Ok(ExtractedFacetValues {
|
||||
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
||||
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
||||
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
||||
})
|
||||
}
|
||||
|
||||
/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
|
||||
fn deladd_obkv_cbo_roaring_bitmaps(
|
||||
buffer: &mut Vec<u8>,
|
||||
del_bitmap: &RoaringBitmap,
|
||||
add_bitmap: &RoaringBitmap,
|
||||
) -> io::Result<()> {
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(buffer);
|
||||
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
|
||||
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
|
||||
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
|
||||
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
|
||||
obkv.finish()
|
||||
}
|
||||
|
||||
/// Truncates a string to the biggest valid LMDB key size.
|
||||
fn truncate_string(s: String) -> String {
|
||||
s.char_indices()
|
||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_numbers_diff<MF>(
|
||||
fid_docid_facet_numbers_sorter: &mut Sorter<MF>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_numbers: Vec<f64>,
|
||||
mut add_numbers: Vec<f64>,
|
||||
) -> Result<()>
|
||||
where
|
||||
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||
{
|
||||
// We sort and dedup the float numbers
|
||||
del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
del_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||
add_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||
|
||||
let merged_numbers_iter = itertools::merge_join_by(
|
||||
del_numbers.into_iter().map(OrderedFloat),
|
||||
add_numbers.into_iter().map(OrderedFloat),
|
||||
|del, add| del.cmp(add),
|
||||
);
|
||||
|
||||
// insert facet numbers in sorter
|
||||
for eob in merged_numbers_iter {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left(OrderedFloat(number)) => {
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those numbers.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, ().as_bytes())?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
EitherOrBoth::Right(OrderedFloat(number)) => {
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those numbers.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, ().as_bytes())?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add strings and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_strings_diff<MF>(
|
||||
fid_docid_facet_strings_sorter: &mut Sorter<MF>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_strings: Vec<(String, String)>,
|
||||
mut add_strings: Vec<(String, String)>,
|
||||
) -> Result<()>
|
||||
where
|
||||
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||
{
|
||||
// We sort and dedup the normalized and original strings
|
||||
del_strings.sort_unstable();
|
||||
add_strings.sort_unstable();
|
||||
del_strings.dedup();
|
||||
add_strings.dedup();
|
||||
|
||||
let merged_strings_iter = itertools::merge_join_by(
|
||||
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||
|del, add| del.cmp(add),
|
||||
);
|
||||
|
||||
// insert normalized and original facet string in sorter
|
||||
for eob in merged_strings_iter {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left((normalized, original)) => {
|
||||
let truncated = truncate_string(normalized);
|
||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, original)?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
EitherOrBoth::Right((normalized, original)) => {
|
||||
let truncated = truncate_string(normalized);
|
||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, original)?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Represent what a document field contains.
|
||||
enum FilterableValues {
|
||||
/// Corresponds to the JSON `null` value.
|
||||
@ -182,6 +409,7 @@ enum FilterableValues {
|
||||
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
||||
}
|
||||
|
||||
/// Extracts the facet values of a JSON field.
|
||||
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
|
||||
fn inner_extract_facet_values(
|
||||
value: &Value,
|
||||
|
@ -1,16 +1,18 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use grenad::Sorter;
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters, MergeFn,
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||
GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::Result;
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
|
||||
/// Extracts the field id word count and the documents ids where
|
||||
/// this field id with this amount of words appear.
|
||||
@ -28,70 +30,62 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
// This map is assumed to not consume a lot of memory.
|
||||
let mut document_fid_wordcount = HashMap::new();
|
||||
let mut current_document_id = None;
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, _word_bytes) = try_split_array_at(key)
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let curr_document_id = *current_document_id.get_or_insert(document_id);
|
||||
if curr_document_id != document_id {
|
||||
drain_document_fid_wordcount_into_sorter(
|
||||
&mut fid_word_count_docids_sorter,
|
||||
&mut document_fid_wordcount,
|
||||
curr_document_id,
|
||||
)?;
|
||||
current_document_id = Some(document_id);
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
let deletion = del_add_reader
|
||||
// get deleted words
|
||||
.get(DelAdd::Deletion)
|
||||
// count deleted words
|
||||
.map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
let addition = del_add_reader
|
||||
// get added words
|
||||
.get(DelAdd::Addition)
|
||||
// count added words
|
||||
.map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
|
||||
if deletion != addition {
|
||||
// Insert deleted word count in sorter if exist.
|
||||
if let Some(word_count) = deletion {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
// Insert added word count in sorter if exist.
|
||||
if let Some(word_count) = addition {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
|
||||
for position in read_u32_ne_bytes(value) {
|
||||
let (field_id, _) = relative_from_absolute_position(position);
|
||||
|
||||
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
|
||||
*value += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
// We must make sure that don't lose the current document field id
|
||||
// word count map if we break because we reached the end of the chunk.
|
||||
drain_document_fid_wordcount_into_sorter(
|
||||
&mut fid_word_count_docids_sorter,
|
||||
&mut document_fid_wordcount,
|
||||
document_id,
|
||||
)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
||||
}
|
||||
|
||||
fn drain_document_fid_wordcount_into_sorter(
|
||||
fid_word_count_docids_sorter: &mut Sorter<MergeFn>,
|
||||
document_fid_wordcount: &mut HashMap<FieldId, u32>,
|
||||
document_id: DocumentId,
|
||||
) -> Result<()> {
|
||||
let mut key_buffer = Vec::new();
|
||||
|
||||
for (fid, count) in document_fid_wordcount.drain() {
|
||||
if count <= 30 {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
key_buffer.push(count as u8);
|
||||
|
||||
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::GeoError;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::extract_finite_float_from_value;
|
||||
use crate::{FieldId, InternalError, Result};
|
||||
|
||||
@ -30,39 +31,71 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
// since we only needs the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
// since we only need the primary key when we throw an error
|
||||
// we create this getter to lazily get it when needed
|
||||
let document_id = || -> Value {
|
||||
let document_id = obkv.get(primary_key_id).unwrap();
|
||||
serde_json::from_slice(document_id).unwrap()
|
||||
};
|
||||
|
||||
// first we get the two fields
|
||||
let lat = obkv.get(lat_fid);
|
||||
let lng = obkv.get(lng_fid);
|
||||
match (obkv.get(lat_fid), obkv.get(lng_fid)) {
|
||||
(Some(lat), Some(lng)) => {
|
||||
let deladd_lat_obkv = KvReaderDelAdd::new(lat);
|
||||
let deladd_lng_obkv = KvReaderDelAdd::new(lng);
|
||||
|
||||
if let Some((lat, lng)) = lat.zip(lng) {
|
||||
// then we extract the values
|
||||
let lat = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
|
||||
// then we extract the values
|
||||
let del_lat_lng = deladd_lat_obkv
|
||||
.get(DelAdd::Deletion)
|
||||
.zip(deladd_lng_obkv.get(DelAdd::Deletion))
|
||||
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
|
||||
.transpose()?;
|
||||
let add_lat_lng = deladd_lat_obkv
|
||||
.get(DelAdd::Addition)
|
||||
.zip(deladd_lng_obkv.get(DelAdd::Addition))
|
||||
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
|
||||
.transpose()?;
|
||||
|
||||
let lng = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
||||
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
} else if lat.is_none() && lng.is_some() {
|
||||
return Err(GeoError::MissingLatitude { document_id: document_id() })?;
|
||||
} else if lat.is_some() && lng.is_none() {
|
||||
return Err(GeoError::MissingLongitude { document_id: document_id() })?;
|
||||
if del_lat_lng != add_lat_lng {
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
if let Some([lat, lng]) = del_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Deletion, bytes)?;
|
||||
}
|
||||
if let Some([lat, lng]) = add_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Addition, bytes)?;
|
||||
}
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
}
|
||||
}
|
||||
(None, Some(_)) => {
|
||||
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
|
||||
}
|
||||
(Some(_), None) => {
|
||||
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
|
||||
}
|
||||
(None, None) => (),
|
||||
}
|
||||
// else => the _geo object was `null`, there is nothing to do
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
/// Extract the finite floats lat and lng from two bytes slices.
|
||||
fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
|
||||
let lat = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
|
||||
|
||||
let lng = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
||||
|
||||
Ok([lat, lng])
|
||||
}
|
||||
|
@ -1,13 +1,24 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::convert::TryFrom;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::mem::size_of;
|
||||
use std::str::from_utf8;
|
||||
|
||||
use bytemuck::cast_slice;
|
||||
use grenad::Writer;
|
||||
use itertools::EitherOrBoth;
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde_json::{from_slice, Value};
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::UserError;
|
||||
use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::try_split_at;
|
||||
use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||
///
|
||||
@ -16,7 +27,6 @@ use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
primary_key_id: FieldId,
|
||||
vectors_fid: FieldId,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
@ -27,43 +37,112 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// this must always be serialized as (docid, external_docid);
|
||||
let (docid_bytes, external_id_bytes) =
|
||||
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
|
||||
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
||||
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
// since we only needs the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value {
|
||||
let document_id = obkv.get(primary_key_id).unwrap();
|
||||
from_slice(document_id).unwrap()
|
||||
};
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
// first we retrieve the _vectors field
|
||||
if let Some(vectors) = obkv.get(vectors_fid) {
|
||||
// extract the vectors
|
||||
let vectors = match from_slice(vectors) {
|
||||
Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors),
|
||||
Err(_) => {
|
||||
return Err(UserError::InvalidVectorsType {
|
||||
document_id: document_id(),
|
||||
value: from_slice(vectors).map_err(InternalError::SerdeJson)?,
|
||||
}
|
||||
.into())
|
||||
}
|
||||
};
|
||||
if let Some(value) = obkv.get(vectors_fid) {
|
||||
let vectors_obkv = KvReaderDelAdd::new(value);
|
||||
|
||||
if let Some(vectors) = vectors {
|
||||
for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
let index = u16::try_from(i).unwrap();
|
||||
let mut key = docid_bytes.to_vec();
|
||||
key.extend_from_slice(&index.to_be_bytes());
|
||||
let bytes = cast_slice(&vector);
|
||||
writer.insert(key, bytes)?;
|
||||
}
|
||||
}
|
||||
// then we extract the values
|
||||
let del_vectors = vectors_obkv
|
||||
.get(DelAdd::Deletion)
|
||||
.map(|vectors| extract_vectors(vectors, document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
let add_vectors = vectors_obkv
|
||||
.get(DelAdd::Addition)
|
||||
.map(|vectors| extract_vectors(vectors, document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
&mut writer,
|
||||
&mut key_buffer,
|
||||
del_vectors.unwrap_or_default(),
|
||||
add_vectors.unwrap_or_default(),
|
||||
)?;
|
||||
}
|
||||
// else => the `_vectors` object was `null`, there is nothing to do
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn push_vectors_diff(
|
||||
writer: &mut Writer<BufWriter<File>>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_vectors: Vec<Vec<f32>>,
|
||||
mut add_vectors: Vec<Vec<f32>>,
|
||||
) -> Result<()> {
|
||||
// We sort and dedup the vectors
|
||||
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||
|
||||
let merged_vectors_iter =
|
||||
itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
|
||||
|
||||
// insert vectors into the writer
|
||||
for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
// Generate the key by extending the unique index to it.
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
let index = u16::try_from(i).unwrap();
|
||||
key_buffer.extend_from_slice(&index.to_be_bytes());
|
||||
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left(vector) => {
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
EitherOrBoth::Right(vector) => {
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compares two vectors by using the OrderingFloat helper.
|
||||
fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
|
||||
a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
|
||||
}
|
||||
|
||||
/// Extracts the vectors from a JSON value.
|
||||
fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result<Option<Vec<Vec<f32>>>> {
|
||||
match from_slice(value) {
|
||||
Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)),
|
||||
Err(_) => Err(UserError::InvalidVectorsType {
|
||||
document_id: document_id(),
|
||||
value: from_slice(value).map_err(InternalError::SerdeJson)?,
|
||||
}
|
||||
.into()),
|
||||
}
|
||||
}
|
||||
|
@ -1,18 +1,20 @@
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
use heed::BytesDecode;
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters,
|
||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
|
||||
try_split_array_at, writer_into_reader, GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::index_documents::helpers::read_u32_ne_bytes;
|
||||
use crate::{relative_from_absolute_position, FieldId, Result};
|
||||
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::MergeFn;
|
||||
use crate::{DocumentId, FieldId, Result};
|
||||
|
||||
/// Extracts the word and the documents ids where this word appear.
|
||||
///
|
||||
@ -26,65 +28,152 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
exact_attributes: &HashSet<FieldId>,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
) -> Result<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 2),
|
||||
max_memory.map(|x| x / 3),
|
||||
);
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut del_words = BTreeSet::new();
|
||||
let mut add_words = BTreeSet::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let (fid_bytes, _) = try_split_array_at(fid_bytes)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
let fid = u16::from_be_bytes(fid_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (_pos, word) in KvReaderU16::new(deletion).iter() {
|
||||
del_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (_pos, word) in KvReaderU16::new(addition).iter() {
|
||||
add_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
words_into_sorter(
|
||||
document_id,
|
||||
fid,
|
||||
&mut key_buffer,
|
||||
&del_words,
|
||||
&add_words,
|
||||
&mut word_fid_docids_sorter,
|
||||
)?;
|
||||
|
||||
del_words.clear();
|
||||
add_words.clear();
|
||||
}
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 3),
|
||||
);
|
||||
|
||||
let mut exact_word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 2),
|
||||
max_memory.map(|x| x / 3),
|
||||
);
|
||||
|
||||
let mut value_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, positions)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||
let mut word_fid_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
|
||||
// TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// only keep the value if their is a change to apply in the DB.
|
||||
if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
|
||||
word_fid_docids_writer.insert(key, value)?;
|
||||
}
|
||||
|
||||
let (word, fid) = StrBEU16Codec::bytes_decode(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let bitmap = RoaringBitmap::from_iter(Some(document_id));
|
||||
serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
|
||||
|
||||
// If there are no exact attributes, we do not need to iterate over positions.
|
||||
if exact_attributes.is_empty() {
|
||||
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||
// every words contained in an attribute set to exact must be pushed in the exact_words list.
|
||||
if exact_attributes.contains(&fid) {
|
||||
exact_word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||
} else {
|
||||
let mut added_to_exact = false;
|
||||
let mut added_to_word_docids = false;
|
||||
for position in read_u32_ne_bytes(positions) {
|
||||
// as soon as we know that this word had been to both readers, we don't need to
|
||||
// iterate over the positions.
|
||||
if added_to_exact && added_to_word_docids {
|
||||
break;
|
||||
}
|
||||
let (fid, _) = relative_from_absolute_position(position);
|
||||
if exact_attributes.contains(&fid) && !added_to_exact {
|
||||
exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||
added_to_exact = true;
|
||||
} else if !added_to_word_docids {
|
||||
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||
added_to_word_docids = true;
|
||||
}
|
||||
}
|
||||
word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
sorter_into_reader(word_docids_sorter, indexer)?,
|
||||
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
||||
writer_into_reader(word_fid_docids_writer)?,
|
||||
))
|
||||
}
|
||||
|
||||
fn words_into_sorter(
|
||||
document_id: DocumentId,
|
||||
fid: FieldId,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
del_words: &BTreeSet<Vec<u8>>,
|
||||
add_words: &BTreeSet<Vec<u8>>,
|
||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let word_bytes = match eob {
|
||||
Left(word_bytes) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
Right(word_bytes) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
Both(word_bytes, _) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1,51 +0,0 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::{relative_from_absolute_position, DocumentId, Result};
|
||||
|
||||
/// Extracts the word, field id, and the documents ids where this word appear at this field id.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||
|
||||
for position in read_u32_ne_bytes(value) {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
let (fid, _) = relative_from_absolute_position(position);
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
|
||||
|
||||
Ok(word_fid_docids_reader)
|
||||
}
|
@ -1,16 +1,18 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{cmp, io, mem, str, vec};
|
||||
use std::{cmp, io};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters, MergeFn,
|
||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
|
||||
writer_into_reader, GrenadParameters, MergeFn,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::proximity::{positions_proximity, MAX_DISTANCE};
|
||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::{DocumentId, Result};
|
||||
|
||||
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
||||
@ -26,58 +28,137 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_pair_proximity_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
|
||||
.map(|_| {
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / MAX_DISTANCE as usize),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// This map is assumed to not consume a lot of memory.
|
||||
let mut document_word_positions_heap = BinaryHeap::new();
|
||||
let mut del_word_positions: VecDeque<(String, u16)> =
|
||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||
let mut add_word_positions: VecDeque<(String, u16)> =
|
||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||
let mut del_word_pair_proximity = BTreeMap::new();
|
||||
let mut add_word_pair_proximity = BTreeMap::new();
|
||||
let mut current_document_id = None;
|
||||
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
let word = str::from_utf8(word_bytes)?;
|
||||
|
||||
let curr_document_id = *current_document_id.get_or_insert(document_id);
|
||||
if curr_document_id != document_id {
|
||||
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
|
||||
// if we change document, we fill the sorter
|
||||
if current_document_id.map_or(false, |id| id != document_id) {
|
||||
puffin::profile_scope!("Document into sorter");
|
||||
|
||||
document_word_positions_into_sorter(
|
||||
curr_document_id,
|
||||
document_word_positions_heap,
|
||||
&mut word_pair_proximity_docids_sorter,
|
||||
current_document_id.unwrap(),
|
||||
&del_word_pair_proximity,
|
||||
&add_word_pair_proximity,
|
||||
&mut word_pair_proximity_docids_sorters,
|
||||
)?;
|
||||
current_document_id = Some(document_id);
|
||||
del_word_pair_proximity.clear();
|
||||
add_word_pair_proximity.clear();
|
||||
}
|
||||
|
||||
let word = word.to_string();
|
||||
let mut positions: Vec<_> = read_u32_ne_bytes(value).collect();
|
||||
positions.sort_unstable();
|
||||
let mut iter = positions.into_iter();
|
||||
if let Some(position) = iter.next() {
|
||||
document_word_positions_heap.push(PeekedWordPosition { word, position, iter });
|
||||
}
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
// deletions
|
||||
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
|
||||
for (position, word) in KvReaderU16::new(deletion).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while del_word_positions.get(0).map_or(false, |(_w, p)| {
|
||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||
}) {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut del_word_positions,
|
||||
&mut del_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert the new word.
|
||||
let word = std::str::from_utf8(word)?;
|
||||
del_word_positions.push_back((word.to_string(), position));
|
||||
}
|
||||
|
||||
while !del_word_positions.is_empty() {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut del_word_positions,
|
||||
&mut del_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
|| {
|
||||
// additions
|
||||
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||
for (position, word) in KvReaderU16::new(addition).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while add_word_positions.get(0).map_or(false, |(_w, p)| {
|
||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||
}) {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut add_word_positions,
|
||||
&mut add_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert the new word.
|
||||
let word = std::str::from_utf8(word)?;
|
||||
add_word_positions.push_back((word.to_string(), position));
|
||||
}
|
||||
|
||||
while !add_word_positions.is_empty() {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut add_word_positions,
|
||||
&mut add_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
del?;
|
||||
add?;
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
// We must make sure that don't lose the current document field id
|
||||
// word count map if we break because we reached the end of the chunk.
|
||||
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
|
||||
puffin::profile_scope!("Final document into sorter");
|
||||
document_word_positions_into_sorter(
|
||||
document_id,
|
||||
document_word_positions_heap,
|
||||
&mut word_pair_proximity_docids_sorter,
|
||||
&del_word_pair_proximity,
|
||||
&add_word_pair_proximity,
|
||||
&mut word_pair_proximity_docids_sorters,
|
||||
)?;
|
||||
}
|
||||
{
|
||||
puffin::profile_scope!("sorter_into_reader");
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
sorter_into_reader(word_pair_proximity_docids_sorter, indexer)
|
||||
for sorter in word_pair_proximity_docids_sorters {
|
||||
sorter.write_into_stream_writer(&mut writer)?;
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
||||
@ -86,96 +167,66 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
/// close to each other.
|
||||
fn document_word_positions_into_sorter(
|
||||
document_id: DocumentId,
|
||||
mut word_positions_heap: BinaryHeap<PeekedWordPosition<vec::IntoIter<u32>>>,
|
||||
word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeFn>],
|
||||
) -> Result<()> {
|
||||
let mut word_pair_proximity = HashMap::new();
|
||||
let mut ordered_peeked_word_positions = Vec::new();
|
||||
while !word_positions_heap.is_empty() {
|
||||
while let Some(peeked_word_position) = word_positions_heap.pop() {
|
||||
ordered_peeked_word_positions.push(peeked_word_position);
|
||||
if ordered_peeked_word_positions.len() == 7 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((head, tail)) = ordered_peeked_word_positions.split_first() {
|
||||
for PeekedWordPosition { word, position, .. } in tail {
|
||||
let prox = positions_proximity(head.position, *position);
|
||||
if prox > 0 && prox < MAX_DISTANCE {
|
||||
word_pair_proximity
|
||||
.entry((head.word.clone(), word.clone()))
|
||||
.and_modify(|p| {
|
||||
*p = cmp::min(*p, prox);
|
||||
})
|
||||
.or_insert(prox);
|
||||
}
|
||||
}
|
||||
|
||||
// Push the tail in the heap.
|
||||
let tail_iter = ordered_peeked_word_positions.drain(1..);
|
||||
word_positions_heap.extend(tail_iter);
|
||||
|
||||
// Advance the head and push it in the heap.
|
||||
if let Some(mut head) = ordered_peeked_word_positions.pop() {
|
||||
if let Some(next_position) = head.iter.next() {
|
||||
let prox = positions_proximity(head.position, next_position);
|
||||
|
||||
if prox > 0 && prox < MAX_DISTANCE {
|
||||
word_pair_proximity
|
||||
.entry((head.word.clone(), head.word.clone()))
|
||||
.and_modify(|p| {
|
||||
*p = cmp::min(*p, prox);
|
||||
})
|
||||
.or_insert(prox);
|
||||
}
|
||||
|
||||
word_positions_heap.push(PeekedWordPosition {
|
||||
word: head.word,
|
||||
position: next_position,
|
||||
iter: head.iter,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut key_buffer = Vec::new();
|
||||
for ((w1, w2), prox) in word_pair_proximity {
|
||||
for eob in
|
||||
merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
|
||||
d.cmp(a)
|
||||
})
|
||||
{
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let ((w1, w2), prox) = match eob {
|
||||
Left(key_value) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
Right(key_value) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
Both(key_value, _) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.push(prox as u8);
|
||||
key_buffer.push(*prox);
|
||||
key_buffer.extend_from_slice(w1.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(w2.as_bytes());
|
||||
|
||||
word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
word_pair_proximity_docids_sorters[*prox as usize - 1]
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct PeekedWordPosition<I> {
|
||||
word: String,
|
||||
position: u32,
|
||||
iter: I,
|
||||
}
|
||||
|
||||
impl<I> Ord for PeekedWordPosition<I> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.position.cmp(&other.position).reverse()
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> PartialOrd for PeekedWordPosition<I> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> Eq for PeekedWordPosition<I> {}
|
||||
|
||||
impl<I> PartialEq for PeekedWordPosition<I> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.position == other.position
|
||||
fn word_positions_into_word_pair_proximity(
|
||||
word_positions: &mut VecDeque<(String, u16)>,
|
||||
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||
) -> Result<()> {
|
||||
let (head_word, head_position) = word_positions.pop_front().unwrap();
|
||||
for (word, position) in word_positions.iter() {
|
||||
let prox = index_proximity(head_position as u32, *position as u32) as u8;
|
||||
if prox > 0 && prox < MAX_DISTANCE as u8 {
|
||||
word_pair_proximity
|
||||
.entry((head_word.clone(), word.clone()))
|
||||
.and_modify(|p| {
|
||||
*p = cmp::min(*p, prox);
|
||||
})
|
||||
.or_insert(prox);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1,13 +1,18 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters,
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||
GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::MergeFn;
|
||||
use crate::{bucketed_position, DocumentId, Result};
|
||||
|
||||
/// Extracts the word positions and the documents ids where this word appear.
|
||||
///
|
||||
@ -24,32 +29,111 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut word_position_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||
let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||
let mut current_document_id: Option<u32> = None;
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||
|
||||
for position in read_u32_ne_bytes(value) {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
let (_, position) = relative_from_absolute_position(position);
|
||||
let position = bucketed_position(position);
|
||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
if current_document_id.map_or(false, |id| document_id != id) {
|
||||
words_position_into_sorter(
|
||||
current_document_id.unwrap(),
|
||||
&mut key_buffer,
|
||||
&del_word_positions,
|
||||
&add_word_positions,
|
||||
&mut word_position_docids_sorter,
|
||||
)?;
|
||||
del_word_positions.clear();
|
||||
add_word_positions.clear();
|
||||
}
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
|
||||
let position = bucketed_position(position);
|
||||
del_word_positions.insert((position, word_bytes.to_vec()));
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (position, word_bytes) in KvReaderU16::new(addition).iter() {
|
||||
let position = bucketed_position(position);
|
||||
add_word_positions.insert((position, word_bytes.to_vec()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
words_position_into_sorter(
|
||||
document_id,
|
||||
&mut key_buffer,
|
||||
&del_word_positions,
|
||||
&add_word_positions,
|
||||
&mut word_position_docids_sorter,
|
||||
)?;
|
||||
}
|
||||
|
||||
// TODO remove noop DelAdd OBKV
|
||||
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
||||
|
||||
Ok(word_position_docids_reader)
|
||||
}
|
||||
|
||||
fn words_position_into_sorter(
|
||||
document_id: DocumentId,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a))
|
||||
{
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let (position, word_bytes) = match eob {
|
||||
Left(key) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
Right(key) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
Both(key, _) => {
|
||||
// both values needs to be kept because it will be used in other extractors.
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||
word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -6,7 +6,6 @@ mod extract_fid_word_count_docids;
|
||||
mod extract_geo_points;
|
||||
mod extract_vector_points;
|
||||
mod extract_word_docids;
|
||||
mod extract_word_fid_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
mod extract_word_position_docids;
|
||||
|
||||
@ -26,12 +25,11 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_vector_points::extract_vector_points;
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_fid_docids::extract_word_fid_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{
|
||||
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
|
||||
GrenadParameters, MergeFn, MergeableReader,
|
||||
as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters,
|
||||
MergeFn, MergeableReader,
|
||||
};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::{FieldId, Result};
|
||||
@ -65,7 +63,6 @@ pub(crate) fn data_from_obkv_documents(
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
vectors_field_id,
|
||||
primary_key_id,
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()?;
|
||||
@ -94,9 +91,9 @@ pub(crate) fn data_from_obkv_documents(
|
||||
let (
|
||||
docid_word_positions_chunks,
|
||||
(
|
||||
docid_fid_facet_numbers_chunks,
|
||||
fid_docid_facet_numbers_chunks,
|
||||
(
|
||||
docid_fid_facet_strings_chunks,
|
||||
fid_docid_facet_strings_chunks,
|
||||
(
|
||||
facet_is_null_docids_chunks,
|
||||
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
||||
@ -110,7 +107,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-exists-docids");
|
||||
match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
|
||||
}
|
||||
@ -126,7 +123,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-null-docids");
|
||||
match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
|
||||
}
|
||||
@ -142,7 +139,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-empty-docids");
|
||||
match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
|
||||
}
|
||||
@ -158,7 +155,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
@ -168,24 +165,31 @@ pub(crate) fn data_from_obkv_documents(
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdWordcountDocids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)>,
|
||||
Vec<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)>,
|
||||
>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
||||
merge_roaring_bitmaps,
|
||||
|(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
},
|
||||
"word-docids",
|
||||
);
|
||||
@ -195,36 +199,27 @@ pub(crate) fn data_from_obkv_documents(
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPositionDocids,
|
||||
"word-position-docids",
|
||||
);
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_word_positions_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_fid_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordFidDocids,
|
||||
"word-fid-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_fid_facet_strings_chunks,
|
||||
fid_docid_facet_strings_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
"field-id-facet-string-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_fid_facet_numbers_chunks,
|
||||
fid_docid_facet_numbers_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx,
|
||||
extract_facet_number_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docids",
|
||||
);
|
||||
@ -278,7 +273,6 @@ fn send_original_documents_data(
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
vectors_field_id: Option<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
) -> Result<()> {
|
||||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
@ -287,12 +281,7 @@ fn send_original_documents_data(
|
||||
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
let result = extract_vector_points(
|
||||
documents_chunk_cloned,
|
||||
indexer,
|
||||
primary_key_id,
|
||||
vectors_field_id,
|
||||
);
|
||||
let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id);
|
||||
let _ = match result {
|
||||
Ok(vector_points) => {
|
||||
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
|
||||
@ -356,10 +345,10 @@ fn send_and_extract_flattened_documents_data(
|
||||
});
|
||||
}
|
||||
|
||||
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||
rayon::join(
|
||||
|| {
|
||||
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
|
||||
let (docid_word_positions_chunk, script_language_pair) =
|
||||
extract_docid_word_positions(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
@ -370,9 +359,6 @@ fn send_and_extract_flattened_documents_data(
|
||||
max_positions_per_attributes,
|
||||
)?;
|
||||
|
||||
// send documents_ids to DB writer
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids)));
|
||||
|
||||
// send docid_word_positions_chunk to DB writer
|
||||
let docid_word_positions_chunk =
|
||||
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
||||
@ -384,8 +370,8 @@ fn send_and_extract_flattened_documents_data(
|
||||
},
|
||||
|| {
|
||||
let ExtractedFacetValues {
|
||||
docid_fid_facet_numbers_chunk,
|
||||
docid_fid_facet_strings_chunk,
|
||||
fid_docid_facet_numbers_chunk,
|
||||
fid_docid_facet_strings_chunk,
|
||||
fid_facet_is_null_docids_chunk,
|
||||
fid_facet_is_empty_docids_chunk,
|
||||
fid_facet_exists_docids_chunk,
|
||||
@ -396,26 +382,26 @@ fn send_and_extract_flattened_documents_data(
|
||||
geo_fields_ids,
|
||||
)?;
|
||||
|
||||
// send docid_fid_facet_numbers_chunk to DB writer
|
||||
let docid_fid_facet_numbers_chunk =
|
||||
unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? };
|
||||
// send fid_docid_facet_numbers_chunk to DB writer
|
||||
let fid_docid_facet_numbers_chunk =
|
||||
unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
|
||||
docid_fid_facet_numbers_chunk.clone(),
|
||||
fid_docid_facet_numbers_chunk.clone(),
|
||||
)));
|
||||
|
||||
// send docid_fid_facet_strings_chunk to DB writer
|
||||
let docid_fid_facet_strings_chunk =
|
||||
unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? };
|
||||
// send fid_docid_facet_strings_chunk to DB writer
|
||||
let fid_docid_facet_strings_chunk =
|
||||
unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
|
||||
docid_fid_facet_strings_chunk.clone(),
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
)));
|
||||
|
||||
Ok((
|
||||
docid_fid_facet_numbers_chunk,
|
||||
fid_docid_facet_numbers_chunk,
|
||||
(
|
||||
docid_fid_facet_strings_chunk,
|
||||
fid_docid_facet_strings_chunk,
|
||||
(
|
||||
fid_facet_is_null_docids_chunk,
|
||||
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
||||
@ -425,5 +411,5 @@ fn send_and_extract_flattened_documents_data(
|
||||
},
|
||||
);
|
||||
|
||||
Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?))
|
||||
Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
|
||||
}
|
||||
|
@ -1,14 +1,12 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter, Seek};
|
||||
use std::time::Instant;
|
||||
|
||||
use grenad::{CompressionType, Sorter};
|
||||
use heed::types::ByteSlice;
|
||||
use log::debug;
|
||||
|
||||
use super::{ClonableMmap, MergeFn};
|
||||
use crate::error::InternalError;
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::Result;
|
||||
|
||||
pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
|
||||
@ -47,6 +45,7 @@ pub fn create_sorter(
|
||||
builder.allow_realloc(false);
|
||||
}
|
||||
builder.sort_algorithm(sort_algorithm);
|
||||
builder.sort_in_parallel(true);
|
||||
builder.build()
|
||||
}
|
||||
|
||||
@ -54,6 +53,7 @@ pub fn sorter_into_reader(
|
||||
sorter: grenad::Sorter<MergeFn>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
@ -115,6 +115,32 @@ impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<Bu
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeableReader
|
||||
for Vec<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)>
|
||||
{
|
||||
type Output = (
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
);
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut m1 = MergerBuilder::new(merge_fn);
|
||||
let mut m2 = MergerBuilder::new(merge_fn);
|
||||
let mut m3 = MergerBuilder::new(merge_fn);
|
||||
for (r1, r2, r3) in self.into_iter() {
|
||||
m1.push(r1)?;
|
||||
m2.push(r2)?;
|
||||
m3.push(r3)?;
|
||||
}
|
||||
Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?))
|
||||
}
|
||||
}
|
||||
|
||||
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
||||
|
||||
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
||||
@ -195,11 +221,13 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
||||
);
|
||||
|
||||
while let Some((document_id, obkv)) = cursor.move_on_next()? {
|
||||
obkv_documents.insert(document_id, obkv)?;
|
||||
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
||||
if !obkv.is_empty() {
|
||||
obkv_documents.insert(document_id, obkv)?;
|
||||
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
||||
|
||||
if current_chunk_size >= documents_chunk_size as u64 {
|
||||
return writer_into_reader(obkv_documents).map(Some);
|
||||
if current_chunk_size >= documents_chunk_size as u64 {
|
||||
return writer_into_reader(obkv_documents).map(Some);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -210,45 +238,46 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
||||
Ok(std::iter::from_fn(move || transposer().transpose()))
|
||||
}
|
||||
|
||||
pub fn sorter_into_lmdb_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
/// Write provided sorter in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
pub fn write_sorter_into_database<K, V, FS, FM>(
|
||||
sorter: Sorter<MergeFn>,
|
||||
merge: MergeFn,
|
||||
) -> Result<()> {
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut heed::RwTxn,
|
||||
index_is_empty: bool,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
where
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
debug!("Writing MTBL sorter...");
|
||||
let before = Instant::now();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<ByteSlice, ByteSlice>();
|
||||
|
||||
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
||||
if database.is_empty(wtxn)? {
|
||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||
while let Some((k, v)) = merger_iter.next()? {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { out_iter.append(k, v)? };
|
||||
}
|
||||
} else {
|
||||
while let Some((k, v)) = merger_iter.next()? {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
|
||||
match iter.next().transpose()? {
|
||||
Some((key, old_val)) if key == k => {
|
||||
let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
|
||||
let val = merge(k, &vals).map_err(|_| {
|
||||
// TODO just wrap this error?
|
||||
InternalError::IndexingMergingKeys { process: "get-put-merge" }
|
||||
})?;
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(k, &val)? };
|
||||
while let Some((key, value)) = merger_iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = if index_is_empty {
|
||||
Some(serialize_value(value, &mut buffer)?)
|
||||
} else {
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
};
|
||||
match value {
|
||||
Some(value) => database.put(wtxn, key, value)?,
|
||||
None => {
|
||||
database.delete(wtxn, key)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -6,22 +6,12 @@ use std::result::Result as StdResult;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::transform::Operation;
|
||||
use crate::Result;
|
||||
|
||||
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
|
||||
|
||||
pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let capacity = values.iter().map(|v| v.len()).sum::<usize>();
|
||||
let mut output = Vec::with_capacity(capacity);
|
||||
values.iter().for_each(|integers| output.extend_from_slice(integers));
|
||||
Ok(Cow::Owned(output))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||
buffer.clear();
|
||||
buffer.reserve(bitmap.serialized_size());
|
||||
@ -75,57 +65,123 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<
|
||||
Ok(obkvs.last().unwrap().clone())
|
||||
}
|
||||
|
||||
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
|
||||
pub fn merge_two_del_add_obkvs(
|
||||
base: obkv::KvReaderU16,
|
||||
update: obkv::KvReaderU16,
|
||||
merge_additions: bool,
|
||||
buffer: &mut Vec<u8>,
|
||||
) {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
buffer.clear();
|
||||
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||
match eob {
|
||||
Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(),
|
||||
Left((k, v)) => {
|
||||
if merge_additions {
|
||||
writer.insert(k, v).unwrap()
|
||||
} else {
|
||||
// If merge_additions is false, recreate an obkv keeping the deletions only.
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let base_reader = KvReaderDelAdd::new(v);
|
||||
|
||||
if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
value_writer.finish().unwrap();
|
||||
writer.insert(k, &value_buffer).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
Right((k, v)) => writer.insert(k, v).unwrap(),
|
||||
Both((k, base), (_, update)) => {
|
||||
// merge deletions and additions.
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let base_reader = KvReaderDelAdd::new(base);
|
||||
let update_reader = KvReaderDelAdd::new(update);
|
||||
|
||||
// keep newest deletion.
|
||||
if let Some(deletion) = update_reader
|
||||
.get(DelAdd::Deletion)
|
||||
.or_else(|| base_reader.get(DelAdd::Deletion))
|
||||
{
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
}
|
||||
|
||||
// keep base addition only if merge_additions is true.
|
||||
let base_addition =
|
||||
merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
|
||||
// keep newest addition.
|
||||
// TODO use or_else
|
||||
if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
|
||||
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||
}
|
||||
|
||||
value_writer.finish().unwrap();
|
||||
writer.insert(k, &value_buffer).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer.finish().unwrap();
|
||||
}
|
||||
|
||||
/// Merge all the obks in the order we see them.
|
||||
pub fn merge_obkvs_and_operations<'a>(
|
||||
/// Merge all the obkvs from the newest to the oldest.
|
||||
fn inner_merge_del_add_obkvs<'a>(
|
||||
obkvs: &[Cow<'a, [u8]>],
|
||||
merge_additions: bool,
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
// pop the newest operation from the list.
|
||||
let (newest, obkvs) = obkvs.split_last().unwrap();
|
||||
// keep the operation type for the returned value.
|
||||
let newest_operation_type = newest[0];
|
||||
|
||||
// treat the newest obkv as the starting point of the merge.
|
||||
let mut acc_operation_type = newest_operation_type;
|
||||
let mut acc = newest[1..].to_vec();
|
||||
let mut buffer = Vec::new();
|
||||
// reverse iter from the most recent to the oldest.
|
||||
for current in obkvs.iter().rev() {
|
||||
// if in the previous iteration there was a complete deletion,
|
||||
// stop the merge process.
|
||||
if acc_operation_type == Operation::Deletion as u8 {
|
||||
break;
|
||||
}
|
||||
|
||||
let newest = obkv::KvReader::new(&acc);
|
||||
let oldest = obkv::KvReader::new(¤t[1..]);
|
||||
merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
|
||||
|
||||
// we want the result of the merge into our accumulator.
|
||||
std::mem::swap(&mut acc, &mut buffer);
|
||||
acc_operation_type = current[0];
|
||||
}
|
||||
|
||||
acc.insert(0, newest_operation_type);
|
||||
Ok(Cow::from(acc))
|
||||
}
|
||||
|
||||
/// Merge all the obkvs from the newest to the oldest.
|
||||
pub fn obkvs_merge_additions_and_deletions<'a>(
|
||||
_key: &[u8],
|
||||
obkvs: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
// [add, add, delete, add, add]
|
||||
// we can ignore everything that happened before the last delete.
|
||||
let starting_position =
|
||||
obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0);
|
||||
|
||||
// [add, add, delete]
|
||||
// if the last operation was a deletion then we simply return the deletion
|
||||
if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8
|
||||
{
|
||||
return Ok(obkvs[obkvs.len() - 1].clone());
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
// (add, add, delete) [add, add]
|
||||
// in the other case, no deletion will be encountered during the merge
|
||||
let mut ret =
|
||||
obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| {
|
||||
let first = obkv::KvReader::new(&acc);
|
||||
let second = obkv::KvReader::new(¤t[1..]);
|
||||
merge_two_obkvs(first, second, &mut buffer);
|
||||
|
||||
// we want the result of the merge into our accumulator
|
||||
std::mem::swap(&mut acc, &mut buffer);
|
||||
acc
|
||||
});
|
||||
|
||||
ret.insert(0, Operation::Addition as u8);
|
||||
Ok(Cow::from(ret))
|
||||
inner_merge_del_add_obkvs(obkvs, true)
|
||||
}
|
||||
|
||||
/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
|
||||
pub fn obkvs_keep_last_addition_merge_deletions<'a>(
|
||||
_key: &[u8],
|
||||
obkvs: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
inner_merge_del_add_obkvs(obkvs, false)
|
||||
}
|
||||
|
||||
/// Do a union of all the CboRoaringBitmaps in the values.
|
||||
pub fn merge_cbo_roaring_bitmaps<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
@ -138,3 +194,52 @@ pub fn merge_cbo_roaring_bitmaps<'a>(
|
||||
Ok(Cow::from(vec))
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
|
||||
/// separately and outputs a new DelAdd with both unions.
|
||||
pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_bitmaps_bytes = Vec::new();
|
||||
let mut add_bitmaps_bytes = Vec::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
|
||||
del_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
|
||||
add_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let mut buffer = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
/// A function that merges a DelAdd of bitmao into an already existing bitmap.
|
||||
///
|
||||
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
|
||||
/// the second one is the CboRoaringBitmap to merge into.
|
||||
pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
||||
deladd_obkv: &[u8],
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> Result<Option<&'a [u8]>> {
|
||||
Ok(CboRoaringBitmapCodec::merge_deladd_into(
|
||||
KvReaderDelAdd::new(deladd_obkv),
|
||||
previous,
|
||||
buffer,
|
||||
)?)
|
||||
}
|
||||
|
@ -9,13 +9,14 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
pub use grenad_helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
||||
merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader,
|
||||
merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
|
||||
GrenadParameters, MergeableReader,
|
||||
};
|
||||
pub use merge_functions::{
|
||||
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
|
||||
merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps,
|
||||
serialize_roaring_bitmap, MergeFn,
|
||||
keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
|
||||
obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn,
|
||||
};
|
||||
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
@ -44,10 +45,6 @@ where
|
||||
Some((head, tail))
|
||||
}
|
||||
|
||||
pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
|
||||
bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
|
||||
}
|
||||
|
||||
/// Converts an fst Stream into an HashSet of Strings.
|
||||
pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet<Vec<u8>>
|
||||
where
|
||||
|
@ -20,11 +20,13 @@ use slice_group_by::GroupBy;
|
||||
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
||||
|
||||
use self::enrich::enrich_documents_batch;
|
||||
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
|
||||
pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId};
|
||||
pub use self::helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
||||
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||
sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
|
||||
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader,
|
||||
ClonableMmap, MergeFn,
|
||||
};
|
||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
@ -32,13 +34,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||
use crate::update::{
|
||||
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
|
||||
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
};
|
||||
use crate::{Index, Result, RoaringBitmapCodec};
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
|
||||
static MERGED_DATABASE_COUNT: usize = 7;
|
||||
static PREFIX_DATABASE_COUNT: usize = 5;
|
||||
static PREFIX_DATABASE_COUNT: usize = 4;
|
||||
static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@ -86,7 +87,6 @@ pub struct IndexDocumentsConfig {
|
||||
pub words_positions_level_group_size: Option<NonZeroU32>,
|
||||
pub words_positions_min_level_size: Option<NonZeroU32>,
|
||||
pub update_method: IndexDocumentsMethod,
|
||||
pub deletion_strategy: DeletionStrategy,
|
||||
pub autogenerate_docids: bool,
|
||||
}
|
||||
|
||||
@ -178,6 +178,7 @@ where
|
||||
|
||||
// Early return when there is no document to add
|
||||
if to_delete.is_empty() {
|
||||
// Maintains Invariant: remove documents actually always returns Ok for the inner result
|
||||
return Ok((self, Ok(0)));
|
||||
}
|
||||
|
||||
@ -190,14 +191,48 @@ where
|
||||
|
||||
self.deleted_documents += deleted_documents;
|
||||
|
||||
// Maintains Invariant: remove documents actually always returns Ok for the inner result
|
||||
Ok((self, Ok(deleted_documents)))
|
||||
}
|
||||
|
||||
/// Removes documents from db using their internal document ids.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This function is dangerous and will only work correctly if:
|
||||
///
|
||||
/// - All the passed ids currently exist in the database
|
||||
/// - No batching using the standards `remove_documents` and `add_documents` took place
|
||||
///
|
||||
/// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
|
||||
pub fn remove_documents_from_db_no_batch(
|
||||
mut self,
|
||||
to_delete: &RoaringBitmap,
|
||||
) -> Result<(Self, u64)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
// Early return when there is no document to add
|
||||
if to_delete.is_empty() {
|
||||
return Ok((self, 0));
|
||||
}
|
||||
|
||||
let deleted_documents = self
|
||||
.transform
|
||||
.as_mut()
|
||||
.expect("Invalid document deletion state")
|
||||
.remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)?
|
||||
as u64;
|
||||
|
||||
self.deleted_documents += deleted_documents;
|
||||
|
||||
Ok((self, deleted_documents))
|
||||
}
|
||||
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
|
||||
puffin::profile_function!();
|
||||
|
||||
if self.added_documents == 0 {
|
||||
if self.added_documents == 0 && self.deleted_documents == 0 {
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
||||
}
|
||||
@ -241,9 +276,6 @@ where
|
||||
primary_key,
|
||||
fields_ids_map,
|
||||
field_distribution,
|
||||
new_external_documents_ids,
|
||||
new_documents_ids,
|
||||
replaced_documents_ids,
|
||||
documents_count,
|
||||
original_documents,
|
||||
flattened_documents,
|
||||
@ -367,29 +399,12 @@ where
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
|
||||
// needs to be droped to avoid channel waiting lock.
|
||||
// needs to be dropped to avoid channel waiting lock.
|
||||
drop(lmdb_writer_sx)
|
||||
});
|
||||
|
||||
// We delete the documents that this document addition replaces. This way we are
|
||||
// able to simply insert all the documents even if they already exist in the database.
|
||||
if !replaced_documents_ids.is_empty() {
|
||||
let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?;
|
||||
deletion_builder.strategy(self.config.deletion_strategy);
|
||||
debug!("documents to delete {:?}", replaced_documents_ids);
|
||||
deletion_builder.delete_documents(&replaced_documents_ids);
|
||||
let deleted_documents_result = deletion_builder.execute_inner()?;
|
||||
debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
|
||||
}
|
||||
|
||||
let index_documents_ids = self.index.documents_ids(self.wtxn)?;
|
||||
let index_is_empty = index_documents_ids.is_empty();
|
||||
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
|
||||
let mut final_documents_ids = RoaringBitmap::new();
|
||||
let mut word_pair_proximity_docids = None;
|
||||
let mut word_position_docids = None;
|
||||
let mut word_fid_docids = None;
|
||||
let mut word_docids = None;
|
||||
let mut exact_word_docids = None;
|
||||
|
||||
let mut databases_seen = 0;
|
||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
@ -397,35 +412,40 @@ where
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
let mut word_position_docids = None;
|
||||
let mut word_fid_docids = None;
|
||||
let mut word_docids = None;
|
||||
let mut exact_word_docids = None;
|
||||
|
||||
for result in lmdb_writer_rx {
|
||||
if (self.should_abort)() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
let typed_chunk = match result? {
|
||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||
word_docids = Some(cloneable_chunk);
|
||||
let cloneable_chunk =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||
exact_word_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(chunk) => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||
word_pair_proximity_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordPairProximityDocids(chunk)
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||
word_fid_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
}
|
||||
TypedChunk::WordPositionDocids(chunk) => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||
word_position_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordPositionDocids(chunk)
|
||||
}
|
||||
TypedChunk::WordFidDocids(chunk) => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||
word_fid_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordFidDocids(chunk)
|
||||
}
|
||||
otherwise => otherwise,
|
||||
};
|
||||
|
||||
@ -457,25 +477,16 @@ where
|
||||
|
||||
// We write the primary key field id into the main database
|
||||
self.index.put_primary_key(self.wtxn, &primary_key)?;
|
||||
|
||||
// We write the external documents ids into the main database.
|
||||
let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
|
||||
external_documents_ids.insert_ids(&new_external_documents_ids)?;
|
||||
let external_documents_ids = external_documents_ids.into_static();
|
||||
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
|
||||
|
||||
let all_documents_ids = index_documents_ids | new_documents_ids;
|
||||
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
|
||||
self.execute_prefix_databases(
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
)?;
|
||||
|
||||
Ok(all_documents_ids.len())
|
||||
Ok(number_of_documents)
|
||||
}
|
||||
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
@ -483,7 +494,6 @@ where
|
||||
self,
|
||||
word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
) -> Result<()>
|
||||
@ -604,32 +614,6 @@ where
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
if let Some(word_pair_proximity_docids) = word_pair_proximity_docids {
|
||||
// Run the word prefix pair proximity docids update operation.
|
||||
PrefixWordPairsProximityDocids::new(
|
||||
self.wtxn,
|
||||
self.index,
|
||||
self.indexer_config.chunk_compression_type,
|
||||
self.indexer_config.chunk_compression_level,
|
||||
)
|
||||
.execute(
|
||||
word_pair_proximity_docids,
|
||||
&new_prefix_fst_words,
|
||||
&common_prefix_fst_words,
|
||||
&del_prefix_fst_words,
|
||||
)?;
|
||||
}
|
||||
|
||||
if (self.should_abort)() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
databases_seen += 1;
|
||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
if let Some(word_position_docids) = word_position_docids {
|
||||
// Run the words prefix position docids update operation.
|
||||
let mut builder = WordPrefixIntegerDocids::new(
|
||||
@ -687,8 +671,8 @@ where
|
||||
fn execute_word_prefix_docids(
|
||||
txn: &mut heed::RwTxn,
|
||||
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
||||
word_docids_db: Database<Str, RoaringBitmapCodec>,
|
||||
word_prefix_docids_db: Database<Str, RoaringBitmapCodec>,
|
||||
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||
indexer_config: &IndexerConfig,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
@ -709,14 +693,15 @@ fn execute_word_prefix_docids(
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
use fst::IntoStreamer;
|
||||
use heed::RwTxn;
|
||||
use maplit::hashset;
|
||||
|
||||
use super::*;
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::TermsMatchingStrategy;
|
||||
use crate::update::DeleteDocuments;
|
||||
use crate::{db_snap, BEU16};
|
||||
use crate::{db_snap, Filter, Search, BEU16};
|
||||
|
||||
#[test]
|
||||
fn simple_document_replacement() {
|
||||
@ -807,11 +792,10 @@ mod tests {
|
||||
assert_eq!(count, 1);
|
||||
|
||||
// Check that we get only one document from the database.
|
||||
// Since the document has been deleted and re-inserted, its internal docid has been incremented to 1
|
||||
let docs = index.documents(&rtxn, Some(1)).unwrap();
|
||||
let docs = index.documents(&rtxn, Some(0)).unwrap();
|
||||
assert_eq!(docs.len(), 1);
|
||||
let (id, doc) = docs[0];
|
||||
assert_eq!(id, 1);
|
||||
assert_eq!(id, 0);
|
||||
|
||||
// Check that this document is equal to the last one sent.
|
||||
let mut doc_iter = doc.iter();
|
||||
@ -872,7 +856,7 @@ mod tests {
|
||||
assert_eq!(count, 3);
|
||||
|
||||
// the document 0 has been deleted and reinserted with the id 3
|
||||
let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap();
|
||||
let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap();
|
||||
let kevin_position =
|
||||
docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
|
||||
assert_eq!(kevin_position, 2);
|
||||
@ -1018,7 +1002,6 @@ mod tests {
|
||||
assert_eq!(count, 6);
|
||||
|
||||
db_snap!(index, word_docids, "updated");
|
||||
db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]");
|
||||
|
||||
drop(rtxn);
|
||||
}
|
||||
@ -1121,17 +1104,15 @@ mod tests {
|
||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||
]))
|
||||
.unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
|
||||
|
||||
// Delete not all of the documents but some of them.
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_external_id("30");
|
||||
builder.execute().unwrap();
|
||||
index.delete_document("30");
|
||||
|
||||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
||||
assert!(external_documents_ids.get("30").is_none());
|
||||
wtxn.commit().unwrap();
|
||||
let txn = index.read_txn().unwrap();
|
||||
assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId"));
|
||||
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
assert!(external_documents_ids.get(&txn, "30").unwrap().is_none());
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
@ -1140,8 +1121,8 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let wtxn = index.write_txn().unwrap();
|
||||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
||||
assert!(external_documents_ids.get("30").is_some());
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some());
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index
|
||||
@ -1435,8 +1416,10 @@ mod tests {
|
||||
index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let external_documents_ids = index.external_documents_ids(&rtxn).unwrap();
|
||||
assert!(external_documents_ids.get("1").is_some());
|
||||
let all_documents_count = index.all_documents(&rtxn).unwrap().count();
|
||||
assert_eq!(all_documents_count, 1);
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -1490,12 +1473,6 @@ mod tests {
|
||||
3 2 second second
|
||||
3 3 third third
|
||||
"###);
|
||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||
0 []
|
||||
1 []
|
||||
2 []
|
||||
3 [0, 1, 2, 3, ]
|
||||
"###);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
@ -1519,12 +1496,6 @@ mod tests {
|
||||
|
||||
db_snap!(index, facet_id_string_docids, @"");
|
||||
db_snap!(index, field_id_docid_facet_strings, @"");
|
||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||
0 []
|
||||
1 []
|
||||
2 []
|
||||
3 [0, 1, 2, 3, ]
|
||||
"###);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
@ -1551,12 +1522,6 @@ mod tests {
|
||||
3 2 second second
|
||||
3 3 third third
|
||||
"###);
|
||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||
0 []
|
||||
1 []
|
||||
2 []
|
||||
3 [0, 1, 2, 3, ]
|
||||
"###);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
@ -1719,7 +1684,7 @@ mod tests {
|
||||
|
||||
let wtxn = index.read_txn().unwrap();
|
||||
|
||||
let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map();
|
||||
let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap();
|
||||
let ids = map.values().collect::<HashSet<_>>();
|
||||
|
||||
assert_eq!(ids.len(), map.len());
|
||||
@ -2531,17 +2496,8 @@ mod tests {
|
||||
db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
|
||||
db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
// Delete not all of the documents but some of them.
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_external_id("0");
|
||||
builder.delete_external_id("3");
|
||||
let result = builder.execute().unwrap();
|
||||
println!("{result:?}");
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
index.delete_documents(vec!["0".into(), "3".into()]);
|
||||
|
||||
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
|
||||
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
||||
@ -2596,8 +2552,7 @@ mod tests {
|
||||
),
|
||||
]
|
||||
*/
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
||||
let index = TempIndex::new();
|
||||
|
||||
// START OF BATCH
|
||||
|
||||
@ -2637,8 +2592,7 @@ mod tests {
|
||||
{"id":1,"doggo":"bernese"}
|
||||
"###);
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
soft:
|
||||
hard:
|
||||
docids:
|
||||
1 0
|
||||
"###);
|
||||
|
||||
@ -2683,13 +2637,10 @@ mod tests {
|
||||
"###);
|
||||
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
soft:
|
||||
hard:
|
||||
docids:
|
||||
0 1
|
||||
"###);
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
|
||||
// BATCH 3
|
||||
|
||||
println!("--- ENTERING BATCH 3");
|
||||
@ -2731,4 +2682,537 @@ mod tests {
|
||||
let res = index.search(&rtxn).execute().unwrap();
|
||||
index.documents(&rtxn, res.documents_ids).unwrap();
|
||||
}
|
||||
|
||||
fn delete_documents<'t>(
|
||||
wtxn: &mut RwTxn<'t, '_>,
|
||||
index: &'t TempIndex,
|
||||
external_ids: &[&str],
|
||||
) -> Vec<u32> {
|
||||
let external_document_ids = index.external_documents_ids();
|
||||
let ids_to_delete: Vec<u32> = external_ids
|
||||
.iter()
|
||||
.map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap())
|
||||
.collect();
|
||||
|
||||
// Delete some documents.
|
||||
index.delete_documents_using_wtxn(
|
||||
wtxn,
|
||||
external_ids.iter().map(ToString::to_string).collect(),
|
||||
);
|
||||
|
||||
ids_to_delete
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_documents_with_numbers_as_primary_key() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
|
||||
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
|
||||
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// delete those documents, ids are synchronous therefore 0, 1, and 2.
|
||||
index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]);
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// All these snapshots should be empty since the database was cleared
|
||||
db_snap!(index, documents_ids);
|
||||
db_snap!(index, word_docids);
|
||||
db_snap!(index, word_pair_proximity_docids);
|
||||
db_snap!(index, facet_id_exists_docids);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_documents_with_strange_primary_key() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()]))
|
||||
.unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "mysuperid": 0, "name": "kevin" },
|
||||
{ "mysuperid": 1, "name": "kevina" },
|
||||
{ "mysuperid": 2, "name": "benoit" }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
// Delete not all of the documents but some of them.
|
||||
index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]);
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents_ids);
|
||||
db_snap!(index, word_docids);
|
||||
db_snap!(index, word_pair_proximity_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filtered_placeholder_search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
settings.set_filterable_fields(hashset! { S("label"), S("label2") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]);
|
||||
|
||||
// Placeholder search with filter
|
||||
let filter = Filter::from_str("label = sign").unwrap().unwrap();
|
||||
let results = index.search(&wtxn).filter(filter).execute().unwrap();
|
||||
assert!(results.documents_ids.is_empty());
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, word_docids);
|
||||
db_snap!(index, facet_id_f64_docids);
|
||||
db_snap!(index, word_pair_proximity_docids);
|
||||
db_snap!(index, facet_id_exists_docids);
|
||||
db_snap!(index, facet_id_string_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn placeholder_search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]);
|
||||
|
||||
// Placeholder search
|
||||
let results = index.search(&wtxn).execute().unwrap();
|
||||
assert!(!results.documents_ids.is_empty());
|
||||
for id in results.documents_ids.iter() {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
|
||||
|
||||
// search for abstract
|
||||
let results = index.search(&wtxn).query("abstract").execute().unwrap();
|
||||
assert!(!results.documents_ids.is_empty());
|
||||
for id in results.documents_ids.iter() {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn geo_filtered_placeholder_search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("id"));
|
||||
settings.set_filterable_fields(hashset!(S("_geo")));
|
||||
settings.set_sortable_fields(hashset!(S("_geo")));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index.add_documents_using_wtxn(&mut wtxn, documents!([
|
||||
{ "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } },
|
||||
{ "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } },
|
||||
{ "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } },
|
||||
{ "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } },
|
||||
{ "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } },
|
||||
{ "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } },
|
||||
{ "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } },
|
||||
{ "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } },
|
||||
{ "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } },
|
||||
{ "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } },
|
||||
{ "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } },
|
||||
{ "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } },
|
||||
{ "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } },
|
||||
{ "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } },
|
||||
{ "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } },
|
||||
{ "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } },
|
||||
{ "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } },
|
||||
{ "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } },
|
||||
{ "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } },
|
||||
{ "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } }
|
||||
])).unwrap();
|
||||
|
||||
let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete);
|
||||
|
||||
// Placeholder search with geo filter
|
||||
let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap();
|
||||
let results = index.search(&wtxn).filter(filter).execute().unwrap();
|
||||
assert!(!results.documents_ids.is_empty());
|
||||
for id in results.documents_ids.iter() {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids);
|
||||
db_snap!(index, facet_id_string_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_documents_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let deleted_external_ids = ["1_7", "1_52"];
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids);
|
||||
|
||||
// list all documents
|
||||
let results = index.all_documents(&wtxn).unwrap();
|
||||
for result in results {
|
||||
let (id, _) = result.unwrap();
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(&id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
// list internal document ids
|
||||
let results = index.documents_ids(&wtxn).unwrap();
|
||||
for id in results {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(&id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
// get internal docids from deleted external document ids
|
||||
let results = index.external_documents_ids();
|
||||
for id in deleted_external_ids {
|
||||
assert!(
|
||||
results.get(&rtxn, id).unwrap().is_none(),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
drop(rtxn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stats_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index.add_documents_using_wtxn(&mut wtxn, documents!([
|
||||
{ "docid": "1_4", "label": ["sign"]},
|
||||
{ "docid": "1_5", "label": ["letter"]},
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"},
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"]},
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"]},
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"},
|
||||
{ "docid": "1_39", "label": ["abstract"]},
|
||||
{ "docid": "1_40", "label": ["cartoon"]},
|
||||
{ "docid": "1_41", "label": ["art","drawing"]},
|
||||
{ "docid": "1_42", "label": ["art","pattern"]},
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32},
|
||||
{ "docid": "1_44", "label": ["drawing"], "number": 44i32},
|
||||
{ "docid": "1_45", "label": ["art"]},
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"]},
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"]},
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"]},
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"]},
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"]},
|
||||
{ "docid": "1_68", "label": ["design"]},
|
||||
{ "docid": "1_69", "label": ["geometry"]}
|
||||
])).unwrap();
|
||||
|
||||
delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
|
||||
|
||||
// count internal documents
|
||||
let results = index.number_of_documents(&wtxn).unwrap();
|
||||
assert_eq!(18, results);
|
||||
|
||||
// count field distribution
|
||||
let results = index.field_distribution(&wtxn).unwrap();
|
||||
assert_eq!(Some(&18), results.get("label"));
|
||||
assert_eq!(Some(&1), results.get("title"));
|
||||
assert_eq!(Some(&2), results.get("number"));
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stored_detected_script_and_language_should_not_return_deleted_documents() {
|
||||
use charabia::{Language, Script};
|
||||
let index = TempIndex::new();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
|
||||
{ "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
|
||||
{ "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
|
||||
{ "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
|
||||
{ "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
|
||||
{ "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let key_cmn = (Script::Cj, Language::Cmn);
|
||||
let cj_cmn_docs =
|
||||
index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
|
||||
let mut expected_cj_cmn_docids = RoaringBitmap::new();
|
||||
expected_cj_cmn_docids.push(1);
|
||||
expected_cj_cmn_docids.push(5);
|
||||
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
|
||||
|
||||
delete_documents(&mut wtxn, &index, &["1"]);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let cj_cmn_docs =
|
||||
index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
|
||||
let mut expected_cj_cmn_docids = RoaringBitmap::new();
|
||||
expected_cj_cmn_docids.push(5);
|
||||
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_words_exact_attributes() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key(S("id"));
|
||||
settings.set_searchable_fields(vec![S("text"), S("exact")]);
|
||||
settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "text": "hello" },
|
||||
{ "id": 1, "exact": "hello"}
|
||||
]))
|
||||
.unwrap();
|
||||
db_snap!(index, word_docids, 1, @r###"
|
||||
hello [0, ]
|
||||
"###);
|
||||
db_snap!(index, exact_word_docids, 1, @r###"
|
||||
hello [1, ]
|
||||
"###);
|
||||
db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, word_docids, 2, @r###"
|
||||
hello [0, ]
|
||||
"###);
|
||||
db_snap!(index, exact_word_docids, 2, @"");
|
||||
db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
|
||||
|
||||
insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
|
||||
let txn = index.read_txn().unwrap();
|
||||
let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
|
||||
insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("hello");
|
||||
let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
[]
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
[2, ]
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
benoit [2, ]
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/delete_documents.rs
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
|
||||
2 [21, ]
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
2 0 2.2 1 [21, ]
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/delete_documents.rs
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ]
|
||||
1 0 aquarium 1 [5, ]
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/delete_documents.rs
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
|
||||
2 [21, ]
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/delete_documents.rs
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 1 36 [3, ]
|
||||
1 1 37 [4, ]
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/delete_documents.rs
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
3 0 48.9021 1 [19, ]
|
||||
3 0 49.9314 1 [17, ]
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
@ -1,60 +1,56 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
0 [1, 7, ]
|
||||
0 [1, ]
|
||||
1 [2, ]
|
||||
10 [1, 7, ]
|
||||
12 [0, 8, ]
|
||||
10 [1, ]
|
||||
12 [0, ]
|
||||
1344 [3, ]
|
||||
1813 [8, ]
|
||||
2 [0, 8, ]
|
||||
1813 [0, ]
|
||||
2 [0, ]
|
||||
23 [5, ]
|
||||
25 [2, ]
|
||||
3 [0, 8, ]
|
||||
3 [0, ]
|
||||
35 [5, ]
|
||||
4 [4, 6, ]
|
||||
42 [0, 5, 8, ]
|
||||
456 [1, 7, ]
|
||||
5 [0, 8, ]
|
||||
4 [4, ]
|
||||
42 [0, 5, ]
|
||||
456 [1, ]
|
||||
5 [0, ]
|
||||
99 [2, ]
|
||||
adams [5, ]
|
||||
adventure [1, 7, ]
|
||||
adventure [1, ]
|
||||
alice [2, ]
|
||||
and [0, 4, 6, 8, ]
|
||||
antoine [1, 7, ]
|
||||
austen [8, ]
|
||||
austin [0, ]
|
||||
blood [4, 6, ]
|
||||
and [0, 4, ]
|
||||
antoine [1, ]
|
||||
austen [0, ]
|
||||
blood [4, ]
|
||||
carroll [2, ]
|
||||
de [1, 7, ]
|
||||
de [1, ]
|
||||
douglas [5, ]
|
||||
exupery [1, 7, ]
|
||||
fantasy [2, 3, 4, 6, ]
|
||||
exupery [1, ]
|
||||
fantasy [2, 3, 4, ]
|
||||
galaxy [5, ]
|
||||
guide [5, ]
|
||||
half [4, 6, ]
|
||||
harry [4, 6, ]
|
||||
half [4, ]
|
||||
harry [4, ]
|
||||
hitchhiker [5, ]
|
||||
hobbit [3, ]
|
||||
in [2, ]
|
||||
j [3, 4, 6, 8, ]
|
||||
jane [0, ]
|
||||
k [4, 6, ]
|
||||
le [1, ]
|
||||
j [0, 3, 4, ]
|
||||
k [4, ]
|
||||
lewis [2, ]
|
||||
little [7, ]
|
||||
petit [1, ]
|
||||
potter [4, 6, ]
|
||||
prejudice [0, 8, ]
|
||||
pride [0, 8, ]
|
||||
prince [1, 4, 7, ]
|
||||
princess [6, ]
|
||||
little [1, ]
|
||||
potter [4, ]
|
||||
prejudice [0, ]
|
||||
pride [0, ]
|
||||
prince [1, ]
|
||||
princess [4, ]
|
||||
r [3, ]
|
||||
romance [0, 8, ]
|
||||
rowling [4, 6, ]
|
||||
romance [0, ]
|
||||
rowling [4, ]
|
||||
s [5, ]
|
||||
saint [1, 7, ]
|
||||
the [3, 4, 5, 6, 7, ]
|
||||
saint [1, ]
|
||||
the [1, 3, 4, 5, ]
|
||||
to [5, ]
|
||||
tolkien [3, ]
|
||||
wonderland [2, ]
|
||||
|
@ -1,5 +1,6 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::btree_map::Entry as BEntry;
|
||||
use std::collections::hash_map::Entry as HEntry;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek};
|
||||
@ -7,18 +8,21 @@ use std::io::{Read, Seek};
|
||||
use fxhash::FxHashMap;
|
||||
use heed::RoTxn;
|
||||
use itertools::Itertools;
|
||||
use obkv::{KvReader, KvWriter};
|
||||
use obkv::{KvReader, KvReaderU16, KvWriter};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
use smartstring::SmartString;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn,
|
||||
create_sorter, create_writer, keep_first, obkvs_keep_last_addition_merge_deletions,
|
||||
obkvs_merge_additions_and_deletions, sorter_into_reader, MergeFn,
|
||||
};
|
||||
use super::{IndexDocumentsMethod, IndexerConfig};
|
||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::index::{db_name, main_key};
|
||||
use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd};
|
||||
use crate::update::index_documents::GrenadParameters;
|
||||
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
||||
use crate::{
|
||||
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
|
||||
@ -28,9 +32,6 @@ pub struct TransformOutput {
|
||||
pub primary_key: String,
|
||||
pub fields_ids_map: FieldsIdsMap,
|
||||
pub field_distribution: FieldDistribution,
|
||||
pub new_external_documents_ids: fst::Map<Cow<'static, [u8]>>,
|
||||
pub new_documents_ids: RoaringBitmap,
|
||||
pub replaced_documents_ids: RoaringBitmap,
|
||||
pub documents_count: usize,
|
||||
pub original_documents: File,
|
||||
pub flattened_documents: File,
|
||||
@ -106,8 +107,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
// We must choose the appropriate merge function for when two or more documents
|
||||
// with the same user id must be merged or fully replaced in the same batch.
|
||||
let merge_function = match index_documents_method {
|
||||
IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv,
|
||||
IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations,
|
||||
IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions,
|
||||
IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions,
|
||||
};
|
||||
|
||||
// We initialize the sorter with the user indexing settings.
|
||||
@ -130,17 +131,13 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
indexer_settings.max_memory.map(|mem| mem / 2),
|
||||
);
|
||||
let documents_ids = index.documents_ids(wtxn)?;
|
||||
let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?;
|
||||
|
||||
Ok(Transform {
|
||||
index,
|
||||
fields_ids_map: index.fields_ids_map(wtxn)?,
|
||||
indexer_settings,
|
||||
autogenerate_docids,
|
||||
available_documents_ids: AvailableDocumentsIds::from_documents_ids(
|
||||
&documents_ids,
|
||||
&soft_deleted_documents_ids,
|
||||
),
|
||||
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
|
||||
original_sorter,
|
||||
flattened_sorter,
|
||||
index_documents_method,
|
||||
@ -151,6 +148,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
})
|
||||
}
|
||||
|
||||
#[logging_timer::time]
|
||||
pub fn read_documents<R, FP, FA>(
|
||||
&mut self,
|
||||
reader: EnrichedDocumentsBatchReader<R>,
|
||||
@ -163,8 +161,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
|
||||
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||
let external_documents_ids = self.index.external_documents_ids();
|
||||
let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
|
||||
|
||||
let primary_key = cursor.primary_key().to_string();
|
||||
@ -172,7 +172,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
|
||||
|
||||
let mut obkv_buffer = Vec::new();
|
||||
let mut document_sorter_buffer = Vec::new();
|
||||
let mut document_sorter_value_buffer = Vec::new();
|
||||
let mut document_sorter_key_buffer = Vec::new();
|
||||
let mut documents_count = 0;
|
||||
let mut docid_buffer: Vec<u8> = Vec::new();
|
||||
let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
|
||||
@ -213,29 +214,30 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2));
|
||||
|
||||
// Build the new obkv document.
|
||||
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
|
||||
let mut writer = KvWriter::new(&mut obkv_buffer);
|
||||
for (k, v) in field_buffer_cache.iter() {
|
||||
writer.insert(*k, v)?;
|
||||
}
|
||||
|
||||
let mut original_docid = None;
|
||||
|
||||
let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
|
||||
Entry::Occupied(entry) => *entry.get() as u32,
|
||||
Entry::Vacant(entry) => {
|
||||
// If the document was already in the db we mark it as a replaced document.
|
||||
// It'll be deleted later.
|
||||
if let Some(docid) = external_documents_ids.get(entry.key()) {
|
||||
// If it was already in the list of replaced documents it means it was deleted
|
||||
// by the remove_document method. We should starts as if it never existed.
|
||||
if self.replaced_documents_ids.insert(docid) {
|
||||
original_docid = Some(docid);
|
||||
HEntry::Occupied(entry) => *entry.get() as u32,
|
||||
HEntry::Vacant(entry) => {
|
||||
let docid = match external_documents_ids.get(wtxn, entry.key())? {
|
||||
Some(docid) => {
|
||||
// If it was already in the list of replaced documents it means it was deleted
|
||||
// by the remove_document method. We should starts as if it never existed.
|
||||
if self.replaced_documents_ids.insert(docid) {
|
||||
original_docid = Some(docid);
|
||||
}
|
||||
|
||||
docid
|
||||
}
|
||||
}
|
||||
let docid = self
|
||||
.available_documents_ids
|
||||
.next()
|
||||
.ok_or(UserError::DocumentLimitReached)?;
|
||||
None => self
|
||||
.available_documents_ids
|
||||
.next()
|
||||
.ok_or(UserError::DocumentLimitReached)?,
|
||||
};
|
||||
entry.insert(docid as u64);
|
||||
docid
|
||||
}
|
||||
@ -263,47 +265,68 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
skip_insertion = true;
|
||||
} else {
|
||||
// we associate the base document with the new key, everything will get merged later.
|
||||
document_sorter_buffer.clear();
|
||||
document_sorter_buffer.push(Operation::Addition as u8);
|
||||
document_sorter_buffer.extend_from_slice(base_obkv);
|
||||
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
||||
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
|
||||
Some(flattened_obkv) => {
|
||||
// we recreate our buffer with the flattened documents
|
||||
document_sorter_buffer.clear();
|
||||
document_sorter_buffer.push(Operation::Addition as u8);
|
||||
document_sorter_buffer.extend_from_slice(&flattened_obkv);
|
||||
self.flattened_sorter
|
||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
||||
let deladd_operation = match self.index_documents_method {
|
||||
IndexDocumentsMethod::UpdateDocuments => {
|
||||
DelAddOperation::DeletionAndAddition
|
||||
}
|
||||
None => self
|
||||
.flattened_sorter
|
||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?,
|
||||
IndexDocumentsMethod::ReplaceDocuments => DelAddOperation::Deletion,
|
||||
};
|
||||
document_sorter_key_buffer.clear();
|
||||
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(base_obkv),
|
||||
deladd_operation,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
self.original_sorter
|
||||
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
let base_obkv = KvReader::new(base_obkv);
|
||||
if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? {
|
||||
// we recreate our buffer with the flattened documents
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&flattened_obkv),
|
||||
deladd_operation,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
}
|
||||
self.flattened_sorter
|
||||
.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !skip_insertion {
|
||||
self.new_documents_ids.insert(docid);
|
||||
|
||||
document_sorter_buffer.clear();
|
||||
document_sorter_buffer.push(Operation::Addition as u8);
|
||||
document_sorter_buffer.extend_from_slice(&obkv_buffer);
|
||||
document_sorter_key_buffer.clear();
|
||||
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&obkv_buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
// We use the extracted/generated user id as the key for this document.
|
||||
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
||||
self.original_sorter
|
||||
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
|
||||
match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
|
||||
Some(flattened_obkv) => {
|
||||
document_sorter_buffer.clear();
|
||||
document_sorter_buffer.push(Operation::Addition as u8);
|
||||
document_sorter_buffer.extend_from_slice(&flattened_obkv);
|
||||
self.flattened_sorter
|
||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
||||
}
|
||||
None => self
|
||||
.flattened_sorter
|
||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?,
|
||||
let flattened_obkv = KvReader::new(&obkv_buffer);
|
||||
if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&obkv),
|
||||
DelAddOperation::Addition,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?
|
||||
}
|
||||
self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||
}
|
||||
documents_count += 1;
|
||||
|
||||
@ -338,6 +361,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
/// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db,
|
||||
/// it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids.
|
||||
/// - If the document to remove was not present in either the db or the transform we do nothing.
|
||||
#[logging_timer::time]
|
||||
pub fn remove_documents<FA>(
|
||||
&mut self,
|
||||
mut to_remove: Vec<String>,
|
||||
@ -347,54 +371,176 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
where
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
// there may be duplicates in the documents to remove.
|
||||
to_remove.sort_unstable();
|
||||
to_remove.dedup();
|
||||
|
||||
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||
let external_documents_ids = self.index.external_documents_ids();
|
||||
|
||||
let mut documents_deleted = 0;
|
||||
let mut document_sorter_value_buffer = Vec::new();
|
||||
let mut document_sorter_key_buffer = Vec::new();
|
||||
for to_remove in to_remove {
|
||||
if should_abort() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
|
||||
// if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
|
||||
Entry::Occupied(entry) => {
|
||||
let doc_id = *entry.get() as u32;
|
||||
self.original_sorter
|
||||
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
|
||||
self.flattened_sorter
|
||||
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
|
||||
// Check if the document has been added in the current indexing process.
|
||||
let deleted_from_current =
|
||||
match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
|
||||
// if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
|
||||
HEntry::Occupied(entry) => {
|
||||
let docid = *entry.get() as u32;
|
||||
// Key is the concatenation of the internal docid and the external one.
|
||||
document_sorter_key_buffer.clear();
|
||||
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes());
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||
obkv::KvWriterU16::new(&mut document_sorter_value_buffer).finish().unwrap();
|
||||
self.original_sorter
|
||||
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
self.flattened_sorter
|
||||
.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||
|
||||
// we must NOT update the list of replaced_documents_ids
|
||||
// Either:
|
||||
// 1. It's already in it and there is nothing to do
|
||||
// 2. It wasn't in it because the document was created by a previous batch and since
|
||||
// we're removing it there is nothing to do.
|
||||
self.new_documents_ids.remove(doc_id);
|
||||
entry.remove_entry();
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
// If the document was already in the db we mark it as a `to_delete` document.
|
||||
// It'll be deleted later. We don't need to push anything to the sorters.
|
||||
if let Some(docid) = external_documents_ids.get(entry.key()) {
|
||||
self.replaced_documents_ids.insert(docid);
|
||||
} else {
|
||||
// if the document is nowehere to be found, there is nothing to do and we must NOT
|
||||
// increment the count of documents_deleted
|
||||
continue;
|
||||
// we must NOT update the list of replaced_documents_ids
|
||||
// Either:
|
||||
// 1. It's already in it and there is nothing to do
|
||||
// 2. It wasn't in it because the document was created by a previous batch and since
|
||||
// we're removing it there is nothing to do.
|
||||
self.new_documents_ids.remove(docid);
|
||||
entry.remove_entry();
|
||||
true
|
||||
}
|
||||
HEntry::Vacant(_) => false,
|
||||
};
|
||||
|
||||
// If the document was already in the db we mark it as a `to_delete` document.
|
||||
// Then we push the document in sorters in deletion mode.
|
||||
let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? {
|
||||
Some(docid) => {
|
||||
self.remove_document_from_db(
|
||||
docid,
|
||||
to_remove,
|
||||
wtxn,
|
||||
&mut document_sorter_key_buffer,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
true
|
||||
}
|
||||
None => false,
|
||||
};
|
||||
|
||||
// increase counter only if the document existed somewhere before.
|
||||
if deleted_from_current || deleted_from_db {
|
||||
documents_deleted += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(documents_deleted)
|
||||
}
|
||||
|
||||
/// Removes documents from db using their internal document ids.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This function is dangerous and will only work correctly if:
|
||||
///
|
||||
/// - All the passed ids currently exist in the database
|
||||
/// - No batching using the standards `remove_documents` and `add_documents` took place
|
||||
///
|
||||
/// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
|
||||
#[logging_timer::time]
|
||||
pub fn remove_documents_from_db_no_batch<FA>(
|
||||
&mut self,
|
||||
to_remove: &RoaringBitmap,
|
||||
wtxn: &mut heed::RwTxn,
|
||||
should_abort: FA,
|
||||
) -> Result<usize>
|
||||
where
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut documents_deleted = 0;
|
||||
let mut document_sorter_value_buffer = Vec::new();
|
||||
let mut document_sorter_key_buffer = Vec::new();
|
||||
let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?;
|
||||
|
||||
for (internal_docid, external_docid) in to_remove.iter().zip(external_ids) {
|
||||
let external_docid = external_docid?;
|
||||
if should_abort() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
self.remove_document_from_db(
|
||||
internal_docid,
|
||||
external_docid,
|
||||
wtxn,
|
||||
&mut document_sorter_key_buffer,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
|
||||
documents_deleted += 1;
|
||||
}
|
||||
|
||||
Ok(documents_deleted)
|
||||
}
|
||||
|
||||
fn remove_document_from_db(
|
||||
&mut self,
|
||||
internal_docid: u32,
|
||||
external_docid: String,
|
||||
txn: &heed::RoTxn,
|
||||
document_sorter_key_buffer: &mut Vec<u8>,
|
||||
document_sorter_value_buffer: &mut Vec<u8>,
|
||||
) -> Result<()> {
|
||||
self.replaced_documents_ids.insert(internal_docid);
|
||||
|
||||
// fetch the obkv document
|
||||
let original_key = BEU32::new(internal_docid);
|
||||
let base_obkv = self
|
||||
.index
|
||||
.documents
|
||||
.remap_data_type::<heed::types::ByteSlice>()
|
||||
.get(txn, &original_key)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::DOCUMENTS,
|
||||
key: None,
|
||||
})?;
|
||||
|
||||
// Key is the concatenation of the internal docid and the external one.
|
||||
document_sorter_key_buffer.clear();
|
||||
document_sorter_key_buffer.extend_from_slice(&internal_docid.to_be_bytes());
|
||||
document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes());
|
||||
// push it as to delete in the original_sorter
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(base_obkv),
|
||||
DelAddOperation::Deletion,
|
||||
document_sorter_value_buffer,
|
||||
)?;
|
||||
self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
|
||||
// flatten it and push it as to delete in the flattened_sorter
|
||||
let flattened_obkv = KvReader::new(base_obkv);
|
||||
if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
|
||||
// we recreate our buffer with the flattened documents
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&obkv),
|
||||
DelAddOperation::Deletion,
|
||||
document_sorter_value_buffer,
|
||||
)?;
|
||||
}
|
||||
self.flattened_sorter
|
||||
.insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Flatten a document from the fields ids map contained in self and insert the new
|
||||
// created fields. Returns `None` if the document doesn't need to be flattened.
|
||||
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
|
||||
@ -514,42 +660,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_deleted_documents_from_field_distribution(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
field_distribution: &mut FieldDistribution,
|
||||
) -> Result<()> {
|
||||
for deleted_docid in self.replaced_documents_ids.iter() {
|
||||
let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
|
||||
)?;
|
||||
|
||||
for (key, _) in obkv.iter() {
|
||||
let name =
|
||||
self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||
field_id: key,
|
||||
process: "Computing field distribution in transform.",
|
||||
})?;
|
||||
// We checked that the document was in the db earlier. If we can't find it it means
|
||||
// there is an inconsistency between the field distribution and the field id map.
|
||||
let field =
|
||||
field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||
field_id: key,
|
||||
process: "Accessing field distribution in transform.",
|
||||
})?;
|
||||
*field -= 1;
|
||||
if *field == 0 {
|
||||
// since we were able to get the field right before it's safe to unwrap here
|
||||
field_distribution.remove(name).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generate the `TransformOutput` based on the given sorter that can be generated from any
|
||||
/// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
|
||||
/// id for the user side and the value must be an obkv where keys are valid fields ids.
|
||||
#[logging_timer::time]
|
||||
pub(crate) fn output_from_sorter<F>(
|
||||
self,
|
||||
wtxn: &mut heed::RwTxn,
|
||||
@ -581,17 +695,13 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
// 2. Add all the new documents to the field distribution
|
||||
let mut field_distribution = self.index.field_distribution(wtxn)?;
|
||||
|
||||
self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?;
|
||||
|
||||
// Here we are going to do the document count + field distribution + `write_into_stream_writer`
|
||||
let mut iter = self.original_sorter.into_stream_merger_iter()?;
|
||||
// used only for the callback
|
||||
let mut documents_count = 0;
|
||||
|
||||
while let Some((key, val)) = iter.next()? {
|
||||
if val[0] == Operation::Deletion as u8 {
|
||||
continue;
|
||||
}
|
||||
// skip first byte corresponding to the operation type (Deletion or Addition).
|
||||
let val = &val[1..];
|
||||
|
||||
// send a callback to show at which step we are
|
||||
@ -601,16 +711,51 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
total_documents: self.documents_count,
|
||||
});
|
||||
|
||||
// We increment all the field of the current document in the field distribution.
|
||||
let obkv = KvReader::new(val);
|
||||
|
||||
for (key, _) in obkv.iter() {
|
||||
let name =
|
||||
self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||
field_id: key,
|
||||
process: "Computing field distribution in transform.",
|
||||
})?;
|
||||
*field_distribution.entry(name.to_string()).or_insert(0) += 1;
|
||||
for (key, value) in KvReader::new(val) {
|
||||
let reader = KvReaderDelAdd::new(value);
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
(None, None) => {}
|
||||
(None, Some(_)) => {
|
||||
// New field
|
||||
let name = self.fields_ids_map.name(key).ok_or(
|
||||
FieldIdMapMissingEntry::FieldId {
|
||||
field_id: key,
|
||||
process: "Computing field distribution in transform.",
|
||||
},
|
||||
)?;
|
||||
*field_distribution.entry(name.to_string()).or_insert(0) += 1;
|
||||
}
|
||||
(Some(_), None) => {
|
||||
// Field removed
|
||||
let name = self.fields_ids_map.name(key).ok_or(
|
||||
FieldIdMapMissingEntry::FieldId {
|
||||
field_id: key,
|
||||
process: "Computing field distribution in transform.",
|
||||
},
|
||||
)?;
|
||||
match field_distribution.entry(name.to_string()) {
|
||||
BEntry::Vacant(_) => { /* Bug? trying to remove a non-existing field */
|
||||
}
|
||||
BEntry::Occupied(mut entry) => {
|
||||
// attempt to remove one
|
||||
match entry.get_mut().checked_sub(1) {
|
||||
Some(0) => {
|
||||
entry.remove();
|
||||
}
|
||||
Some(new_val) => {
|
||||
*entry.get_mut() = new_val;
|
||||
}
|
||||
None => {
|
||||
unreachable!("Attempting to remove a field that wasn't in the field distribution")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
(Some(_), Some(_)) => {
|
||||
// Value change, no field distribution change
|
||||
}
|
||||
}
|
||||
}
|
||||
writer.insert(key, val)?;
|
||||
}
|
||||
@ -631,9 +776,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
// We get rids of the `Operation` byte and skip the deleted documents as well.
|
||||
let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
|
||||
while let Some((key, val)) = iter.next()? {
|
||||
if val[0] == Operation::Deletion as u8 {
|
||||
continue;
|
||||
}
|
||||
// skip first byte corresponding to the operation type (Deletion or Addition).
|
||||
let val = &val[1..];
|
||||
writer.insert(key, val)?;
|
||||
}
|
||||
@ -649,15 +792,11 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| {
|
||||
fst_new_external_documents_ids_builder.insert(key, value)
|
||||
})?;
|
||||
let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map();
|
||||
|
||||
Ok(TransformOutput {
|
||||
primary_key,
|
||||
fields_ids_map: self.fields_ids_map,
|
||||
field_distribution,
|
||||
new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(),
|
||||
new_documents_ids: self.new_documents_ids,
|
||||
replaced_documents_ids: self.replaced_documents_ids,
|
||||
documents_count: self.documents_count,
|
||||
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
|
||||
flattened_documents: flattened_documents
|
||||
@ -687,37 +826,41 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
.to_string();
|
||||
let field_distribution = self.index.field_distribution(wtxn)?;
|
||||
|
||||
// Delete the soft deleted document ids from the maps inside the external_document_ids structure
|
||||
let new_external_documents_ids = {
|
||||
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
|
||||
// This call should be free and can't fail since the previous method merged both fsts.
|
||||
external_documents_ids.into_static().to_fst()?.into_owned()
|
||||
};
|
||||
|
||||
let documents_ids = self.index.documents_ids(wtxn)?;
|
||||
let documents_count = documents_ids.len() as usize;
|
||||
|
||||
// We create a final writer to write the new documents in order from the sorter.
|
||||
let mut original_writer = create_writer(
|
||||
// We initialize the sorter with the user indexing settings.
|
||||
let mut original_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
self.indexer_settings.chunk_compression_type,
|
||||
self.indexer_settings.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
self.indexer_settings.max_nb_chunks,
|
||||
self.indexer_settings.max_memory.map(|mem| mem / 2),
|
||||
);
|
||||
|
||||
// We create a final writer to write the new documents in order from the sorter.
|
||||
let mut flattened_writer = create_writer(
|
||||
// We initialize the sorter with the user indexing settings.
|
||||
let mut flattened_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
self.indexer_settings.chunk_compression_type,
|
||||
self.indexer_settings.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
self.indexer_settings.max_nb_chunks,
|
||||
self.indexer_settings.max_memory.map(|mem| mem / 2),
|
||||
);
|
||||
|
||||
let mut obkv_buffer = Vec::new();
|
||||
for result in self.index.all_documents(wtxn)? {
|
||||
let (docid, obkv) = result?;
|
||||
let mut document_sorter_key_buffer = Vec::new();
|
||||
let mut document_sorter_value_buffer = Vec::new();
|
||||
for result in self.index.external_documents_ids().iter(wtxn)? {
|
||||
let (external_id, docid) = result?;
|
||||
let obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
|
||||
)?;
|
||||
let docid = docid.get();
|
||||
|
||||
obkv_buffer.clear();
|
||||
let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
|
||||
let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer);
|
||||
|
||||
// We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
|
||||
for (id, name) in new_fields_ids_map.iter() {
|
||||
@ -727,7 +870,17 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
}
|
||||
|
||||
let buffer = obkv_writer.into_inner()?;
|
||||
original_writer.insert(docid.to_be_bytes(), &buffer)?;
|
||||
|
||||
document_sorter_key_buffer.clear();
|
||||
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
|
||||
document_sorter_value_buffer.clear();
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
|
||||
// Once we have the document. We're going to flatten it
|
||||
// and insert it in the flattened sorter.
|
||||
@ -762,29 +915,34 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(fid, &value)?;
|
||||
}
|
||||
flattened_writer.insert(docid.to_be_bytes(), &buffer)?;
|
||||
document_sorter_value_buffer.clear();
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||
}
|
||||
|
||||
// Once we have written all the documents, we extract
|
||||
// the file and reset the seek to be able to read it again.
|
||||
let mut original_documents = original_writer.into_inner()?;
|
||||
original_documents.rewind()?;
|
||||
let grenad_params = GrenadParameters {
|
||||
chunk_compression_type: self.indexer_settings.chunk_compression_type,
|
||||
chunk_compression_level: self.indexer_settings.chunk_compression_level,
|
||||
max_memory: self.indexer_settings.max_memory,
|
||||
max_nb_chunks: self.indexer_settings.max_nb_chunks, // default value, may be chosen.
|
||||
};
|
||||
|
||||
let mut flattened_documents = flattened_writer.into_inner()?;
|
||||
flattened_documents.rewind()?;
|
||||
// Once we have written all the documents, we merge everything into a Reader.
|
||||
let original_documents = sorter_into_reader(original_sorter, grenad_params)?;
|
||||
|
||||
let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?;
|
||||
|
||||
let output = TransformOutput {
|
||||
primary_key,
|
||||
fields_ids_map: new_fields_ids_map,
|
||||
field_distribution,
|
||||
new_external_documents_ids,
|
||||
new_documents_ids: documents_ids,
|
||||
replaced_documents_ids: RoaringBitmap::default(),
|
||||
documents_count,
|
||||
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
|
||||
flattened_documents: flattened_documents
|
||||
.into_inner()
|
||||
.map_err(|err| err.into_error())?,
|
||||
original_documents: original_documents.into_inner().into_inner(),
|
||||
flattened_documents: flattened_documents.into_inner().into_inner(),
|
||||
};
|
||||
|
||||
let new_facets = output.compute_real_facets(wtxn, self.index)?;
|
||||
@ -828,38 +986,111 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn merge_obkvs() {
|
||||
let mut doc_0 = Vec::new();
|
||||
let mut kv_writer = KvWriter::new(&mut doc_0);
|
||||
let mut additive_doc_0 = Vec::new();
|
||||
let mut deletive_doc_0 = Vec::new();
|
||||
let mut del_add_doc_0 = Vec::new();
|
||||
let mut kv_writer = KvWriter::memory();
|
||||
kv_writer.insert(0_u8, [0]).unwrap();
|
||||
kv_writer.finish().unwrap();
|
||||
doc_0.insert(0, Operation::Addition as u8);
|
||||
|
||||
let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap();
|
||||
assert_eq!(*ret, doc_0);
|
||||
|
||||
let ret = merge_obkvs_and_operations(
|
||||
&[],
|
||||
&[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())],
|
||||
let buffer = kv_writer.into_inner().unwrap();
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut additive_doc_0,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, doc_0);
|
||||
|
||||
let ret = merge_obkvs_and_operations(
|
||||
&[],
|
||||
&[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())],
|
||||
additive_doc_0.insert(0, Operation::Addition as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
DelAddOperation::Deletion,
|
||||
&mut deletive_doc_0,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, [Operation::Deletion as u8]);
|
||||
deletive_doc_0.insert(0, Operation::Deletion as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
DelAddOperation::DeletionAndAddition,
|
||||
&mut del_add_doc_0,
|
||||
)
|
||||
.unwrap();
|
||||
del_add_doc_0.insert(0, Operation::Addition as u8);
|
||||
|
||||
let ret = merge_obkvs_and_operations(
|
||||
let mut additive_doc_1 = Vec::new();
|
||||
let mut kv_writer = KvWriter::memory();
|
||||
kv_writer.insert(1_u8, [1]).unwrap();
|
||||
let buffer = kv_writer.into_inner().unwrap();
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut additive_doc_1,
|
||||
)
|
||||
.unwrap();
|
||||
additive_doc_1.insert(0, Operation::Addition as u8);
|
||||
|
||||
let mut additive_doc_0_1 = Vec::new();
|
||||
let mut kv_writer = KvWriter::memory();
|
||||
kv_writer.insert(0_u8, [0]).unwrap();
|
||||
kv_writer.insert(1_u8, [1]).unwrap();
|
||||
let buffer = kv_writer.into_inner().unwrap();
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut additive_doc_0_1,
|
||||
)
|
||||
.unwrap();
|
||||
additive_doc_0_1.insert(0, Operation::Addition as u8);
|
||||
|
||||
let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())])
|
||||
.unwrap();
|
||||
assert_eq!(*ret, additive_doc_0);
|
||||
|
||||
let ret = obkvs_merge_additions_and_deletions(
|
||||
&[],
|
||||
&[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, del_add_doc_0);
|
||||
|
||||
let ret = obkvs_merge_additions_and_deletions(
|
||||
&[],
|
||||
&[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, deletive_doc_0);
|
||||
|
||||
let ret = obkvs_merge_additions_and_deletions(
|
||||
&[],
|
||||
&[
|
||||
Cow::from([Operation::Addition as u8, 1].as_slice()),
|
||||
Cow::from([Operation::Deletion as u8].as_slice()),
|
||||
Cow::from(doc_0.as_slice()),
|
||||
Cow::from(additive_doc_1.as_slice()),
|
||||
Cow::from(deletive_doc_0.as_slice()),
|
||||
Cow::from(additive_doc_0.as_slice()),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, doc_0);
|
||||
assert_eq!(*ret, del_add_doc_0);
|
||||
|
||||
let ret = obkvs_merge_additions_and_deletions(
|
||||
&[],
|
||||
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, additive_doc_0_1);
|
||||
|
||||
let ret = obkvs_keep_last_addition_merge_deletions(
|
||||
&[],
|
||||
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, additive_doc_0);
|
||||
|
||||
let ret = obkvs_keep_last_addition_merge_deletions(
|
||||
&[],
|
||||
&[
|
||||
Cow::from(deletive_doc_0.as_slice()),
|
||||
Cow::from(additive_doc_1.as_slice()),
|
||||
Cow::from(additive_doc_0.as_slice()),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, del_add_doc_0);
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,4 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
@ -9,32 +8,40 @@ use charabia::{Language, Script};
|
||||
use grenad::MergerBuilder;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::RwTxn;
|
||||
use log::error;
|
||||
use obkv::{KvReader, KvWriter};
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
|
||||
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values,
|
||||
valid_lmdb_key, CursorClonableMmap,
|
||||
};
|
||||
use super::{ClonableMmap, MergeFn};
|
||||
use crate::distance::NDotProductPoint;
|
||||
use crate::error::UserError;
|
||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||
use crate::facet::FacetType;
|
||||
use crate::index::db_name::DOCUMENTS;
|
||||
use crate::index::Hnsw;
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
||||
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
|
||||
use crate::{
|
||||
lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError, BEU32,
|
||||
};
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
||||
Documents(grenad::Reader<CursorClonableMmap>),
|
||||
FieldIdWordcountDocids(grenad::Reader<BufReader<File>>),
|
||||
NewDocumentsIds(RoaringBitmap),
|
||||
FieldIdWordCountDocids(grenad::Reader<BufReader<File>>),
|
||||
WordDocids {
|
||||
word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||
exact_word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||
word_fid_docids_reader: grenad::Reader<BufReader<File>>,
|
||||
},
|
||||
WordPositionDocids(grenad::Reader<BufReader<File>>),
|
||||
WordFidDocids(grenad::Reader<BufReader<File>>),
|
||||
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
|
||||
@ -43,7 +50,7 @@ pub(crate) enum TypedChunk {
|
||||
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
|
||||
GeoPoints(grenad::Reader<BufReader<File>>),
|
||||
VectorPoints(grenad::Reader<BufReader<File>>),
|
||||
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
||||
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
||||
}
|
||||
|
||||
impl TypedChunk {
|
||||
@ -58,23 +65,22 @@ impl TypedChunk {
|
||||
TypedChunk::Documents(grenad) => {
|
||||
format!("Documents {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::FieldIdWordcountDocids(grenad) => {
|
||||
TypedChunk::FieldIdWordCountDocids(grenad) => {
|
||||
format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::NewDocumentsIds(grenad) => {
|
||||
format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!(
|
||||
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}",
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} => format!(
|
||||
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}",
|
||||
word_docids_reader.len(),
|
||||
exact_word_docids_reader.len()
|
||||
exact_word_docids_reader.len(),
|
||||
word_fid_docids_reader.len()
|
||||
),
|
||||
TypedChunk::WordPositionDocids(grenad) => {
|
||||
format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::WordFidDocids(grenad) => {
|
||||
format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(grenad) => {
|
||||
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
@ -99,8 +105,8 @@ impl TypedChunk {
|
||||
TypedChunk::VectorPoints(grenad) => {
|
||||
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::ScriptLanguageDocids(grenad) => {
|
||||
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -119,34 +125,75 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let mut is_merged_database = false;
|
||||
match typed_chunk {
|
||||
TypedChunk::Documents(obkv_documents_iter) => {
|
||||
let mut operations: Vec<DocumentOperation> = Default::default();
|
||||
|
||||
let mut docids = index.documents_ids(wtxn)?;
|
||||
let mut cursor = obkv_documents_iter.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
index.documents.remap_types::<ByteSlice, ByteSlice>().put(wtxn, key, value)?;
|
||||
while let Some((key, reader)) = cursor.move_on_next()? {
|
||||
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
||||
let reader: KvReader<FieldId> = KvReader::new(reader);
|
||||
|
||||
let (document_id_bytes, external_id_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?;
|
||||
let docid = DocumentId::from_be_bytes(document_id_bytes);
|
||||
let external_id = std::str::from_utf8(external_id_bytes)?;
|
||||
|
||||
for (field_id, value) in reader.iter() {
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
writer.insert(field_id, addition)?;
|
||||
}
|
||||
}
|
||||
|
||||
let db = index.documents.remap_data_type::<ByteSlice>();
|
||||
|
||||
if !writer.is_empty() {
|
||||
db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?;
|
||||
operations.push(DocumentOperation {
|
||||
external_id: external_id.to_string(),
|
||||
internal_id: docid,
|
||||
kind: DocumentOperationKind::Create,
|
||||
});
|
||||
docids.insert(docid);
|
||||
} else {
|
||||
db.delete(wtxn, &BEU32::new(docid))?;
|
||||
operations.push(DocumentOperation {
|
||||
external_id: external_id.to_string(),
|
||||
internal_id: docid,
|
||||
kind: DocumentOperationKind::Delete,
|
||||
});
|
||||
docids.remove(docid);
|
||||
}
|
||||
}
|
||||
let external_documents_docids = index.external_documents_ids();
|
||||
external_documents_docids.apply(wtxn, operations)?;
|
||||
index.put_documents_ids(wtxn, &docids)?;
|
||||
}
|
||||
TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => {
|
||||
TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
fid_word_count_docids_iter,
|
||||
&index.field_id_word_count_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::NewDocumentsIds(documents_ids) => {
|
||||
return Ok((documents_ids, is_merged_database))
|
||||
}
|
||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} => {
|
||||
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||
append_entries_into_database(
|
||||
word_docids_iter.clone(),
|
||||
&index.word_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_roaring_bitmaps,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||
@ -155,8 +202,18 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
&index.exact_word_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_roaring_bitmaps,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
|
||||
append_entries_into_database(
|
||||
word_fid_docids_iter,
|
||||
&index.word_fid_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
// create fst from word docids
|
||||
@ -177,19 +234,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
&index.word_position_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordFidDocids(word_fid_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
word_fid_docids_iter,
|
||||
&index.word_fid_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
@ -209,8 +255,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
&index.facet_id_exists_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
@ -220,8 +266,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
&index.facet_id_is_null_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
@ -231,8 +277,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
&index.facet_id_is_empty_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
@ -242,8 +288,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
&index.word_pair_proximity_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
@ -252,8 +298,18 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
index.field_id_docid_facet_f64s.remap_types::<ByteSlice, ByteSlice>();
|
||||
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let reader = KvReaderDelAdd::new(value);
|
||||
if valid_lmdb_key(key) {
|
||||
index_fid_docid_facet_numbers.put(wtxn, key, value)?;
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
(None, None) => {}
|
||||
(None, Some(new)) => index_fid_docid_facet_numbers.put(wtxn, key, new)?,
|
||||
(Some(_), None) => {
|
||||
index_fid_docid_facet_numbers.delete(wtxn, key)?;
|
||||
}
|
||||
(Some(_), Some(new)) => {
|
||||
index_fid_docid_facet_numbers.put(wtxn, key, new)?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -262,8 +318,18 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
index.field_id_docid_facet_strings.remap_types::<ByteSlice, ByteSlice>();
|
||||
let mut cursor = fid_docid_facet_string.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let reader = KvReaderDelAdd::new(value);
|
||||
if valid_lmdb_key(key) {
|
||||
index_fid_docid_facet_strings.put(wtxn, key, value)?;
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
(None, None) => {}
|
||||
(None, Some(new)) => index_fid_docid_facet_strings.put(wtxn, key, new)?,
|
||||
(Some(_), None) => {
|
||||
index_fid_docid_facet_strings.delete(wtxn, key)?;
|
||||
}
|
||||
(Some(_), Some(new)) => {
|
||||
index_fid_docid_facet_strings.put(wtxn, key, new)?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -276,57 +342,86 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
|
||||
// convert the latitude and longitude back to a f64 (8 bytes)
|
||||
let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
|
||||
let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
|
||||
let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
|
||||
let xyz_point = lat_lng_to_xyz(&point);
|
||||
|
||||
rtree.insert(GeoPoint::new(xyz_point, (docid, point)));
|
||||
geo_faceted_docids.insert(docid);
|
||||
let deladd_obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(value) = deladd_obkv.get(DelAdd::Deletion) {
|
||||
let geopoint = extract_geo_point(value, docid);
|
||||
rtree.remove(&geopoint);
|
||||
geo_faceted_docids.remove(docid);
|
||||
}
|
||||
if let Some(value) = deladd_obkv.get(DelAdd::Addition) {
|
||||
let geopoint = extract_geo_point(value, docid);
|
||||
rtree.insert(geopoint);
|
||||
geo_faceted_docids.insert(docid);
|
||||
}
|
||||
}
|
||||
index.put_geo_rtree(wtxn, &rtree)?;
|
||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||
}
|
||||
TypedChunk::VectorPoints(vector_points) => {
|
||||
let (pids, mut points): (Vec<_>, Vec<_>) = match index.vector_hnsw(wtxn)? {
|
||||
Some(hnsw) => hnsw.iter().map(|(pid, point)| (pid, point.clone())).unzip(),
|
||||
None => Default::default(),
|
||||
};
|
||||
|
||||
// Convert the PointIds into DocumentIds
|
||||
let mut docids = Vec::new();
|
||||
for pid in pids {
|
||||
let docid =
|
||||
index.vector_id_docid.get(wtxn, &BEU32::new(pid.into_inner()))?.unwrap();
|
||||
docids.push(docid.get());
|
||||
let mut vectors_set = HashSet::new();
|
||||
// We extract and store the previous vectors
|
||||
if let Some(hnsw) = index.vector_hnsw(wtxn)? {
|
||||
for (pid, point) in hnsw.iter() {
|
||||
let pid_key = BEU32::new(pid.into_inner());
|
||||
let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap().get();
|
||||
let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect();
|
||||
vectors_set.insert((docid, vector));
|
||||
}
|
||||
}
|
||||
|
||||
let mut expected_dimensions = points.get(0).map(|p| p.len());
|
||||
let mut cursor = vector_points.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let (left, _index) = try_split_array_at(key).unwrap();
|
||||
let docid = DocumentId::from_be_bytes(left);
|
||||
// convert the vector back to a Vec<f32>
|
||||
let vector: Vec<f32> = pod_collect_to_vec(value);
|
||||
|
||||
// TODO Inform the user about the document that has a wrong `_vectors`
|
||||
let found = vector.len();
|
||||
let expected = *expected_dimensions.get_or_insert(found);
|
||||
if expected != found {
|
||||
return Err(UserError::InvalidVectorDimensions { expected, found })?;
|
||||
let vector_deladd_obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
|
||||
// convert the vector back to a Vec<f32>
|
||||
let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
|
||||
let key = (docid, vector);
|
||||
if !vectors_set.remove(&key) {
|
||||
error!("Unable to delete the vector: {:?}", key.1);
|
||||
}
|
||||
}
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
||||
// convert the vector back to a Vec<f32>
|
||||
let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
|
||||
vectors_set.insert((docid, vector));
|
||||
}
|
||||
|
||||
points.push(NDotProductPoint::new(vector));
|
||||
docids.push(docid);
|
||||
}
|
||||
|
||||
assert_eq!(docids.len(), points.len());
|
||||
// Extract the most common vector dimension
|
||||
let expected_dimension_size = {
|
||||
let mut dims = HashMap::new();
|
||||
vectors_set.iter().for_each(|(_, v)| *dims.entry(v.len()).or_insert(0) += 1);
|
||||
dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len)
|
||||
};
|
||||
|
||||
// Ensure that the vector lengths are correct and
|
||||
// prepare the vectors before inserting them in the HNSW.
|
||||
let mut points = Vec::new();
|
||||
let mut docids = Vec::new();
|
||||
for (docid, vector) in vectors_set {
|
||||
if expected_dimension_size.map_or(false, |expected| expected != vector.len()) {
|
||||
return Err(UserError::InvalidVectorDimensions {
|
||||
expected: expected_dimension_size.unwrap_or(vector.len()),
|
||||
found: vector.len(),
|
||||
}
|
||||
.into());
|
||||
} else {
|
||||
let vector = vector.into_iter().map(OrderedFloat::into_inner).collect();
|
||||
points.push(NDotProductPoint::new(vector));
|
||||
docids.push(docid);
|
||||
}
|
||||
}
|
||||
|
||||
let hnsw_length = points.len();
|
||||
let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
|
||||
|
||||
assert_eq!(docids.len(), pids.len());
|
||||
|
||||
// Store the vectors in the point-docid relation database
|
||||
index.vector_id_docid.clear(wtxn)?;
|
||||
for (docid, pid) in docids.into_iter().zip(pids) {
|
||||
index.vector_id_docid.put(
|
||||
@ -339,22 +434,25 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
|
||||
index.put_vector_hnsw(wtxn, &new_hnsw)?;
|
||||
}
|
||||
TypedChunk::ScriptLanguageDocids(hash_pair) => {
|
||||
let mut buffer = Vec::new();
|
||||
for (key, value) in hash_pair {
|
||||
buffer.clear();
|
||||
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||
for (key, (deletion, addition)) in sl_map {
|
||||
let mut db_key_exists = false;
|
||||
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
||||
Some(db_values) => {
|
||||
let mut db_value_buffer = Vec::new();
|
||||
serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?;
|
||||
let mut new_value_buffer = Vec::new();
|
||||
serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
|
||||
merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
|
||||
RoaringBitmap::deserialize_from(&buffer[..])?
|
||||
db_key_exists = true;
|
||||
(db_values - deletion) | addition
|
||||
}
|
||||
None => value,
|
||||
None => addition,
|
||||
};
|
||||
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
||||
|
||||
if final_value.is_empty() {
|
||||
// If the database entry exists, delete it.
|
||||
if db_key_exists {
|
||||
index.script_language_docids.delete(wtxn, &key)?;
|
||||
}
|
||||
} else {
|
||||
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -362,6 +460,15 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
Ok((RoaringBitmap::new(), is_merged_database))
|
||||
}
|
||||
|
||||
/// Converts the latitude and longitude back to an xyz GeoPoint.
|
||||
fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
|
||||
let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
|
||||
let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
|
||||
let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
|
||||
let xyz_point = lat_lng_to_xyz(&point);
|
||||
GeoPoint::new(xyz_point, (docid, point))
|
||||
}
|
||||
|
||||
fn merge_word_docids_reader_into_fst(
|
||||
word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
||||
exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
||||
@ -379,24 +486,6 @@ fn merge_word_docids_reader_into_fst(
|
||||
Ok(builder.into_set())
|
||||
}
|
||||
|
||||
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
|
||||
let new_value = RoaringBitmap::deserialize_from(new_value)?;
|
||||
let db_value = RoaringBitmap::deserialize_from(db_value)?;
|
||||
let value = new_value | db_value;
|
||||
Ok(serialize_roaring_bitmap(&value, buffer)?)
|
||||
}
|
||||
|
||||
fn merge_cbo_roaring_bitmaps(
|
||||
new_value: &[u8],
|
||||
db_value: &[u8],
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<()> {
|
||||
Ok(CboRoaringBitmapCodec::merge_into(
|
||||
&[Cow::Borrowed(db_value), Cow::Borrowed(new_value)],
|
||||
buffer,
|
||||
)?)
|
||||
}
|
||||
|
||||
/// Write provided entries in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
fn write_entries_into_database<R, K, V, FS, FM>(
|
||||
@ -410,7 +499,7 @@ fn write_entries_into_database<R, K, V, FS, FM>(
|
||||
where
|
||||
R: io::Read + io::Seek,
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
{
|
||||
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
||||
|
||||
@ -422,17 +511,19 @@ where
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = if index_is_empty {
|
||||
serialize_value(value, &mut buffer)?
|
||||
Some(serialize_value(value, &mut buffer)?)
|
||||
} else {
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => {
|
||||
merge_values(value, prev_value, &mut buffer)?;
|
||||
&buffer[..]
|
||||
}
|
||||
None => serialize_value(value, &mut buffer)?,
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
}
|
||||
};
|
||||
database.put(wtxn, key, value)?;
|
||||
match value {
|
||||
Some(value) => database.put(wtxn, key, value)?,
|
||||
None => {
|
||||
database.delete(wtxn, key)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -454,7 +545,8 @@ fn append_entries_into_database<R, K, V, FS, FM>(
|
||||
where
|
||||
R: io::Read + io::Seek,
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
K: for<'a> heed::BytesDecode<'a>,
|
||||
{
|
||||
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
||||
|
||||
@ -475,6 +567,12 @@ where
|
||||
let mut cursor = data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
debug_assert!(
|
||||
K::bytes_decode(key).is_some(),
|
||||
"Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
|
||||
key.len(),
|
||||
&key
|
||||
);
|
||||
buffer.clear();
|
||||
let value = serialize_value(value, &mut buffer)?;
|
||||
unsafe { database.append(key, value)? };
|
||||
|
@ -1,6 +1,5 @@
|
||||
pub use self::available_documents_ids::AvailableDocumentsIds;
|
||||
pub use self::clear_documents::ClearDocuments;
|
||||
pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDeletionResult};
|
||||
pub use self::facet::bulk::FacetsUpdateBulk;
|
||||
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
|
||||
pub use self::index_documents::{
|
||||
@ -9,10 +8,6 @@ pub use self::index_documents::{
|
||||
MergeFn,
|
||||
};
|
||||
pub use self::indexer_config::IndexerConfig;
|
||||
pub use self::prefix_word_pairs::{
|
||||
PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
|
||||
MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
|
||||
};
|
||||
pub use self::settings::{Setting, Settings};
|
||||
pub use self::update_step::UpdateIndexingStep;
|
||||
pub use self::word_prefix_docids::WordPrefixDocids;
|
||||
@ -21,11 +16,10 @@ pub use self::words_prefixes_fst::WordsPrefixesFst;
|
||||
|
||||
mod available_documents_ids;
|
||||
mod clear_documents;
|
||||
mod delete_documents;
|
||||
pub(crate) mod del_add;
|
||||
pub(crate) mod facet;
|
||||
mod index_documents;
|
||||
mod indexer_config;
|
||||
mod prefix_word_pairs;
|
||||
mod settings;
|
||||
mod update_step;
|
||||
mod word_prefix_docids;
|
||||
|
@ -1,579 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashSet;
|
||||
use std::io::{BufReader, BufWriter};
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
|
||||
use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap};
|
||||
use crate::{Index, Result};
|
||||
|
||||
mod prefix_word;
|
||||
mod word_prefix;
|
||||
|
||||
pub use prefix_word::index_prefix_word_database;
|
||||
pub use word_prefix::index_word_prefix_database;
|
||||
|
||||
pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4;
|
||||
pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2;
|
||||
|
||||
pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
max_proximity: u8,
|
||||
max_prefix_length: usize,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
}
|
||||
impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
) -> Self {
|
||||
Self {
|
||||
wtxn,
|
||||
index,
|
||||
max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
|
||||
max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
|
||||
pub fn execute<'a>(
|
||||
self,
|
||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||
new_prefix_fst_words: &'a [String],
|
||||
common_prefix_fst_words: &[&'a [String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
index_word_prefix_database(
|
||||
self.wtxn,
|
||||
self.index.word_pair_proximity_docids,
|
||||
self.index.word_prefix_pair_proximity_docids,
|
||||
self.max_proximity,
|
||||
self.max_prefix_length,
|
||||
new_word_pair_proximity_docids.clone(),
|
||||
new_prefix_fst_words,
|
||||
common_prefix_fst_words,
|
||||
del_prefix_fst_words,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
)?;
|
||||
|
||||
index_prefix_word_database(
|
||||
self.wtxn,
|
||||
self.index.word_pair_proximity_docids,
|
||||
self.index.prefix_word_pair_proximity_docids,
|
||||
self.max_proximity,
|
||||
self.max_prefix_length,
|
||||
new_word_pair_proximity_docids,
|
||||
new_prefix_fst_words,
|
||||
common_prefix_fst_words,
|
||||
del_prefix_fst_words,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// This is adapted from `sorter_into_lmdb_database`
|
||||
pub fn insert_into_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
new_key: &[u8],
|
||||
new_value: &[u8],
|
||||
) -> Result<()> {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
|
||||
match iter.next().transpose()? {
|
||||
Some((key, old_val)) if new_key == key => {
|
||||
let val =
|
||||
merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
|
||||
.map_err(|_| {
|
||||
// TODO just wrap this error?
|
||||
crate::error::InternalError::IndexingMergingKeys {
|
||||
process: "get-put-merge",
|
||||
}
|
||||
})?;
|
||||
// safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
|
||||
unsafe { iter.put_current(new_key, &val)? };
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
|
||||
// but it uses `append` if the database is empty, and it assumes that the values in the
|
||||
// writer don't conflict with values in the database.
|
||||
pub fn write_into_lmdb_database_without_merging(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
writer: grenad::Writer<BufWriter<std::fs::File>>,
|
||||
) -> Result<()> {
|
||||
let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
||||
let reader = grenad::Reader::new(BufReader::new(file))?;
|
||||
if database.is_empty(wtxn)? {
|
||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
while let Some((k, v)) = cursor.move_on_next()? {
|
||||
// safety: the key comes from the grenad reader, not the database
|
||||
unsafe { out_iter.append(k, v)? };
|
||||
}
|
||||
} else {
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
while let Some((k, v)) = cursor.move_on_next()? {
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io::Cursor;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::db_snap;
|
||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::{DeleteDocuments, DeletionStrategy, IndexDocumentsMethod};
|
||||
|
||||
fn documents_with_enough_different_words_for_prefixes(
|
||||
prefixes: &[&str],
|
||||
start_id: usize,
|
||||
) -> Vec<crate::Object> {
|
||||
let mut documents = Vec::new();
|
||||
let mut id = start_id;
|
||||
for prefix in prefixes {
|
||||
for i in 0..50 {
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": id,
|
||||
"text": format!("{prefix}{i:x}"),
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
id += 1;
|
||||
}
|
||||
}
|
||||
documents
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_new_documents() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": "9000",
|
||||
"text": "At an amazing and beautiful house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": "9001",
|
||||
"text": "The bell rings at 5 am"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": "9002",
|
||||
"text": "At an extraordinary house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, word_pair_proximity_docids, "update");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "update");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "update");
|
||||
}
|
||||
#[test]
|
||||
fn batch_bug_3043() {
|
||||
// https://github.com/meilisearch/meilisearch/issues/3043
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": "x y"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": "x a y"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, word_pair_proximity_docids);
|
||||
db_snap!(index, word_prefix_pair_proximity_docids);
|
||||
db_snap!(index, prefix_word_pair_proximity_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hard_delete_and_reupdate() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9000,
|
||||
"text": "At an amazing and beautiful house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9001,
|
||||
"text": "The bell rings at 5 am"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "initial");
|
||||
db_snap!(index, word_docids, "initial");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
delete.strategy(DeletionStrategy::AlwaysHard);
|
||||
delete.delete_documents(&RoaringBitmap::from_iter([50]));
|
||||
delete.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "first_delete");
|
||||
db_snap!(index, word_docids, "first_delete");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "first_delete");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "first_delete");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
delete.strategy(DeletionStrategy::AlwaysHard);
|
||||
delete.delete_documents(&RoaringBitmap::from_iter(0..50));
|
||||
delete.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "second_delete");
|
||||
db_snap!(index, word_docids, "second_delete");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "second_delete");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "second_delete");
|
||||
|
||||
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
|
||||
index.add_documents(batch_reader_from_documents(documents)).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "reupdate");
|
||||
db_snap!(index, word_docids, "reupdate");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "reupdate");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn soft_delete_and_reupdate() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9000,
|
||||
"text": "At an amazing and beautiful house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9001,
|
||||
"text": "The bell rings at 5 am"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "initial");
|
||||
db_snap!(index, word_docids, "initial");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
delete.strategy(DeletionStrategy::AlwaysSoft);
|
||||
delete.delete_documents(&RoaringBitmap::from_iter([50]));
|
||||
delete.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "first_delete");
|
||||
db_snap!(index, word_docids, "first_delete");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "first_delete");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "first_delete");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
delete.strategy(DeletionStrategy::AlwaysSoft);
|
||||
|
||||
delete.delete_documents(&RoaringBitmap::from_iter(0..50));
|
||||
delete.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "second_delete");
|
||||
db_snap!(index, word_docids, "second_delete");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "second_delete");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "second_delete");
|
||||
|
||||
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
|
||||
index.add_documents(batch_reader_from_documents(documents)).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "reupdate");
|
||||
db_snap!(index, word_docids, "reupdate");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "reupdate");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replace_soft_deletion() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9000,
|
||||
"text": "At an amazing house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9001,
|
||||
"text": "The bell rings"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "initial");
|
||||
db_snap!(index, word_docids, "initial");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
||||
|
||||
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0);
|
||||
index.add_documents(batch_reader_from_documents(documents)).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "replaced");
|
||||
db_snap!(index, word_docids, "replaced");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "replaced");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "replaced");
|
||||
db_snap!(index, soft_deleted_documents_ids, "replaced", @"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, ]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replace_hard_deletion() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9000,
|
||||
"text": "At an amazing house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9001,
|
||||
"text": "The bell rings"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "initial");
|
||||
db_snap!(index, word_docids, "initial");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
||||
|
||||
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0);
|
||||
index.add_documents(batch_reader_from_documents(documents)).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "replaced");
|
||||
db_snap!(index, word_docids, "replaced");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "replaced");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "replaced");
|
||||
db_snap!(index, soft_deleted_documents_ids, "replaced", @"[]");
|
||||
}
|
||||
}
|
@ -1,182 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::BytesDecode;
|
||||
use log::debug;
|
||||
|
||||
use crate::update::index_documents::{create_writer, CursorClonableMmap};
|
||||
use crate::update::prefix_word_pairs::{
|
||||
insert_into_database, write_into_lmdb_database_without_merging,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[logging_timer::time]
|
||||
pub fn index_prefix_word_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
max_proximity: u8,
|
||||
max_prefix_length: usize,
|
||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_proximity = max_proximity - 1;
|
||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||
|
||||
let common_prefixes: Vec<_> = common_prefix_fst_words
|
||||
.iter()
|
||||
.flat_map(|s| s.iter())
|
||||
.map(|s| s.as_str())
|
||||
.filter(|s| s.len() <= max_prefix_length)
|
||||
.collect();
|
||||
|
||||
for proximity in 1..max_proximity {
|
||||
for prefix in common_prefixes.iter() {
|
||||
let mut prefix_key = vec![proximity];
|
||||
prefix_key.extend_from_slice(prefix.as_bytes());
|
||||
let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?;
|
||||
// This is the core of the algorithm
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
proximity,
|
||||
prefix.as_bytes(),
|
||||
// the next two arguments tell how to iterate over the new word pairs
|
||||
&mut cursor,
|
||||
|cursor| {
|
||||
if let Some((key, value)) = cursor.next()? {
|
||||
let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
|
||||
.ok_or(heed::Error::Decoding)?;
|
||||
Ok(Some((word2, value)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
},
|
||||
// and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap)
|
||||
|key, value| {
|
||||
insert_into_database(
|
||||
wtxn,
|
||||
*prefix_word_pair_proximity_docids.as_polymorph(),
|
||||
key,
|
||||
value,
|
||||
)
|
||||
},
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Now we do the same thing with the new prefixes and all word pairs in the DB
|
||||
let new_prefixes: Vec<_> = new_prefix_fst_words
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.filter(|s| s.len() <= max_prefix_length)
|
||||
.collect();
|
||||
|
||||
// Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
|
||||
// element in an intermediary grenad
|
||||
let mut writer =
|
||||
create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
|
||||
|
||||
for proximity in 1..max_proximity {
|
||||
for prefix in new_prefixes.iter() {
|
||||
let mut prefix_key = vec![proximity];
|
||||
prefix_key.extend_from_slice(prefix.as_bytes());
|
||||
let mut db_iter = word_pair_proximity_docids
|
||||
.as_polymorph()
|
||||
.prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())?
|
||||
.remap_key_type::<UncheckedU8StrStrCodec>();
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
proximity,
|
||||
prefix.as_bytes(),
|
||||
&mut db_iter,
|
||||
|db_iter| {
|
||||
db_iter
|
||||
.next()
|
||||
.transpose()
|
||||
.map(|x| x.map(|((_, _, word2), value)| (word2, value)))
|
||||
.map_err(|e| e.into())
|
||||
},
|
||||
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
||||
)?;
|
||||
drop(db_iter);
|
||||
}
|
||||
}
|
||||
|
||||
// and then we write the grenad into the DB
|
||||
// Since the grenad contains only new prefixes, we know in advance that none
|
||||
// of its elements already exist in the DB, thus there is no need to specify
|
||||
// how to merge conflicting elements
|
||||
write_into_lmdb_database_without_merging(
|
||||
wtxn,
|
||||
*prefix_word_pair_proximity_docids.as_polymorph(),
|
||||
writer,
|
||||
)?;
|
||||
|
||||
// All of the word prefix pairs in the database that have a w2
|
||||
// that is contained in the `suppr_pw` set must be removed as well.
|
||||
if !del_prefix_fst_words.is_empty() {
|
||||
let mut iter =
|
||||
prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
|
||||
while let Some(((_, prefix, _), _)) = iter.next().transpose()? {
|
||||
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||
// Delete this entry as the w2 prefix is no more in the words prefix fst.
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database.
|
||||
///
|
||||
/// Its arguments are:
|
||||
/// - an iterator over the words following the given `prefix` with the given `proximity`
|
||||
/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements
|
||||
fn execute_on_word_pairs_and_prefixes<I>(
|
||||
proximity: u8,
|
||||
prefix: &[u8],
|
||||
iter: &mut I,
|
||||
mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>,
|
||||
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = BTreeMap::default();
|
||||
|
||||
// Memory usage check:
|
||||
// The content of the loop will be called for each `word2` that follows a word beginning
|
||||
// with `prefix` with the given proximity.
|
||||
// In practice, I don't think the batch can ever get too big.
|
||||
while let Some((word2, docids)) = next_word2_and_docids(iter)? {
|
||||
let entry = batch.entry(word2.to_owned()).or_default();
|
||||
entry.push(Cow::Owned(docids.to_owned()));
|
||||
}
|
||||
|
||||
let mut key_buffer = Vec::with_capacity(512);
|
||||
key_buffer.push(proximity);
|
||||
key_buffer.extend_from_slice(prefix);
|
||||
key_buffer.push(0);
|
||||
|
||||
let mut value_buffer = Vec::with_capacity(65_536);
|
||||
|
||||
for (word2, docids) in batch {
|
||||
key_buffer.truncate(prefix.len() + 2);
|
||||
value_buffer.clear();
|
||||
|
||||
key_buffer.extend_from_slice(&word2);
|
||||
let data = if docids.len() > 1 {
|
||||
CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?;
|
||||
value_buffer.as_slice()
|
||||
} else {
|
||||
&docids[0]
|
||||
};
|
||||
insert(key_buffer.as_slice(), data)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 a 5 [101, ]
|
||||
1 a amazing [100, ]
|
||||
1 a an [100, ]
|
||||
1 a and [100, ]
|
||||
1 a beautiful [100, ]
|
||||
1 b house [100, ]
|
||||
1 b rings [101, ]
|
||||
1 be house [100, ]
|
||||
1 be rings [101, ]
|
||||
2 a am [101, ]
|
||||
2 a amazing [100, ]
|
||||
2 a and [100, ]
|
||||
2 a beautiful [100, ]
|
||||
2 a house [100, ]
|
||||
2 b at [101, ]
|
||||
2 be at [101, ]
|
||||
|
@ -1,23 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 5 a [101, ]
|
||||
1 amazing a [100, ]
|
||||
1 an a [100, ]
|
||||
1 and b [100, ]
|
||||
1 and be [100, ]
|
||||
1 at a [100, ]
|
||||
1 rings a [101, ]
|
||||
1 the b [101, ]
|
||||
1 the be [101, ]
|
||||
2 amazing b [100, ]
|
||||
2 amazing be [100, ]
|
||||
2 an a [100, ]
|
||||
2 at a [100, 101, ]
|
||||
2 bell a [101, ]
|
||||
3 an b [100, ]
|
||||
3 an be [100, ]
|
||||
3 at a [100, ]
|
||||
3 rings a [101, ]
|
||||
3 the a [101, ]
|
||||
|
@ -1,29 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 a 5 [101, ]
|
||||
1 a amazing [100, ]
|
||||
1 a an [100, 202, ]
|
||||
1 a and [100, ]
|
||||
1 a beautiful [100, ]
|
||||
1 a extraordinary [202, ]
|
||||
1 am and [100, ]
|
||||
1 an amazing [100, ]
|
||||
1 an beautiful [100, ]
|
||||
1 an extraordinary [202, ]
|
||||
1 b house [100, ]
|
||||
1 b rings [101, ]
|
||||
1 be house [100, ]
|
||||
1 be rings [101, ]
|
||||
2 a am [101, ]
|
||||
2 a amazing [100, ]
|
||||
2 a and [100, ]
|
||||
2 a beautiful [100, ]
|
||||
2 a extraordinary [202, ]
|
||||
2 a house [100, 202, ]
|
||||
2 am beautiful [100, ]
|
||||
2 an and [100, ]
|
||||
2 an house [100, 202, ]
|
||||
2 b at [101, ]
|
||||
2 be at [101, ]
|
||||
|
@ -1,33 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 5 am [101, ]
|
||||
1 amazing and [100, ]
|
||||
1 an amazing [100, ]
|
||||
1 an extraordinary [202, ]
|
||||
1 and beautiful [100, ]
|
||||
1 at 5 [101, ]
|
||||
1 at an [100, 202, ]
|
||||
1 beautiful house [100, ]
|
||||
1 bell rings [101, ]
|
||||
1 extraordinary house [202, ]
|
||||
1 rings at [101, ]
|
||||
1 the bell [101, ]
|
||||
2 amazing beautiful [100, ]
|
||||
2 an and [100, ]
|
||||
2 an house [202, ]
|
||||
2 and house [100, ]
|
||||
2 at am [101, ]
|
||||
2 at amazing [100, ]
|
||||
2 at extraordinary [202, ]
|
||||
2 bell at [101, ]
|
||||
2 rings 5 [101, ]
|
||||
2 the rings [101, ]
|
||||
3 amazing house [100, ]
|
||||
3 an beautiful [100, ]
|
||||
3 at and [100, ]
|
||||
3 at house [202, ]
|
||||
3 bell 5 [101, ]
|
||||
3 rings am [101, ]
|
||||
3 the at [101, ]
|
||||
|
@ -1,31 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 5 a [101, ]
|
||||
1 5 am [101, ]
|
||||
1 amazing a [100, ]
|
||||
1 amazing an [100, ]
|
||||
1 an a [100, ]
|
||||
1 an am [100, ]
|
||||
1 and b [100, ]
|
||||
1 and be [100, ]
|
||||
1 at a [100, 202, ]
|
||||
1 at an [100, 202, ]
|
||||
1 rings a [101, ]
|
||||
1 the b [101, ]
|
||||
1 the be [101, ]
|
||||
2 amazing b [100, ]
|
||||
2 amazing be [100, ]
|
||||
2 an a [100, ]
|
||||
2 an an [100, ]
|
||||
2 at a [100, 101, ]
|
||||
2 at am [100, 101, ]
|
||||
2 bell a [101, ]
|
||||
3 an b [100, ]
|
||||
3 an be [100, ]
|
||||
3 at a [100, ]
|
||||
3 at an [100, ]
|
||||
3 rings a [101, ]
|
||||
3 rings am [101, ]
|
||||
3 the a [101, ]
|
||||
|
@ -1,4 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
|
@ -1,8 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 a y [51, ]
|
||||
1 x a [51, ]
|
||||
1 x y [50, ]
|
||||
2 x y [51, ]
|
||||
|
@ -1,7 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 a y [51, ]
|
||||
1 x y [50, ]
|
||||
2 x y [51, ]
|
||||
|
@ -1,4 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ]
|
@ -1,6 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 a 5 [51, ]
|
||||
2 a am [51, ]
|
||||
|
@ -1,60 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
5 [51, ]
|
||||
a0 [0, ]
|
||||
a1 [1, ]
|
||||
a10 [16, ]
|
||||
a11 [17, ]
|
||||
a12 [18, ]
|
||||
a13 [19, ]
|
||||
a14 [20, ]
|
||||
a15 [21, ]
|
||||
a16 [22, ]
|
||||
a17 [23, ]
|
||||
a18 [24, ]
|
||||
a19 [25, ]
|
||||
a1a [26, ]
|
||||
a1b [27, ]
|
||||
a1c [28, ]
|
||||
a1d [29, ]
|
||||
a1e [30, ]
|
||||
a1f [31, ]
|
||||
a2 [2, ]
|
||||
a20 [32, ]
|
||||
a21 [33, ]
|
||||
a22 [34, ]
|
||||
a23 [35, ]
|
||||
a24 [36, ]
|
||||
a25 [37, ]
|
||||
a26 [38, ]
|
||||
a27 [39, ]
|
||||
a28 [40, ]
|
||||
a29 [41, ]
|
||||
a2a [42, ]
|
||||
a2b [43, ]
|
||||
a2c [44, ]
|
||||
a2d [45, ]
|
||||
a2e [46, ]
|
||||
a2f [47, ]
|
||||
a3 [3, ]
|
||||
a30 [48, ]
|
||||
a31 [49, ]
|
||||
a4 [4, ]
|
||||
a5 [5, ]
|
||||
a6 [6, ]
|
||||
a7 [7, ]
|
||||
a8 [8, ]
|
||||
a9 [9, ]
|
||||
aa [10, ]
|
||||
ab [11, ]
|
||||
ac [12, ]
|
||||
ad [13, ]
|
||||
ae [14, ]
|
||||
af [15, ]
|
||||
am [51, ]
|
||||
at [51, ]
|
||||
bell [51, ]
|
||||
rings [51, ]
|
||||
the [51, ]
|
||||
|
@ -1,10 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 5 a [51, ]
|
||||
1 rings a [51, ]
|
||||
2 at a [51, ]
|
||||
2 bell a [51, ]
|
||||
3 rings a [51, ]
|
||||
3 the a [51, ]
|
||||
|
@ -1,4 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ]
|
@ -1,14 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 a 5 [51, ]
|
||||
1 a amazing [50, ]
|
||||
1 a an [50, ]
|
||||
1 a and [50, ]
|
||||
1 a beautiful [50, ]
|
||||
2 a am [51, ]
|
||||
2 a amazing [50, ]
|
||||
2 a and [50, ]
|
||||
2 a beautiful [50, ]
|
||||
2 a house [50, ]
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user