mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-10-30 01:38:49 +01:00
Introduce indexer level bumpalo
This commit is contained in:
parent
39b27e42be
commit
68a2502388
46
Cargo.lock
generated
46
Cargo.lock
generated
@ -296,9 +296,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "allocator-api2"
|
name = "allocator-api2"
|
||||||
version = "0.2.16"
|
version = "0.2.18"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anes"
|
name = "anes"
|
||||||
@ -664,6 +664,10 @@ name = "bumpalo"
|
|||||||
version = "3.16.0"
|
version = "3.16.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
||||||
|
dependencies = [
|
||||||
|
"allocator-api2",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "byte-unit"
|
name = "byte-unit"
|
||||||
@ -1887,6 +1891,12 @@ version = "1.0.7"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "foldhash"
|
||||||
|
version = "0.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "form_urlencoded"
|
name = "form_urlencoded"
|
||||||
version = "1.2.1"
|
version = "1.2.1"
|
||||||
@ -2315,6 +2325,18 @@ dependencies = [
|
|||||||
"allocator-api2",
|
"allocator-api2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hashbrown"
|
||||||
|
version = "0.15.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
|
||||||
|
dependencies = [
|
||||||
|
"allocator-api2",
|
||||||
|
"equivalent",
|
||||||
|
"foldhash",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heapless"
|
name = "heapless"
|
||||||
version = "0.8.0"
|
version = "0.8.0"
|
||||||
@ -2557,6 +2579,7 @@ dependencies = [
|
|||||||
"arroy",
|
"arroy",
|
||||||
"big_s",
|
"big_s",
|
||||||
"bincode",
|
"bincode",
|
||||||
|
"bumpalo",
|
||||||
"crossbeam",
|
"crossbeam",
|
||||||
"csv",
|
"csv",
|
||||||
"derive_builder 0.20.0",
|
"derive_builder 0.20.0",
|
||||||
@ -3549,6 +3572,7 @@ dependencies = [
|
|||||||
"bimap",
|
"bimap",
|
||||||
"bincode",
|
"bincode",
|
||||||
"bstr",
|
"bstr",
|
||||||
|
"bumpalo",
|
||||||
"bytemuck",
|
"bytemuck",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"candle-core",
|
"candle-core",
|
||||||
@ -3585,6 +3609,7 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
"ordered-float",
|
"ordered-float",
|
||||||
"rand",
|
"rand",
|
||||||
|
"raw-collections",
|
||||||
"rayon",
|
"rayon",
|
||||||
"rayon-par-bridge",
|
"rayon-par-bridge",
|
||||||
"rhai",
|
"rhai",
|
||||||
@ -4406,6 +4431,18 @@ dependencies = [
|
|||||||
"rand",
|
"rand",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "raw-collections"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "git+https://github.com/dureuill/raw-collections.git#0ecd143c1707d237e3c4d749bc685418da2fccc2"
|
||||||
|
dependencies = [
|
||||||
|
"allocator-api2",
|
||||||
|
"bumpalo",
|
||||||
|
"hashbrown 0.15.0",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "raw-cpuid"
|
name = "raw-cpuid"
|
||||||
version = "10.7.0"
|
version = "10.7.0"
|
||||||
@ -4869,12 +4906,13 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_json"
|
name = "serde_json"
|
||||||
version = "1.0.120"
|
version = "1.0.128"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5"
|
checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"indexmap",
|
"indexmap",
|
||||||
"itoa",
|
"itoa",
|
||||||
|
"memchr",
|
||||||
"ryu",
|
"ryu",
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
@ -39,6 +39,7 @@ time = { version = "0.3.36", features = [
|
|||||||
tracing = "0.1.40"
|
tracing = "0.1.40"
|
||||||
ureq = "2.10.0"
|
ureq = "2.10.0"
|
||||||
uuid = { version = "1.10.0", features = ["serde", "v4"] }
|
uuid = { version = "1.10.0", features = ["serde", "v4"] }
|
||||||
|
bumpalo = "3.16.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
|
arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
|
||||||
|
@ -23,14 +23,15 @@ use std::fmt;
|
|||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::BufWriter;
|
use std::io::BufWriter;
|
||||||
|
|
||||||
|
use bumpalo::collections::CollectIn;
|
||||||
|
use bumpalo::Bump;
|
||||||
use dump::IndexMetadata;
|
use dump::IndexMetadata;
|
||||||
use meilisearch_types::error::Code;
|
use meilisearch_types::error::Code;
|
||||||
use meilisearch_types::heed::{RoTxn, RwTxn};
|
use meilisearch_types::heed::{RoTxn, RwTxn};
|
||||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||||
use meilisearch_types::milli::heed::CompactionOption;
|
use meilisearch_types::milli::heed::CompactionOption;
|
||||||
use meilisearch_types::milli::update::new::indexer::{
|
use meilisearch_types::milli::update::new::indexer::document_changes::DocumentChanges;
|
||||||
self, retrieve_or_guess_primary_key, DocumentChanges,
|
use meilisearch_types::milli::update::new::indexer::{self, retrieve_or_guess_primary_key};
|
||||||
};
|
|
||||||
use meilisearch_types::milli::update::{
|
use meilisearch_types::milli::update::{
|
||||||
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
||||||
};
|
};
|
||||||
@ -1219,6 +1220,8 @@ impl IndexScheduler {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
operation: IndexOperation,
|
operation: IndexOperation,
|
||||||
) -> Result<Vec<Task>> {
|
) -> Result<Vec<Task>> {
|
||||||
|
let indexer_alloc = Bump::new();
|
||||||
|
|
||||||
match operation {
|
match operation {
|
||||||
IndexOperation::DocumentClear { mut tasks, .. } => {
|
IndexOperation::DocumentClear { mut tasks, .. } => {
|
||||||
let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?;
|
let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?;
|
||||||
@ -1252,6 +1255,9 @@ impl IndexScheduler {
|
|||||||
let mut primary_key_has_been_set = false;
|
let mut primary_key_has_been_set = false;
|
||||||
let must_stop_processing = self.must_stop_processing.clone();
|
let must_stop_processing = self.must_stop_processing.clone();
|
||||||
let indexer_config = self.index_mapper.indexer_config();
|
let indexer_config = self.index_mapper.indexer_config();
|
||||||
|
// TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches.
|
||||||
|
// this is made difficult by the fact we're doing private clones of the index scheduler and sending it
|
||||||
|
// to a fresh thread.
|
||||||
|
|
||||||
/// TODO manage errors correctly
|
/// TODO manage errors correctly
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
@ -1274,7 +1280,9 @@ impl IndexScheduler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut fields_ids_map = index.fields_ids_map(&rtxn)?;
|
let db_fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||||
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let first_document = match content_files.first() {
|
let first_document = match content_files.first() {
|
||||||
Some(mmap) => {
|
Some(mmap) => {
|
||||||
let mut iter = serde_json::Deserializer::from_slice(mmap).into_iter();
|
let mut iter = serde_json::Deserializer::from_slice(mmap).into_iter();
|
||||||
@ -1286,7 +1294,7 @@ impl IndexScheduler {
|
|||||||
let primary_key = retrieve_or_guess_primary_key(
|
let primary_key = retrieve_or_guess_primary_key(
|
||||||
&rtxn,
|
&rtxn,
|
||||||
index,
|
index,
|
||||||
&mut fields_ids_map,
|
&mut new_fields_ids_map,
|
||||||
first_document.as_ref(),
|
first_document.as_ref(),
|
||||||
)?
|
)?
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@ -1320,7 +1328,11 @@ impl IndexScheduler {
|
|||||||
}
|
}
|
||||||
DocumentOperation::Delete(document_ids) => {
|
DocumentOperation::Delete(document_ids) => {
|
||||||
let count = document_ids.len();
|
let count = document_ids.len();
|
||||||
indexer.delete_documents(document_ids);
|
let document_ids: bumpalo::collections::vec::Vec<_> = document_ids
|
||||||
|
.iter()
|
||||||
|
.map(|s| &*indexer_alloc.alloc_str(s))
|
||||||
|
.collect_in(&indexer_alloc);
|
||||||
|
indexer.delete_documents(document_ids.into_bump_slice());
|
||||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||||
// let count = user_result.unwrap();
|
// let count = user_result.unwrap();
|
||||||
let provided_ids =
|
let provided_ids =
|
||||||
@ -1347,10 +1359,22 @@ impl IndexScheduler {
|
|||||||
// let pool = indexer_config.thread_pool.unwrap();
|
// let pool = indexer_config.thread_pool.unwrap();
|
||||||
let pool = rayon::ThreadPoolBuilder::new().build().unwrap();
|
let pool = rayon::ThreadPoolBuilder::new().build().unwrap();
|
||||||
|
|
||||||
let param = (index, &rtxn, &primary_key);
|
let document_changes = indexer.into_changes(
|
||||||
let document_changes = indexer.document_changes(&mut fields_ids_map, param)?;
|
&indexer_alloc,
|
||||||
/// TODO pass/write the FieldsIdsMap
|
index,
|
||||||
indexer::index(index_wtxn, index, fields_ids_map, &pool, document_changes)?;
|
&rtxn,
|
||||||
|
&primary_key,
|
||||||
|
&mut new_fields_ids_map,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
indexer::index(
|
||||||
|
index_wtxn,
|
||||||
|
index,
|
||||||
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
&pool,
|
||||||
|
&document_changes,
|
||||||
|
)?;
|
||||||
|
|
||||||
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||||
}
|
}
|
||||||
@ -1501,10 +1525,11 @@ impl IndexScheduler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
let mut fields_ids_map = index.fields_ids_map(&rtxn)?;
|
let db_fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||||
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let primary_key =
|
let primary_key =
|
||||||
retrieve_or_guess_primary_key(&rtxn, index, &mut fields_ids_map, None)?
|
retrieve_or_guess_primary_key(&rtxn, index, &mut new_fields_ids_map, None)?
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
if !tasks.iter().all(|res| res.error.is_some()) {
|
if !tasks.iter().all(|res| res.error.is_some()) {
|
||||||
@ -1512,19 +1537,17 @@ impl IndexScheduler {
|
|||||||
// let pool = indexer_config.thread_pool.unwrap();
|
// let pool = indexer_config.thread_pool.unwrap();
|
||||||
let pool = rayon::ThreadPoolBuilder::new().build().unwrap();
|
let pool = rayon::ThreadPoolBuilder::new().build().unwrap();
|
||||||
|
|
||||||
let param = (index, &fields_ids_map, &primary_key);
|
|
||||||
let mut indexer = indexer::DocumentDeletion::new();
|
let mut indexer = indexer::DocumentDeletion::new();
|
||||||
indexer.delete_documents_by_docids(to_delete);
|
indexer.delete_documents_by_docids(to_delete);
|
||||||
/// TODO remove this fields-ids-map, it's useless for the deletion pipeline (the &mut cloned one).
|
let document_changes = indexer.into_changes(&indexer_alloc, primary_key);
|
||||||
let document_changes =
|
|
||||||
indexer.document_changes(&mut fields_ids_map.clone(), param)?;
|
|
||||||
/// TODO pass/write the FieldsIdsMap
|
|
||||||
indexer::index(
|
indexer::index(
|
||||||
index_wtxn,
|
index_wtxn,
|
||||||
index,
|
index,
|
||||||
fields_ids_map.clone(),
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
&pool,
|
&pool,
|
||||||
document_changes,
|
&document_changes,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||||
|
@ -29,8 +29,8 @@ fst = "0.4.7"
|
|||||||
fxhash = "0.2.1"
|
fxhash = "0.2.1"
|
||||||
geoutils = "0.5.1"
|
geoutils = "0.5.1"
|
||||||
grenad = { version = "0.4.7", default-features = false, features = [
|
grenad = { version = "0.4.7", default-features = false, features = [
|
||||||
"rayon", # TODO Should we keep this feature
|
"rayon", # TODO Should we keep this feature
|
||||||
"tempfile"
|
"tempfile",
|
||||||
], git = "https://github.com/meilisearch/grenad", branch = "various-improvements" }
|
], git = "https://github.com/meilisearch/grenad", branch = "various-improvements" }
|
||||||
heed = { version = "0.20.3", default-features = false, features = [
|
heed = { version = "0.20.3", default-features = false, features = [
|
||||||
"serde-json",
|
"serde-json",
|
||||||
@ -81,7 +81,13 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls",
|
|||||||
] }
|
] }
|
||||||
tiktoken-rs = "0.5.9"
|
tiktoken-rs = "0.5.9"
|
||||||
liquid = "0.26.6"
|
liquid = "0.26.6"
|
||||||
rhai = { version = "1.19.0", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] }
|
rhai = { version = "1.19.0", features = [
|
||||||
|
"serde",
|
||||||
|
"no_module",
|
||||||
|
"no_custom_syntax",
|
||||||
|
"no_time",
|
||||||
|
"sync",
|
||||||
|
] }
|
||||||
arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
|
arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
tracing = "0.1.40"
|
tracing = "0.1.40"
|
||||||
@ -89,6 +95,8 @@ ureq = { version = "2.10.0", features = ["json"] }
|
|||||||
url = "2.5.2"
|
url = "2.5.2"
|
||||||
rayon-par-bridge = "0.1.0"
|
rayon-par-bridge = "0.1.0"
|
||||||
hashbrown = "0.14.5"
|
hashbrown = "0.14.5"
|
||||||
|
raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
|
||||||
|
bumpalo = "3.16.0"
|
||||||
thread_local = "1.1.8"
|
thread_local = "1.1.8"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
@ -13,8 +13,8 @@ pub use builder::DocumentsBatchBuilder;
|
|||||||
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
|
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
|
||||||
use obkv::KvReader;
|
use obkv::KvReader;
|
||||||
pub use primary_key::{
|
pub use primary_key::{
|
||||||
validate_document_id_value, DocumentIdExtractionError, FieldIdMapper, PrimaryKey,
|
validate_document_id_str, validate_document_id_value, DocumentIdExtractionError, FieldIdMapper,
|
||||||
DEFAULT_PRIMARY_KEY,
|
PrimaryKey, DEFAULT_PRIMARY_KEY,
|
||||||
};
|
};
|
||||||
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
|
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@ -96,6 +96,10 @@ impl FieldIdMapper for DocumentsBatchIndex {
|
|||||||
fn id(&self, name: &str) -> Option<FieldId> {
|
fn id(&self, name: &str) -> Option<FieldId> {
|
||||||
self.id(name)
|
self.id(name)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn name(&self, id: FieldId) -> Option<&str> {
|
||||||
|
self.name(id)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
@ -19,6 +19,21 @@ pub trait FieldIdMapper {
|
|||||||
///
|
///
|
||||||
/// `None` if the field with this name was not found.
|
/// `None` if the field with this name was not found.
|
||||||
fn id(&self, name: &str) -> Option<FieldId>;
|
fn id(&self, name: &str) -> Option<FieldId>;
|
||||||
|
|
||||||
|
fn name(&self, id: FieldId) -> Option<&str>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> FieldIdMapper for &T
|
||||||
|
where
|
||||||
|
T: FieldIdMapper,
|
||||||
|
{
|
||||||
|
fn id(&self, name: &str) -> Option<FieldId> {
|
||||||
|
T::id(self, name)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn name(&self, id: FieldId) -> Option<&str> {
|
||||||
|
T::name(self, id)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A type that represent the type of primary key that has been set
|
/// A type that represent the type of primary key that has been set
|
||||||
@ -190,7 +205,7 @@ fn starts_with(selector: &str, key: &str) -> bool {
|
|||||||
|
|
||||||
// FIXME: move to a DocumentId struct
|
// FIXME: move to a DocumentId struct
|
||||||
|
|
||||||
fn validate_document_id(document_id: &str) -> Option<&str> {
|
pub fn validate_document_id_str(document_id: &str) -> Option<&str> {
|
||||||
if !document_id.is_empty()
|
if !document_id.is_empty()
|
||||||
&& document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
&& document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
||||||
{
|
{
|
||||||
@ -202,7 +217,7 @@ fn validate_document_id(document_id: &str) -> Option<&str> {
|
|||||||
|
|
||||||
pub fn validate_document_id_value(document_id: Value) -> StdResult<String, UserError> {
|
pub fn validate_document_id_value(document_id: Value) -> StdResult<String, UserError> {
|
||||||
match document_id {
|
match document_id {
|
||||||
Value::String(string) => match validate_document_id(&string) {
|
Value::String(string) => match validate_document_id_str(&string) {
|
||||||
Some(s) if s.len() == string.len() => Ok(string),
|
Some(s) if s.len() == string.len() => Ok(string),
|
||||||
Some(s) => Ok(s.to_string()),
|
Some(s) => Ok(s.to_string()),
|
||||||
None => Err(UserError::InvalidDocumentId { document_id: Value::String(string) }),
|
None => Err(UserError::InvalidDocumentId { document_id: Value::String(string) }),
|
||||||
|
@ -98,6 +98,20 @@ impl crate::documents::FieldIdMapper for FieldsIdsMap {
|
|||||||
fn id(&self, name: &str) -> Option<FieldId> {
|
fn id(&self, name: &str) -> Option<FieldId> {
|
||||||
self.id(name)
|
self.id(name)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn name(&self, id: FieldId) -> Option<&str> {
|
||||||
|
self.name(id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait MutFieldIdMapper {
|
||||||
|
fn insert(&mut self, name: &str) -> Option<FieldId>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MutFieldIdMapper for FieldsIdsMap {
|
||||||
|
fn insert(&mut self, name: &str) -> Option<FieldId> {
|
||||||
|
self.insert(name)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::sync::RwLock;
|
use std::sync::RwLock;
|
||||||
|
|
||||||
|
use super::MutFieldIdMapper;
|
||||||
|
use crate::documents::FieldIdMapper;
|
||||||
use crate::{FieldId, FieldsIdsMap};
|
use crate::{FieldId, FieldsIdsMap};
|
||||||
|
|
||||||
/// A fields ids map that can be globally updated to add fields
|
/// A fields ids map that can be globally updated to add fields
|
||||||
@ -11,11 +13,21 @@ pub struct GlobalFieldsIdsMap<'indexing> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
struct LocalFieldsIdsMap {
|
pub struct LocalFieldsIdsMap {
|
||||||
names_ids: BTreeMap<String, FieldId>,
|
names_ids: BTreeMap<String, FieldId>,
|
||||||
ids_names: BTreeMap<FieldId, String>,
|
ids_names: BTreeMap<FieldId, String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl FieldIdMapper for LocalFieldsIdsMap {
|
||||||
|
fn id(&self, name: &str) -> Option<FieldId> {
|
||||||
|
self.id(name)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn name(&self, id: FieldId) -> Option<&str> {
|
||||||
|
self.name(id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl LocalFieldsIdsMap {
|
impl LocalFieldsIdsMap {
|
||||||
fn new(global: &RwLock<FieldsIdsMap>) -> Self {
|
fn new(global: &RwLock<FieldsIdsMap>) -> Self {
|
||||||
let global = global.read().unwrap();
|
let global = global.read().unwrap();
|
||||||
@ -83,4 +95,14 @@ impl<'indexing> GlobalFieldsIdsMap<'indexing> {
|
|||||||
|
|
||||||
self.local.name(id)
|
self.local.name(id)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn local_map(&self) -> &LocalFieldsIdsMap {
|
||||||
|
&self.local
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'indexing> MutFieldIdMapper for GlobalFieldsIdsMap<'indexing> {
|
||||||
|
fn insert(&mut self, name: &str) -> Option<FieldId> {
|
||||||
|
self.id_or_insert(name)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
255
milli/src/update/new/document.rs
Normal file
255
milli/src/update/new/document.rs
Normal file
@ -0,0 +1,255 @@
|
|||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
|
use heed::RoTxn;
|
||||||
|
use serde_json::value::RawValue;
|
||||||
|
|
||||||
|
use super::document_change::{Entry, Versions};
|
||||||
|
use super::{KvReaderFieldId, KvWriterFieldId};
|
||||||
|
use crate::documents::FieldIdMapper;
|
||||||
|
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
||||||
|
use crate::{DocumentId, FieldId, Index, InternalError, Result};
|
||||||
|
|
||||||
|
/// A view into a document that can represent either the current version from the DB,
|
||||||
|
/// the update data from payload or other means, or the merged updated version.
|
||||||
|
///
|
||||||
|
/// The 'doc lifetime is meant to live sufficiently for the document to be handled by the extractors.
|
||||||
|
pub trait Document<'doc> {
|
||||||
|
/// Iterate over all **top-level** fields of the document, returning their name and raw JSON value.
|
||||||
|
///
|
||||||
|
/// - The returned values *may* contain nested fields.
|
||||||
|
/// - The `_vectors` field is **ignored** by this method, meaning it is **not returned** by this method.
|
||||||
|
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>>;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub struct DocumentFromDb<'t, Mapper: FieldIdMapper>
|
||||||
|
where
|
||||||
|
Mapper: FieldIdMapper,
|
||||||
|
{
|
||||||
|
fields_ids_map: &'t Mapper,
|
||||||
|
content: &'t KvReaderFieldId,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> {
|
||||||
|
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'t str, &'t RawValue)>> {
|
||||||
|
let mut it = self.content.iter();
|
||||||
|
|
||||||
|
std::iter::from_fn(move || {
|
||||||
|
let (fid, value) = it.next()?;
|
||||||
|
|
||||||
|
let res = (|| {
|
||||||
|
let value =
|
||||||
|
serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?;
|
||||||
|
|
||||||
|
let name = self.fields_ids_map.name(fid).ok_or(
|
||||||
|
InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId {
|
||||||
|
field_id: fid,
|
||||||
|
process: "getting current document",
|
||||||
|
}),
|
||||||
|
)?;
|
||||||
|
Ok((name, value))
|
||||||
|
})();
|
||||||
|
|
||||||
|
Some(res)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> {
|
||||||
|
pub fn new(
|
||||||
|
docid: DocumentId,
|
||||||
|
rtxn: &'t RoTxn,
|
||||||
|
index: &'t Index,
|
||||||
|
db_fields_ids_map: &'t Mapper,
|
||||||
|
) -> Result<Option<Self>> {
|
||||||
|
index.documents.get(rtxn, &docid).map_err(crate::Error::from).map(|reader| {
|
||||||
|
reader.map(|reader| Self { fields_ids_map: db_fields_ids_map, content: reader })
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn field_from_fid(&self, fid: FieldId) -> Result<Option<&'t RawValue>> {
|
||||||
|
Ok(self
|
||||||
|
.content
|
||||||
|
.get(fid)
|
||||||
|
.map(|v| serde_json::from_slice(v).map_err(InternalError::SerdeJson))
|
||||||
|
.transpose()?)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub struct DocumentFromVersions<'doc> {
|
||||||
|
versions: Versions<'doc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'doc> DocumentFromVersions<'doc> {
|
||||||
|
pub fn new(versions: Versions<'doc>) -> Self {
|
||||||
|
Self { versions }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'doc> Document<'doc> for DocumentFromVersions<'doc> {
|
||||||
|
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>> {
|
||||||
|
match &self.versions {
|
||||||
|
Versions::Single(version) => either::Either::Left(version.iter_top_level_fields()),
|
||||||
|
Versions::Multiple(versions) => {
|
||||||
|
let mut seen_fields = BTreeSet::new();
|
||||||
|
let mut it = versions.iter().rev().flat_map(|version| version.iter()).copied();
|
||||||
|
either::Either::Right(std::iter::from_fn(move || loop {
|
||||||
|
let (name, value) = it.next()?;
|
||||||
|
|
||||||
|
if seen_fields.contains(name) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seen_fields.insert(name);
|
||||||
|
return Some(Ok((name, value)));
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// used in document from payload
|
||||||
|
impl<'doc> Document<'doc> for &'doc [Entry<'doc>] {
|
||||||
|
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<Entry<'doc>>> {
|
||||||
|
self.iter().copied().map(|(k, v)| Ok((k, v)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct MergedDocument<'doc, 't, Mapper: FieldIdMapper> {
|
||||||
|
new_doc: DocumentFromVersions<'doc>,
|
||||||
|
db: Option<DocumentFromDb<'t, Mapper>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'doc, 't, Mapper: FieldIdMapper> MergedDocument<'doc, 't, Mapper> {
|
||||||
|
pub fn new(
|
||||||
|
new_doc: DocumentFromVersions<'doc>,
|
||||||
|
db: Option<DocumentFromDb<'t, Mapper>>,
|
||||||
|
) -> Self {
|
||||||
|
Self { new_doc, db }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_db(
|
||||||
|
docid: DocumentId,
|
||||||
|
rtxn: &'t RoTxn,
|
||||||
|
index: &'t Index,
|
||||||
|
db_fields_ids_map: &'t Mapper,
|
||||||
|
new_doc: DocumentFromVersions<'doc>,
|
||||||
|
) -> Result<Self> {
|
||||||
|
let db = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map)?;
|
||||||
|
Ok(Self { new_doc, db })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn without_db(new_doc: DocumentFromVersions<'doc>) -> Self {
|
||||||
|
Self { new_doc, db: None }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d>
|
||||||
|
for MergedDocument<'doc, 't, Mapper>
|
||||||
|
{
|
||||||
|
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'d str, &'d RawValue)>> {
|
||||||
|
let mut new_doc_it = self.new_doc.iter_top_level_fields();
|
||||||
|
let mut db_it = self.db.iter().flat_map(|db| db.iter_top_level_fields());
|
||||||
|
|
||||||
|
std::iter::from_fn(move || {
|
||||||
|
let mut seen_fields = BTreeSet::new();
|
||||||
|
if let Some(next) = new_doc_it.next() {
|
||||||
|
if let Ok((name, _)) = next {
|
||||||
|
seen_fields.insert(name);
|
||||||
|
}
|
||||||
|
return Some(next);
|
||||||
|
}
|
||||||
|
loop {
|
||||||
|
match db_it.next()? {
|
||||||
|
Ok((name, value)) => {
|
||||||
|
if seen_fields.contains(name) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
return Some(Ok((name, value)));
|
||||||
|
}
|
||||||
|
Err(err) => return Some(Err(err)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'doc, D> Document<'doc> for &D
|
||||||
|
where
|
||||||
|
D: Document<'doc>,
|
||||||
|
{
|
||||||
|
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>> {
|
||||||
|
D::iter_top_level_fields(self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Turn this document into an obkv, whose fields are indexed by the provided `FieldIdMapper`.
|
||||||
|
///
|
||||||
|
/// The produced obkv is suitable for storing into the documents DB, meaning:
|
||||||
|
///
|
||||||
|
/// - It contains the contains of `_vectors` that are not configured as an embedder
|
||||||
|
/// - It contains all the top-level fields of the document, with their raw JSON value as value.
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// - If the document contains a top-level field that is not present in `fields_ids_map`.
|
||||||
|
///
|
||||||
|
pub fn write_to_obkv<'s, 'a, 'b>(
|
||||||
|
document: &'s impl Document<'s>,
|
||||||
|
vector_document: Option<()>,
|
||||||
|
fields_ids_map: &'a impl FieldIdMapper,
|
||||||
|
mut document_buffer: &'a mut Vec<u8>,
|
||||||
|
) -> Result<&'a KvReaderFieldId>
|
||||||
|
where
|
||||||
|
's: 'a,
|
||||||
|
's: 'b,
|
||||||
|
{
|
||||||
|
// will be used in 'inject_vectors
|
||||||
|
let vectors_value: Box<RawValue>;
|
||||||
|
|
||||||
|
document_buffer.clear();
|
||||||
|
let mut unordered_field_buffer = Vec::new();
|
||||||
|
unordered_field_buffer.clear();
|
||||||
|
|
||||||
|
let mut writer = KvWriterFieldId::new(&mut document_buffer);
|
||||||
|
|
||||||
|
for res in document.iter_top_level_fields() {
|
||||||
|
let (field_name, value) = res?;
|
||||||
|
let field_id = fields_ids_map.id(field_name).unwrap();
|
||||||
|
unordered_field_buffer.push((field_id, value));
|
||||||
|
}
|
||||||
|
|
||||||
|
'inject_vectors: {
|
||||||
|
let Some(vector_document) = vector_document else { break 'inject_vectors };
|
||||||
|
|
||||||
|
let Some(vectors_fid) = fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME) else {
|
||||||
|
break 'inject_vectors;
|
||||||
|
};
|
||||||
|
/*
|
||||||
|
let mut vectors = BTreeMap::new();
|
||||||
|
for (name, entry) in vector_document.iter_vectors() {
|
||||||
|
if entry.has_configured_embedder {
|
||||||
|
continue; // we don't write vectors with configured embedder in documents
|
||||||
|
}
|
||||||
|
vectors.insert(
|
||||||
|
name,
|
||||||
|
serde_json::json!({
|
||||||
|
"regenerate": entry.regenerate,
|
||||||
|
// TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object
|
||||||
|
"embeddings": entry.embeddings,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
vectors_value = serde_json::value::to_raw_value(&vectors).unwrap();
|
||||||
|
unordered_field_buffer.push((vectors_fid, &vectors_value));*/
|
||||||
|
}
|
||||||
|
|
||||||
|
unordered_field_buffer.sort_by_key(|(fid, _)| *fid);
|
||||||
|
for (fid, value) in unordered_field_buffer.iter() {
|
||||||
|
writer.insert(*fid, value.get().as_bytes()).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.finish().unwrap();
|
||||||
|
Ok(KvReaderFieldId::from_slice(document_buffer))
|
||||||
|
}
|
@ -1,35 +1,35 @@
|
|||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use obkv::KvReader;
|
use serde_json::value::RawValue;
|
||||||
|
|
||||||
use crate::update::new::KvReaderFieldId;
|
use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument};
|
||||||
use crate::{DocumentId, FieldId, Index, Result};
|
use crate::documents::FieldIdMapper;
|
||||||
|
use crate::{DocumentId, Index, Result};
|
||||||
|
|
||||||
pub enum DocumentChange {
|
pub enum DocumentChange<'doc> {
|
||||||
Deletion(Deletion),
|
Deletion(Deletion),
|
||||||
Update(Update),
|
Update(Update<'doc>),
|
||||||
Insertion(Insertion),
|
Insertion(Insertion<'doc>),
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Deletion {
|
pub struct Deletion {
|
||||||
pub docid: DocumentId,
|
docid: DocumentId,
|
||||||
pub external_document_id: String,
|
external_document_id: String,
|
||||||
current: Box<KvReaderFieldId>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Update {
|
pub struct Update<'doc> {
|
||||||
pub docid: DocumentId,
|
docid: DocumentId,
|
||||||
pub external_document_id: String,
|
external_document_id: String,
|
||||||
current: Box<KvReaderFieldId>,
|
new: DocumentFromVersions<'doc>,
|
||||||
pub new: Box<KvReaderFieldId>,
|
has_deletion: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Insertion {
|
pub struct Insertion<'doc> {
|
||||||
pub docid: DocumentId,
|
docid: DocumentId,
|
||||||
pub external_document_id: String,
|
external_document_id: String,
|
||||||
pub new: Box<KvReaderFieldId>,
|
new: DocumentFromVersions<'doc>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocumentChange {
|
impl<'doc> DocumentChange<'doc> {
|
||||||
pub fn docid(&self) -> DocumentId {
|
pub fn docid(&self) -> DocumentId {
|
||||||
match &self {
|
match &self {
|
||||||
Self::Deletion(inner) => inner.docid(),
|
Self::Deletion(inner) => inner.docid(),
|
||||||
@ -37,15 +37,19 @@ impl DocumentChange {
|
|||||||
Self::Insertion(inner) => inner.docid(),
|
Self::Insertion(inner) => inner.docid(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn external_docid(&self) -> &str {
|
||||||
|
match self {
|
||||||
|
DocumentChange::Deletion(deletion) => deletion.external_document_id(),
|
||||||
|
DocumentChange::Update(update) => update.external_document_id(),
|
||||||
|
DocumentChange::Insertion(insertion) => insertion.external_document_id(),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Deletion {
|
impl Deletion {
|
||||||
pub fn create(
|
pub fn create(docid: DocumentId, external_document_id: String) -> Self {
|
||||||
docid: DocumentId,
|
Self { docid, external_document_id }
|
||||||
external_document_id: String,
|
|
||||||
current: Box<KvReaderFieldId>,
|
|
||||||
) -> Self {
|
|
||||||
Self { docid, external_document_id, current }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn docid(&self) -> DocumentId {
|
pub fn docid(&self) -> DocumentId {
|
||||||
@ -56,21 +60,23 @@ impl Deletion {
|
|||||||
&self.external_document_id
|
&self.external_document_id
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO shouldn't we use the one in self?
|
pub fn current<'a, Mapper: FieldIdMapper>(
|
||||||
pub fn current<'a>(
|
|
||||||
&self,
|
&self,
|
||||||
rtxn: &'a RoTxn,
|
rtxn: &'a RoTxn,
|
||||||
index: &'a Index,
|
index: &'a Index,
|
||||||
) -> Result<Option<&'a KvReader<FieldId>>> {
|
mapper: &'a Mapper,
|
||||||
index.documents.get(rtxn, &self.docid).map_err(crate::Error::from)
|
) -> Result<DocumentFromDb<'a, Mapper>> {
|
||||||
|
Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
|
||||||
|
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
|
||||||
|
)?)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Insertion {
|
impl<'doc> Insertion<'doc> {
|
||||||
pub fn create(
|
pub fn create(
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
external_document_id: String,
|
external_document_id: String,
|
||||||
new: Box<KvReaderFieldId>,
|
new: DocumentFromVersions<'doc>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Insertion { docid, external_document_id, new }
|
Insertion { docid, external_document_id, new }
|
||||||
}
|
}
|
||||||
@ -82,20 +88,19 @@ impl Insertion {
|
|||||||
pub fn external_document_id(&self) -> &str {
|
pub fn external_document_id(&self) -> &str {
|
||||||
&self.external_document_id
|
&self.external_document_id
|
||||||
}
|
}
|
||||||
|
pub fn new(&self) -> DocumentFromVersions<'doc> {
|
||||||
pub fn new(&self) -> &KvReader<FieldId> {
|
self.new
|
||||||
self.new.as_ref()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Update {
|
impl<'doc> Update<'doc> {
|
||||||
pub fn create(
|
pub fn create(
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
external_document_id: String,
|
external_document_id: String,
|
||||||
current: Box<KvReaderFieldId>,
|
new: DocumentFromVersions<'doc>,
|
||||||
new: Box<KvReaderFieldId>,
|
has_deletion: bool,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Update { docid, external_document_id, current, new }
|
Update { docid, new, external_document_id, has_deletion }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn docid(&self) -> DocumentId {
|
pub fn docid(&self) -> DocumentId {
|
||||||
@ -105,16 +110,39 @@ impl Update {
|
|||||||
pub fn external_document_id(&self) -> &str {
|
pub fn external_document_id(&self) -> &str {
|
||||||
&self.external_document_id
|
&self.external_document_id
|
||||||
}
|
}
|
||||||
|
pub fn current<'a, Mapper: FieldIdMapper>(
|
||||||
pub fn current<'a>(
|
|
||||||
&self,
|
&self,
|
||||||
rtxn: &'a RoTxn,
|
rtxn: &'a RoTxn,
|
||||||
index: &'a Index,
|
index: &'a Index,
|
||||||
) -> Result<Option<&'a KvReader<FieldId>>> {
|
mapper: &'a Mapper,
|
||||||
index.documents.get(rtxn, &self.docid).map_err(crate::Error::from)
|
) -> Result<DocumentFromDb<'a, Mapper>> {
|
||||||
|
Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
|
||||||
|
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
|
||||||
|
)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new(&self) -> &KvReader<FieldId> {
|
pub fn updated(&self) -> DocumentFromVersions<'doc> {
|
||||||
self.new.as_ref()
|
self.new
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new<'a, Mapper: FieldIdMapper>(
|
||||||
|
&self,
|
||||||
|
rtxn: &'a RoTxn,
|
||||||
|
index: &'a Index,
|
||||||
|
mapper: &'a Mapper,
|
||||||
|
) -> Result<MergedDocument<'doc, 'a, Mapper>> {
|
||||||
|
if self.has_deletion {
|
||||||
|
Ok(MergedDocument::without_db(self.new))
|
||||||
|
} else {
|
||||||
|
MergedDocument::with_db(self.docid, rtxn, index, mapper, self.new)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub type Entry<'doc> = (&'doc str, &'doc RawValue);
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub enum Versions<'doc> {
|
||||||
|
Single(&'doc [Entry<'doc>]),
|
||||||
|
Multiple(&'doc [&'doc [Entry<'doc>]]),
|
||||||
|
}
|
||||||
|
@ -2,46 +2,90 @@ use std::cell::RefCell;
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::sync::Arc;
|
use std::ops::DerefMut as _;
|
||||||
|
|
||||||
|
use bumpalo::Bump;
|
||||||
use grenad::{MergeFunction, Merger};
|
use grenad::{MergeFunction, Merger};
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
|
use rayon::iter::{ParallelBridge as _, ParallelIterator as _};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use thread_local::ThreadLocal;
|
|
||||||
|
|
||||||
use super::super::cache::CboCachedSorter;
|
use super::super::cache::CboCachedSorter;
|
||||||
use super::facet_document::extract_document_facets;
|
use super::facet_document::extract_document_facets;
|
||||||
use super::FacetKind;
|
use super::FacetKind;
|
||||||
use crate::facet::value_encoding::f64_into_bytes;
|
use crate::facet::value_encoding::f64_into_bytes;
|
||||||
use crate::update::new::extract::DocidsExtractor;
|
use crate::update::new::extract::DocidsExtractor;
|
||||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt;
|
use crate::update::new::indexer::document_changes::{
|
||||||
|
for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
|
||||||
|
IndexingContext, ThreadLocal,
|
||||||
|
};
|
||||||
use crate::update::new::DocumentChange;
|
use crate::update::new::DocumentChange;
|
||||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||||
use crate::{
|
use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH};
|
||||||
DocumentId, Error, FieldId, GlobalFieldsIdsMap, Index, Result, MAX_FACET_VALUE_LENGTH,
|
|
||||||
};
|
pub struct FacetedExtractorData<'extractor> {
|
||||||
|
attributes_to_extract: &'extractor [&'extractor str],
|
||||||
|
grenad_parameters: GrenadParameters,
|
||||||
|
max_memory: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> {
|
||||||
|
type Data = FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>;
|
||||||
|
|
||||||
|
fn init_data(
|
||||||
|
&self,
|
||||||
|
_extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
|
||||||
|
) -> Result<Self::Data> {
|
||||||
|
Ok(FullySend(RefCell::new(CboCachedSorter::new(
|
||||||
|
// TODO use a better value
|
||||||
|
1_000_000.try_into().unwrap(),
|
||||||
|
create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
MergeDeladdCboRoaringBitmaps,
|
||||||
|
self.grenad_parameters.chunk_compression_type,
|
||||||
|
self.grenad_parameters.chunk_compression_level,
|
||||||
|
self.grenad_parameters.max_nb_chunks,
|
||||||
|
self.max_memory,
|
||||||
|
),
|
||||||
|
))))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn process(
|
||||||
|
&self,
|
||||||
|
change: DocumentChange,
|
||||||
|
context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>,
|
||||||
|
) -> Result<()> {
|
||||||
|
FacetedDocidsExtractor::extract_document_change(
|
||||||
|
&context,
|
||||||
|
self.attributes_to_extract,
|
||||||
|
change,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct FacetedDocidsExtractor;
|
pub struct FacetedDocidsExtractor;
|
||||||
|
|
||||||
impl FacetedDocidsExtractor {
|
impl FacetedDocidsExtractor {
|
||||||
fn extract_document_change(
|
fn extract_document_change(
|
||||||
rtxn: &RoTxn,
|
context: &DocumentChangeContext<
|
||||||
index: &Index,
|
FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
|
||||||
buffer: &mut Vec<u8>,
|
>,
|
||||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
|
||||||
attributes_to_extract: &[&str],
|
attributes_to_extract: &[&str],
|
||||||
cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
|
||||||
document_change: DocumentChange,
|
document_change: DocumentChange,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
let index = &context.index;
|
||||||
|
let rtxn = &context.txn;
|
||||||
|
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut();
|
||||||
|
let mut cached_sorter = context.data.0.borrow_mut();
|
||||||
match document_change {
|
match document_change {
|
||||||
DocumentChange::Deletion(inner) => extract_document_facets(
|
DocumentChange::Deletion(inner) => extract_document_facets(
|
||||||
attributes_to_extract,
|
attributes_to_extract,
|
||||||
inner.current(rtxn, index)?.unwrap(),
|
inner.current(rtxn, index, context.db_fields_ids_map)?,
|
||||||
fields_ids_map,
|
new_fields_ids_map.deref_mut(),
|
||||||
&mut |fid, value| {
|
&mut |fid, value| {
|
||||||
Self::facet_fn_with_options(
|
Self::facet_fn_with_options(
|
||||||
buffer,
|
&context.doc_alloc,
|
||||||
cached_sorter,
|
cached_sorter.deref_mut(),
|
||||||
CboCachedSorter::insert_del_u32,
|
CboCachedSorter::insert_del_u32,
|
||||||
inner.docid(),
|
inner.docid(),
|
||||||
fid,
|
fid,
|
||||||
@ -52,12 +96,12 @@ impl FacetedDocidsExtractor {
|
|||||||
DocumentChange::Update(inner) => {
|
DocumentChange::Update(inner) => {
|
||||||
extract_document_facets(
|
extract_document_facets(
|
||||||
attributes_to_extract,
|
attributes_to_extract,
|
||||||
inner.current(rtxn, index)?.unwrap(),
|
inner.current(rtxn, index, context.db_fields_ids_map)?,
|
||||||
fields_ids_map,
|
new_fields_ids_map.deref_mut(),
|
||||||
&mut |fid, value| {
|
&mut |fid, value| {
|
||||||
Self::facet_fn_with_options(
|
Self::facet_fn_with_options(
|
||||||
buffer,
|
&context.doc_alloc,
|
||||||
cached_sorter,
|
cached_sorter.deref_mut(),
|
||||||
CboCachedSorter::insert_del_u32,
|
CboCachedSorter::insert_del_u32,
|
||||||
inner.docid(),
|
inner.docid(),
|
||||||
fid,
|
fid,
|
||||||
@ -68,12 +112,12 @@ impl FacetedDocidsExtractor {
|
|||||||
|
|
||||||
extract_document_facets(
|
extract_document_facets(
|
||||||
attributes_to_extract,
|
attributes_to_extract,
|
||||||
inner.new(),
|
inner.new(rtxn, index, context.db_fields_ids_map)?,
|
||||||
fields_ids_map,
|
new_fields_ids_map.deref_mut(),
|
||||||
&mut |fid, value| {
|
&mut |fid, value| {
|
||||||
Self::facet_fn_with_options(
|
Self::facet_fn_with_options(
|
||||||
buffer,
|
&context.doc_alloc,
|
||||||
cached_sorter,
|
cached_sorter.deref_mut(),
|
||||||
CboCachedSorter::insert_add_u32,
|
CboCachedSorter::insert_add_u32,
|
||||||
inner.docid(),
|
inner.docid(),
|
||||||
fid,
|
fid,
|
||||||
@ -85,11 +129,11 @@ impl FacetedDocidsExtractor {
|
|||||||
DocumentChange::Insertion(inner) => extract_document_facets(
|
DocumentChange::Insertion(inner) => extract_document_facets(
|
||||||
attributes_to_extract,
|
attributes_to_extract,
|
||||||
inner.new(),
|
inner.new(),
|
||||||
fields_ids_map,
|
new_fields_ids_map.deref_mut(),
|
||||||
&mut |fid, value| {
|
&mut |fid, value| {
|
||||||
Self::facet_fn_with_options(
|
Self::facet_fn_with_options(
|
||||||
buffer,
|
&context.doc_alloc,
|
||||||
cached_sorter,
|
cached_sorter.deref_mut(),
|
||||||
CboCachedSorter::insert_add_u32,
|
CboCachedSorter::insert_add_u32,
|
||||||
inner.docid(),
|
inner.docid(),
|
||||||
fid,
|
fid,
|
||||||
@ -101,7 +145,7 @@ impl FacetedDocidsExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn facet_fn_with_options<MF>(
|
fn facet_fn_with_options<MF>(
|
||||||
buffer: &mut Vec<u8>,
|
doc_alloc: &Bump,
|
||||||
cached_sorter: &mut CboCachedSorter<MF>,
|
cached_sorter: &mut CboCachedSorter<MF>,
|
||||||
cache_fn: impl Fn(&mut CboCachedSorter<MF>, &[u8], u32) -> grenad::Result<(), MF::Error>,
|
cache_fn: impl Fn(&mut CboCachedSorter<MF>, &[u8], u32) -> grenad::Result<(), MF::Error>,
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
@ -113,9 +157,9 @@ impl FacetedDocidsExtractor {
|
|||||||
MF::Error: Debug,
|
MF::Error: Debug,
|
||||||
grenad::Error<MF::Error>: Into<crate::Error>,
|
grenad::Error<MF::Error>: Into<crate::Error>,
|
||||||
{
|
{
|
||||||
|
let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc);
|
||||||
// Exists
|
// Exists
|
||||||
// key: fid
|
// key: fid
|
||||||
buffer.clear();
|
|
||||||
buffer.push(FacetKind::Exists as u8);
|
buffer.push(FacetKind::Exists as u8);
|
||||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into)?;
|
cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into)?;
|
||||||
@ -197,58 +241,38 @@ fn truncate_str(s: &str) -> &str {
|
|||||||
|
|
||||||
impl DocidsExtractor for FacetedDocidsExtractor {
|
impl DocidsExtractor for FacetedDocidsExtractor {
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
|
||||||
fn run_extraction(
|
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
|
||||||
index: &Index,
|
grenad_parameters: GrenadParameters,
|
||||||
fields_ids_map: &GlobalFieldsIdsMap,
|
document_changes: &DC,
|
||||||
indexer: GrenadParameters,
|
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
||||||
document_changes: impl IntoParallelIterator<
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
Item = std::result::Result<DocumentChange, Arc<Error>>,
|
|
||||||
>,
|
|
||||||
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = grenad_parameters.max_memory_by_thread();
|
||||||
|
|
||||||
|
let index = indexing_context.index;
|
||||||
|
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
|
let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
|
||||||
let attributes_to_extract: Vec<_> =
|
let attributes_to_extract: Vec<_> =
|
||||||
attributes_to_extract.iter().map(|s| s.as_ref()).collect();
|
attributes_to_extract.iter().map(|s| s.as_ref()).collect();
|
||||||
let thread_local = ThreadLocal::with_capacity(rayon::current_num_threads());
|
let datastore = ThreadLocal::new();
|
||||||
|
|
||||||
{
|
{
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
document_changes.into_par_iter().try_arc_for_each_try_init(
|
|
||||||
|| {
|
let extractor = FacetedExtractorData {
|
||||||
thread_local.get_or_try(|| {
|
attributes_to_extract: &attributes_to_extract,
|
||||||
let rtxn = index.read_txn().map_err(Error::from)?;
|
grenad_parameters,
|
||||||
let cache = CboCachedSorter::new(
|
max_memory,
|
||||||
/// TODO use a better value
|
};
|
||||||
100.try_into().unwrap(),
|
for_each_document_change(
|
||||||
create_sorter(
|
document_changes,
|
||||||
grenad::SortAlgorithm::Stable,
|
&extractor,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
indexing_context,
|
||||||
indexer.chunk_compression_type,
|
extractor_allocs,
|
||||||
indexer.chunk_compression_level,
|
&datastore,
|
||||||
indexer.max_nb_chunks,
|
|
||||||
max_memory,
|
|
||||||
),
|
|
||||||
);
|
|
||||||
Ok((rtxn, RefCell::new((fields_ids_map.clone(), Vec::new(), cache))))
|
|
||||||
})
|
|
||||||
},
|
|
||||||
|(rtxn, rc), document_change| {
|
|
||||||
let (fields_ids_map, buffer, cached_sorter) = &mut *rc.borrow_mut();
|
|
||||||
Self::extract_document_change(
|
|
||||||
rtxn,
|
|
||||||
index,
|
|
||||||
buffer,
|
|
||||||
fields_ids_map,
|
|
||||||
&attributes_to_extract,
|
|
||||||
cached_sorter,
|
|
||||||
document_change?,
|
|
||||||
)
|
|
||||||
.map_err(Arc::new)
|
|
||||||
},
|
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@ -257,11 +281,11 @@ impl DocidsExtractor for FacetedDocidsExtractor {
|
|||||||
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
|
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
let readers: Vec<_> = thread_local
|
let readers: Vec<_> = datastore
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.par_bridge()
|
.par_bridge()
|
||||||
.map(|(_, rc)| {
|
.map(|cached_sorter| {
|
||||||
let (_, _, cached_sorter) = rc.into_inner();
|
let cached_sorter = cached_sorter.0.into_inner();
|
||||||
let sorter = cached_sorter.into_sorter()?;
|
let sorter = cached_sorter.into_sorter()?;
|
||||||
sorter.into_reader_cursors()
|
sorter.into_reader_cursors()
|
||||||
})
|
})
|
||||||
|
@ -1,24 +1,17 @@
|
|||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use crate::update::new::document::Document;
|
||||||
use crate::update::new::extract::perm_json_p;
|
use crate::update::new::extract::perm_json_p;
|
||||||
use crate::update::new::KvReaderFieldId;
|
|
||||||
use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError};
|
use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError};
|
||||||
|
|
||||||
pub fn extract_document_facets(
|
pub fn extract_document_facets<'doc>(
|
||||||
attributes_to_extract: &[&str],
|
attributes_to_extract: &[&str],
|
||||||
obkv: &KvReaderFieldId,
|
document: impl Document<'doc>,
|
||||||
field_id_map: &mut GlobalFieldsIdsMap,
|
field_id_map: &mut GlobalFieldsIdsMap,
|
||||||
facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>,
|
facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut field_name = String::new();
|
for res in document.iter_top_level_fields() {
|
||||||
for (field_id, field_bytes) in obkv {
|
let (field_name, value) = res?;
|
||||||
let Some(field_name) = field_id_map.name(field_id).map(|s| {
|
|
||||||
field_name.clear();
|
|
||||||
field_name.push_str(s);
|
|
||||||
&field_name
|
|
||||||
}) else {
|
|
||||||
unreachable!("field id not found in field id map");
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) {
|
let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) {
|
||||||
Some(field_id) => facet_fn(field_id, value),
|
Some(field_id) => facet_fn(field_id, value),
|
||||||
@ -28,7 +21,7 @@ pub fn extract_document_facets(
|
|||||||
// if the current field is searchable or contains a searchable attribute
|
// if the current field is searchable or contains a searchable attribute
|
||||||
if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) {
|
if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) {
|
||||||
// parse json.
|
// parse json.
|
||||||
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
|
match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? {
|
||||||
Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
|
Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
|
||||||
&object,
|
&object,
|
||||||
Some(attributes_to_extract),
|
Some(attributes_to_extract),
|
||||||
|
@ -3,26 +3,24 @@ mod faceted;
|
|||||||
mod lru;
|
mod lru;
|
||||||
mod searchable;
|
mod searchable;
|
||||||
|
|
||||||
|
use std::cell::RefCell;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
|
use bumpalo::Bump;
|
||||||
pub use faceted::*;
|
pub use faceted::*;
|
||||||
use grenad::Merger;
|
use grenad::Merger;
|
||||||
use rayon::iter::IntoParallelIterator;
|
|
||||||
pub use searchable::*;
|
pub use searchable::*;
|
||||||
|
|
||||||
use super::DocumentChange;
|
use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal};
|
||||||
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||||
use crate::{Error, GlobalFieldsIdsMap, Index, Result};
|
use crate::Result;
|
||||||
|
|
||||||
pub trait DocidsExtractor {
|
pub trait DocidsExtractor {
|
||||||
fn run_extraction(
|
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
|
||||||
index: &Index,
|
grenad_parameters: GrenadParameters,
|
||||||
fields_ids_map: &GlobalFieldsIdsMap,
|
document_changes: &DC,
|
||||||
indexer: GrenadParameters,
|
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
||||||
document_changes: impl IntoParallelIterator<
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
Item = std::result::Result<DocumentChange, Arc<Error>>,
|
|
||||||
>,
|
|
||||||
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>;
|
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,17 +2,23 @@ use std::cell::RefCell;
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::num::NonZero;
|
use std::num::NonZero;
|
||||||
|
use std::ops::DerefMut as _;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use bumpalo::Bump;
|
||||||
use grenad::{Merger, MergerBuilder};
|
use grenad::{Merger, MergerBuilder};
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use rayon::iter::IntoParallelIterator;
|
use rayon::iter::IntoParallelIterator;
|
||||||
use thread_local::ThreadLocal;
|
|
||||||
|
|
||||||
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||||
use super::SearchableExtractor;
|
use super::SearchableExtractor;
|
||||||
|
use crate::update::new::document::Document;
|
||||||
use crate::update::new::extract::cache::CboCachedSorter;
|
use crate::update::new::extract::cache::CboCachedSorter;
|
||||||
use crate::update::new::extract::perm_json_p::contained_in;
|
use crate::update::new::extract::perm_json_p::contained_in;
|
||||||
|
use crate::update::new::indexer::document_changes::{
|
||||||
|
for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
|
||||||
|
IndexingContext, ThreadLocal,
|
||||||
|
};
|
||||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt;
|
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt;
|
||||||
use crate::update::new::DocumentChange;
|
use crate::update::new::DocumentChange;
|
||||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||||
@ -23,7 +29,7 @@ use crate::{
|
|||||||
|
|
||||||
const MAX_COUNTED_WORDS: usize = 30;
|
const MAX_COUNTED_WORDS: usize = 30;
|
||||||
|
|
||||||
struct WordDocidsCachedSorters {
|
pub struct WordDocidsCachedSorters {
|
||||||
word_fid_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
word_fid_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
word_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
word_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
exact_word_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
exact_word_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
@ -301,18 +307,47 @@ impl WordDocidsMergerBuilders {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct WordDocidsExtractorData<'extractor> {
|
||||||
|
tokenizer: &'extractor DocumentTokenizer<'extractor>,
|
||||||
|
grenad_parameters: GrenadParameters,
|
||||||
|
max_memory: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
|
||||||
|
type Data = FullySend<RefCell<WordDocidsCachedSorters>>;
|
||||||
|
|
||||||
|
fn init_data(
|
||||||
|
&self,
|
||||||
|
_extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
|
||||||
|
) -> Result<Self::Data> {
|
||||||
|
Ok(FullySend(RefCell::new(WordDocidsCachedSorters::new(
|
||||||
|
self.grenad_parameters,
|
||||||
|
self.max_memory,
|
||||||
|
// TODO use a better value
|
||||||
|
200_000.try_into().unwrap(),
|
||||||
|
))))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn process(
|
||||||
|
&self,
|
||||||
|
change: DocumentChange,
|
||||||
|
context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>,
|
||||||
|
) -> Result<()> {
|
||||||
|
WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct WordDocidsExtractors;
|
pub struct WordDocidsExtractors;
|
||||||
|
|
||||||
impl WordDocidsExtractors {
|
impl WordDocidsExtractors {
|
||||||
pub fn run_extraction(
|
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
|
||||||
index: &Index,
|
grenad_parameters: GrenadParameters,
|
||||||
fields_ids_map: &GlobalFieldsIdsMap,
|
document_changes: &DC,
|
||||||
indexer: GrenadParameters,
|
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
||||||
document_changes: impl IntoParallelIterator<
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
Item = std::result::Result<DocumentChange, Arc<Error>>,
|
|
||||||
>,
|
|
||||||
) -> Result<WordDocidsMergers> {
|
) -> Result<WordDocidsMergers> {
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = grenad_parameters.max_memory_by_thread();
|
||||||
|
let index = indexing_context.index;
|
||||||
|
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
let stop_words = index.stop_words(&rtxn)?;
|
let stop_words = index.stop_words(&rtxn)?;
|
||||||
@ -342,38 +377,25 @@ impl WordDocidsExtractors {
|
|||||||
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
||||||
};
|
};
|
||||||
|
|
||||||
let thread_local = ThreadLocal::with_capacity(rayon::current_num_threads());
|
let datastore = ThreadLocal::new();
|
||||||
|
|
||||||
{
|
{
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
document_changes.into_par_iter().try_arc_for_each_try_init(
|
|
||||||
|| {
|
let extractor = WordDocidsExtractorData {
|
||||||
thread_local.get_or_try(|| {
|
tokenizer: &document_tokenizer,
|
||||||
let rtxn = index.read_txn().map_err(Error::from)?;
|
grenad_parameters,
|
||||||
let fields_ids_map = fields_ids_map.clone();
|
max_memory,
|
||||||
let cache = WordDocidsCachedSorters::new(
|
};
|
||||||
indexer,
|
|
||||||
max_memory,
|
for_each_document_change(
|
||||||
// TODO use a better value
|
document_changes,
|
||||||
200_000.try_into().unwrap(),
|
&extractor,
|
||||||
);
|
indexing_context,
|
||||||
Ok((rtxn, &document_tokenizer, RefCell::new((fields_ids_map, cache))))
|
extractor_allocs,
|
||||||
})
|
&datastore,
|
||||||
},
|
|
||||||
|(rtxn, document_tokenizer, rc), document_change| {
|
|
||||||
let (fields_ids_map, cached_sorter) = &mut *rc.borrow_mut();
|
|
||||||
Self::extract_document_change(
|
|
||||||
rtxn,
|
|
||||||
index,
|
|
||||||
document_tokenizer,
|
|
||||||
fields_ids_map,
|
|
||||||
cached_sorter,
|
|
||||||
document_change?,
|
|
||||||
)
|
|
||||||
.map_err(Arc::new)
|
|
||||||
},
|
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -382,8 +404,7 @@ impl WordDocidsExtractors {
|
|||||||
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
|
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
let mut builder = WordDocidsMergerBuilders::new();
|
let mut builder = WordDocidsMergerBuilders::new();
|
||||||
for (_, _, rc) in thread_local.into_iter() {
|
for cache in datastore.into_iter().map(|cache| cache.0.into_inner()) {
|
||||||
let (_, cache) = rc.into_inner();
|
|
||||||
builder.add_sorters(cache)?;
|
builder.add_sorters(cache)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -392,13 +413,17 @@ impl WordDocidsExtractors {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn extract_document_change(
|
fn extract_document_change(
|
||||||
rtxn: &RoTxn,
|
context: &DocumentChangeContext<FullySend<RefCell<WordDocidsCachedSorters>>>,
|
||||||
index: &Index,
|
|
||||||
document_tokenizer: &DocumentTokenizer,
|
document_tokenizer: &DocumentTokenizer,
|
||||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
|
||||||
cached_sorter: &mut WordDocidsCachedSorters,
|
|
||||||
document_change: DocumentChange,
|
document_change: DocumentChange,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
let index = &context.index;
|
||||||
|
let rtxn = &context.txn;
|
||||||
|
let mut cached_sorter = context.data.0.borrow_mut();
|
||||||
|
let cached_sorter = cached_sorter.deref_mut();
|
||||||
|
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut();
|
||||||
|
let new_fields_ids_map = new_fields_ids_map.deref_mut();
|
||||||
|
|
||||||
let exact_attributes = index.exact_attributes(rtxn)?;
|
let exact_attributes = index.exact_attributes(rtxn)?;
|
||||||
let is_exact_attribute =
|
let is_exact_attribute =
|
||||||
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
|
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
|
||||||
@ -418,8 +443,8 @@ impl WordDocidsExtractors {
|
|||||||
.map_err(crate::Error::from)
|
.map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
document_tokenizer.tokenize_document(
|
document_tokenizer.tokenize_document(
|
||||||
inner.current(rtxn, index)?.unwrap(),
|
inner.current(rtxn, index, context.db_fields_ids_map)?,
|
||||||
fields_ids_map,
|
new_fields_ids_map,
|
||||||
&mut token_fn,
|
&mut token_fn,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
@ -437,8 +462,8 @@ impl WordDocidsExtractors {
|
|||||||
.map_err(crate::Error::from)
|
.map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
document_tokenizer.tokenize_document(
|
document_tokenizer.tokenize_document(
|
||||||
inner.current(rtxn, index)?.unwrap(),
|
inner.current(rtxn, index, context.db_fields_ids_map)?,
|
||||||
fields_ids_map,
|
new_fields_ids_map,
|
||||||
&mut token_fn,
|
&mut token_fn,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
@ -454,7 +479,11 @@ impl WordDocidsExtractors {
|
|||||||
)
|
)
|
||||||
.map_err(crate::Error::from)
|
.map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
document_tokenizer.tokenize_document(
|
||||||
|
inner.new(rtxn, index, context.db_fields_ids_map)?,
|
||||||
|
new_fields_ids_map,
|
||||||
|
&mut token_fn,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
DocumentChange::Insertion(inner) => {
|
DocumentChange::Insertion(inner) => {
|
||||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||||
@ -469,7 +498,11 @@ impl WordDocidsExtractors {
|
|||||||
)
|
)
|
||||||
.map_err(crate::Error::from)
|
.map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
document_tokenizer.tokenize_document(
|
||||||
|
inner.new(),
|
||||||
|
new_fields_ids_map,
|
||||||
|
&mut token_fn,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,13 +1,17 @@
|
|||||||
|
use std::cell::RefCell;
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
|
|
||||||
|
use bumpalo::Bump;
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use obkv::KvReader;
|
use obkv::KvReader;
|
||||||
|
|
||||||
use super::tokenize_document::DocumentTokenizer;
|
use super::tokenize_document::DocumentTokenizer;
|
||||||
use super::SearchableExtractor;
|
use super::SearchableExtractor;
|
||||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||||
|
use crate::update::new::document::Document;
|
||||||
use crate::update::new::extract::cache::CboCachedSorter;
|
use crate::update::new::extract::cache::CboCachedSorter;
|
||||||
|
use crate::update::new::indexer::document_changes::{DocumentChangeContext, FullySend};
|
||||||
use crate::update::new::DocumentChange;
|
use crate::update::new::DocumentChange;
|
||||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||||
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
|
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
|
||||||
@ -28,27 +32,39 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
|||||||
// This method is reimplemented to count the number of words in the document in each field
|
// This method is reimplemented to count the number of words in the document in each field
|
||||||
// and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
|
// and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
|
||||||
fn extract_document_change(
|
fn extract_document_change(
|
||||||
rtxn: &RoTxn,
|
context: &DocumentChangeContext<
|
||||||
index: &Index,
|
FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
|
||||||
|
>,
|
||||||
document_tokenizer: &DocumentTokenizer,
|
document_tokenizer: &DocumentTokenizer,
|
||||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
|
||||||
cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
|
||||||
document_change: DocumentChange,
|
document_change: DocumentChange,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut key_buffer = Vec::new();
|
let doc_alloc = &context.doc_alloc;
|
||||||
let mut del_word_pair_proximity = Vec::new();
|
|
||||||
let mut add_word_pair_proximity = Vec::new();
|
let index = context.index;
|
||||||
|
let rtxn = &context.txn;
|
||||||
|
|
||||||
|
let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc);
|
||||||
|
let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
|
||||||
|
let mut add_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
|
||||||
|
|
||||||
|
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut();
|
||||||
|
let new_fields_ids_map = &mut *new_fields_ids_map;
|
||||||
|
|
||||||
|
let mut cached_sorter = context.data.0.borrow_mut();
|
||||||
|
let cached_sorter = &mut *cached_sorter;
|
||||||
|
|
||||||
|
// is a vecdequeue, and will be smol, so can stay on the heap for now
|
||||||
let mut word_positions: VecDeque<(Rc<str>, u16)> =
|
let mut word_positions: VecDeque<(Rc<str>, u16)> =
|
||||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||||
|
|
||||||
let docid = document_change.docid();
|
let docid = document_change.docid();
|
||||||
match document_change {
|
match document_change {
|
||||||
DocumentChange::Deletion(inner) => {
|
DocumentChange::Deletion(inner) => {
|
||||||
let document = inner.current(rtxn, index)?.unwrap();
|
let document = inner.current(rtxn, index, context.db_fields_ids_map)?;
|
||||||
process_document_tokens(
|
process_document_tokens(
|
||||||
document,
|
document,
|
||||||
document_tokenizer,
|
document_tokenizer,
|
||||||
fields_ids_map,
|
new_fields_ids_map,
|
||||||
&mut word_positions,
|
&mut word_positions,
|
||||||
&mut |(w1, w2), prox| {
|
&mut |(w1, w2), prox| {
|
||||||
del_word_pair_proximity.push(((w1, w2), prox));
|
del_word_pair_proximity.push(((w1, w2), prox));
|
||||||
@ -56,21 +72,21 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
|||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
DocumentChange::Update(inner) => {
|
DocumentChange::Update(inner) => {
|
||||||
let document = inner.current(rtxn, index)?.unwrap();
|
let document = inner.current(rtxn, index, context.db_fields_ids_map)?;
|
||||||
process_document_tokens(
|
process_document_tokens(
|
||||||
document,
|
document,
|
||||||
document_tokenizer,
|
document_tokenizer,
|
||||||
fields_ids_map,
|
new_fields_ids_map,
|
||||||
&mut word_positions,
|
&mut word_positions,
|
||||||
&mut |(w1, w2), prox| {
|
&mut |(w1, w2), prox| {
|
||||||
del_word_pair_proximity.push(((w1, w2), prox));
|
del_word_pair_proximity.push(((w1, w2), prox));
|
||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
let document = inner.new();
|
let document = inner.new(rtxn, index, context.db_fields_ids_map)?;
|
||||||
process_document_tokens(
|
process_document_tokens(
|
||||||
document,
|
document,
|
||||||
document_tokenizer,
|
document_tokenizer,
|
||||||
fields_ids_map,
|
new_fields_ids_map,
|
||||||
&mut word_positions,
|
&mut word_positions,
|
||||||
&mut |(w1, w2), prox| {
|
&mut |(w1, w2), prox| {
|
||||||
add_word_pair_proximity.push(((w1, w2), prox));
|
add_word_pair_proximity.push(((w1, w2), prox));
|
||||||
@ -82,7 +98,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
|||||||
process_document_tokens(
|
process_document_tokens(
|
||||||
document,
|
document,
|
||||||
document_tokenizer,
|
document_tokenizer,
|
||||||
fields_ids_map,
|
new_fields_ids_map,
|
||||||
&mut word_positions,
|
&mut word_positions,
|
||||||
&mut |(w1, w2), prox| {
|
&mut |(w1, w2), prox| {
|
||||||
add_word_pair_proximity.push(((w1, w2), prox));
|
add_word_pair_proximity.push(((w1, w2), prox));
|
||||||
@ -108,7 +124,12 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec<u8>) -> &'a [u8] {
|
fn build_key<'a>(
|
||||||
|
prox: u8,
|
||||||
|
w1: &str,
|
||||||
|
w2: &str,
|
||||||
|
key_buffer: &'a mut bumpalo::collections::Vec<u8>,
|
||||||
|
) -> &'a [u8] {
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.push(prox);
|
key_buffer.push(prox);
|
||||||
key_buffer.extend_from_slice(w1.as_bytes());
|
key_buffer.extend_from_slice(w1.as_bytes());
|
||||||
@ -131,8 +152,8 @@ fn word_positions_into_word_pair_proximity(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process_document_tokens(
|
fn process_document_tokens<'doc>(
|
||||||
document: &KvReader<FieldId>,
|
document: impl Document<'doc>,
|
||||||
document_tokenizer: &DocumentTokenizer,
|
document_tokenizer: &DocumentTokenizer,
|
||||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||||
word_positions: &mut VecDeque<(Rc<str>, u16)>,
|
word_positions: &mut VecDeque<(Rc<str>, u16)>,
|
||||||
|
@ -4,40 +4,81 @@ mod tokenize_document;
|
|||||||
|
|
||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::sync::Arc;
|
use std::marker::PhantomData;
|
||||||
|
use std::ops::DerefMut;
|
||||||
|
|
||||||
|
use bumpalo::Bump;
|
||||||
pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers};
|
pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers};
|
||||||
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
|
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
|
||||||
use grenad::Merger;
|
use grenad::Merger;
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
|
use rayon::iter::{ParallelBridge, ParallelIterator};
|
||||||
use thread_local::ThreadLocal;
|
|
||||||
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||||
|
|
||||||
use super::cache::CboCachedSorter;
|
use super::cache::CboCachedSorter;
|
||||||
use super::DocidsExtractor;
|
use super::DocidsExtractor;
|
||||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt;
|
use crate::update::new::indexer::document_changes::{
|
||||||
|
for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
|
||||||
|
IndexingContext, ThreadLocal,
|
||||||
|
};
|
||||||
use crate::update::new::DocumentChange;
|
use crate::update::new::DocumentChange;
|
||||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||||
use crate::{Error, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||||
|
|
||||||
pub trait SearchableExtractor {
|
pub struct SearchableExtractorData<'extractor, EX: SearchableExtractor> {
|
||||||
fn run_extraction(
|
tokenizer: &'extractor DocumentTokenizer<'extractor>,
|
||||||
index: &Index,
|
grenad_parameters: GrenadParameters,
|
||||||
fields_ids_map: &GlobalFieldsIdsMap,
|
max_memory: Option<usize>,
|
||||||
indexer: GrenadParameters,
|
_ex: PhantomData<EX>,
|
||||||
document_changes: impl IntoParallelIterator<
|
}
|
||||||
Item = std::result::Result<DocumentChange, Arc<Error>>,
|
|
||||||
>,
|
impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
|
||||||
|
for SearchableExtractorData<'extractor, EX>
|
||||||
|
{
|
||||||
|
type Data = FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>;
|
||||||
|
|
||||||
|
fn init_data(
|
||||||
|
&self,
|
||||||
|
_extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
|
||||||
|
) -> Result<Self::Data> {
|
||||||
|
Ok(FullySend(RefCell::new(CboCachedSorter::new(
|
||||||
|
// TODO use a better value
|
||||||
|
1_000_000.try_into().unwrap(),
|
||||||
|
create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
MergeDeladdCboRoaringBitmaps,
|
||||||
|
self.grenad_parameters.chunk_compression_type,
|
||||||
|
self.grenad_parameters.chunk_compression_level,
|
||||||
|
self.grenad_parameters.max_nb_chunks,
|
||||||
|
self.max_memory,
|
||||||
|
),
|
||||||
|
))))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn process(
|
||||||
|
&self,
|
||||||
|
change: DocumentChange,
|
||||||
|
context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>,
|
||||||
|
) -> Result<()> {
|
||||||
|
EX::extract_document_change(context, self.tokenizer, change)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait SearchableExtractor: Sized + Sync {
|
||||||
|
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
|
||||||
|
grenad_parameters: GrenadParameters,
|
||||||
|
document_changes: &DC,
|
||||||
|
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
||||||
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = grenad_parameters.max_memory_by_thread();
|
||||||
|
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = indexing_context.index.read_txn()?;
|
||||||
let stop_words = index.stop_words(&rtxn)?;
|
let stop_words = indexing_context.index.stop_words(&rtxn)?;
|
||||||
let allowed_separators = index.allowed_separators(&rtxn)?;
|
let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
|
||||||
let allowed_separators: Option<Vec<_>> =
|
let allowed_separators: Option<Vec<_>> =
|
||||||
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||||
let dictionary = index.dictionary(&rtxn)?;
|
let dictionary = indexing_context.index.dictionary(&rtxn)?;
|
||||||
let dictionary: Option<Vec<_>> =
|
let dictionary: Option<Vec<_>> =
|
||||||
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||||
let builder = tokenizer_builder(
|
let builder = tokenizer_builder(
|
||||||
@ -47,10 +88,10 @@ pub trait SearchableExtractor {
|
|||||||
);
|
);
|
||||||
let tokenizer = builder.into_tokenizer();
|
let tokenizer = builder.into_tokenizer();
|
||||||
|
|
||||||
let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
|
let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
|
||||||
let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
|
let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
|
||||||
let localized_attributes_rules =
|
let localized_attributes_rules =
|
||||||
index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
||||||
|
|
||||||
let document_tokenizer = DocumentTokenizer {
|
let document_tokenizer = DocumentTokenizer {
|
||||||
tokenizer: &tokenizer,
|
tokenizer: &tokenizer,
|
||||||
@ -60,48 +101,26 @@ pub trait SearchableExtractor {
|
|||||||
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
||||||
};
|
};
|
||||||
|
|
||||||
let thread_local = ThreadLocal::with_capacity(rayon::current_num_threads());
|
let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData {
|
||||||
|
tokenizer: &document_tokenizer,
|
||||||
|
grenad_parameters,
|
||||||
|
max_memory,
|
||||||
|
_ex: PhantomData,
|
||||||
|
};
|
||||||
|
|
||||||
|
let datastore = ThreadLocal::new();
|
||||||
|
|
||||||
{
|
{
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
document_changes.into_par_iter().try_arc_for_each_try_init(
|
for_each_document_change(
|
||||||
|| {
|
document_changes,
|
||||||
thread_local.get_or_try(|| {
|
&extractor_data,
|
||||||
let rtxn = index.read_txn().map_err(Error::from)?;
|
indexing_context,
|
||||||
let cache = CboCachedSorter::new(
|
extractor_allocs,
|
||||||
/// TODO use a better value
|
&datastore,
|
||||||
1_000_000.try_into().unwrap(),
|
);
|
||||||
create_sorter(
|
|
||||||
grenad::SortAlgorithm::Stable,
|
|
||||||
MergeDeladdCboRoaringBitmaps,
|
|
||||||
indexer.chunk_compression_type,
|
|
||||||
indexer.chunk_compression_level,
|
|
||||||
indexer.max_nb_chunks,
|
|
||||||
max_memory,
|
|
||||||
),
|
|
||||||
);
|
|
||||||
Ok((
|
|
||||||
rtxn,
|
|
||||||
&document_tokenizer,
|
|
||||||
RefCell::new((fields_ids_map.clone(), cache)),
|
|
||||||
))
|
|
||||||
})
|
|
||||||
},
|
|
||||||
|(rtxn, document_tokenizer, rc), document_change| {
|
|
||||||
let (fields_ids_map, cached_sorter) = &mut *rc.borrow_mut();
|
|
||||||
Self::extract_document_change(
|
|
||||||
rtxn,
|
|
||||||
index,
|
|
||||||
document_tokenizer,
|
|
||||||
fields_ids_map,
|
|
||||||
cached_sorter,
|
|
||||||
document_change?,
|
|
||||||
)
|
|
||||||
.map_err(Arc::new)
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||||
@ -109,11 +128,14 @@ pub trait SearchableExtractor {
|
|||||||
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
|
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
let readers: Vec<_> = thread_local
|
let readers: Vec<_> = datastore
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.par_bridge()
|
.par_bridge()
|
||||||
.map(|(_, _, rc)| {
|
.map(|cache_entry| {
|
||||||
let (_, cached_sorter) = rc.into_inner();
|
let cached_sorter: FullySend<
|
||||||
|
RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>,
|
||||||
|
> = cache_entry;
|
||||||
|
let cached_sorter = cached_sorter.0.into_inner();
|
||||||
let sorter = cached_sorter.into_sorter()?;
|
let sorter = cached_sorter.into_sorter()?;
|
||||||
sorter.into_reader_cursors()
|
sorter.into_reader_cursors()
|
||||||
})
|
})
|
||||||
@ -122,16 +144,16 @@ pub trait SearchableExtractor {
|
|||||||
for reader in readers {
|
for reader in readers {
|
||||||
builder.extend(reader?);
|
builder.extend(reader?);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(builder.build())
|
Ok(builder.build())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_document_change(
|
fn extract_document_change(
|
||||||
rtxn: &RoTxn,
|
context: &DocumentChangeContext<
|
||||||
index: &Index,
|
FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
|
||||||
|
>,
|
||||||
document_tokenizer: &DocumentTokenizer,
|
document_tokenizer: &DocumentTokenizer,
|
||||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
|
||||||
cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
|
||||||
document_change: DocumentChange,
|
document_change: DocumentChange,
|
||||||
) -> Result<()>;
|
) -> Result<()>;
|
||||||
|
|
||||||
@ -142,14 +164,17 @@ pub trait SearchableExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<T: SearchableExtractor> DocidsExtractor for T {
|
impl<T: SearchableExtractor> DocidsExtractor for T {
|
||||||
fn run_extraction(
|
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
|
||||||
index: &Index,
|
grenad_parameters: GrenadParameters,
|
||||||
fields_ids_map: &GlobalFieldsIdsMap,
|
document_changes: &DC,
|
||||||
indexer: GrenadParameters,
|
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
||||||
document_changes: impl IntoParallelIterator<
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
Item = std::result::Result<DocumentChange, Arc<Error>>,
|
|
||||||
>,
|
|
||||||
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
||||||
Self::run_extraction(index, fields_ids_map, indexer, document_changes)
|
Self::run_extraction(
|
||||||
|
grenad_parameters,
|
||||||
|
document_changes,
|
||||||
|
indexing_context,
|
||||||
|
extractor_allocs,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@ use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
|||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use crate::proximity::MAX_DISTANCE;
|
use crate::proximity::MAX_DISTANCE;
|
||||||
|
use crate::update::new::document::Document;
|
||||||
use crate::update::new::extract::perm_json_p::{
|
use crate::update::new::extract::perm_json_p::{
|
||||||
seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
|
seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
|
||||||
};
|
};
|
||||||
@ -22,22 +23,16 @@ pub struct DocumentTokenizer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> DocumentTokenizer<'a> {
|
impl<'a> DocumentTokenizer<'a> {
|
||||||
pub fn tokenize_document(
|
pub fn tokenize_document<'doc>(
|
||||||
&self,
|
&self,
|
||||||
obkv: &KvReaderFieldId,
|
document: impl Document<'doc>,
|
||||||
field_id_map: &mut GlobalFieldsIdsMap,
|
field_id_map: &mut GlobalFieldsIdsMap,
|
||||||
token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
|
token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut field_position = HashMap::new();
|
let mut field_position = HashMap::new();
|
||||||
let mut field_name = String::new();
|
|
||||||
for (field_id, field_bytes) in obkv {
|
for entry in document.iter_top_level_fields() {
|
||||||
let Some(field_name) = field_id_map.name(field_id).map(|s| {
|
let (field_name, value) = entry?;
|
||||||
field_name.clear();
|
|
||||||
field_name.push_str(s);
|
|
||||||
&field_name
|
|
||||||
}) else {
|
|
||||||
unreachable!("field id not found in field id map");
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut tokenize_field = |name: &str, value: &Value| {
|
let mut tokenize_field = |name: &str, value: &Value| {
|
||||||
let Some(field_id) = field_id_map.id_or_insert(name) else {
|
let Some(field_id) = field_id_map.id_or_insert(name) else {
|
||||||
@ -94,7 +89,7 @@ impl<'a> DocumentTokenizer<'a> {
|
|||||||
// if the current field is searchable or contains a searchable attribute
|
// if the current field is searchable or contains a searchable attribute
|
||||||
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) {
|
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) {
|
||||||
// parse json.
|
// parse json.
|
||||||
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
|
match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
|
||||||
Value::Object(object) => seek_leaf_values_in_object(
|
Value::Object(object) => seek_leaf_values_in_object(
|
||||||
&object,
|
&object,
|
||||||
self.attribute_to_extract,
|
self.attribute_to_extract,
|
||||||
@ -174,10 +169,13 @@ pub fn tokenizer_builder<'a>(
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
|
use bumpalo::Bump;
|
||||||
use charabia::TokenizerBuilder;
|
use charabia::TokenizerBuilder;
|
||||||
use meili_snap::snapshot;
|
use meili_snap::snapshot;
|
||||||
use obkv::KvReader;
|
use obkv::KvReader;
|
||||||
|
use raw_collections::RawMap;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
use serde_json::value::RawValue;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::FieldsIdsMap;
|
use crate::FieldsIdsMap;
|
||||||
@ -186,40 +184,25 @@ mod test {
|
|||||||
fn test_tokenize_document() {
|
fn test_tokenize_document() {
|
||||||
let mut fields_ids_map = FieldsIdsMap::new();
|
let mut fields_ids_map = FieldsIdsMap::new();
|
||||||
|
|
||||||
let field_1 = json!({
|
let document = json!({
|
||||||
"name": "doggo",
|
"doggo": { "name": "doggo",
|
||||||
"age": 10,
|
"age": 10,},
|
||||||
});
|
"catto": {
|
||||||
|
|
||||||
let field_2 = json!({
|
|
||||||
"catto": {
|
"catto": {
|
||||||
"name": "pesti",
|
"name": "pesti",
|
||||||
"age": 23,
|
"age": 23,
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"doggo.name": ["doggo", "catto"],
|
||||||
|
"not-me": "UNSEARCHABLE",
|
||||||
|
"me-nether": {"nope": "unsearchable"}
|
||||||
});
|
});
|
||||||
|
|
||||||
let field_3 = json!(["doggo", "catto"]);
|
let _field_1_id = fields_ids_map.insert("doggo").unwrap();
|
||||||
let field_4 = json!("UNSEARCHABLE");
|
let _field_2_id = fields_ids_map.insert("catto").unwrap();
|
||||||
let field_5 = json!({"nope": "unsearchable"});
|
let _field_3_id = fields_ids_map.insert("doggo.name").unwrap();
|
||||||
|
let _field_4_id = fields_ids_map.insert("not-me").unwrap();
|
||||||
let mut obkv = obkv::KvWriter::memory();
|
let _field_5_id = fields_ids_map.insert("me-nether").unwrap();
|
||||||
let field_1_id = fields_ids_map.insert("doggo").unwrap();
|
|
||||||
let field_1 = serde_json::to_string(&field_1).unwrap();
|
|
||||||
obkv.insert(field_1_id, field_1.as_bytes()).unwrap();
|
|
||||||
let field_2_id = fields_ids_map.insert("catto").unwrap();
|
|
||||||
let field_2 = serde_json::to_string(&field_2).unwrap();
|
|
||||||
obkv.insert(field_2_id, field_2.as_bytes()).unwrap();
|
|
||||||
let field_3_id = fields_ids_map.insert("doggo.name").unwrap();
|
|
||||||
let field_3 = serde_json::to_string(&field_3).unwrap();
|
|
||||||
obkv.insert(field_3_id, field_3.as_bytes()).unwrap();
|
|
||||||
let field_4_id = fields_ids_map.insert("not-me").unwrap();
|
|
||||||
let field_4 = serde_json::to_string(&field_4).unwrap();
|
|
||||||
obkv.insert(field_4_id, field_4.as_bytes()).unwrap();
|
|
||||||
let field_5_id = fields_ids_map.insert("me-nether").unwrap();
|
|
||||||
let field_5 = serde_json::to_string(&field_5).unwrap();
|
|
||||||
obkv.insert(field_5_id, field_5.as_bytes()).unwrap();
|
|
||||||
let value = obkv.into_inner().unwrap();
|
|
||||||
let obkv = KvReader::from_slice(value.as_slice());
|
|
||||||
|
|
||||||
let mut tb = TokenizerBuilder::default();
|
let mut tb = TokenizerBuilder::default();
|
||||||
let document_tokenizer = DocumentTokenizer {
|
let document_tokenizer = DocumentTokenizer {
|
||||||
@ -234,11 +217,23 @@ mod test {
|
|||||||
let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
|
let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
|
||||||
|
|
||||||
let mut words = std::collections::BTreeMap::new();
|
let mut words = std::collections::BTreeMap::new();
|
||||||
|
|
||||||
|
let document = document.to_string();
|
||||||
|
|
||||||
|
let bump = Bump::new();
|
||||||
|
let document: &RawValue = serde_json::from_str(&document).unwrap();
|
||||||
|
let document = RawMap::from_raw_value(document, &bump).unwrap();
|
||||||
|
let document = document.into_bump_slice();
|
||||||
|
|
||||||
document_tokenizer
|
document_tokenizer
|
||||||
.tokenize_document(obkv, &mut global_fields_ids_map, &mut |_fname, fid, pos, word| {
|
.tokenize_document(
|
||||||
words.insert([fid, pos], word.to_string());
|
document,
|
||||||
Ok(())
|
&mut global_fields_ids_map,
|
||||||
})
|
&mut |_fname, fid, pos, word| {
|
||||||
|
words.insert([fid, pos], word.to_string());
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
snapshot!(format!("{:#?}", words), @r###"
|
snapshot!(format!("{:#?}", words), @r###"
|
||||||
|
163
milli/src/update/new/indexer/de.rs
Normal file
163
milli/src/update/new/indexer/de.rs
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
use bumpalo::Bump;
|
||||||
|
use serde_json::value::RawValue;
|
||||||
|
|
||||||
|
use crate::documents::{validate_document_id_str, DocumentIdExtractionError, PrimaryKey};
|
||||||
|
use crate::fields_ids_map::MutFieldIdMapper;
|
||||||
|
use crate::{FieldId, UserError};
|
||||||
|
|
||||||
|
// visits a document to fill the top level fields of the field id map and retrieve the external document id.
|
||||||
|
pub struct DocumentVisitor<'p, 'indexer, Mapper: MutFieldIdMapper> {
|
||||||
|
fields_ids_map: &'p mut Mapper,
|
||||||
|
primary_key: &'p PrimaryKey<'p>,
|
||||||
|
indexer: &'indexer Bump,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'p, 'indexer, Mapper: MutFieldIdMapper> DocumentVisitor<'p, 'indexer, Mapper> {
|
||||||
|
pub fn new(
|
||||||
|
fields_ids_map: &'p mut Mapper,
|
||||||
|
primary_key: &'p PrimaryKey<'p>,
|
||||||
|
indexer: &'indexer Bump,
|
||||||
|
) -> Self {
|
||||||
|
Self { fields_ids_map, primary_key, indexer }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> serde::de::Visitor<'de>
|
||||||
|
for DocumentVisitor<'p, 'indexer, Mapper>
|
||||||
|
{
|
||||||
|
type Value = std::result::Result<&'de str, DocumentIdExtractionError>;
|
||||||
|
|
||||||
|
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
|
write!(formatter, "a map")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_map<A>(mut self, mut map: A) -> std::result::Result<Self::Value, A::Error>
|
||||||
|
where
|
||||||
|
A: serde::de::MapAccess<'de>,
|
||||||
|
{
|
||||||
|
let mut docid = None;
|
||||||
|
while let Some((fid, fields_ids_map)) =
|
||||||
|
map.next_key_seed(FieldIdMapSeed(self.fields_ids_map))?
|
||||||
|
{
|
||||||
|
use serde::de::Deserializer as _;
|
||||||
|
self.fields_ids_map = fields_ids_map;
|
||||||
|
/// FIXME unwrap => too many fields
|
||||||
|
let fid = fid.unwrap();
|
||||||
|
|
||||||
|
match self.primary_key {
|
||||||
|
PrimaryKey::Flat { name, field_id } => {
|
||||||
|
let value: &'de RawValue = map.next_value()?;
|
||||||
|
if fid == *field_id {
|
||||||
|
let value = match value
|
||||||
|
.deserialize_any(DocumentIdVisitor(self.indexer))
|
||||||
|
.map_err(|_err| {
|
||||||
|
DocumentIdExtractionError::InvalidDocumentId(
|
||||||
|
UserError::InvalidDocumentId {
|
||||||
|
document_id: serde_json::to_value(value).unwrap(),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}) {
|
||||||
|
Ok(Ok(value)) => value,
|
||||||
|
Ok(Err(err)) | Err(err) => return Ok(Err(err)),
|
||||||
|
};
|
||||||
|
if let Some(_previous_value) = docid.replace(value) {
|
||||||
|
return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(2)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
PrimaryKey::Nested { name } => todo!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(match docid {
|
||||||
|
Some(docid) => Ok(docid),
|
||||||
|
None => Err(DocumentIdExtractionError::MissingDocumentId),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct FieldIdMapSeed<'a, Mapper: MutFieldIdMapper>(&'a mut Mapper);
|
||||||
|
|
||||||
|
impl<'de, 'a, Mapper: MutFieldIdMapper> serde::de::DeserializeSeed<'de>
|
||||||
|
for FieldIdMapSeed<'a, Mapper>
|
||||||
|
{
|
||||||
|
type Value = (Option<FieldId>, &'a mut Mapper);
|
||||||
|
|
||||||
|
fn deserialize<D>(self, deserializer: D) -> std::result::Result<Self::Value, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
struct FieldIdMapVisitor<'a, Mapper: MutFieldIdMapper>(&'a mut Mapper);
|
||||||
|
impl<'de, 'a, Mapper: MutFieldIdMapper> serde::de::Visitor<'de> for FieldIdMapVisitor<'a, Mapper> {
|
||||||
|
type Value = (Option<FieldId>, &'a mut Mapper);
|
||||||
|
|
||||||
|
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
|
write!(formatter, "expecting a string")
|
||||||
|
}
|
||||||
|
fn visit_borrowed_str<E>(self, v: &'de str) -> std::result::Result<Self::Value, E>
|
||||||
|
where
|
||||||
|
E: serde::de::Error,
|
||||||
|
{
|
||||||
|
Ok((self.0.insert(v), self.0))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_str<E>(self, v: &str) -> std::result::Result<Self::Value, E>
|
||||||
|
where
|
||||||
|
E: serde::de::Error,
|
||||||
|
{
|
||||||
|
Ok((self.0.insert(v), self.0))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
deserializer.deserialize_str(FieldIdMapVisitor(self.0))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct DocumentIdVisitor<'indexer>(&'indexer Bump);
|
||||||
|
|
||||||
|
impl<'de, 'indexer: 'de> serde::de::Visitor<'de> for DocumentIdVisitor<'indexer> {
|
||||||
|
type Value = std::result::Result<&'de str, DocumentIdExtractionError>;
|
||||||
|
|
||||||
|
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
|
write!(formatter, "an integer or a string")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_borrowed_str<E>(self, v: &'de str) -> std::result::Result<Self::Value, E>
|
||||||
|
where
|
||||||
|
E: serde::de::Error,
|
||||||
|
{
|
||||||
|
Ok(validate_document_id_str(v).ok_or_else(|| {
|
||||||
|
DocumentIdExtractionError::InvalidDocumentId(UserError::InvalidDocumentId {
|
||||||
|
document_id: serde_json::Value::String(v.to_owned()),
|
||||||
|
})
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_str<E>(self, v: &str) -> std::result::Result<Self::Value, E>
|
||||||
|
where
|
||||||
|
E: serde::de::Error,
|
||||||
|
{
|
||||||
|
let v = self.0.alloc_str(v);
|
||||||
|
self.visit_borrowed_str(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_u64<E>(self, v: u64) -> std::result::Result<Self::Value, E>
|
||||||
|
where
|
||||||
|
E: serde::de::Error,
|
||||||
|
{
|
||||||
|
use std::fmt::Write as _;
|
||||||
|
|
||||||
|
let mut out = bumpalo::collections::String::new_in(&self.0);
|
||||||
|
write!(&mut out, "{v}");
|
||||||
|
Ok(Ok(out.into_bump_str()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_i64<E>(self, v: i64) -> std::result::Result<Self::Value, E>
|
||||||
|
where
|
||||||
|
E: serde::de::Error,
|
||||||
|
{
|
||||||
|
use std::fmt::Write as _;
|
||||||
|
|
||||||
|
let mut out = bumpalo::collections::String::new_in(&self.0);
|
||||||
|
write!(&mut out, "{v}");
|
||||||
|
Ok(Ok(out.into_bump_str()))
|
||||||
|
}
|
||||||
|
}
|
378
milli/src/update/new/indexer/document_changes.rs
Normal file
378
milli/src/update/new/indexer/document_changes.rs
Normal file
@ -0,0 +1,378 @@
|
|||||||
|
use std::cell::{Cell, RefCell};
|
||||||
|
use std::sync::{Arc, RwLock};
|
||||||
|
|
||||||
|
use bumpalo::Bump;
|
||||||
|
use heed::RoTxn;
|
||||||
|
use raw_collections::alloc::RefBump;
|
||||||
|
use rayon::iter::IndexedParallelIterator;
|
||||||
|
|
||||||
|
use super::super::document_change::DocumentChange;
|
||||||
|
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
|
||||||
|
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result};
|
||||||
|
|
||||||
|
/// A trait for types that are **not** [`Send`] only because they would then allow concurrent access to a type that is not [`Sync`].
|
||||||
|
///
|
||||||
|
/// The primary example of such a type is `&T`, with `T: !Sync`.
|
||||||
|
///
|
||||||
|
/// In the authors' understanding, a type can be `!Send` for two distinct reasons:
|
||||||
|
///
|
||||||
|
/// 1. Because it contains data that *genuinely* cannot be moved between threads, such as thread-local data.
|
||||||
|
/// 2. Because sending the type would allow concurrent access to a `!Sync` type, which is undefined behavior.
|
||||||
|
///
|
||||||
|
/// `MostlySend` exists to be used in bounds where you need a type whose data is **not** *attached* to a thread
|
||||||
|
/// because you might access it from a different thread, but where you will never access the type **concurrently** from
|
||||||
|
/// multiple threads.
|
||||||
|
///
|
||||||
|
/// Like [`Send`], `MostlySend` assumes properties on types that cannot be verified by the compiler, which is why implementing
|
||||||
|
/// this trait is unsafe.
|
||||||
|
///
|
||||||
|
/// # Safety
|
||||||
|
///
|
||||||
|
/// Implementers of this trait promises that the following properties hold on the implementing type:
|
||||||
|
///
|
||||||
|
/// 1. Its data can be accessed from any thread and will be the same regardless of the thread accessing it.
|
||||||
|
/// 2. Any operation that can be performed on the type does not depend on the thread that executes it.
|
||||||
|
///
|
||||||
|
/// As these properties are subtle and are not generally tracked by the Rust type system, great care should be taken before
|
||||||
|
/// implementing `MostlySend` on a type, especially a foreign type.
|
||||||
|
///
|
||||||
|
/// - An example of a type that verifies (1) and (2) is [`std::rc::Rc`] (when `T` is `Send` and `Sync`).
|
||||||
|
/// - An example of a type that doesn't verify (1) is thread-local data.
|
||||||
|
/// - An example of a type that doesn't verify (2) is [`std::sync::MutexGuard`]: a lot of mutex implementations require that
|
||||||
|
/// a lock is returned to the operating system on the same thread that initially locked the mutex, failing to uphold this
|
||||||
|
/// invariant will cause Undefined Behavior
|
||||||
|
/// (see last § in [the nomicon](https://doc.rust-lang.org/nomicon/send-and-sync.html)).
|
||||||
|
///
|
||||||
|
/// It is **always safe** to implement this trait on a type that is `Send`, but no placeholder impl is provided due to limitations in
|
||||||
|
/// coherency. Use the [`FullySend`] wrapper in this situation.
|
||||||
|
pub unsafe trait MostlySend {}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, Default, Eq, PartialEq, Ord, PartialOrd, Hash)]
|
||||||
|
pub struct FullySend<T>(pub T);
|
||||||
|
|
||||||
|
// SAFETY: a type **fully** send is always mostly send as well.
|
||||||
|
unsafe impl<T> MostlySend for FullySend<T> where T: Send {}
|
||||||
|
|
||||||
|
impl<T> FullySend<T> {
|
||||||
|
pub fn into(self) -> T {
|
||||||
|
self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> From<T> for FullySend<T> {
|
||||||
|
fn from(value: T) -> Self {
|
||||||
|
Self(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
|
struct MostlySendWrapper<T>(T);
|
||||||
|
|
||||||
|
impl<T: MostlySend> MostlySendWrapper<T> {
|
||||||
|
/// # Safety
|
||||||
|
///
|
||||||
|
/// - (P1) Users of this type will never access the type concurrently from multiple threads without synchronization
|
||||||
|
unsafe fn new(t: T) -> Self {
|
||||||
|
Self(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new_send(t: T) -> Self
|
||||||
|
where
|
||||||
|
T: Send,
|
||||||
|
{
|
||||||
|
Self(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get(&self) -> T
|
||||||
|
where
|
||||||
|
T: Copy,
|
||||||
|
{
|
||||||
|
self.0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn as_ref(&self) -> &T {
|
||||||
|
&self.0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn as_mut(&mut self) -> &mut T {
|
||||||
|
&mut self.0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_inner(self) -> T {
|
||||||
|
self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// # Safety
|
||||||
|
///
|
||||||
|
/// 1. `T` is [`MostlySend`], so by its safety contract it can be accessed by any thread and all of its operations are available
|
||||||
|
/// from any thread.
|
||||||
|
/// 2. (P1) of `MostlySendWrapper::new` forces the user to never access the value from multiple threads concurrently.
|
||||||
|
unsafe impl<T: MostlySend> Send for MostlySendWrapper<T> {}
|
||||||
|
|
||||||
|
/// A wrapper around [`thread_local::ThreadLocal`] that accepts [`MostlySend`] `T`s.
|
||||||
|
pub struct ThreadLocal<T: MostlySend> {
|
||||||
|
inner: thread_local::ThreadLocal<MostlySendWrapper<T>>,
|
||||||
|
// FIXME: this should be necessary
|
||||||
|
//_no_send: PhantomData<*mut ()>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: MostlySend> ThreadLocal<T> {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self { inner: thread_local::ThreadLocal::new() }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_capacity(capacity: usize) -> Self {
|
||||||
|
Self { inner: thread_local::ThreadLocal::with_capacity(capacity) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn clear(&mut self) {
|
||||||
|
self.inner.clear()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get(&self) -> Option<&T> {
|
||||||
|
self.inner.get().map(|t| t.as_ref())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_or<F>(&self, create: F) -> &T
|
||||||
|
where
|
||||||
|
F: FnOnce() -> T,
|
||||||
|
{
|
||||||
|
self.inner.get_or(|| unsafe { MostlySendWrapper::new(create()) }).as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_or_try<F, E>(&self, create: F) -> std::result::Result<&T, E>
|
||||||
|
where
|
||||||
|
F: FnOnce() -> std::result::Result<T, E>,
|
||||||
|
{
|
||||||
|
self.inner
|
||||||
|
.get_or_try(|| unsafe { Ok(MostlySendWrapper::new(create()?)) })
|
||||||
|
.map(MostlySendWrapper::as_ref)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_or_default(&self) -> &T
|
||||||
|
where
|
||||||
|
T: Default,
|
||||||
|
{
|
||||||
|
self.inner.get_or_default().as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn iter_mut(&mut self) -> IterMut<T> {
|
||||||
|
IterMut(self.inner.iter_mut())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: MostlySend> IntoIterator for ThreadLocal<T> {
|
||||||
|
type Item = T;
|
||||||
|
|
||||||
|
type IntoIter = IntoIter<T>;
|
||||||
|
|
||||||
|
fn into_iter(self) -> Self::IntoIter {
|
||||||
|
IntoIter(self.inner.into_iter())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct IterMut<'a, T: MostlySend>(thread_local::IterMut<'a, MostlySendWrapper<T>>);
|
||||||
|
|
||||||
|
impl<'a, T: MostlySend> Iterator for IterMut<'a, T> {
|
||||||
|
type Item = &'a mut T;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
self.0.next().map(|t| t.as_mut())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct IntoIter<T: MostlySend>(thread_local::IntoIter<MostlySendWrapper<T>>);
|
||||||
|
|
||||||
|
impl<T: MostlySend> Iterator for IntoIter<T> {
|
||||||
|
type Item = T;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
self.0.next().map(|t| t.into_inner())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DocumentChangeContext<
|
||||||
|
'doc, // covariant lifetime of a single `process` call
|
||||||
|
'extractor: 'doc, // invariant lifetime of the extractor_allocs
|
||||||
|
'fid: 'doc, // invariant lifetime of the new_fields_ids_map
|
||||||
|
'indexer: 'doc, // covariant lifetime of objects that outlive a single `process` call
|
||||||
|
T: MostlySend,
|
||||||
|
> {
|
||||||
|
/// The index we're indexing in
|
||||||
|
pub index: &'indexer Index,
|
||||||
|
/// The fields ids map as it was at the start of this indexing process. Contains at least all top-level fields from documents
|
||||||
|
/// inside of the DB.
|
||||||
|
pub db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||||
|
/// A transaction providing data from the DB before all indexing operations
|
||||||
|
pub txn: RoTxn<'indexer>,
|
||||||
|
|
||||||
|
/// Global field id map that is up to date with the current state of the indexing process.
|
||||||
|
///
|
||||||
|
/// - Inserting a field will take a lock
|
||||||
|
/// - Retrieving a field may take a lock as well
|
||||||
|
pub new_fields_ids_map: &'doc std::cell::RefCell<GlobalFieldsIdsMap<'fid>>,
|
||||||
|
|
||||||
|
/// Data allocated in this allocator is cleared between each call to `process`.
|
||||||
|
pub doc_alloc: Bump,
|
||||||
|
|
||||||
|
/// Data allocated in this allocator is not cleared between each call to `process`, unless the data spills.
|
||||||
|
pub extractor_alloc: RefBump<'extractor>,
|
||||||
|
|
||||||
|
/// Pool of doc allocators, used to retrieve the doc allocator we provided for the documents
|
||||||
|
doc_allocs: &'doc ThreadLocal<FullySend<Cell<Bump>>>,
|
||||||
|
|
||||||
|
/// Extractor-specific data
|
||||||
|
pub data: &'doc T,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<
|
||||||
|
'doc, // covariant lifetime of a single `process` call
|
||||||
|
'data: 'doc, // invariant on T lifetime of the datastore
|
||||||
|
'extractor: 'doc, // invariant lifetime of extractor_allocs
|
||||||
|
'fid: 'doc, // invariant lifetime of fields ids map
|
||||||
|
'indexer: 'doc, // covariant lifetime of objects that survive a `process` call
|
||||||
|
T: MostlySend,
|
||||||
|
> DocumentChangeContext<'doc, 'extractor, 'fid, 'indexer, T>
|
||||||
|
{
|
||||||
|
pub fn new<F>(
|
||||||
|
index: &'indexer Index,
|
||||||
|
db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||||
|
new_fields_ids_map: &'fid RwLock<FieldsIdsMap>,
|
||||||
|
extractor_allocs: &'extractor ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
|
doc_allocs: &'doc ThreadLocal<FullySend<Cell<Bump>>>,
|
||||||
|
datastore: &'data ThreadLocal<T>,
|
||||||
|
fields_ids_map_store: &'doc ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
|
||||||
|
init_data: F,
|
||||||
|
) -> Result<Self>
|
||||||
|
where
|
||||||
|
F: FnOnce(RefBump<'extractor>) -> Result<T>,
|
||||||
|
{
|
||||||
|
let doc_alloc =
|
||||||
|
doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024))));
|
||||||
|
let doc_alloc = doc_alloc.0.take();
|
||||||
|
let fields_ids_map = fields_ids_map_store
|
||||||
|
.get_or(|| RefCell::new(GlobalFieldsIdsMap::new(&new_fields_ids_map)).into());
|
||||||
|
|
||||||
|
let fields_ids_map = &fields_ids_map.0;
|
||||||
|
let extractor_alloc = extractor_allocs.get_or_default();
|
||||||
|
|
||||||
|
let extractor_alloc = RefBump::new(extractor_alloc.0.borrow());
|
||||||
|
|
||||||
|
let data = datastore.get_or_try(|| init_data(RefBump::clone(&extractor_alloc)))?;
|
||||||
|
|
||||||
|
let txn = index.read_txn()?;
|
||||||
|
Ok(DocumentChangeContext {
|
||||||
|
index,
|
||||||
|
txn,
|
||||||
|
db_fields_ids_map,
|
||||||
|
new_fields_ids_map: fields_ids_map,
|
||||||
|
doc_alloc,
|
||||||
|
extractor_alloc,
|
||||||
|
data,
|
||||||
|
doc_allocs,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An internal iterator (i.e. using `foreach`) of `DocumentChange`s
|
||||||
|
pub trait Extractor<'extractor>: Sync {
|
||||||
|
type Data: MostlySend;
|
||||||
|
|
||||||
|
fn init_data<'doc>(&'doc self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data>;
|
||||||
|
|
||||||
|
fn process<'doc>(
|
||||||
|
&'doc self,
|
||||||
|
change: DocumentChange<'doc>,
|
||||||
|
context: &'doc DocumentChangeContext<Self::Data>,
|
||||||
|
) -> Result<()>;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait DocumentChanges<'pl // lifetime of the underlying payload
|
||||||
|
>: Sync {
|
||||||
|
type Item;
|
||||||
|
|
||||||
|
fn iter(&self) -> impl IndexedParallelIterator<Item = Self::Item>;
|
||||||
|
|
||||||
|
fn item_to_document_change<'doc, // lifetime of a single `process` call
|
||||||
|
T: MostlySend>(
|
||||||
|
&'doc self,
|
||||||
|
context: &'doc DocumentChangeContext<T>,
|
||||||
|
item: Self::Item,
|
||||||
|
) -> Result<DocumentChange<'doc>> where 'pl: 'doc // the payload must survive the process calls
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub struct IndexingContext<
|
||||||
|
'fid, // invariant lifetime of fields ids map
|
||||||
|
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||||
|
'index, // covariant lifetime of the index
|
||||||
|
> {
|
||||||
|
pub index: &'index Index,
|
||||||
|
pub db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||||
|
pub new_fields_ids_map: &'fid RwLock<FieldsIdsMap>,
|
||||||
|
pub doc_allocs: &'indexer ThreadLocal<FullySend<Cell<Bump>>>,
|
||||||
|
pub fields_ids_map_store: &'indexer ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn for_each_document_change<
|
||||||
|
'pl, // covariant lifetime of the underlying payload
|
||||||
|
'extractor, // invariant lifetime of extractor_alloc
|
||||||
|
'fid, // invariant lifetime of fields ids map
|
||||||
|
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing
|
||||||
|
'data, // invariant on EX::Data lifetime of datastore
|
||||||
|
'index, // covariant lifetime of the index
|
||||||
|
EX,
|
||||||
|
DC: DocumentChanges<'pl>,
|
||||||
|
>(
|
||||||
|
document_changes: &DC,
|
||||||
|
extractor: &EX,
|
||||||
|
IndexingContext {
|
||||||
|
index,
|
||||||
|
db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
doc_allocs,
|
||||||
|
fields_ids_map_store,
|
||||||
|
}: IndexingContext<'fid, 'indexer, 'index>,
|
||||||
|
extractor_allocs: &'extractor mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
|
datastore: &'data ThreadLocal<EX::Data>,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
EX: Extractor<'extractor>,
|
||||||
|
{
|
||||||
|
// Clean up and reuse the extractor allocs
|
||||||
|
for extractor_alloc in extractor_allocs.iter_mut() {
|
||||||
|
extractor_alloc.0.get_mut().reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
let pi = document_changes.iter();
|
||||||
|
pi.try_arc_for_each_try_init(
|
||||||
|
|| {
|
||||||
|
DocumentChangeContext::new(
|
||||||
|
index,
|
||||||
|
db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
extractor_allocs,
|
||||||
|
doc_allocs,
|
||||||
|
datastore,
|
||||||
|
fields_ids_map_store,
|
||||||
|
move |index_alloc| extractor.init_data(index_alloc),
|
||||||
|
)
|
||||||
|
},
|
||||||
|
|context, item| {
|
||||||
|
// Clean up and reuse the document-specific allocator
|
||||||
|
context.doc_alloc.reset();
|
||||||
|
|
||||||
|
let change =
|
||||||
|
document_changes.item_to_document_change(context, item).map_err(Arc::new)?;
|
||||||
|
|
||||||
|
let res = extractor.process(change, context).map_err(Arc::new);
|
||||||
|
|
||||||
|
// send back the doc_alloc in the pool
|
||||||
|
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
|
||||||
|
|
||||||
|
res
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
@ -1,14 +1,14 @@
|
|||||||
use std::sync::Arc;
|
use bumpalo::collections::CollectIn;
|
||||||
|
use bumpalo::Bump;
|
||||||
use rayon::iter::{IndexedParallelIterator, IntoParallelIterator};
|
use rayon::iter::{IntoParallelIterator, ParallelIterator as _};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::DocumentChanges;
|
use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend};
|
||||||
use crate::documents::PrimaryKey;
|
use crate::documents::PrimaryKey;
|
||||||
use crate::index::db_name::EXTERNAL_DOCUMENTS_IDS;
|
use crate::index::db_name::EXTERNAL_DOCUMENTS_IDS;
|
||||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
|
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
|
||||||
use crate::update::new::{Deletion, DocumentChange};
|
use crate::update::new::{Deletion, DocumentChange};
|
||||||
use crate::{Error, FieldsIdsMap, Index, InternalError, Result};
|
use crate::{DocumentId, InternalError, Result};
|
||||||
|
|
||||||
pub struct DocumentDeletion {
|
pub struct DocumentDeletion {
|
||||||
pub to_delete: RoaringBitmap,
|
pub to_delete: RoaringBitmap,
|
||||||
@ -22,38 +22,163 @@ impl DocumentDeletion {
|
|||||||
pub fn delete_documents_by_docids(&mut self, docids: RoaringBitmap) {
|
pub fn delete_documents_by_docids(&mut self, docids: RoaringBitmap) {
|
||||||
self.to_delete |= docids;
|
self.to_delete |= docids;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl<'p> DocumentChanges<'p> for DocumentDeletion {
|
pub fn into_changes<'indexer>(
|
||||||
type Parameter = (&'p Index, &'p FieldsIdsMap, &'p PrimaryKey<'p>);
|
|
||||||
|
|
||||||
fn document_changes(
|
|
||||||
self,
|
self,
|
||||||
_fields_ids_map: &mut FieldsIdsMap,
|
indexer: &'indexer Bump,
|
||||||
param: Self::Parameter,
|
primary_key: PrimaryKey<'indexer>,
|
||||||
) -> Result<
|
) -> DocumentDeletionChanges<'indexer> {
|
||||||
impl IndexedParallelIterator<Item = std::result::Result<DocumentChange, Arc<Error>>>
|
let to_delete: bumpalo::collections::Vec<_> =
|
||||||
+ Clone
|
self.to_delete.into_iter().collect_in(indexer);
|
||||||
+ 'p,
|
|
||||||
> {
|
let to_delete = to_delete.into_bump_slice();
|
||||||
let (index, fields_ids_map, primary_key) = param;
|
|
||||||
let to_delete: Vec<_> = self.to_delete.into_iter().collect();
|
DocumentDeletionChanges { to_delete, primary_key }
|
||||||
Ok(to_delete.into_par_iter().try_map_try_init(
|
}
|
||||||
|| index.read_txn().map_err(crate::Error::from),
|
}
|
||||||
|rtxn, docid| {
|
|
||||||
let current = index.document(rtxn, docid)?;
|
pub struct DocumentDeletionChanges<'indexer> {
|
||||||
let external_document_id = primary_key
|
to_delete: &'indexer [DocumentId],
|
||||||
.document_id(current, fields_ids_map)?
|
primary_key: PrimaryKey<'indexer>,
|
||||||
.map_err(|_| InternalError::DatabaseMissingEntry {
|
}
|
||||||
db_name: EXTERNAL_DOCUMENTS_IDS,
|
|
||||||
key: None,
|
impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
|
||||||
})?;
|
type Item = DocumentId;
|
||||||
Ok(DocumentChange::Deletion(Deletion::create(
|
|
||||||
docid,
|
fn iter(&self) -> impl rayon::prelude::IndexedParallelIterator<Item = Self::Item> {
|
||||||
external_document_id,
|
self.to_delete.into_par_iter().copied()
|
||||||
current.boxed(),
|
}
|
||||||
)))
|
|
||||||
},
|
fn item_to_document_change<
|
||||||
))
|
'doc, // lifetime of a single `process` call
|
||||||
|
T: MostlySend,
|
||||||
|
>(
|
||||||
|
&'doc self,
|
||||||
|
context: &'doc DocumentChangeContext<T>,
|
||||||
|
docid: Self::Item,
|
||||||
|
) -> Result<DocumentChange<'doc>>
|
||||||
|
where
|
||||||
|
'pl: 'doc, // the payload must survive the process calls
|
||||||
|
{
|
||||||
|
let current = context.index.document(&context.txn, docid)?;
|
||||||
|
let new_fields_ids_map = context.new_fields_ids_map.borrow();
|
||||||
|
let new_fields_ids_map = new_fields_ids_map.local_map();
|
||||||
|
let external_document_id =
|
||||||
|
self.primary_key.document_id(current, new_fields_ids_map)?.map_err(|_| {
|
||||||
|
InternalError::DatabaseMissingEntry { db_name: EXTERNAL_DOCUMENTS_IDS, key: None }
|
||||||
|
})?;
|
||||||
|
Ok(DocumentChange::Deletion(Deletion::create(docid, external_document_id)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: implement Allocator for Ref<'bump, Bump>
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use std::cell::RefCell;
|
||||||
|
use std::marker::PhantomData;
|
||||||
|
use std::sync::RwLock;
|
||||||
|
|
||||||
|
use bumpalo::Bump;
|
||||||
|
use raw_collections::alloc::RefBump;
|
||||||
|
|
||||||
|
use crate::index::tests::TempIndex;
|
||||||
|
use crate::update::new::indexer::document_changes::{
|
||||||
|
for_each_document_change, DocumentChangeContext, Extractor, IndexingContext, MostlySend,
|
||||||
|
ThreadLocal,
|
||||||
|
};
|
||||||
|
use crate::update::new::indexer::DocumentDeletion;
|
||||||
|
use crate::update::new::DocumentChange;
|
||||||
|
use crate::DocumentId;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_deletions() {
|
||||||
|
struct DeletionWithData<'extractor> {
|
||||||
|
deleted: RefCell<
|
||||||
|
hashbrown::HashSet<
|
||||||
|
DocumentId,
|
||||||
|
hashbrown::hash_map::DefaultHashBuilder,
|
||||||
|
RefBump<'extractor>,
|
||||||
|
>,
|
||||||
|
>,
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe impl<'extractor> MostlySend for DeletionWithData<'extractor> {}
|
||||||
|
|
||||||
|
struct TrackDeletion<'extractor>(PhantomData<&'extractor ()>);
|
||||||
|
|
||||||
|
impl<'extractor> Extractor<'extractor> for TrackDeletion<'extractor> {
|
||||||
|
type Data = DeletionWithData<'extractor>;
|
||||||
|
|
||||||
|
fn init_data(
|
||||||
|
&self,
|
||||||
|
extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
|
||||||
|
) -> crate::Result<Self::Data> {
|
||||||
|
let deleted = RefCell::new(hashbrown::HashSet::new_in(extractor_alloc));
|
||||||
|
Ok(DeletionWithData { deleted })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn process(
|
||||||
|
&self,
|
||||||
|
change: DocumentChange,
|
||||||
|
context: &DocumentChangeContext<Self::Data>,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
context.data.deleted.borrow_mut().insert(change.docid());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut deletions = DocumentDeletion::new();
|
||||||
|
deletions.delete_documents_by_docids(vec![0, 2, 42].into_iter().collect());
|
||||||
|
let indexer = Bump::new();
|
||||||
|
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
|
let fields_ids_map = RwLock::new(db_fields_ids_map.clone());
|
||||||
|
|
||||||
|
let fields_ids_map_store = ThreadLocal::new();
|
||||||
|
|
||||||
|
let mut extractor_allocs = ThreadLocal::new();
|
||||||
|
let doc_allocs = ThreadLocal::new();
|
||||||
|
|
||||||
|
let deletion_tracker = TrackDeletion(PhantomData);
|
||||||
|
|
||||||
|
let changes = deletions
|
||||||
|
.into_changes(&indexer, crate::documents::PrimaryKey::Flat { name: "id", field_id: 0 });
|
||||||
|
|
||||||
|
let context = IndexingContext {
|
||||||
|
index: &index,
|
||||||
|
db_fields_ids_map: &db_fields_ids_map,
|
||||||
|
new_fields_ids_map: &fields_ids_map,
|
||||||
|
doc_allocs: &doc_allocs,
|
||||||
|
fields_ids_map_store: &fields_ids_map_store,
|
||||||
|
};
|
||||||
|
|
||||||
|
for _ in 0..3 {
|
||||||
|
let datastore = ThreadLocal::new();
|
||||||
|
|
||||||
|
for_each_document_change(
|
||||||
|
&changes,
|
||||||
|
&deletion_tracker,
|
||||||
|
context,
|
||||||
|
&mut extractor_allocs,
|
||||||
|
&datastore,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
for (index, data) in datastore.into_iter().enumerate() {
|
||||||
|
println!("deleted by {index}: {:?}", data.deleted.borrow());
|
||||||
|
}
|
||||||
|
for alloc in extractor_allocs.iter_mut() {
|
||||||
|
let alloc = &mut alloc.0;
|
||||||
|
alloc.get_mut().reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
drop(deletion_tracker);
|
||||||
|
drop(changes);
|
||||||
|
drop(rtxn);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,19 +1,18 @@
|
|||||||
use std::borrow::Cow;
|
use bumpalo::collections::CollectIn;
|
||||||
use std::collections::{BTreeMap, HashMap};
|
use bumpalo::Bump;
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use heed::types::Bytes;
|
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
use rayon::iter::{IndexedParallelIterator, IntoParallelIterator};
|
use rayon::iter::IntoParallelIterator;
|
||||||
|
use serde_json::value::RawValue;
|
||||||
use IndexDocumentsMethod as Idm;
|
use IndexDocumentsMethod as Idm;
|
||||||
|
|
||||||
use super::super::document_change::DocumentChange;
|
use super::super::document_change::DocumentChange;
|
||||||
use super::super::{CowStr, TopLevelMap};
|
use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend};
|
||||||
use super::DocumentChanges;
|
|
||||||
use crate::documents::{DocumentIdExtractionError, PrimaryKey};
|
use crate::documents::{DocumentIdExtractionError, PrimaryKey};
|
||||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
|
use crate::update::new::document::DocumentFromVersions;
|
||||||
use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update};
|
use crate::update::new::document_change::Versions;
|
||||||
|
use crate::update::new::indexer::de::DocumentVisitor;
|
||||||
|
use crate::update::new::{Deletion, Insertion, Update};
|
||||||
use crate::update::{AvailableIds, IndexDocumentsMethod};
|
use crate::update::{AvailableIds, IndexDocumentsMethod};
|
||||||
use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError};
|
use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError};
|
||||||
|
|
||||||
@ -22,9 +21,14 @@ pub struct DocumentOperation<'pl> {
|
|||||||
index_documents_method: IndexDocumentsMethod,
|
index_documents_method: IndexDocumentsMethod,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct DocumentOperationChanges<'pl> {
|
||||||
|
docids_version_offsets: &'pl [(&'pl str, ((u32, bool), &'pl [InnerDocOp<'pl>]))],
|
||||||
|
index_documents_method: IndexDocumentsMethod,
|
||||||
|
}
|
||||||
|
|
||||||
pub enum Payload<'pl> {
|
pub enum Payload<'pl> {
|
||||||
Addition(&'pl [u8]),
|
Addition(&'pl [u8]),
|
||||||
Deletion(Vec<String>),
|
Deletion(&'pl [&'pl str]),
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct PayloadStats {
|
pub struct PayloadStats {
|
||||||
@ -33,7 +37,7 @@ pub struct PayloadStats {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
enum InnerDocOp<'pl> {
|
pub enum InnerDocOp<'pl> {
|
||||||
Addition(DocumentOffset<'pl>),
|
Addition(DocumentOffset<'pl>),
|
||||||
Deletion,
|
Deletion,
|
||||||
}
|
}
|
||||||
@ -61,83 +65,89 @@ impl<'pl> DocumentOperation<'pl> {
|
|||||||
Ok(PayloadStats { bytes: payload.len() as u64, document_count })
|
Ok(PayloadStats { bytes: payload.len() as u64, document_count })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn delete_documents(&mut self, to_delete: Vec<String>) {
|
pub fn delete_documents(&mut self, to_delete: &'pl [&'pl str]) {
|
||||||
self.operations.push(Payload::Deletion(to_delete))
|
self.operations.push(Payload::Deletion(to_delete))
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> {
|
pub fn into_changes(
|
||||||
type Parameter = (&'p Index, &'p RoTxn<'p>, &'p PrimaryKey<'p>);
|
|
||||||
|
|
||||||
fn document_changes(
|
|
||||||
self,
|
self,
|
||||||
fields_ids_map: &mut FieldsIdsMap,
|
indexer: &'pl Bump,
|
||||||
param: Self::Parameter,
|
index: &Index,
|
||||||
) -> Result<
|
rtxn: &RoTxn,
|
||||||
impl IndexedParallelIterator<Item = std::result::Result<DocumentChange, Arc<Error>>>
|
primary_key: &PrimaryKey,
|
||||||
+ Clone
|
new_fields_ids_map: &mut FieldsIdsMap,
|
||||||
+ 'p,
|
) -> Result<DocumentOperationChanges<'pl>> {
|
||||||
> {
|
use serde::de::Deserializer;
|
||||||
let (index, rtxn, primary_key) = param;
|
// will contain nodes from the intermediate hashmap
|
||||||
|
let document_changes_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1 MiB
|
||||||
|
|
||||||
let documents_ids = index.documents_ids(rtxn)?;
|
let documents_ids = index.documents_ids(rtxn)?;
|
||||||
let mut available_docids = AvailableIds::new(&documents_ids);
|
let mut available_docids = AvailableIds::new(&documents_ids);
|
||||||
let mut docids_version_offsets = HashMap::<CowStr<'pl>, _>::new();
|
let mut docids_version_offsets =
|
||||||
|
hashbrown::HashMap::<&'pl str, _, _, _>::new_in(&document_changes_alloc);
|
||||||
|
|
||||||
for operation in self.operations {
|
for operation in self.operations {
|
||||||
match operation {
|
match operation {
|
||||||
Payload::Addition(payload) => {
|
Payload::Addition(payload) => {
|
||||||
let mut iter =
|
let mut iter =
|
||||||
serde_json::Deserializer::from_slice(payload).into_iter::<TopLevelMap>();
|
serde_json::Deserializer::from_slice(payload).into_iter::<&RawValue>();
|
||||||
|
|
||||||
/// TODO manage the error
|
/// TODO manage the error
|
||||||
let mut previous_offset = 0;
|
let mut previous_offset = 0;
|
||||||
while let Some(document) = iter.next().transpose().unwrap() {
|
while let Some(document) =
|
||||||
// TODO Fetch all document fields to fill the fields ids map
|
iter.next().transpose().map_err(UserError::SerdeJson)?
|
||||||
document.0.keys().for_each(|key| {
|
{
|
||||||
fields_ids_map.insert(key.as_ref());
|
let res = document
|
||||||
});
|
.deserialize_map(DocumentVisitor::new(
|
||||||
|
new_fields_ids_map,
|
||||||
|
primary_key,
|
||||||
|
indexer,
|
||||||
|
))
|
||||||
|
.map_err(UserError::SerdeJson)?;
|
||||||
|
|
||||||
// TODO we must manage the TooManyDocumentIds,InvalidDocumentId
|
let external_document_id = match res {
|
||||||
// we must manage the unwrap
|
Ok(document_id) => Ok(document_id),
|
||||||
let external_document_id =
|
Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e),
|
||||||
match primary_key.document_id_from_top_level_map(&document)? {
|
Err(DocumentIdExtractionError::MissingDocumentId) => {
|
||||||
Ok(document_id) => Ok(document_id),
|
Err(UserError::MissingDocumentId {
|
||||||
Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e),
|
primary_key: primary_key.name().to_string(),
|
||||||
Err(DocumentIdExtractionError::MissingDocumentId) => {
|
document: serde_json::from_str(document.get()).unwrap(),
|
||||||
Err(UserError::MissingDocumentId {
|
})
|
||||||
primary_key: primary_key.name().to_string(),
|
}
|
||||||
document: document.try_into().unwrap(),
|
Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
|
||||||
})
|
Err(UserError::TooManyDocumentIds {
|
||||||
}
|
primary_key: primary_key.name().to_string(),
|
||||||
Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
|
document: serde_json::from_str(document.get()).unwrap(),
|
||||||
Err(UserError::TooManyDocumentIds {
|
})
|
||||||
primary_key: primary_key.name().to_string(),
|
}
|
||||||
document: document.try_into().unwrap(),
|
}?;
|
||||||
})
|
|
||||||
}
|
|
||||||
}?;
|
|
||||||
|
|
||||||
let current_offset = iter.byte_offset();
|
let current_offset = iter.byte_offset();
|
||||||
let document_operation = InnerDocOp::Addition(DocumentOffset {
|
let document_operation = InnerDocOp::Addition(DocumentOffset {
|
||||||
content: &payload[previous_offset..current_offset],
|
content: &payload[previous_offset..current_offset],
|
||||||
});
|
});
|
||||||
|
|
||||||
match docids_version_offsets.get_mut(external_document_id.as_ref()) {
|
match docids_version_offsets.get_mut(external_document_id) {
|
||||||
None => {
|
None => {
|
||||||
let docid = match index
|
let (docid, is_new) = match index
|
||||||
.external_documents_ids()
|
.external_documents_ids()
|
||||||
.get(rtxn, &external_document_id)?
|
.get(rtxn, &external_document_id)?
|
||||||
{
|
{
|
||||||
Some(docid) => docid,
|
Some(docid) => (docid, false),
|
||||||
None => available_docids
|
None => (
|
||||||
.next()
|
available_docids.next().ok_or(Error::UserError(
|
||||||
.ok_or(Error::UserError(UserError::DocumentLimitReached))?,
|
UserError::DocumentLimitReached,
|
||||||
|
))?,
|
||||||
|
true,
|
||||||
|
),
|
||||||
};
|
};
|
||||||
|
|
||||||
docids_version_offsets.insert(
|
docids_version_offsets.insert(
|
||||||
external_document_id,
|
external_document_id,
|
||||||
(docid, vec![document_operation]),
|
(
|
||||||
|
(docid, is_new),
|
||||||
|
bumpalo::vec![in indexer; document_operation],
|
||||||
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Some((_, offsets)) => {
|
Some((_, offsets)) => {
|
||||||
@ -163,21 +173,27 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> {
|
|||||||
}
|
}
|
||||||
Payload::Deletion(to_delete) => {
|
Payload::Deletion(to_delete) => {
|
||||||
for external_document_id in to_delete {
|
for external_document_id in to_delete {
|
||||||
match docids_version_offsets.get_mut(external_document_id.as_str()) {
|
match docids_version_offsets.get_mut(external_document_id) {
|
||||||
None => {
|
None => {
|
||||||
let docid = match index
|
let (docid, is_new) = match index
|
||||||
.external_documents_ids()
|
.external_documents_ids()
|
||||||
.get(rtxn, &external_document_id)?
|
.get(rtxn, external_document_id)?
|
||||||
{
|
{
|
||||||
Some(docid) => docid,
|
Some(docid) => (docid, false),
|
||||||
None => available_docids
|
None => (
|
||||||
.next()
|
available_docids.next().ok_or(Error::UserError(
|
||||||
.ok_or(Error::UserError(UserError::DocumentLimitReached))?,
|
UserError::DocumentLimitReached,
|
||||||
|
))?,
|
||||||
|
true,
|
||||||
|
),
|
||||||
};
|
};
|
||||||
|
|
||||||
docids_version_offsets.insert(
|
docids_version_offsets.insert(
|
||||||
CowStr(external_document_id.into()),
|
external_document_id,
|
||||||
(docid, vec![InnerDocOp::Deletion]),
|
(
|
||||||
|
(docid, is_new),
|
||||||
|
bumpalo::vec![in indexer; InnerDocOp::Deletion],
|
||||||
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Some((_, offsets)) => {
|
Some((_, offsets)) => {
|
||||||
@ -190,10 +206,11 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// TODO is it the best way to provide FieldsIdsMap to the parallel iterator?
|
|
||||||
let fields_ids_map = fields_ids_map.clone();
|
|
||||||
// TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
|
// TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
|
||||||
let mut docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect();
|
let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> = docids_version_offsets
|
||||||
|
.drain()
|
||||||
|
.map(|(item, (docid, v))| (item, (docid, v.into_bump_slice())))
|
||||||
|
.collect_in(indexer);
|
||||||
// Reorder the offsets to make sure we iterate on the file sequentially
|
// Reorder the offsets to make sure we iterate on the file sequentially
|
||||||
let sort_function_key = match self.index_documents_method {
|
let sort_function_key = match self.index_documents_method {
|
||||||
Idm::ReplaceDocuments => MergeDocumentForReplacement::sort_key,
|
Idm::ReplaceDocuments => MergeDocumentForReplacement::sort_key,
|
||||||
@ -202,43 +219,61 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> {
|
|||||||
|
|
||||||
// And finally sort them
|
// And finally sort them
|
||||||
docids_version_offsets.sort_unstable_by_key(|(_, (_, docops))| sort_function_key(docops));
|
docids_version_offsets.sort_unstable_by_key(|(_, (_, docops))| sort_function_key(docops));
|
||||||
|
let docids_version_offsets = docids_version_offsets.into_bump_slice();
|
||||||
|
Ok(DocumentOperationChanges {
|
||||||
|
docids_version_offsets,
|
||||||
|
index_documents_method: self.index_documents_method,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(docids_version_offsets.into_par_iter().try_map_try_init(
|
impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> {
|
||||||
|| index.read_txn().map_err(Error::from),
|
type Item = &'pl (&'pl str, ((u32, bool), &'pl [InnerDocOp<'pl>]));
|
||||||
move |rtxn, (external_docid, (internal_docid, operations))| {
|
|
||||||
let document_merge_function = match self.index_documents_method {
|
|
||||||
Idm::ReplaceDocuments => MergeDocumentForReplacement::merge,
|
|
||||||
Idm::UpdateDocuments => MergeDocumentForUpdates::merge,
|
|
||||||
};
|
|
||||||
|
|
||||||
document_merge_function(
|
fn iter(&self) -> impl rayon::prelude::IndexedParallelIterator<Item = Self::Item> {
|
||||||
rtxn,
|
self.docids_version_offsets.into_par_iter()
|
||||||
index,
|
}
|
||||||
&fields_ids_map,
|
|
||||||
internal_docid,
|
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||||
external_docid.to_string(), // TODO do not clone
|
&'doc self,
|
||||||
&operations,
|
context: &'doc DocumentChangeContext<T>,
|
||||||
)
|
item: Self::Item,
|
||||||
},
|
) -> Result<DocumentChange<'doc>>
|
||||||
))
|
where
|
||||||
|
'pl: 'doc,
|
||||||
|
{
|
||||||
|
let document_merge_function = match self.index_documents_method {
|
||||||
|
Idm::ReplaceDocuments => MergeDocumentForReplacement::merge,
|
||||||
|
Idm::UpdateDocuments => MergeDocumentForUpdates::merge,
|
||||||
|
};
|
||||||
|
|
||||||
|
let (external_doc, ((internal_docid, is_new), operations)) = *item;
|
||||||
|
|
||||||
|
let change = document_merge_function(
|
||||||
|
internal_docid,
|
||||||
|
external_doc,
|
||||||
|
is_new,
|
||||||
|
&context.doc_alloc,
|
||||||
|
operations,
|
||||||
|
)?;
|
||||||
|
Ok(change)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
trait MergeChanges {
|
trait MergeChanges {
|
||||||
/// Wether the payloads in the list of operations are useless or not.
|
/// Whether the payloads in the list of operations are useless or not.
|
||||||
const USELESS_PREVIOUS_CHANGES: bool;
|
const USELESS_PREVIOUS_CHANGES: bool;
|
||||||
|
|
||||||
/// Returns a key that is used to order the payloads the right way.
|
/// Returns a key that is used to order the payloads the right way.
|
||||||
fn sort_key(docops: &[InnerDocOp]) -> usize;
|
fn sort_key(docops: &[InnerDocOp]) -> usize;
|
||||||
|
|
||||||
fn merge(
|
fn merge<'doc>(
|
||||||
rtxn: &RoTxn,
|
|
||||||
index: &Index,
|
|
||||||
fields_ids_map: &FieldsIdsMap,
|
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
external_docid: String,
|
external_docid: &'doc str,
|
||||||
operations: &[InnerDocOp],
|
is_new: bool,
|
||||||
) -> Result<DocumentChange>;
|
doc_alloc: &'doc Bump,
|
||||||
|
operations: &'doc [InnerDocOp],
|
||||||
|
) -> Result<DocumentChange<'doc>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct MergeDocumentForReplacement;
|
struct MergeDocumentForReplacement;
|
||||||
@ -258,48 +293,42 @@ impl MergeChanges for MergeDocumentForReplacement {
|
|||||||
/// Returns only the most recent version of a document based on the updates from the payloads.
|
/// Returns only the most recent version of a document based on the updates from the payloads.
|
||||||
///
|
///
|
||||||
/// This function is only meant to be used when doing a replacement and not an update.
|
/// This function is only meant to be used when doing a replacement and not an update.
|
||||||
fn merge(
|
fn merge<'doc>(
|
||||||
rtxn: &RoTxn,
|
|
||||||
index: &Index,
|
|
||||||
fields_ids_map: &FieldsIdsMap,
|
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
external_docid: String,
|
external_doc: &'doc str,
|
||||||
operations: &[InnerDocOp],
|
is_new: bool,
|
||||||
) -> Result<DocumentChange> {
|
doc_alloc: &'doc Bump,
|
||||||
let current = index.documents.remap_data_type::<Bytes>().get(rtxn, &docid)?;
|
operations: &'doc [InnerDocOp],
|
||||||
let current: Option<&KvReaderFieldId> = current.map(Into::into);
|
) -> Result<DocumentChange<'doc>> {
|
||||||
|
|
||||||
match operations.last() {
|
match operations.last() {
|
||||||
Some(InnerDocOp::Addition(DocumentOffset { content })) => {
|
Some(InnerDocOp::Addition(DocumentOffset { content })) => {
|
||||||
let map: TopLevelMap = serde_json::from_slice(content).unwrap();
|
let document = serde_json::from_slice(content).unwrap();
|
||||||
let mut document_entries = Vec::new();
|
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
|
||||||
for (key, v) in map.0 {
|
.map_err(UserError::SerdeJson)?;
|
||||||
let id = fields_ids_map.id(key.as_ref()).unwrap();
|
|
||||||
document_entries.push((id, v));
|
|
||||||
}
|
|
||||||
|
|
||||||
document_entries.sort_unstable_by_key(|(id, _)| *id);
|
let document = document.into_bump_slice();
|
||||||
|
let document = DocumentFromVersions::new(Versions::Single(document));
|
||||||
|
|
||||||
let mut writer = KvWriterFieldId::memory();
|
if is_new {
|
||||||
document_entries
|
Ok(DocumentChange::Insertion(Insertion::create(
|
||||||
.into_iter()
|
docid,
|
||||||
.for_each(|(id, value)| writer.insert(id, value.get()).unwrap());
|
external_doc.to_owned(),
|
||||||
let new = writer.into_boxed();
|
document,
|
||||||
|
)))
|
||||||
match current {
|
} else {
|
||||||
Some(current) => {
|
Ok(DocumentChange::Update(Update::create(
|
||||||
let update = Update::create(docid, external_docid, current.boxed(), new);
|
docid,
|
||||||
Ok(DocumentChange::Update(update))
|
external_doc.to_owned(),
|
||||||
}
|
document,
|
||||||
None => {
|
true,
|
||||||
Ok(DocumentChange::Insertion(Insertion::create(docid, external_docid, new)))
|
)))
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some(InnerDocOp::Deletion) => {
|
Some(InnerDocOp::Deletion) => {
|
||||||
let deletion = match current {
|
let deletion = if is_new {
|
||||||
Some(current) => Deletion::create(docid, external_docid, current.boxed()),
|
Deletion::create(docid, external_doc.to_owned())
|
||||||
None => todo!("Do that with Louis"),
|
} else {
|
||||||
|
todo!("Do that with Louis")
|
||||||
};
|
};
|
||||||
Ok(DocumentChange::Deletion(deletion))
|
Ok(DocumentChange::Deletion(deletion))
|
||||||
}
|
}
|
||||||
@ -326,18 +355,13 @@ impl MergeChanges for MergeDocumentForUpdates {
|
|||||||
/// in the grenad update files and merges them to generate a new boxed obkv.
|
/// in the grenad update files and merges them to generate a new boxed obkv.
|
||||||
///
|
///
|
||||||
/// This function is only meant to be used when doing an update and not a replacement.
|
/// This function is only meant to be used when doing an update and not a replacement.
|
||||||
fn merge(
|
fn merge<'doc>(
|
||||||
rtxn: &RoTxn,
|
|
||||||
index: &Index,
|
|
||||||
fields_ids_map: &FieldsIdsMap,
|
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
external_docid: String,
|
external_docid: &'doc str,
|
||||||
operations: &[InnerDocOp],
|
is_new: bool,
|
||||||
) -> Result<DocumentChange> {
|
doc_alloc: &'doc Bump,
|
||||||
let mut document = BTreeMap::<_, Cow<_>>::new();
|
operations: &'doc [InnerDocOp],
|
||||||
let current = index.documents.remap_data_type::<Bytes>().get(rtxn, &docid)?;
|
) -> Result<DocumentChange<'doc>> {
|
||||||
let current: Option<&KvReaderFieldId> = current.map(Into::into);
|
|
||||||
|
|
||||||
if operations.is_empty() {
|
if operations.is_empty() {
|
||||||
unreachable!("We must not have empty set of operations on a document");
|
unreachable!("We must not have empty set of operations on a document");
|
||||||
}
|
}
|
||||||
@ -345,24 +369,20 @@ impl MergeChanges for MergeDocumentForUpdates {
|
|||||||
let last_deletion = operations.iter().rposition(|op| matches!(op, InnerDocOp::Deletion));
|
let last_deletion = operations.iter().rposition(|op| matches!(op, InnerDocOp::Deletion));
|
||||||
let operations = &operations[last_deletion.map_or(0, |i| i + 1)..];
|
let operations = &operations[last_deletion.map_or(0, |i| i + 1)..];
|
||||||
|
|
||||||
// If there was a deletion we must not start
|
let has_deletion = last_deletion.is_some();
|
||||||
// from the original document but from scratch.
|
|
||||||
if last_deletion.is_none() {
|
|
||||||
if let Some(current) = current {
|
|
||||||
current.into_iter().for_each(|(k, v)| {
|
|
||||||
document.insert(k, v.into());
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if operations.is_empty() {
|
if operations.is_empty() {
|
||||||
let deletion = match current {
|
let deletion = if !is_new {
|
||||||
Some(current) => Deletion::create(docid, external_docid, current.boxed()),
|
Deletion::create(docid, external_docid.to_owned())
|
||||||
None => todo!("Do that with Louis"),
|
} else {
|
||||||
|
todo!("Do that with Louis")
|
||||||
};
|
};
|
||||||
|
|
||||||
return Ok(DocumentChange::Deletion(deletion));
|
return Ok(DocumentChange::Deletion(deletion));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut versions = bumpalo::collections::Vec::with_capacity_in(operations.len(), doc_alloc);
|
||||||
|
|
||||||
for operation in operations {
|
for operation in operations {
|
||||||
let DocumentOffset { content } = match operation {
|
let DocumentOffset { content } = match operation {
|
||||||
InnerDocOp::Addition(offset) => offset,
|
InnerDocOp::Addition(offset) => offset,
|
||||||
@ -371,26 +391,35 @@ impl MergeChanges for MergeDocumentForUpdates {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let map: TopLevelMap = serde_json::from_slice(content).unwrap();
|
let document = serde_json::from_slice(content).unwrap();
|
||||||
for (key, v) in map.0 {
|
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
|
||||||
let id = fields_ids_map.id(key.as_ref()).unwrap();
|
.map_err(UserError::SerdeJson)?;
|
||||||
document.insert(id, v.get().as_bytes().to_vec().into());
|
|
||||||
}
|
let document = document.into_bump_slice();
|
||||||
|
versions.push(document);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut writer = KvWriterFieldId::memory();
|
let versions = versions.into_bump_slice();
|
||||||
document.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap());
|
let versions = match versions {
|
||||||
let new = writer.into_boxed();
|
[single] => Versions::Single(*single),
|
||||||
|
versions => Versions::Multiple(versions),
|
||||||
|
};
|
||||||
|
|
||||||
match current {
|
let document = DocumentFromVersions::new(versions);
|
||||||
Some(current) => {
|
|
||||||
let update = Update::create(docid, external_docid, current.boxed(), new);
|
if is_new {
|
||||||
Ok(DocumentChange::Update(update))
|
Ok(DocumentChange::Insertion(Insertion::create(
|
||||||
}
|
docid,
|
||||||
None => {
|
external_docid.to_owned(),
|
||||||
let insertion = Insertion::create(docid, external_docid, new);
|
document,
|
||||||
Ok(DocumentChange::Insertion(insertion))
|
)))
|
||||||
}
|
} else {
|
||||||
|
Ok(DocumentChange::Update(Update::create(
|
||||||
|
docid,
|
||||||
|
external_docid.to_owned(),
|
||||||
|
document,
|
||||||
|
has_deletion,
|
||||||
|
)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,12 @@
|
|||||||
|
use std::cell::RefCell;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use std::thread::{self, Builder};
|
use std::thread::{self, Builder};
|
||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
|
use bumpalo::Bump;
|
||||||
|
use document_changes::{
|
||||||
|
for_each_document_change, DocumentChanges, Extractor, FullySend, IndexingContext, ThreadLocal,
|
||||||
|
};
|
||||||
pub use document_deletion::DocumentDeletion;
|
pub use document_deletion::DocumentDeletion;
|
||||||
pub use document_operation::DocumentOperation;
|
pub use document_operation::DocumentOperation;
|
||||||
use heed::{RoTxn, RwTxn};
|
use heed::{RoTxn, RwTxn};
|
||||||
@ -11,6 +16,7 @@ use rayon::ThreadPool;
|
|||||||
pub use update_by_function::UpdateByFunction;
|
pub use update_by_function::UpdateByFunction;
|
||||||
|
|
||||||
use super::channel::*;
|
use super::channel::*;
|
||||||
|
use super::document::write_to_obkv;
|
||||||
use super::document_change::{Deletion, DocumentChange, Insertion, Update};
|
use super::document_change::{Deletion, DocumentChange, Insertion, Update};
|
||||||
use super::extract::*;
|
use super::extract::*;
|
||||||
use super::merger::{merge_grenad_entries, FacetFieldIdsDelta};
|
use super::merger::{merge_grenad_entries, FacetFieldIdsDelta};
|
||||||
@ -18,32 +24,75 @@ use super::word_fst_builder::PrefixDelta;
|
|||||||
use super::words_prefix_docids::{
|
use super::words_prefix_docids::{
|
||||||
compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids,
|
compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids,
|
||||||
};
|
};
|
||||||
use super::{StdResult, TopLevelMap};
|
use super::{extract, StdResult, TopLevelMap};
|
||||||
use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
|
use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::update::new::channel::ExtractorSender;
|
use crate::update::new::channel::ExtractorSender;
|
||||||
use crate::update::settings::InnerIndexSettings;
|
|
||||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt;
|
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt;
|
||||||
use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
use crate::update::settings::InnerIndexSettings;
|
||||||
use crate::update::{FacetsUpdateBulk, GrenadParameters};
|
use crate::update::{FacetsUpdateBulk, GrenadParameters};
|
||||||
|
use crate::{fields_ids_map, Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
||||||
|
|
||||||
|
mod de;
|
||||||
|
pub mod document_changes;
|
||||||
mod document_deletion;
|
mod document_deletion;
|
||||||
mod document_operation;
|
mod document_operation;
|
||||||
mod partial_dump;
|
mod partial_dump;
|
||||||
mod update_by_function;
|
mod update_by_function;
|
||||||
|
|
||||||
pub trait DocumentChanges<'p> {
|
struct DocumentExtractor<'a> {
|
||||||
type Parameter: 'p;
|
document_sender: &'a DocumentSender<'a>,
|
||||||
|
}
|
||||||
|
|
||||||
fn document_changes(
|
impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> {
|
||||||
self,
|
type Data = FullySend<()>;
|
||||||
fields_ids_map: &mut FieldsIdsMap,
|
|
||||||
param: Self::Parameter,
|
fn init_data(
|
||||||
) -> Result<
|
&self,
|
||||||
impl IndexedParallelIterator<Item = std::result::Result<DocumentChange, Arc<Error>>>
|
extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
|
||||||
+ Clone
|
) -> Result<Self::Data> {
|
||||||
+ 'p,
|
Ok(FullySend(()))
|
||||||
>;
|
}
|
||||||
|
|
||||||
|
fn process(
|
||||||
|
&self,
|
||||||
|
change: DocumentChange,
|
||||||
|
context: &document_changes::DocumentChangeContext<Self::Data>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut document_buffer = Vec::new();
|
||||||
|
|
||||||
|
let new_fields_ids_map = context.new_fields_ids_map.borrow();
|
||||||
|
let new_fields_ids_map = &*new_fields_ids_map;
|
||||||
|
let new_fields_ids_map = new_fields_ids_map.local_map();
|
||||||
|
|
||||||
|
let external_docid = change.external_docid().to_owned();
|
||||||
|
|
||||||
|
// document but we need to create a function that collects and compresses documents.
|
||||||
|
match change {
|
||||||
|
DocumentChange::Deletion(deletion) => {
|
||||||
|
let docid = deletion.docid();
|
||||||
|
self.document_sender.delete(docid, external_docid).unwrap();
|
||||||
|
}
|
||||||
|
/// TODO: change NONE by SOME(vector) when implemented
|
||||||
|
DocumentChange::Update(update) => {
|
||||||
|
let docid = update.docid();
|
||||||
|
let content =
|
||||||
|
update.new(&context.txn, context.index, &context.db_fields_ids_map)?;
|
||||||
|
let content =
|
||||||
|
write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
|
||||||
|
self.document_sender.insert(docid, external_docid, content.boxed()).unwrap();
|
||||||
|
}
|
||||||
|
DocumentChange::Insertion(insertion) => {
|
||||||
|
let docid = insertion.docid();
|
||||||
|
let content = insertion.new();
|
||||||
|
let content =
|
||||||
|
write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
|
||||||
|
self.document_sender.insert(docid, external_docid, content.boxed()).unwrap();
|
||||||
|
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This is the main function of this crate.
|
/// This is the main function of this crate.
|
||||||
@ -51,25 +100,34 @@ pub trait DocumentChanges<'p> {
|
|||||||
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
|
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
|
||||||
///
|
///
|
||||||
/// TODO return stats
|
/// TODO return stats
|
||||||
pub fn index<PI>(
|
pub fn index<'pl, 'indexer, 'index, DC>(
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
index: &Index,
|
index: &'index Index,
|
||||||
fields_ids_map: FieldsIdsMap,
|
db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||||
|
new_fields_ids_map: FieldsIdsMap,
|
||||||
pool: &ThreadPool,
|
pool: &ThreadPool,
|
||||||
document_changes: PI,
|
document_changes: &DC,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
where
|
where
|
||||||
PI: IndexedParallelIterator<Item = std::result::Result<DocumentChange, Arc<Error>>>
|
DC: DocumentChanges<'pl>,
|
||||||
+ Send
|
|
||||||
+ Clone,
|
|
||||||
{
|
{
|
||||||
let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
|
let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
|
||||||
// This channel acts as a rendezvous point to ensure that we are one task ahead
|
// This channel acts as a rendezvous point to ensure that we are one task ahead
|
||||||
let (extractor_sender, merger_receiver) = extractors_merger_channels(4);
|
let (extractor_sender, merger_receiver) = extractors_merger_channels(4);
|
||||||
|
|
||||||
let fields_ids_map_lock = RwLock::new(fields_ids_map);
|
let new_fields_ids_map = RwLock::new(new_fields_ids_map);
|
||||||
let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
|
|
||||||
let global_fields_ids_map_clone = global_fields_ids_map.clone();
|
let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads());
|
||||||
|
let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
|
||||||
|
let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
|
||||||
|
|
||||||
|
let indexing_context = IndexingContext {
|
||||||
|
index,
|
||||||
|
db_fields_ids_map,
|
||||||
|
new_fields_ids_map: &new_fields_ids_map,
|
||||||
|
doc_allocs: &doc_allocs,
|
||||||
|
fields_ids_map_store: &fields_ids_map_store,
|
||||||
|
};
|
||||||
|
|
||||||
thread::scope(|s| {
|
thread::scope(|s| {
|
||||||
let indexer_span = tracing::Span::current();
|
let indexer_span = tracing::Span::current();
|
||||||
@ -78,26 +136,12 @@ where
|
|||||||
pool.in_place_scope(|_s| {
|
pool.in_place_scope(|_s| {
|
||||||
let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
|
let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
let document_changes = document_changes.into_par_iter();
|
|
||||||
|
|
||||||
// document but we need to create a function that collects and compresses documents.
|
// document but we need to create a function that collects and compresses documents.
|
||||||
let document_sender = extractor_sender.document_sender();
|
let document_sender = extractor_sender.document_sender();
|
||||||
document_changes.clone().into_par_iter().try_arc_for_each::<_, Error>(
|
let document_extractor = DocumentExtractor { document_sender: &document_sender};
|
||||||
|result| {
|
let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
|
||||||
match result? {
|
for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?;
|
||||||
DocumentChange::Deletion(Deletion { docid, external_document_id, ..}) => {
|
|
||||||
document_sender.delete(docid, external_document_id).unwrap();
|
|
||||||
}
|
|
||||||
DocumentChange::Update(Update { docid, external_document_id, new, ..}) => {
|
|
||||||
document_sender.insert(docid, external_document_id, new).unwrap();
|
|
||||||
}
|
|
||||||
DocumentChange::Insertion(Insertion { docid, external_document_id, new, ..}) => {
|
|
||||||
document_sender.insert(docid, external_document_id, new).unwrap();
|
|
||||||
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
})?;
|
|
||||||
|
|
||||||
document_sender.finish().unwrap();
|
document_sender.finish().unwrap();
|
||||||
|
|
||||||
@ -112,13 +156,14 @@ where
|
|||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
extract_and_send_docids::<
|
extract_and_send_docids::<
|
||||||
|
_,
|
||||||
FacetedDocidsExtractor,
|
FacetedDocidsExtractor,
|
||||||
FacetDocids,
|
FacetDocids,
|
||||||
>(
|
>(
|
||||||
index,
|
|
||||||
&global_fields_ids_map,
|
|
||||||
grenad_parameters,
|
grenad_parameters,
|
||||||
document_changes.clone(),
|
document_changes,
|
||||||
|
indexing_context,
|
||||||
|
&mut extractor_allocs,
|
||||||
&extractor_sender,
|
&extractor_sender,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
@ -133,7 +178,7 @@ where
|
|||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
word_position_docids,
|
word_position_docids,
|
||||||
fid_word_count_docids,
|
fid_word_count_docids,
|
||||||
} = WordDocidsExtractors::run_extraction(index, &global_fields_ids_map, grenad_parameters, document_changes.clone())?;
|
} = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?;
|
||||||
extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
|
extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
|
||||||
extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
|
extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
|
||||||
extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
|
extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
|
||||||
@ -145,13 +190,14 @@ where
|
|||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
extract_and_send_docids::<
|
extract_and_send_docids::<
|
||||||
|
_,
|
||||||
WordPairProximityDocidsExtractor,
|
WordPairProximityDocidsExtractor,
|
||||||
WordPairProximityDocids,
|
WordPairProximityDocids,
|
||||||
>(
|
>(
|
||||||
index,
|
|
||||||
&global_fields_ids_map,
|
|
||||||
grenad_parameters,
|
grenad_parameters,
|
||||||
document_changes.clone(),
|
document_changes,
|
||||||
|
indexing_context,
|
||||||
|
&mut extractor_allocs,
|
||||||
&extractor_sender,
|
&extractor_sender,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
@ -180,6 +226,8 @@ where
|
|||||||
})
|
})
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
|
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
|
||||||
|
|
||||||
let indexer_span = tracing::Span::current();
|
let indexer_span = tracing::Span::current();
|
||||||
// TODO manage the errors correctly
|
// TODO manage the errors correctly
|
||||||
let merger_thread = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || {
|
let merger_thread = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || {
|
||||||
@ -192,7 +240,7 @@ where
|
|||||||
merger_sender,
|
merger_sender,
|
||||||
&rtxn,
|
&rtxn,
|
||||||
index,
|
index,
|
||||||
global_fields_ids_map_clone,
|
global_fields_ids_map,
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
@ -223,7 +271,10 @@ where
|
|||||||
Ok(()) as Result<_>
|
Ok(()) as Result<_>
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let fields_ids_map = fields_ids_map_lock.into_inner().unwrap();
|
drop(indexing_context);
|
||||||
|
drop(fields_ids_map_store);
|
||||||
|
|
||||||
|
let fields_ids_map = new_fields_ids_map.into_inner().unwrap();
|
||||||
index.put_fields_ids_map(wtxn, &fields_ids_map)?;
|
index.put_fields_ids_map(wtxn, &fields_ids_map)?;
|
||||||
|
|
||||||
// used to update the localized and weighted maps while sharing the update code with the settings pipeline.
|
// used to update the localized and weighted maps while sharing the update code with the settings pipeline.
|
||||||
@ -284,14 +335,23 @@ fn compute_facet_level_database(
|
|||||||
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
|
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
|
||||||
/// TODO: manage the errors correctly
|
/// TODO: manage the errors correctly
|
||||||
/// TODO: we must have a single trait that also gives the extractor type
|
/// TODO: we must have a single trait that also gives the extractor type
|
||||||
fn extract_and_send_docids<E: DocidsExtractor, D: MergerOperationType>(
|
fn extract_and_send_docids<
|
||||||
index: &Index,
|
'pl,
|
||||||
fields_ids_map: &GlobalFieldsIdsMap,
|
'fid,
|
||||||
indexer: GrenadParameters,
|
'indexer,
|
||||||
document_changes: impl IntoParallelIterator<Item = std::result::Result<DocumentChange, Arc<Error>>>,
|
'index,
|
||||||
|
DC: DocumentChanges<'pl>,
|
||||||
|
E: DocidsExtractor,
|
||||||
|
D: MergerOperationType,
|
||||||
|
>(
|
||||||
|
grenad_parameters: GrenadParameters,
|
||||||
|
document_changes: &DC,
|
||||||
|
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
||||||
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
sender: &ExtractorSender,
|
sender: &ExtractorSender,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let merger = E::run_extraction(index, fields_ids_map, indexer, document_changes)?;
|
let merger =
|
||||||
|
E::run_extraction(grenad_parameters, document_changes, indexing_context, extractor_allocs)?;
|
||||||
sender.send_searchable::<D>(merger).unwrap();
|
sender.send_searchable::<D>(merger).unwrap();
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,17 @@
|
|||||||
use std::sync::Arc;
|
use std::ops::DerefMut;
|
||||||
|
|
||||||
use rayon::iter::IndexedParallelIterator;
|
use rayon::iter::IndexedParallelIterator;
|
||||||
|
use serde::Deserializer;
|
||||||
|
use serde_json::value::RawValue;
|
||||||
|
|
||||||
use super::DocumentChanges;
|
use super::de::DocumentVisitor;
|
||||||
|
use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend};
|
||||||
use crate::documents::{DocumentIdExtractionError, PrimaryKey};
|
use crate::documents::{DocumentIdExtractionError, PrimaryKey};
|
||||||
use crate::update::concurrent_available_ids::ConcurrentAvailableIds;
|
use crate::update::concurrent_available_ids::ConcurrentAvailableIds;
|
||||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt;
|
use crate::update::new::document::DocumentFromVersions;
|
||||||
use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId};
|
use crate::update::new::document_change::Versions;
|
||||||
use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};
|
use crate::update::new::{DocumentChange, Insertion};
|
||||||
|
use crate::{Error, InternalError, Result, UserError};
|
||||||
|
|
||||||
pub struct PartialDump<I> {
|
pub struct PartialDump<I> {
|
||||||
iter: I,
|
iter: I,
|
||||||
@ -17,69 +21,81 @@ impl<I> PartialDump<I> {
|
|||||||
pub fn new_from_jsonlines(iter: I) -> Self {
|
pub fn new_from_jsonlines(iter: I) -> Self {
|
||||||
PartialDump { iter }
|
PartialDump { iter }
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl<'p, I> DocumentChanges<'p> for PartialDump<I>
|
pub fn into_changes<'index>(
|
||||||
where
|
|
||||||
I: IndexedParallelIterator<Item = Object> + Clone + 'p,
|
|
||||||
{
|
|
||||||
type Parameter = (&'p FieldsIdsMap, &'p ConcurrentAvailableIds, &'p PrimaryKey<'p>);
|
|
||||||
|
|
||||||
/// Note for future self:
|
|
||||||
/// - the field ids map must already be valid so you must have to generate it beforehand.
|
|
||||||
/// - We should probably expose another method that generates the fields ids map from an iterator of JSON objects.
|
|
||||||
/// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
|
|
||||||
fn document_changes(
|
|
||||||
self,
|
self,
|
||||||
_fields_ids_map: &mut FieldsIdsMap,
|
concurrent_available_ids: &'index ConcurrentAvailableIds,
|
||||||
param: Self::Parameter,
|
primary_key: &'index PrimaryKey,
|
||||||
) -> Result<
|
) -> PartialDumpChanges<'index, I> {
|
||||||
impl IndexedParallelIterator<Item = std::result::Result<DocumentChange, Arc<Error>>>
|
/// Note for future self:
|
||||||
+ Clone
|
/// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
|
||||||
+ 'p,
|
PartialDumpChanges { iter: self.iter, concurrent_available_ids, primary_key }
|
||||||
> {
|
}
|
||||||
let (fields_ids_map, concurrent_available_ids, primary_key) = param;
|
}
|
||||||
|
|
||||||
Ok(self.iter.try_map_try_init(
|
pub struct PartialDumpChanges<'doc, I> {
|
||||||
|| Ok(()),
|
iter: I,
|
||||||
|_, object| {
|
concurrent_available_ids: &'doc ConcurrentAvailableIds,
|
||||||
let docid = match concurrent_available_ids.next() {
|
primary_key: &'doc PrimaryKey<'doc>,
|
||||||
Some(id) => id,
|
}
|
||||||
None => return Err(Error::UserError(UserError::DocumentLimitReached)),
|
|
||||||
};
|
impl<'index, Iter> DocumentChanges<'index> for PartialDumpChanges<'index, Iter>
|
||||||
|
where
|
||||||
let mut writer = KvWriterFieldId::memory();
|
Iter: IndexedParallelIterator<Item = Box<RawValue>> + Clone + Sync + 'index,
|
||||||
object.iter().for_each(|(key, value)| {
|
{
|
||||||
let key = fields_ids_map.id(key).unwrap();
|
type Item = Box<RawValue>;
|
||||||
/// TODO better error management
|
|
||||||
let value = serde_json::to_vec(&value).unwrap();
|
fn iter(&self) -> impl IndexedParallelIterator<Item = Self::Item> {
|
||||||
/// TODO it is not ordered
|
self.iter.clone()
|
||||||
writer.insert(key, value).unwrap();
|
}
|
||||||
});
|
|
||||||
|
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||||
let document = writer.into_boxed();
|
&'doc self,
|
||||||
let external_docid = match primary_key.document_id(&document, fields_ids_map)? {
|
context: &'doc DocumentChangeContext<T>,
|
||||||
Ok(document_id) => Ok(document_id),
|
document: Self::Item,
|
||||||
Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => {
|
) -> Result<DocumentChange<'doc>>
|
||||||
Err(user_error)
|
where
|
||||||
}
|
'index: 'doc,
|
||||||
Err(DocumentIdExtractionError::MissingDocumentId) => {
|
{
|
||||||
Err(UserError::MissingDocumentId {
|
let doc_alloc = &context.doc_alloc;
|
||||||
primary_key: primary_key.name().to_string(),
|
let docid = match self.concurrent_available_ids.next() {
|
||||||
document: all_obkv_to_json(&document, fields_ids_map)?,
|
Some(id) => id,
|
||||||
})
|
None => return Err(Error::UserError(UserError::DocumentLimitReached)),
|
||||||
}
|
};
|
||||||
Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
|
|
||||||
Err(UserError::TooManyDocumentIds {
|
let mut fields_ids_map = context.new_fields_ids_map.borrow_mut();
|
||||||
primary_key: primary_key.name().to_string(),
|
let fields_ids_map = fields_ids_map.deref_mut();
|
||||||
document: all_obkv_to_json(&document, fields_ids_map)?,
|
|
||||||
})
|
let res = document
|
||||||
}
|
.deserialize_map(DocumentVisitor::new(fields_ids_map, self.primary_key, &doc_alloc))
|
||||||
}?;
|
.map_err(UserError::SerdeJson)?;
|
||||||
|
|
||||||
let insertion = Insertion::create(docid, external_docid, document);
|
let external_document_id = match res {
|
||||||
Ok(DocumentChange::Insertion(insertion))
|
Ok(document_id) => Ok(document_id),
|
||||||
},
|
Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e),
|
||||||
))
|
Err(DocumentIdExtractionError::MissingDocumentId) => {
|
||||||
|
Err(UserError::MissingDocumentId {
|
||||||
|
primary_key: self.primary_key.name().to_string(),
|
||||||
|
document: serde_json::from_str(document.get()).unwrap(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
|
||||||
|
Err(UserError::TooManyDocumentIds {
|
||||||
|
primary_key: self.primary_key.name().to_string(),
|
||||||
|
document: serde_json::from_str(document.get()).unwrap(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}?;
|
||||||
|
let document = doc_alloc.alloc_str(document.get());
|
||||||
|
let document: &RawValue = unsafe { std::mem::transmute(document) };
|
||||||
|
|
||||||
|
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
|
||||||
|
.map_err(InternalError::SerdeJson)?;
|
||||||
|
|
||||||
|
let document = document.into_bump_slice();
|
||||||
|
let document = DocumentFromVersions::new(Versions::Single(document));
|
||||||
|
|
||||||
|
let insertion = Insertion::create(docid, external_document_id.to_owned(), document);
|
||||||
|
Ok(DocumentChange::Insertion(insertion))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,25 +1,33 @@
|
|||||||
use std::sync::Arc;
|
use rayon::iter::IntoParallelIterator;
|
||||||
|
|
||||||
use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
|
use super::document_changes::{DocumentChangeContext, DocumentChanges};
|
||||||
|
use crate::Result;
|
||||||
use super::DocumentChanges;
|
|
||||||
use crate::update::new::DocumentChange;
|
|
||||||
use crate::{Error, FieldsIdsMap, Result};
|
|
||||||
|
|
||||||
pub struct UpdateByFunction;
|
pub struct UpdateByFunction;
|
||||||
|
|
||||||
impl<'p> DocumentChanges<'p> for UpdateByFunction {
|
impl UpdateByFunction {
|
||||||
type Parameter = ();
|
pub fn into_changes(self) -> UpdateByFunctionChanges {
|
||||||
|
UpdateByFunctionChanges
|
||||||
fn document_changes(
|
}
|
||||||
self,
|
}
|
||||||
_fields_ids_map: &mut FieldsIdsMap,
|
|
||||||
_param: Self::Parameter,
|
pub struct UpdateByFunctionChanges;
|
||||||
) -> Result<
|
|
||||||
impl IndexedParallelIterator<Item = std::result::Result<DocumentChange, Arc<Error>>>
|
impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges {
|
||||||
+ Clone
|
type Item = u32;
|
||||||
+ 'p,
|
|
||||||
> {
|
fn iter(&self) -> impl rayon::prelude::IndexedParallelIterator<Item = Self::Item> {
|
||||||
Ok((0..100).into_par_iter().map(|_| todo!()))
|
(0..100).into_par_iter()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn item_to_document_change<'doc, T: super::document_changes::MostlySend + 'doc>(
|
||||||
|
&self,
|
||||||
|
_context: &'doc DocumentChangeContext<T>,
|
||||||
|
_item: Self::Item,
|
||||||
|
) -> Result<crate::update::new::DocumentChange<'doc>>
|
||||||
|
where
|
||||||
|
'index: 'doc,
|
||||||
|
{
|
||||||
|
todo!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,10 +3,10 @@ use std::io::{self};
|
|||||||
|
|
||||||
use bincode::ErrorKind;
|
use bincode::ErrorKind;
|
||||||
use grenad::Merger;
|
use grenad::Merger;
|
||||||
|
use hashbrown::HashSet;
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
use heed::{Database, RoTxn};
|
use heed::{Database, RoTxn};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use std::collections::HashSet;
|
|
||||||
|
|
||||||
use super::channel::*;
|
use super::channel::*;
|
||||||
use super::extract::FacetKind;
|
use super::extract::FacetKind;
|
||||||
@ -149,17 +149,8 @@ pub fn merge_grenad_entries(
|
|||||||
let current = index.documents.remap_data_type::<Bytes>().get(rtxn, &docid)?;
|
let current = index.documents.remap_data_type::<Bytes>().get(rtxn, &docid)?;
|
||||||
let current: Option<&KvReaderFieldId> = current.map(Into::into);
|
let current: Option<&KvReaderFieldId> = current.map(Into::into);
|
||||||
let change = match current {
|
let change = match current {
|
||||||
Some(current) => DocumentChange::Update(Update::create(
|
Some(current) => DocumentChange::Update(todo!()),
|
||||||
docid,
|
None => DocumentChange::Insertion(todo!()),
|
||||||
external_id,
|
|
||||||
current.boxed(),
|
|
||||||
document,
|
|
||||||
)),
|
|
||||||
None => DocumentChange::Insertion(Insertion::create(
|
|
||||||
docid,
|
|
||||||
external_id,
|
|
||||||
document,
|
|
||||||
)),
|
|
||||||
};
|
};
|
||||||
geo_extractor.manage_change(&mut global_fields_ids_map, &change)?;
|
geo_extractor.manage_change(&mut global_fields_ids_map, &change)?;
|
||||||
}
|
}
|
||||||
@ -174,12 +165,7 @@ pub fn merge_grenad_entries(
|
|||||||
sender.documents().delete(docid, external_id.clone()).unwrap();
|
sender.documents().delete(docid, external_id.clone()).unwrap();
|
||||||
|
|
||||||
if let Some(geo_extractor) = geo_extractor.as_mut() {
|
if let Some(geo_extractor) = geo_extractor.as_mut() {
|
||||||
let current = index.document(rtxn, docid)?;
|
let change = DocumentChange::Deletion(Deletion::create(docid, todo!()));
|
||||||
let change = DocumentChange::Deletion(Deletion::create(
|
|
||||||
docid,
|
|
||||||
external_id,
|
|
||||||
current.boxed(),
|
|
||||||
));
|
|
||||||
geo_extractor.manage_change(&mut global_fields_ids_map, &change)?;
|
geo_extractor.manage_change(&mut global_fields_ids_map, &change)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@ use super::del_add::DelAdd;
|
|||||||
use crate::FieldId;
|
use crate::FieldId;
|
||||||
|
|
||||||
mod channel;
|
mod channel;
|
||||||
|
pub mod document;
|
||||||
mod document_change;
|
mod document_change;
|
||||||
mod extract;
|
mod extract;
|
||||||
pub mod indexer;
|
pub mod indexer;
|
||||||
|
Loading…
Reference in New Issue
Block a user