Find a temporary solution to par into iter on an HashMap

Spoiler: Do not use an HashMap but drain it into a Vec
This commit is contained in:
Clément Renault 2024-09-02 19:39:48 +02:00
parent 9b7858fb90
commit bcb1aa3d22
No known key found for this signature in database
GPG key ID: F250A4C4E3AE5F5F
12 changed files with 254 additions and 152 deletions

View file

@ -4,6 +4,8 @@ use serde::{Deserialize, Serialize};
use crate::FieldId;
mod global;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldsIdsMap {
names_ids: BTreeMap<String, FieldId>,

View file

@ -0,0 +1,84 @@
use std::collections::BTreeMap;
use std::sync::RwLock;
use crate::{FieldId, FieldsIdsMap};
/// A fields ids map that can be globally updated to add fields
pub struct GlobalFieldsIdsMap<'indexing> {
global: &'indexing RwLock<FieldsIdsMap>,
local: LocalFieldsIdsMap,
}
struct LocalFieldsIdsMap {
names_ids: BTreeMap<String, FieldId>,
ids_names: BTreeMap<FieldId, String>,
}
impl LocalFieldsIdsMap {
fn new(global: &RwLock<FieldsIdsMap>) -> Self {
let global = global.read().unwrap();
Self { names_ids: global.names_ids.clone(), ids_names: global.ids_names.clone() }
}
fn insert(&mut self, name: &str, field_id: FieldId) {
self.names_ids.insert(name.to_owned(), field_id);
self.ids_names.insert(field_id, name.to_owned());
}
fn name(&self, id: FieldId) -> Option<&str> {
self.ids_names.get(&id).map(String::as_str)
}
fn id(&self, name: &str) -> Option<FieldId> {
self.names_ids.get(name).copied()
}
}
impl<'indexing> GlobalFieldsIdsMap<'indexing> {
pub fn new(global: &'indexing RwLock<FieldsIdsMap>) -> Self {
Self { local: LocalFieldsIdsMap::new(global), global }
}
/// Returns the field id related to a field name, it will create a new field id if the
/// name is not already known. Returns `None` if the maximum field id as been reached.
pub fn id_or_insert(&mut self, name: &str) -> Option<FieldId> {
if let Some(field_id) = self.local.id(name) {
return Some(field_id);
}
{
// optimistically lookup the global map
let global = self.global.read().unwrap();
if let Some(field_id) = global.id(name) {
self.local.insert(name, field_id);
return Some(field_id);
}
}
{
let mut global = self.global.write().unwrap();
if let Some(field_id) = global.id(name) {
self.local.insert(name, field_id);
return Some(field_id);
}
let field_id = global.insert(name)?;
self.local.insert(name, field_id);
Some(field_id)
}
}
/// Get the name of a field based on its id.
pub fn name(&mut self, id: FieldId) -> Option<&str> {
if self.local.name(id).is_none() {
let global = self.global.read().unwrap();
let name = global.name(id)?;
self.local.insert(name, id);
}
self.local.name(id)
}
}

View file

@ -1,65 +0,0 @@
use std::sync::{Arc, RwLock};
use crate::{FieldId, FieldsIdsMap};
/// A fields ids map that can be globally updated to add fields
pub struct GlobalFieldsIdsMap {
global: Arc<RwLock<FieldsIdsMap>>,
local: FieldsIdsMap,
}
impl GlobalFieldsIdsMap {
pub fn new(global: FieldsIdsMap) -> Self {
Self { local: global.clone(), global: Arc::new(RwLock::new(global)) }
}
/// Returns the number of fields ids in the map.
pub fn global_len(&self) -> usize {
todo!()
}
/// Returns `true` if the map is empty.
pub fn global_is_empty(&self) -> bool {
todo!()
}
/// Returns the field id related to a field name, it will create a new field id if the
/// name is not already known. Returns `None` if the maximum field id as been reached.
pub fn insert(&mut self, name: &str) -> Option<FieldId> {
match self.names_ids.get(name) {
Some(id) => Some(*id),
None => {
let id = self.next_id?;
self.next_id = id.checked_add(1);
self.names_ids.insert(name.to_owned(), id);
self.ids_names.insert(id, name.to_owned());
Some(id)
}
}
}
/// Get the id of a field based on its name.
pub fn id(&self, name: &str) -> Option<FieldId> {
self.names_ids.get(name).copied()
}
/// Get the name of a field based on its id.
pub fn name(&self, id: FieldId) -> Option<&str> {
self.ids_names.get(&id).map(String::as_str)
}
/// Iterate over the ids and names in the ids order.
pub fn iter(&self) -> impl Iterator<Item = (FieldId, &str)> {
self.ids_names.iter().map(|(id, name)| (*id, name.as_str()))
}
/// Iterate over the ids in the order of the ids.
pub fn ids(&'_ self) -> impl Iterator<Item = FieldId> + '_ {
self.ids_names.keys().copied()
}
/// Iterate over the names in the order of the ids.
pub fn names(&self) -> impl Iterator<Item = &str> {
self.ids_names.values().map(AsRef::as_ref)
}
}

View file

@ -28,7 +28,7 @@ impl<'p> DocumentChanges<'p> for DocumentDeletion {
fn document_changes(
self,
param: Self::Parameter,
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + 'p> {
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
let (index, fields, primary_key) = param;
let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from)));
Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| {

View file

@ -34,6 +34,7 @@ pub struct PayloadStats {
pub bytes: u64,
}
#[derive(Clone)]
enum InnerDocOp {
Addition(DocumentOffset),
Deletion,
@ -41,6 +42,7 @@ enum InnerDocOp {
/// Represents an offset where a document lives
/// in an mmapped grenad reader file.
#[derive(Clone)]
pub struct DocumentOffset {
/// The mmapped grenad reader file.
pub content: Arc<Mmap>, // grenad::Reader
@ -76,7 +78,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation {
fn document_changes(
self,
param: Self::Parameter,
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + 'p> {
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
let (index, rtxn, fields_ids_map, primary_key) = param;
let documents_ids = index.documents_ids(rtxn)?;
@ -170,6 +172,11 @@ impl<'p> DocumentChanges<'p> for DocumentOperation {
}
}
/// TODO is it the best way to provide FieldsIdsMap to the parallel iterator?
let fields_ids_map = fields_ids_map.clone();
// We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
let docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect();
Ok(docids_version_offsets
.into_par_iter()
.map_with(
@ -177,6 +184,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation {
move |context_pool, (external_docid, (internal_docid, operations))| {
context_pool.with(|rtxn| {
use IndexDocumentsMethod as Idm;
let document_merge_function = match self.index_documents_method {
Idm::ReplaceDocuments => merge_document_for_replacements,
Idm::UpdateDocuments => merge_document_for_updates,
@ -185,7 +193,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation {
document_merge_function(
rtxn,
index,
fields_ids_map,
&fields_ids_map,
internal_docid,
external_docid,
&operations,

View file

@ -1,9 +1,10 @@
use std::fs::File;
use std::thread::{self, Builder};
use big_s::S;
pub use document_deletion::DocumentDeletion;
pub use document_operation::DocumentOperation;
use heed::RwTxn;
use heed::{RoTxn, RwTxn};
pub use partial_dump::PartialDump;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use rayon::ThreadPool;
@ -15,7 +16,11 @@ use super::channel::{
};
use super::document_change::DocumentChange;
use super::merger::merge_grenad_entries;
use crate::{Index, Result};
use super::StdResult;
use crate::documents::{
obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY,
};
use crate::{Index, Result, UserError};
mod document_deletion;
mod document_operation;
@ -28,7 +33,7 @@ pub trait DocumentChanges<'p> {
fn document_changes(
self,
param: Self::Parameter,
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + 'p>;
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p>;
}
/// This is the main function of this crate.
@ -40,7 +45,7 @@ pub fn index<PI>(
wtxn: &mut RwTxn,
index: &Index,
pool: &ThreadPool,
document_changes: PI,
_document_changes: PI,
) -> Result<()>
where
PI: IntoParallelIterator<Item = Result<DocumentChange>> + Send,
@ -88,3 +93,56 @@ where
Ok(())
})
}
/// TODO move this elsewhere
pub fn guess_primary_key<'a>(
rtxn: &'a RoTxn<'a>,
index: &Index,
mut cursor: DocumentsBatchCursor<File>,
documents_batch_index: &'a DocumentsBatchIndex,
) -> Result<StdResult<PrimaryKey<'a>, UserError>> {
// The primary key *field id* that has already been set for this index or the one
// we will guess by searching for the first key that contains "id" as a substring.
match index.primary_key(rtxn)? {
Some(primary_key) => match PrimaryKey::new(primary_key, documents_batch_index) {
Some(primary_key) => Ok(Ok(primary_key)),
None => match cursor.next_document()? {
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
primary_key: primary_key.to_string(),
document: obkv_to_object(first_document, documents_batch_index)?,
})),
None => unreachable!("Called with reader.is_empty()"),
},
},
None => {
let mut guesses: Vec<(u16, &str)> = documents_batch_index
.iter()
.filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
.map(|(field_id, name)| (*field_id, name.as_str()))
.collect();
// sort the keys in a deterministic, obvious way, so that fields are always in the same order.
guesses.sort_by(|(_, left_name), (_, right_name)| {
// shortest name first
left_name.len().cmp(&right_name.len()).then_with(
// then alphabetical order
|| left_name.cmp(right_name),
)
});
match guesses.as_slice() {
[] => Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
[(field_id, name)] => {
tracing::info!("Primary key was not specified in index. Inferred to '{name}'");
Ok(Ok(PrimaryKey::Flat { name, field_id: *field_id }))
}
multiple => Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
candidates: multiple
.iter()
.map(|(_, candidate)| candidate.to_string())
.collect(),
})),
}
}
}
}

View file

@ -7,7 +7,7 @@ use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId};
use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};
pub struct PartialDump<I> {
pub iter: I,
iter: I,
}
impl<I> PartialDump<I> {
@ -19,7 +19,7 @@ impl<I> PartialDump<I> {
impl<'p, I> DocumentChanges<'p> for PartialDump<I>
where
I: IntoIterator<Item = Object>,
I::IntoIter: Send + 'p,
I::IntoIter: Send + Clone + 'p,
I::Item: Send,
{
type Parameter = (&'p FieldsIdsMap, &'p ConcurrentAvailableIds, &'p PrimaryKey<'p>);
@ -31,7 +31,7 @@ where
fn document_changes(
self,
param: Self::Parameter,
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + 'p> {
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
let (fields_ids_map, concurrent_available_ids, primary_key) = param;
Ok(self.iter.into_iter().par_bridge().map(|object| {

View file

@ -12,8 +12,7 @@ impl<'p> DocumentChanges<'p> for UpdateByFunction {
fn document_changes(
self,
_param: Self::Parameter,
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + 'p> {
todo!();
Ok(vec![].into_par_iter())
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
Ok((0..100).into_par_iter().map(|_| todo!()))
}
}

View file

@ -7,8 +7,8 @@ use crate::FieldId;
mod document_change;
mod merger;
// mod extract;
// mod global_fields_ids_map;
mod channel;
//mod global_fields_ids_map;
pub mod indexer;
mod items_pool;