From ff5d3b59f5067b000ba3a0e321002c8723da01f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 12 Sep 2024 12:01:31 +0200 Subject: [PATCH] Move the document id extraction to the primary key code --- milli/src/documents/primary_key.rs | 43 +++++++++++- .../update/new/indexer/document_operation.rs | 67 +++++-------------- milli/src/update/new/indexer/top_level_map.rs | 8 +++ 3 files changed, 65 insertions(+), 53 deletions(-) diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs index 22918f8fc..b6a236623 100644 --- a/milli/src/documents/primary_key.rs +++ b/milli/src/documents/primary_key.rs @@ -1,8 +1,10 @@ +use std::borrow::Cow; use std::iter; use std::result::Result as StdResult; -use serde_json::Value; +use serde_json::{from_str, Value}; +use crate::update::new::{CowStr, TopLevelMap}; use crate::{FieldId, InternalError, Object, Result, UserError}; /// The symbol used to define levels in a nested primary key. @@ -100,6 +102,45 @@ impl<'a> PrimaryKey<'a> { } } + /// Returns the document ID based on the primary and + /// search for it recursively in zero-copy-deserialized documents. + pub fn document_id_from_top_level_map<'p>( + &self, + document: &TopLevelMap<'p>, + ) -> Result, DocumentIdExtractionError>> { + fn get_docid<'p>( + document: &TopLevelMap<'p>, + primary_key: &[&str], + ) -> Result, DocumentIdExtractionError>> { + match primary_key { + [] => unreachable!("arrrgh"), // would None be ok? + [primary_key] => match document.0.get(*primary_key) { + Some(value) => match from_str::(value.get()) { + Ok(value) => Ok(Ok(CowStr(Cow::Owned(value.to_string())))), + Err(_) => match from_str(value.get()) { + Ok(document_id) => Ok(Ok(document_id)), + Err(e) => Ok(Err(DocumentIdExtractionError::InvalidDocumentId( + UserError::SerdeJson(e), + ))), + }, + }, + None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), + }, + [head, tail @ ..] => match document.0.get(*head) { + Some(value) => { + let document = from_str(value.get()).map_err(InternalError::SerdeJson)?; + get_docid(&document, tail) + } + None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), + }, + } + } + + /// TODO do not allocate a vec everytime here + let primary_key: Vec<_> = self.name().split(PRIMARY_KEY_SPLIT_SYMBOL).collect(); + get_docid(document, &primary_key) + } + /// Returns an `Iterator` that gives all the possible fields names the primary key /// can have depending of the first level name and depth of the objects. pub fn possible_level_names(&self) -> impl Iterator + '_ { diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 935c130e5..ed8f1c93f 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -13,7 +13,7 @@ use super::super::document_change::DocumentChange; use super::super::items_pool::ItemsPool; use super::top_level_map::{CowStr, TopLevelMap}; use super::DocumentChanges; -use crate::documents::PrimaryKey; +use crate::documents::{DocumentIdExtractionError, PrimaryKey}; use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; @@ -98,37 +98,22 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { // TODO we must manage the TooManyDocumentIds,InvalidDocumentId // we must manage the unwrap let external_document_id = - match get_docid(&document, &[primary_key.name()]).unwrap() { - Some(document_id) => document_id, - None => { - return Err(UserError::MissingDocumentId { + match primary_key.document_id_from_top_level_map(&document)? { + Ok(document_id) => Ok(document_id), + Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e), + Err(DocumentIdExtractionError::MissingDocumentId) => { + Err(UserError::MissingDocumentId { primary_key: primary_key.name().to_string(), - document: todo!(), - // document: obkv_to_object(document, &batch_index)?, - } - .into()); + document: document.try_into().unwrap(), + }) } - }; - - // let external_document_id = - // match primary_key.document_id(document, &batch_index)? { - // Ok(document_id) => Ok(document_id), - // Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => { - // Err(user_error) - // } - // Err(DocumentIdExtractionError::MissingDocumentId) => { - // Err(UserError::MissingDocumentId { - // primary_key: primary_key.name().to_string(), - // document: obkv_to_object(document, &batch_index)?, - // }) - // } - // Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { - // Err(UserError::TooManyDocumentIds { - // primary_key: primary_key.name().to_string(), - // document: obkv_to_object(document, &batch_index)?, - // }) - // } - // }?; + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: document.try_into().unwrap(), + }) + } + }?; let current_offset = iter.byte_offset(); let document_operation = InnerDocOp::Addition(DocumentOffset { @@ -397,25 +382,3 @@ impl MergeChanges for MergeDocumentForUpdates { } } } - -/// Returns the document ID based on the primary and -/// search for it recursively in zero-copy-deserialized documents. -fn get_docid<'p>( - map: &TopLevelMap<'p>, - primary_key: &[&str], -) -> serde_json::Result>> { - match primary_key { - [] => unreachable!("arrrgh"), // would None be ok? - [primary_key] => match map.0.get(*primary_key) { - Some(value) => match from_str::(value.get()) { - Ok(value) => Ok(Some(CowStr(Cow::Owned(value.to_string())))), - Err(_) => Ok(Some(from_str(value.get())?)), - }, - None => Ok(None), - }, - [head, tail @ ..] => match map.0.get(*head) { - Some(value) => get_docid(&from_str(value.get())?, tail), - None => Ok(None), - }, - } -} diff --git a/milli/src/update/new/indexer/top_level_map.rs b/milli/src/update/new/indexer/top_level_map.rs index f79b6e9ee..aebb64bc9 100644 --- a/milli/src/update/new/indexer/top_level_map.rs +++ b/milli/src/update/new/indexer/top_level_map.rs @@ -22,6 +22,14 @@ impl TryFrom<&'_ TopLevelMap<'_>> for Map { } } +impl TryFrom> for Map { + type Error = serde_json::Error; + + fn try_from(tlmap: TopLevelMap<'_>) -> Result { + TryFrom::try_from(&tlmap) + } +} + impl<'p> ops::Deref for TopLevelMap<'p> { type Target = BTreeMap, &'p RawValue>;