diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 22b16f253..03eb3f4de 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -1,20 +1,17 @@ +use std::fmt; use std::io::{BufWriter, Read, Seek}; use std::result::Result as StdResult; -use std::{fmt, iter}; use serde::{Deserialize, Serialize}; use serde_json::Value; -use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; +use crate::documents::{ + DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader, + EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY, +}; use crate::error::{GeoError, InternalError, UserError}; use crate::update::index_documents::{obkv_to_object, writer_into_reader}; -use crate::{FieldId, Index, Object, Result}; - -/// The symbol used to define levels in a nested primary key. -const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; - -/// The default primary that is used when not specified. -const DEFAULT_PRIMARY_KEY: &str = "id"; +use crate::{FieldId, Index, Result}; /// This function validates and enrich the documents by checking that: /// - we can infer a primary key, @@ -41,14 +38,12 @@ pub fn enrich_documents_batch( // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. let primary_key = match index.primary_key(rtxn)? { - Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { - PrimaryKey::nested(primary_key) - } - Some(primary_key) => match documents_batch_index.id(primary_key) { - Some(id) => PrimaryKey::flat(primary_key, id), - None if autogenerate_docids => { - PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) - } + Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) { + Some(primary_key) => primary_key, + None if autogenerate_docids => PrimaryKey::Flat { + name: primary_key, + field_id: documents_batch_index.insert(primary_key), + }, None => { return match cursor.next_document()? { Some(first_document) => Ok(Err(UserError::MissingDocumentId { @@ -76,14 +71,14 @@ pub fn enrich_documents_batch( }); match guesses.as_slice() { - [] if autogenerate_docids => PrimaryKey::flat( - DEFAULT_PRIMARY_KEY, - documents_batch_index.insert(DEFAULT_PRIMARY_KEY), - ), + [] if autogenerate_docids => PrimaryKey::Flat { + name: DEFAULT_PRIMARY_KEY, + field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY), + }, [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), [(field_id, name)] => { log::info!("Primary key was not specified in index. Inferred to '{name}'"); - PrimaryKey::flat(name, *field_id) + PrimaryKey::Flat { name, field_id: *field_id } } multiple => { return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { @@ -156,92 +151,24 @@ fn fetch_or_generate_document_id( uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], count: u32, ) -> Result> { - match primary_key { - PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { - match document.get(primary_key_id) { - Some(document_id_bytes) => { - let document_id = serde_json::from_slice(document_id_bytes) - .map_err(InternalError::SerdeJson)?; - match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), - Err(user_error) => Ok(Err(user_error)), - } - } - None if autogenerate_docids => { - let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); - Ok(Ok(DocumentId::generated(uuid.to_string(), count))) - } - None => Ok(Err(UserError::MissingDocumentId { - primary_key: primary_key.to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })), - } + Ok(match primary_key.document_id(document, documents_batch_index)? { + Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }), + Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error), + Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => { + let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); + Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count }) } - nested @ PrimaryKey::Nested { .. } => { - let mut matching_documents_ids = Vec::new(); - for (first_level_name, right) in nested.possible_level_names() { - if let Some(field_id) = documents_batch_index.id(first_level_name) { - if let Some(value_bytes) = document.get(field_id) { - let object = serde_json::from_slice(value_bytes) - .map_err(InternalError::SerdeJson)?; - fetch_matching_values(object, right, &mut matching_documents_ids); - - if matching_documents_ids.len() >= 2 { - return Ok(Err(UserError::TooManyDocumentIds { - primary_key: nested.name().to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })); - } - } - } - } - - match matching_documents_ids.pop() { - Some(document_id) => match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), - Err(user_error) => Ok(Err(user_error)), - }, - None => Ok(Err(UserError::MissingDocumentId { - primary_key: nested.name().to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })), - } + Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(document, documents_batch_index)?, + }), + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(document, documents_batch_index)?, + }) } - } -} - -/// A type that represent the type of primary key that has been set -/// for this index, a classic flat one or a nested one. -#[derive(Debug, Clone, Copy)] -enum PrimaryKey<'a> { - Flat { name: &'a str, field_id: FieldId }, - Nested { name: &'a str }, -} - -impl PrimaryKey<'_> { - fn flat(name: &str, field_id: FieldId) -> PrimaryKey { - PrimaryKey::Flat { name, field_id } - } - - fn nested(name: &str) -> PrimaryKey { - PrimaryKey::Nested { name } - } - - fn name(&self) -> &str { - match self { - PrimaryKey::Flat { name, .. } => name, - PrimaryKey::Nested { name } => name, - } - } - - /// Returns an `Iterator` that gives all the possible fields names the primary key - /// can have depending of the first level name and deepnes of the objects. - fn possible_level_names(&self) -> impl Iterator + '_ { - let name = self.name(); - name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) - .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) - .chain(iter::once((name, ""))) - } + }) } /// A type that represents a document id that has been retrieved from a document or auto-generated. @@ -255,14 +182,6 @@ pub enum DocumentId { } impl DocumentId { - fn retrieved(value: String) -> DocumentId { - DocumentId::Retrieved { value } - } - - fn generated(value: String, document_nth: u32) -> DocumentId { - DocumentId::Generated { value, document_nth } - } - fn debug(&self) -> String { format!("{:?}", self) } @@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId { } } -fn starts_with(selector: &str, key: &str) -> bool { - selector.strip_prefix(key).map_or(false, |tail| { - tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) - }) -} - -pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { - match value { - Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), - otherwise => output.push(otherwise), - } -} - -pub fn fetch_matching_values_in_object( - object: Object, - selector: &str, - base_key: &str, - output: &mut Vec, -) { - for (key, value) in object { - let base_key = if base_key.is_empty() { - key.to_string() - } else { - format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) - }; - - if starts_with(selector, &base_key) { - match value { - Value::Object(object) => { - fetch_matching_values_in_object(object, selector, &base_key, output) - } - value => output.push(value), - } - } - } -} - -pub fn validate_document_id(document_id: &str) -> Option<&str> { - if !document_id.is_empty() - && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) - { - Some(document_id) - } else { - None - } -} - -/// Parses a Json encoded document id and validate it, returning a user error when it is one. -pub fn validate_document_id_value(document_id: Value) -> Result> { - match document_id { - Value::String(string) => match validate_document_id(&string) { - Some(s) if s.len() == string.len() => Ok(Ok(string)), - Some(s) => Ok(Ok(s.to_string())), - None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), - }, - Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), - content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), - } -} - /// Try to extract an `f64` from a JSON `Value` and return the `Value` /// in the `Err` variant if it failed. pub fn extract_finite_float_from_value(value: Value) -> StdResult { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2be410ace..d60006289 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,10 +20,7 @@ use slice_group_by::GroupBy; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; use self::enrich::enrich_documents_batch; -pub use self::enrich::{ - extract_finite_float_from_value, validate_document_id, validate_document_id_value, - validate_geo_from_json, DocumentId, -}; +pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,