mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
Enrich uses the new type
This commit is contained in:
parent
db2fb86b8b
commit
9cef800b2a
@ -1,20 +1,17 @@
|
||||
use std::fmt;
|
||||
use std::io::{BufWriter, Read, Seek};
|
||||
use std::result::Result as StdResult;
|
||||
use std::{fmt, iter};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader};
|
||||
use crate::documents::{
|
||||
DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader,
|
||||
EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY,
|
||||
};
|
||||
use crate::error::{GeoError, InternalError, UserError};
|
||||
use crate::update::index_documents::{obkv_to_object, writer_into_reader};
|
||||
use crate::{FieldId, Index, Object, Result};
|
||||
|
||||
/// The symbol used to define levels in a nested primary key.
|
||||
const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
|
||||
|
||||
/// The default primary that is used when not specified.
|
||||
const DEFAULT_PRIMARY_KEY: &str = "id";
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// This function validates and enrich the documents by checking that:
|
||||
/// - we can infer a primary key,
|
||||
@ -41,14 +38,12 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
// The primary key *field id* that has already been set for this index or the one
|
||||
// we will guess by searching for the first key that contains "id" as a substring.
|
||||
let primary_key = match index.primary_key(rtxn)? {
|
||||
Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => {
|
||||
PrimaryKey::nested(primary_key)
|
||||
}
|
||||
Some(primary_key) => match documents_batch_index.id(primary_key) {
|
||||
Some(id) => PrimaryKey::flat(primary_key, id),
|
||||
None if autogenerate_docids => {
|
||||
PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key))
|
||||
}
|
||||
Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) {
|
||||
Some(primary_key) => primary_key,
|
||||
None if autogenerate_docids => PrimaryKey::Flat {
|
||||
name: primary_key,
|
||||
field_id: documents_batch_index.insert(primary_key),
|
||||
},
|
||||
None => {
|
||||
return match cursor.next_document()? {
|
||||
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
|
||||
@ -76,14 +71,14 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
});
|
||||
|
||||
match guesses.as_slice() {
|
||||
[] if autogenerate_docids => PrimaryKey::flat(
|
||||
DEFAULT_PRIMARY_KEY,
|
||||
documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||
),
|
||||
[] if autogenerate_docids => PrimaryKey::Flat {
|
||||
name: DEFAULT_PRIMARY_KEY,
|
||||
field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||
},
|
||||
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
||||
[(field_id, name)] => {
|
||||
log::info!("Primary key was not specified in index. Inferred to '{name}'");
|
||||
PrimaryKey::flat(name, *field_id)
|
||||
PrimaryKey::Flat { name, field_id: *field_id }
|
||||
}
|
||||
multiple => {
|
||||
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||
@ -156,92 +151,24 @@ fn fetch_or_generate_document_id(
|
||||
uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
|
||||
count: u32,
|
||||
) -> Result<StdResult<DocumentId, UserError>> {
|
||||
match primary_key {
|
||||
PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => {
|
||||
match document.get(primary_key_id) {
|
||||
Some(document_id_bytes) => {
|
||||
let document_id = serde_json::from_slice(document_id_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
|
||||
Err(user_error) => Ok(Err(user_error)),
|
||||
}
|
||||
}
|
||||
None if autogenerate_docids => {
|
||||
Ok(match primary_key.document_id(document, documents_batch_index)? {
|
||||
Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }),
|
||||
Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),
|
||||
Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => {
|
||||
let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
|
||||
Ok(Ok(DocumentId::generated(uuid.to_string(), count)))
|
||||
Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count })
|
||||
}
|
||||
None => Ok(Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.to_string(),
|
||||
Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
})),
|
||||
}
|
||||
}
|
||||
nested @ PrimaryKey::Nested { .. } => {
|
||||
let mut matching_documents_ids = Vec::new();
|
||||
for (first_level_name, right) in nested.possible_level_names() {
|
||||
if let Some(field_id) = documents_batch_index.id(first_level_name) {
|
||||
if let Some(value_bytes) = document.get(field_id) {
|
||||
let object = serde_json::from_slice(value_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
fetch_matching_values(object, right, &mut matching_documents_ids);
|
||||
|
||||
if matching_documents_ids.len() >= 2 {
|
||||
return Ok(Err(UserError::TooManyDocumentIds {
|
||||
primary_key: nested.name().to_string(),
|
||||
}),
|
||||
Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
|
||||
Err(UserError::TooManyDocumentIds {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match matching_documents_ids.pop() {
|
||||
Some(document_id) => match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
|
||||
Err(user_error) => Ok(Err(user_error)),
|
||||
},
|
||||
None => Ok(Err(UserError::MissingDocumentId {
|
||||
primary_key: nested.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A type that represent the type of primary key that has been set
|
||||
/// for this index, a classic flat one or a nested one.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum PrimaryKey<'a> {
|
||||
Flat { name: &'a str, field_id: FieldId },
|
||||
Nested { name: &'a str },
|
||||
}
|
||||
|
||||
impl PrimaryKey<'_> {
|
||||
fn flat(name: &str, field_id: FieldId) -> PrimaryKey {
|
||||
PrimaryKey::Flat { name, field_id }
|
||||
}
|
||||
|
||||
fn nested(name: &str) -> PrimaryKey {
|
||||
PrimaryKey::Nested { name }
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
match self {
|
||||
PrimaryKey::Flat { name, .. } => name,
|
||||
PrimaryKey::Nested { name } => name,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an `Iterator` that gives all the possible fields names the primary key
|
||||
/// can have depending of the first level name and deepnes of the objects.
|
||||
fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
|
||||
let name = self.name();
|
||||
name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
|
||||
.map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
|
||||
.chain(iter::once((name, "")))
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// A type that represents a document id that has been retrieved from a document or auto-generated.
|
||||
@ -255,14 +182,6 @@ pub enum DocumentId {
|
||||
}
|
||||
|
||||
impl DocumentId {
|
||||
fn retrieved(value: String) -> DocumentId {
|
||||
DocumentId::Retrieved { value }
|
||||
}
|
||||
|
||||
fn generated(value: String, document_nth: u32) -> DocumentId {
|
||||
DocumentId::Generated { value, document_nth }
|
||||
}
|
||||
|
||||
fn debug(&self) -> String {
|
||||
format!("{:?}", self)
|
||||
}
|
||||
@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId {
|
||||
}
|
||||
}
|
||||
|
||||
fn starts_with(selector: &str, key: &str) -> bool {
|
||||
selector.strip_prefix(key).map_or(false, |tail| {
|
||||
tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
|
||||
match value {
|
||||
Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
|
||||
otherwise => output.push(otherwise),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fetch_matching_values_in_object(
|
||||
object: Object,
|
||||
selector: &str,
|
||||
base_key: &str,
|
||||
output: &mut Vec<Value>,
|
||||
) {
|
||||
for (key, value) in object {
|
||||
let base_key = if base_key.is_empty() {
|
||||
key.to_string()
|
||||
} else {
|
||||
format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
|
||||
};
|
||||
|
||||
if starts_with(selector, &base_key) {
|
||||
match value {
|
||||
Value::Object(object) => {
|
||||
fetch_matching_values_in_object(object, selector, &base_key, output)
|
||||
}
|
||||
value => output.push(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_document_id(document_id: &str) -> Option<&str> {
|
||||
if !document_id.is_empty()
|
||||
&& document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
||||
{
|
||||
Some(document_id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses a Json encoded document id and validate it, returning a user error when it is one.
|
||||
pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
|
||||
match document_id {
|
||||
Value::String(string) => match validate_document_id(&string) {
|
||||
Some(s) if s.len() == string.len() => Ok(Ok(string)),
|
||||
Some(s) => Ok(Ok(s.to_string())),
|
||||
None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
|
||||
},
|
||||
Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
|
||||
content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to extract an `f64` from a JSON `Value` and return the `Value`
|
||||
/// in the `Err` variant if it failed.
|
||||
pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
|
||||
|
@ -20,10 +20,7 @@ use slice_group_by::GroupBy;
|
||||
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
||||
|
||||
use self::enrich::enrich_documents_batch;
|
||||
pub use self::enrich::{
|
||||
extract_finite_float_from_value, validate_document_id, validate_document_id_value,
|
||||
validate_geo_from_json, DocumentId,
|
||||
};
|
||||
pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId};
|
||||
pub use self::helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
||||
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||
|
Loading…
Reference in New Issue
Block a user