Introduce the validate_documents_batch function

This commit is contained in:
Kerollmops 2022-06-14 18:12:15 +02:00
parent cefffde9af
commit 0146175fe6
No known key found for this signature in database
GPG key ID: 92ADA4E935E71FA4
7 changed files with 208 additions and 73 deletions

View file

@ -180,24 +180,10 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) {
mod test {
use std::io::Cursor;
use serde_json::{json, Map};
use serde_json::json;
use super::*;
use crate::documents::DocumentsBatchReader;
use crate::FieldId;
fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
let mut map = Map::new();
for (fid, value) in obkv.iter() {
let field_name = index.name(fid).unwrap().clone();
let value: Value = serde_json::from_slice(value).unwrap();
map.insert(field_name.to_string(), value);
}
Value::Object(map)
}
use crate::documents::{obkv_to_object, DocumentsBatchReader};
#[test]
fn add_single_documents_json() {
@ -272,7 +258,7 @@ mod test {
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
assert_eq!(
val,
@ -301,7 +287,7 @@ mod test {
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
assert_eq!(
val,
@ -328,7 +314,7 @@ mod test {
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
assert_eq!(
val,
@ -355,7 +341,7 @@ mod test {
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
assert_eq!(
val,
@ -382,7 +368,7 @@ mod test {
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
assert_eq!(
val,
@ -409,7 +395,7 @@ mod test {
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
assert_eq!(
val,
@ -436,7 +422,7 @@ mod test {
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
assert_eq!(
val,
@ -463,7 +449,7 @@ mod test {
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
assert_eq!(
val,
@ -507,7 +493,7 @@ mod test {
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
assert_eq!(
val,

View file

@ -6,15 +6,30 @@ use std::io;
use bimap::BiHashMap;
pub use builder::DocumentsBatchBuilder;
use obkv::KvReader;
pub use reader::{DocumentsBatchCursor, DocumentsBatchReader};
use serde::{Deserialize, Serialize};
use crate::FieldId;
use crate::error::{FieldIdMapMissingEntry, InternalError};
use crate::{FieldId, Object, Result};
/// The key that is used to store the `DocumentsBatchIndex` datastructure,
/// it is the absolute last key of the list.
const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes();
/// Helper function to convert an obkv reader into a JSON object.
pub fn obkv_to_object(obkv: &KvReader<FieldId>, index: &DocumentsBatchIndex) -> Result<Object> {
obkv.iter()
.map(|(field_id, value)| {
let field_name = index.name(field_id).ok_or_else(|| {
FieldIdMapMissingEntry::FieldId { field_id, process: "obkv_to_object" }
})?;
let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
Ok((field_name.to_string(), value))
})
.collect()
}
/// A bidirectional map that links field ids to their name in a document batch.
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
@ -48,11 +63,12 @@ impl DocumentsBatchIndex {
self.0.get_by_left(&id).map(AsRef::as_ref)
}
pub fn recreate_json(
&self,
document: &obkv::KvReaderU16,
) -> Result<serde_json::Map<String, serde_json::Value>, crate::Error> {
let mut map = serde_json::Map::new();
pub fn id(&self, name: &str) -> Option<FieldId> {
self.0.get_by_right(name).cloned()
}
pub fn recreate_json(&self, document: &obkv::KvReaderU16) -> Result<Object> {
let mut map = Object::new();
for (k, v) in document.iter() {
// TODO: TAMO: update the error type