mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-06-03 03:18:03 +02:00
aa6c5df0bc
document reader transform remove update format support document sequences fix document transform clean transform improve error handling add documents! macro fix transform bug fix tests remove csv dependency Add comments on the transform process replace search cli fmt review edits fix http ui fix clippy warnings Revert "fix clippy warnings" This reverts commit a1ce3cd96e603633dbf43e9e0b12b2453c9c5620. fix review comments remove smallvec in transform loop review edits
466 lines
13 KiB
Rust
466 lines
13 KiB
Rust
use std::convert::TryInto;
|
|
use std::{fmt, io};
|
|
|
|
use byteorder::{BigEndian, WriteBytesExt};
|
|
use obkv::KvWriter;
|
|
use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer};
|
|
|
|
use super::{ByteCounter, DocumentsBatchIndex, Error};
|
|
use crate::FieldId;
|
|
|
|
pub struct DocumentSerializer<W> {
|
|
pub writer: ByteCounter<W>,
|
|
pub buffer: Vec<u8>,
|
|
pub index: DocumentsBatchIndex,
|
|
pub count: usize,
|
|
pub allow_seq: bool,
|
|
}
|
|
|
|
impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer<W> {
|
|
type Ok = ();
|
|
|
|
type Error = Error;
|
|
|
|
type SerializeSeq = SeqSerializer<'a, W>;
|
|
type SerializeTuple = Impossible<(), Self::Error>;
|
|
type SerializeTupleStruct = Impossible<(), Self::Error>;
|
|
type SerializeTupleVariant = Impossible<(), Self::Error>;
|
|
type SerializeMap = MapSerializer<'a, &'a mut ByteCounter<W>>;
|
|
type SerializeStruct = Impossible<(), Self::Error>;
|
|
type SerializeStructVariant = Impossible<(), Self::Error>;
|
|
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
|
self.buffer.clear();
|
|
let cursor = io::Cursor::new(&mut self.buffer);
|
|
self.count += 1;
|
|
let map_serializer = MapSerializer {
|
|
map: KvWriter::new(cursor),
|
|
index: &mut self.index,
|
|
writer: &mut self.writer,
|
|
buffer: Vec::new(),
|
|
};
|
|
|
|
Ok(map_serializer)
|
|
}
|
|
|
|
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
|
if self.allow_seq {
|
|
// Only allow sequence of documents of depth 1.
|
|
self.allow_seq = false;
|
|
Ok(SeqSerializer { serializer: self })
|
|
} else {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
}
|
|
|
|
fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
|
where
|
|
T: Serialize,
|
|
{
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_unit_variant(
|
|
self,
|
|
_name: &'static str,
|
|
_variant_index: u32,
|
|
_variant: &'static str,
|
|
) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_newtype_struct<T: ?Sized>(
|
|
self,
|
|
_name: &'static str,
|
|
_value: &T,
|
|
) -> Result<Self::Ok, Self::Error>
|
|
where
|
|
T: Serialize,
|
|
{
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_newtype_variant<T: ?Sized>(
|
|
self,
|
|
_name: &'static str,
|
|
_variant_index: u32,
|
|
_variant: &'static str,
|
|
_value: &T,
|
|
) -> Result<Self::Ok, Self::Error>
|
|
where
|
|
T: Serialize,
|
|
{
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_tuple_struct(
|
|
self,
|
|
_name: &'static str,
|
|
_len: usize,
|
|
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_tuple_variant(
|
|
self,
|
|
_name: &'static str,
|
|
_variant_index: u32,
|
|
_variant: &'static str,
|
|
_len: usize,
|
|
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_struct(
|
|
self,
|
|
_name: &'static str,
|
|
_len: usize,
|
|
) -> Result<Self::SerializeStruct, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_struct_variant(
|
|
self,
|
|
_name: &'static str,
|
|
_variant_index: u32,
|
|
_variant: &'static str,
|
|
_len: usize,
|
|
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
}
|
|
|
|
pub struct SeqSerializer<'a, W> {
|
|
serializer: &'a mut DocumentSerializer<W>,
|
|
}
|
|
|
|
impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> {
|
|
type Ok = ();
|
|
type Error = Error;
|
|
|
|
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
|
where
|
|
T: Serialize,
|
|
{
|
|
value.serialize(&mut *self.serializer)?;
|
|
Ok(())
|
|
}
|
|
|
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
pub struct MapSerializer<'a, W> {
|
|
map: KvWriter<io::Cursor<&'a mut Vec<u8>>, FieldId>,
|
|
index: &'a mut DocumentsBatchIndex,
|
|
writer: W,
|
|
buffer: Vec<u8>,
|
|
}
|
|
|
|
/// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and
|
|
/// serialize_value, therefore these to methods remain unimplemented.
|
|
impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> {
|
|
type Ok = ();
|
|
type Error = Error;
|
|
|
|
fn serialize_key<T: ?Sized + Serialize>(&mut self, _key: &T) -> Result<(), Self::Error> {
|
|
unreachable!()
|
|
}
|
|
|
|
fn serialize_value<T: ?Sized>(&mut self, _value: &T) -> Result<(), Self::Error> {
|
|
unreachable!()
|
|
}
|
|
|
|
fn end(mut self) -> Result<Self::Ok, Self::Error> {
|
|
let data = self.map.into_inner().map_err(Error::Io)?.into_inner();
|
|
let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?;
|
|
|
|
self.writer.write_u32::<BigEndian>(data_len).map_err(Error::Io)?;
|
|
self.writer.write_all(&data).map_err(Error::Io)?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
|
&mut self,
|
|
key: &K,
|
|
value: &V,
|
|
) -> Result<(), Self::Error>
|
|
where
|
|
K: Serialize,
|
|
V: Serialize,
|
|
{
|
|
let field_serializer = FieldSerializer { index: &mut self.index };
|
|
let field_id: FieldId = key.serialize(field_serializer)?;
|
|
|
|
self.buffer.clear();
|
|
let mut cursor = io::Cursor::new(&mut self.buffer);
|
|
serde_json::to_writer(&mut cursor, value).map_err(Error::JsonError)?;
|
|
|
|
self.map.insert(field_id, cursor.into_inner()).map_err(Error::Io)?;
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
struct FieldSerializer<'a> {
|
|
index: &'a mut DocumentsBatchIndex,
|
|
}
|
|
|
|
impl<'a> serde::Serializer for FieldSerializer<'a> {
|
|
type Ok = FieldId;
|
|
|
|
type Error = Error;
|
|
|
|
type SerializeSeq = Impossible<FieldId, Self::Error>;
|
|
type SerializeTuple = Impossible<FieldId, Self::Error>;
|
|
type SerializeTupleStruct = Impossible<FieldId, Self::Error>;
|
|
type SerializeTupleVariant = Impossible<FieldId, Self::Error>;
|
|
type SerializeMap = Impossible<FieldId, Self::Error>;
|
|
type SerializeStruct = Impossible<FieldId, Self::Error>;
|
|
type SerializeStructVariant = Impossible<FieldId, Self::Error>;
|
|
|
|
fn serialize_str(self, ws: &str) -> Result<Self::Ok, Self::Error> {
|
|
let field_id = match self.index.get_by_right(ws) {
|
|
Some(field_id) => *field_id,
|
|
None => {
|
|
let field_id = self.index.len() as FieldId;
|
|
self.index.insert(field_id, ws.to_string());
|
|
field_id
|
|
}
|
|
};
|
|
|
|
Ok(field_id)
|
|
}
|
|
|
|
fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
|
where
|
|
T: Serialize,
|
|
{
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_unit_variant(
|
|
self,
|
|
_name: &'static str,
|
|
_variant_index: u32,
|
|
_variant: &'static str,
|
|
) -> Result<Self::Ok, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_newtype_struct<T: ?Sized>(
|
|
self,
|
|
_name: &'static str,
|
|
_value: &T,
|
|
) -> Result<Self::Ok, Self::Error>
|
|
where
|
|
T: Serialize,
|
|
{
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_newtype_variant<T: ?Sized>(
|
|
self,
|
|
_name: &'static str,
|
|
_variant_index: u32,
|
|
_variant: &'static str,
|
|
_value: &T,
|
|
) -> Result<Self::Ok, Self::Error>
|
|
where
|
|
T: Serialize,
|
|
{
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_tuple_struct(
|
|
self,
|
|
_name: &'static str,
|
|
_len: usize,
|
|
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_tuple_variant(
|
|
self,
|
|
_name: &'static str,
|
|
_variant_index: u32,
|
|
_variant: &'static str,
|
|
_len: usize,
|
|
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_struct(
|
|
self,
|
|
_name: &'static str,
|
|
_len: usize,
|
|
) -> Result<Self::SerializeStruct, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
|
|
fn serialize_struct_variant(
|
|
self,
|
|
_name: &'static str,
|
|
_variant_index: u32,
|
|
_variant: &'static str,
|
|
_len: usize,
|
|
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
|
Err(Error::InvalidDocumentFormat)
|
|
}
|
|
}
|
|
|
|
impl serde::ser::Error for Error {
|
|
fn custom<T: fmt::Display>(msg: T) -> Self {
|
|
Error::Custom(msg.to_string())
|
|
}
|
|
}
|