mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
Implement documents format
document reader transform remove update format support document sequences fix document transform clean transform improve error handling add documents! macro fix transform bug fix tests remove csv dependency Add comments on the transform process replace search cli fmt review edits fix http ui fix clippy warnings Revert "fix clippy warnings" This reverts commit a1ce3cd96e603633dbf43e9e0b12b2453c9c5620. fix review comments remove smallvec in transform loop review edits
This commit is contained in:
parent
94764e5c7c
commit
aa6c5df0bc
25 changed files with 5114 additions and 713 deletions
80
milli/src/documents/builder.rs
Normal file
80
milli/src/documents/builder.rs
Normal file
|
@ -0,0 +1,80 @@
|
|||
use std::io;
|
||||
|
||||
use byteorder::{BigEndian, WriteBytesExt};
|
||||
use serde::ser::Serialize;
|
||||
|
||||
use super::serde::DocumentSerializer;
|
||||
use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
|
||||
|
||||
/// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
|
||||
/// format used by milli.
|
||||
///
|
||||
/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to
|
||||
/// iterate other the documents.
|
||||
///
|
||||
/// ## example:
|
||||
/// ```
|
||||
/// use milli::documents::DocumentBatchBuilder;
|
||||
/// use serde_json::json;
|
||||
/// use std::io::Cursor;
|
||||
///
|
||||
/// let mut writer = Cursor::new(Vec::new());
|
||||
/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
|
||||
/// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap();
|
||||
/// builder.finish().unwrap();
|
||||
/// ```
|
||||
pub struct DocumentBatchBuilder<W> {
|
||||
serializer: DocumentSerializer<W>,
|
||||
}
|
||||
|
||||
impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
|
||||
pub fn new(writer: W) -> Result<Self, Error> {
|
||||
let index = DocumentsBatchIndex::new();
|
||||
let mut writer = ByteCounter::new(writer);
|
||||
// add space to write the offset of the metadata at the end of the writer
|
||||
writer.write_u64::<BigEndian>(0)?;
|
||||
|
||||
let serializer =
|
||||
DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true };
|
||||
|
||||
Ok(Self { serializer })
|
||||
}
|
||||
|
||||
/// Returns the number of documents that have been written to the builder.
|
||||
pub fn len(&self) -> usize {
|
||||
self.serializer.count
|
||||
}
|
||||
|
||||
/// This method must be called after the document addition is terminated. It will put the
|
||||
/// metadata at the end of the file, and write the metadata offset at the beginning on the
|
||||
/// file.
|
||||
pub fn finish(self) -> Result<(), Error> {
|
||||
let DocumentSerializer {
|
||||
writer: ByteCounter { mut writer, count: offset },
|
||||
index,
|
||||
count,
|
||||
..
|
||||
} = self.serializer;
|
||||
|
||||
let meta = DocumentsMetadata { count, index };
|
||||
|
||||
bincode::serialize_into(&mut writer, &meta)?;
|
||||
|
||||
writer.seek(io::SeekFrom::Start(0))?;
|
||||
writer.write_u64::<BigEndian>(offset as u64)?;
|
||||
|
||||
writer.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Adds documents to the builder.
|
||||
///
|
||||
/// The internal index is updated with the fields found
|
||||
/// in the documents. Document must either be a map or a sequences of map, anything else will
|
||||
/// fail.
|
||||
pub fn add_documents<T: Serialize>(&mut self, document: T) -> Result<(), Error> {
|
||||
document.serialize(&mut self.serializer)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
233
milli/src/documents/mod.rs
Normal file
233
milli/src/documents/mod.rs
Normal file
|
@ -0,0 +1,233 @@
|
|||
mod builder;
|
||||
/// The documents module defines an intermediary document format that milli uses for indexation, and
|
||||
/// provides an API to easily build and read such documents.
|
||||
///
|
||||
/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can
|
||||
/// later be read by milli using the `DocumentBatchReader` interface.
|
||||
mod reader;
|
||||
mod serde;
|
||||
|
||||
use std::{fmt, io};
|
||||
|
||||
use ::serde::{Deserialize, Serialize};
|
||||
use bimap::BiHashMap;
|
||||
pub use builder::DocumentBatchBuilder;
|
||||
pub use reader::DocumentBatchReader;
|
||||
|
||||
use crate::FieldId;
|
||||
|
||||
/// A bidirectional map that links field ids to their name in a document batch.
|
||||
pub type DocumentsBatchIndex = BiHashMap<FieldId, String>;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct DocumentsMetadata {
|
||||
count: usize,
|
||||
index: DocumentsBatchIndex,
|
||||
}
|
||||
|
||||
pub struct ByteCounter<W> {
|
||||
count: usize,
|
||||
writer: W,
|
||||
}
|
||||
|
||||
impl<W> ByteCounter<W> {
|
||||
fn new(writer: W) -> Self {
|
||||
Self { count: 0, writer }
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: io::Write> io::Write for ByteCounter<W> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
let count = self.writer.write(buf)?;
|
||||
self.count += count;
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.writer.flush()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Error {
|
||||
InvalidDocumentFormat,
|
||||
Custom(String),
|
||||
JsonError(serde_json::Error),
|
||||
Serialize(bincode::Error),
|
||||
Io(io::Error),
|
||||
DocumentTooLarge,
|
||||
}
|
||||
|
||||
impl From<io::Error> for Error {
|
||||
fn from(other: io::Error) -> Self {
|
||||
Self::Io(other)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<bincode::Error> for Error {
|
||||
fn from(other: bincode::Error) -> Self {
|
||||
Self::Serialize(other)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
|
||||
Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
|
||||
Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
|
||||
Error::Io(e) => e.fmt(f),
|
||||
Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"),
|
||||
Error::Serialize(e) => e.fmt(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for Error {}
|
||||
|
||||
/// Macro used to generate documents, with the same syntax as `serde_json::json`
|
||||
#[cfg(test)]
|
||||
macro_rules! documents {
|
||||
($data:tt) => {{
|
||||
let documents = serde_json::json!($data);
|
||||
let mut writer = std::io::Cursor::new(Vec::new());
|
||||
let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
|
||||
builder.add_documents(documents).unwrap();
|
||||
builder.finish().unwrap();
|
||||
|
||||
writer.set_position(0);
|
||||
|
||||
crate::documents::DocumentBatchReader::from_reader(writer).unwrap()
|
||||
}};
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn create_documents_no_errors() {
|
||||
let json = json!({
|
||||
"number": 1,
|
||||
"string": "this is a field",
|
||||
"array": ["an", "array"],
|
||||
"object": {
|
||||
"key": "value",
|
||||
},
|
||||
"bool": true
|
||||
});
|
||||
|
||||
let mut v = Vec::new();
|
||||
let mut cursor = io::Cursor::new(&mut v);
|
||||
|
||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||
|
||||
builder.add_documents(json).unwrap();
|
||||
|
||||
builder.finish().unwrap();
|
||||
|
||||
let mut documents =
|
||||
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
|
||||
|
||||
assert_eq!(documents.index().iter().count(), 5);
|
||||
|
||||
let reader = documents.next_document_with_index().unwrap().unwrap();
|
||||
|
||||
assert_eq!(reader.1.iter().count(), 5);
|
||||
assert!(documents.next_document_with_index().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_add_multiple_documents() {
|
||||
let doc1 = json!({
|
||||
"bool": true,
|
||||
});
|
||||
let doc2 = json!({
|
||||
"toto": false,
|
||||
});
|
||||
|
||||
let mut v = Vec::new();
|
||||
let mut cursor = io::Cursor::new(&mut v);
|
||||
|
||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||
|
||||
builder.add_documents(doc1).unwrap();
|
||||
builder.add_documents(doc2).unwrap();
|
||||
|
||||
builder.finish().unwrap();
|
||||
|
||||
let mut documents =
|
||||
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
|
||||
|
||||
assert_eq!(documents.index().iter().count(), 2);
|
||||
|
||||
let reader = documents.next_document_with_index().unwrap().unwrap();
|
||||
|
||||
assert_eq!(reader.1.iter().count(), 1);
|
||||
assert!(documents.next_document_with_index().unwrap().is_some());
|
||||
assert!(documents.next_document_with_index().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_documents_array() {
|
||||
let docs = json!([
|
||||
{ "toto": false },
|
||||
{ "tata": "hello" },
|
||||
]);
|
||||
|
||||
let mut v = Vec::new();
|
||||
let mut cursor = io::Cursor::new(&mut v);
|
||||
|
||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||
|
||||
builder.add_documents(docs).unwrap();
|
||||
|
||||
builder.finish().unwrap();
|
||||
|
||||
let mut documents =
|
||||
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
|
||||
|
||||
assert_eq!(documents.index().iter().count(), 2);
|
||||
|
||||
let reader = documents.next_document_with_index().unwrap().unwrap();
|
||||
|
||||
assert_eq!(reader.1.iter().count(), 1);
|
||||
assert!(documents.next_document_with_index().unwrap().is_some());
|
||||
assert!(documents.next_document_with_index().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_invalid_document_format() {
|
||||
let mut v = Vec::new();
|
||||
let mut cursor = io::Cursor::new(&mut v);
|
||||
|
||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||
|
||||
let docs = json!([[
|
||||
{ "toto": false },
|
||||
{ "tata": "hello" },
|
||||
]]);
|
||||
|
||||
assert!(builder.add_documents(docs).is_err());
|
||||
|
||||
let docs = json!("hello");
|
||||
|
||||
assert!(builder.add_documents(docs).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nested() {
|
||||
let mut docs = documents!([{
|
||||
"hello": {
|
||||
"toto": ["hello"]
|
||||
}
|
||||
}]);
|
||||
|
||||
let (_index, doc) = docs.next_document_with_index().unwrap().unwrap();
|
||||
|
||||
let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
|
||||
assert_eq!(nested, json!({ "toto": ["hello"] }));
|
||||
}
|
||||
}
|
75
milli/src/documents/reader.rs
Normal file
75
milli/src/documents/reader.rs
Normal file
|
@ -0,0 +1,75 @@
|
|||
use std::io;
|
||||
use std::io::{BufReader, Read};
|
||||
use std::mem::size_of;
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use obkv::KvReader;
|
||||
|
||||
use super::{DocumentsBatchIndex, DocumentsMetadata, Error};
|
||||
use crate::FieldId;
|
||||
|
||||
/// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with
|
||||
/// a `DocumentsBatchWriter`.
|
||||
///
|
||||
/// The documents are returned in the form of `obkv::Reader` where each field is identified with a
|
||||
/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
|
||||
pub struct DocumentBatchReader<R> {
|
||||
reader: BufReader<R>,
|
||||
metadata: DocumentsMetadata,
|
||||
buffer: Vec<u8>,
|
||||
seen_documents: usize,
|
||||
}
|
||||
|
||||
impl<R: io::Read + io::Seek> DocumentBatchReader<R> {
|
||||
/// Construct a `DocumentsReader` from a reader.
|
||||
///
|
||||
/// It first retrieves the index, then moves to the first document. Subsequent calls to
|
||||
/// `next_document` advance the document reader until all the documents have been read.
|
||||
pub fn from_reader(mut reader: R) -> Result<Self, Error> {
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
let meta_offset = reader.read_u64::<BigEndian>()?;
|
||||
reader.seek(io::SeekFrom::Start(meta_offset))?;
|
||||
reader.read_to_end(&mut buffer)?;
|
||||
let metadata: DocumentsMetadata = bincode::deserialize(&buffer)?;
|
||||
|
||||
reader.seek(io::SeekFrom::Start(size_of::<u64>() as u64))?;
|
||||
buffer.clear();
|
||||
|
||||
let reader = BufReader::new(reader);
|
||||
|
||||
Ok(Self { reader, metadata, buffer, seen_documents: 0 })
|
||||
}
|
||||
|
||||
/// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a
|
||||
/// reference to the addition index.
|
||||
pub fn next_document_with_index<'a>(
|
||||
&'a mut self,
|
||||
) -> io::Result<Option<(&'a DocumentsBatchIndex, KvReader<'a, FieldId>)>> {
|
||||
if self.seen_documents < self.metadata.count {
|
||||
let doc_len = self.reader.read_u32::<BigEndian>()?;
|
||||
self.buffer.resize(doc_len as usize, 0);
|
||||
self.reader.read_exact(&mut self.buffer)?;
|
||||
self.seen_documents += 1;
|
||||
|
||||
let reader = KvReader::new(&self.buffer);
|
||||
Ok(Some((&self.metadata.index, reader)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the fields index for the documents batch.
|
||||
pub fn index(&self) -> &DocumentsBatchIndex {
|
||||
&self.metadata.index
|
||||
}
|
||||
|
||||
/// Returns the number of documents in the reader.
|
||||
pub fn len(&self) -> usize {
|
||||
self.metadata.count
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
}
|
465
milli/src/documents/serde.rs
Normal file
465
milli/src/documents/serde.rs
Normal file
|
@ -0,0 +1,465 @@
|
|||
use std::convert::TryInto;
|
||||
use std::{fmt, io};
|
||||
|
||||
use byteorder::{BigEndian, WriteBytesExt};
|
||||
use obkv::KvWriter;
|
||||
use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer};
|
||||
|
||||
use super::{ByteCounter, DocumentsBatchIndex, Error};
|
||||
use crate::FieldId;
|
||||
|
||||
pub struct DocumentSerializer<W> {
|
||||
pub writer: ByteCounter<W>,
|
||||
pub buffer: Vec<u8>,
|
||||
pub index: DocumentsBatchIndex,
|
||||
pub count: usize,
|
||||
pub allow_seq: bool,
|
||||
}
|
||||
|
||||
impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer<W> {
|
||||
type Ok = ();
|
||||
|
||||
type Error = Error;
|
||||
|
||||
type SerializeSeq = SeqSerializer<'a, W>;
|
||||
type SerializeTuple = Impossible<(), Self::Error>;
|
||||
type SerializeTupleStruct = Impossible<(), Self::Error>;
|
||||
type SerializeTupleVariant = Impossible<(), Self::Error>;
|
||||
type SerializeMap = MapSerializer<'a, &'a mut ByteCounter<W>>;
|
||||
type SerializeStruct = Impossible<(), Self::Error>;
|
||||
type SerializeStructVariant = Impossible<(), Self::Error>;
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
self.buffer.clear();
|
||||
let cursor = io::Cursor::new(&mut self.buffer);
|
||||
self.count += 1;
|
||||
let map_serializer = MapSerializer {
|
||||
map: KvWriter::new(cursor),
|
||||
index: &mut self.index,
|
||||
writer: &mut self.writer,
|
||||
buffer: Vec::new(),
|
||||
};
|
||||
|
||||
Ok(map_serializer)
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
if self.allow_seq {
|
||||
// Only allow sequence of documents of depth 1.
|
||||
self.allow_seq = false;
|
||||
Ok(SeqSerializer { serializer: self })
|
||||
} else {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SeqSerializer<'a, W> {
|
||||
serializer: &'a mut DocumentSerializer<W>,
|
||||
}
|
||||
|
||||
impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> {
|
||||
type Ok = ();
|
||||
type Error = Error;
|
||||
|
||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
value.serialize(&mut *self.serializer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapSerializer<'a, W> {
|
||||
map: KvWriter<io::Cursor<&'a mut Vec<u8>>, FieldId>,
|
||||
index: &'a mut DocumentsBatchIndex,
|
||||
writer: W,
|
||||
buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
/// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and
|
||||
/// serialize_value, therefore these to methods remain unimplemented.
|
||||
impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> {
|
||||
type Ok = ();
|
||||
type Error = Error;
|
||||
|
||||
fn serialize_key<T: ?Sized + Serialize>(&mut self, _key: &T) -> Result<(), Self::Error> {
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, _value: &T) -> Result<(), Self::Error> {
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
fn end(mut self) -> Result<Self::Ok, Self::Error> {
|
||||
let data = self.map.into_inner().map_err(Error::Io)?.into_inner();
|
||||
let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?;
|
||||
|
||||
self.writer.write_u32::<BigEndian>(data_len).map_err(Error::Io)?;
|
||||
self.writer.write_all(&data).map_err(Error::Io)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V,
|
||||
) -> Result<(), Self::Error>
|
||||
where
|
||||
K: Serialize,
|
||||
V: Serialize,
|
||||
{
|
||||
let field_serializer = FieldSerializer { index: &mut self.index };
|
||||
let field_id: FieldId = key.serialize(field_serializer)?;
|
||||
|
||||
self.buffer.clear();
|
||||
let mut cursor = io::Cursor::new(&mut self.buffer);
|
||||
serde_json::to_writer(&mut cursor, value).map_err(Error::JsonError)?;
|
||||
|
||||
self.map.insert(field_id, cursor.into_inner()).map_err(Error::Io)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct FieldSerializer<'a> {
|
||||
index: &'a mut DocumentsBatchIndex,
|
||||
}
|
||||
|
||||
impl<'a> serde::Serializer for FieldSerializer<'a> {
|
||||
type Ok = FieldId;
|
||||
|
||||
type Error = Error;
|
||||
|
||||
type SerializeSeq = Impossible<FieldId, Self::Error>;
|
||||
type SerializeTuple = Impossible<FieldId, Self::Error>;
|
||||
type SerializeTupleStruct = Impossible<FieldId, Self::Error>;
|
||||
type SerializeTupleVariant = Impossible<FieldId, Self::Error>;
|
||||
type SerializeMap = Impossible<FieldId, Self::Error>;
|
||||
type SerializeStruct = Impossible<FieldId, Self::Error>;
|
||||
type SerializeStructVariant = Impossible<FieldId, Self::Error>;
|
||||
|
||||
fn serialize_str(self, ws: &str) -> Result<Self::Ok, Self::Error> {
|
||||
let field_id = match self.index.get_by_right(ws) {
|
||||
Some(field_id) => *field_id,
|
||||
None => {
|
||||
let field_id = self.index.len() as FieldId;
|
||||
self.index.insert(field_id, ws.to_string());
|
||||
field_id
|
||||
}
|
||||
};
|
||||
|
||||
Ok(field_id)
|
||||
}
|
||||
|
||||
fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(Error::InvalidDocumentFormat)
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::ser::Error for Error {
|
||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
||||
Error::Custom(msg.to_string())
|
||||
}
|
||||
}
|
|
@ -55,7 +55,6 @@ pub enum FieldIdMapMissingEntry {
|
|||
#[derive(Debug)]
|
||||
pub enum UserError {
|
||||
AttributeLimitReached,
|
||||
Csv(csv::Error),
|
||||
DocumentLimitReached,
|
||||
InvalidAscDescSyntax { name: String },
|
||||
InvalidDocumentId { document_id: Value },
|
||||
|
@ -212,7 +211,6 @@ impl fmt::Display for UserError {
|
|||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"),
|
||||
Self::Csv(error) => error.fmt(f),
|
||||
Self::DocumentLimitReached => f.write_str("maximum number of documents reached"),
|
||||
Self::InvalidFacetsDistribution { invalid_facets_name } => {
|
||||
let name_list =
|
||||
|
|
|
@ -868,7 +868,7 @@ pub(crate) mod tests {
|
|||
use maplit::btreemap;
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::update::{IndexDocuments, UpdateFormat};
|
||||
use crate::update::IndexDocuments;
|
||||
use crate::Index;
|
||||
|
||||
pub(crate) struct TempIndex {
|
||||
|
@ -904,13 +904,12 @@ pub(crate) mod tests {
|
|||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{ "id": 1, "name": "kevin" },
|
||||
{ "id": 2, "name": "bob", "age": 20 },
|
||||
{ "id": 2, "name": "bob", "age": 20 }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -929,8 +928,12 @@ pub(crate) mod tests {
|
|||
// we add all the documents a second time. we are supposed to get the same
|
||||
// field_distribution in the end
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
let content = documents!([
|
||||
{ "id": 1, "name": "kevin" },
|
||||
{ "id": 2, "name": "bob", "age": 20 },
|
||||
{ "id": 2, "name": "bob", "age": 20 }
|
||||
]);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -947,13 +950,12 @@ pub(crate) mod tests {
|
|||
);
|
||||
|
||||
// then we update a document by removing one field and another by adding one field
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{ "id": 1, "name": "kevin", "has_dog": true },
|
||||
{ "id": 2, "name": "bob" }
|
||||
]"#[..];
|
||||
]);
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
#[macro_use]
|
||||
extern crate pest_derive;
|
||||
|
||||
#[macro_use]
|
||||
pub mod documents;
|
||||
|
||||
mod criterion;
|
||||
mod error;
|
||||
mod external_documents_ids;
|
||||
|
|
|
@ -27,6 +27,7 @@ pub trait Distinct {
|
|||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::collections::HashSet;
|
||||
use std::io::Cursor;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::seq::SliceRandom;
|
||||
|
@ -34,19 +35,20 @@ mod test {
|
|||
use roaring::RoaringBitmap;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::documents::{DocumentBatchBuilder, DocumentBatchReader};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::index::Index;
|
||||
use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
|
||||
use crate::update::{IndexDocumentsMethod, UpdateBuilder};
|
||||
use crate::{DocumentId, FieldId, BEU32};
|
||||
|
||||
static JSON: Lazy<Value> = Lazy::new(generate_json);
|
||||
static JSON: Lazy<Vec<u8>> = Lazy::new(generate_documents);
|
||||
|
||||
fn generate_json() -> Value {
|
||||
fn generate_documents() -> Vec<u8> {
|
||||
let mut rng = rand::thread_rng();
|
||||
let num_docs = rng.gen_range(10..30);
|
||||
|
||||
let mut documents = Vec::new();
|
||||
|
||||
let mut cursor = Cursor::new(Vec::new());
|
||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||
let txts = ["Toto", "Titi", "Tata"];
|
||||
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
|
||||
let cat_ints = (1..10).collect::<Vec<_>>();
|
||||
|
@ -66,10 +68,11 @@ mod test {
|
|||
"txts": sample_txts[..(rng.gen_range(0..3))],
|
||||
"cat-ints": sample_ints[..(rng.gen_range(0..3))],
|
||||
});
|
||||
documents.push(doc);
|
||||
builder.add_documents(doc).unwrap();
|
||||
}
|
||||
|
||||
Value::Array(documents)
|
||||
builder.finish().unwrap();
|
||||
cursor.into_inner()
|
||||
}
|
||||
|
||||
/// Returns a temporary index populated with random test documents, the FieldId for the
|
||||
|
@ -89,13 +92,15 @@ mod test {
|
|||
let mut addition = builder.index_documents(&mut txn, &index);
|
||||
|
||||
addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||
addition.update_format(UpdateFormat::Json);
|
||||
addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap();
|
||||
let reader =
|
||||
crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
|
||||
addition.execute(reader, |_, _| ()).unwrap();
|
||||
|
||||
let fields_map = index.fields_ids_map(&txn).unwrap();
|
||||
let fid = fields_map.id(&distinct).unwrap();
|
||||
|
||||
let map = (0..JSON.as_array().unwrap().len() as u32).collect();
|
||||
let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
|
||||
let map = (0..documents.len() as u32).collect();
|
||||
|
||||
txn.commit().unwrap();
|
||||
|
||||
|
|
|
@ -82,7 +82,7 @@ mod tests {
|
|||
use heed::EnvOpenOptions;
|
||||
|
||||
use super::*;
|
||||
use crate::update::{IndexDocuments, UpdateFormat};
|
||||
use crate::update::IndexDocuments;
|
||||
|
||||
#[test]
|
||||
fn clear_documents() {
|
||||
|
@ -92,14 +92,12 @@ mod tests {
|
|||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 20 },
|
||||
{ "id": 1, "name": "kevina" },
|
||||
{ "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
]);
|
||||
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
|
||||
|
||||
// Clear all documents from the database.
|
||||
let builder = ClearDocuments::new(&mut wtxn, &index, 1);
|
||||
|
|
|
@ -567,7 +567,7 @@ mod tests {
|
|||
use maplit::hashset;
|
||||
|
||||
use super::*;
|
||||
use crate::update::{IndexDocuments, Settings, UpdateFormat};
|
||||
use crate::update::{IndexDocuments, Settings};
|
||||
use crate::FilterCondition;
|
||||
|
||||
#[test]
|
||||
|
@ -578,13 +578,12 @@ mod tests {
|
|||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
|
||||
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
|
||||
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
|
||||
// delete those documents, ids are synchronous therefore 0, 1, and 2.
|
||||
|
@ -609,13 +608,12 @@ mod tests {
|
|||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{ "mysuperid": 0, "name": "kevin" },
|
||||
{ "mysuperid": 1, "name": "kevina" },
|
||||
{ "mysuperid": 2, "name": "benoit" }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
|
||||
// Delete not all of the documents but some of them.
|
||||
|
@ -640,7 +638,7 @@ mod tests {
|
|||
builder.set_filterable_fields(hashset! { S("label") });
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{"docid":"1_4","label":"sign"},
|
||||
{"docid":"1_5","label":"letter"},
|
||||
{"docid":"1_7","label":"abstract,cartoon,design,pattern"},
|
||||
|
@ -661,9 +659,8 @@ mod tests {
|
|||
{"docid":"1_58","label":"abstract,art,cartoon"},
|
||||
{"docid":"1_68","label":"design"},
|
||||
{"docid":"1_69","label":"geometry"}
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
|
||||
// Delete not all of the documents but some of them.
|
||||
|
@ -692,7 +689,7 @@ mod tests {
|
|||
builder.set_sortable_fields(hashset!(S("_geo")));
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
|
||||
let content = &r#"[
|
||||
let content = documents!([
|
||||
{"id":"1","city":"Lille", "_geo": { "lat": 50.629973371633746, "lng": 3.0569447399419570 } },
|
||||
{"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.641586120121050, "lng": 3.1106593480348670 } },
|
||||
{"id":"3","city":"Hellemmes", "_geo": { "lat": 50.631220965518080, "lng": 3.1106399673339933 } },
|
||||
|
@ -713,12 +710,10 @@ mod tests {
|
|||
{"id":"18","city":"Amiens", "_geo": { "lat": 49.931472529669996, "lng": 2.2710499758317080 } },
|
||||
{"id":"19","city":"Compiègne", "_geo": { "lat": 49.444980887725656, "lng": 2.7913841281529015 } },
|
||||
{"id":"20","city":"Paris", "_geo": { "lat": 48.902100060895480, "lng": 2.3708400867406930 } }
|
||||
]"#[..];
|
||||
]);
|
||||
let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
|
||||
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
|
||||
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
|
||||
|
||||
let external_document_ids = index.external_documents_ids(&wtxn).unwrap();
|
||||
let ids_to_delete: Vec<u32> = external_ids_to_delete
|
||||
|
|
|
@ -4,7 +4,7 @@ mod transform;
|
|||
mod typed_chunk;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::io::{self, BufRead, BufReader};
|
||||
use std::io::{Read, Seek};
|
||||
use std::iter::FromIterator;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::time::Instant;
|
||||
|
@ -24,6 +24,7 @@ pub use self::helpers::{
|
|||
};
|
||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
use crate::documents::DocumentBatchReader;
|
||||
use crate::update::{
|
||||
Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
|
||||
WordsLevelPositions, WordsPrefixesFst,
|
||||
|
@ -57,17 +58,6 @@ pub enum WriteMethod {
|
|||
GetMergePut,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
#[non_exhaustive]
|
||||
pub enum UpdateFormat {
|
||||
/// The given update is a real **comma seperated** CSV with headers on the first line.
|
||||
Csv,
|
||||
/// The given update is a JSON array with documents inside.
|
||||
Json,
|
||||
/// The given update is a JSON stream with a document on each line.
|
||||
JsonStream,
|
||||
}
|
||||
|
||||
pub struct IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
|
@ -85,7 +75,6 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
|
|||
words_positions_level_group_size: Option<NonZeroU32>,
|
||||
words_positions_min_level_size: Option<NonZeroU32>,
|
||||
update_method: IndexDocumentsMethod,
|
||||
update_format: UpdateFormat,
|
||||
autogenerate_docids: bool,
|
||||
update_id: u64,
|
||||
}
|
||||
|
@ -113,18 +102,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||
words_positions_level_group_size: None,
|
||||
words_positions_min_level_size: None,
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
update_format: UpdateFormat::Json,
|
||||
autogenerate_docids: false,
|
||||
update_id,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn index_documents_method(&mut self, method: IndexDocumentsMethod) {
|
||||
self.update_method = method;
|
||||
pub fn log_every_n(&mut self, n: usize) {
|
||||
self.log_every_n = Some(n);
|
||||
}
|
||||
|
||||
pub fn update_format(&mut self, format: UpdateFormat) {
|
||||
self.update_format = format;
|
||||
pub fn index_documents_method(&mut self, method: IndexDocumentsMethod) {
|
||||
self.update_method = method;
|
||||
}
|
||||
|
||||
pub fn enable_autogenerate_docids(&mut self) {
|
||||
|
@ -136,16 +124,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||
}
|
||||
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
pub fn execute<R, F>(self, reader: R, progress_callback: F) -> Result<DocumentAdditionResult>
|
||||
pub fn execute<R, F>(
|
||||
self,
|
||||
reader: DocumentBatchReader<R>,
|
||||
progress_callback: F,
|
||||
) -> Result<DocumentAdditionResult>
|
||||
where
|
||||
R: io::Read,
|
||||
R: Read + Seek,
|
||||
F: Fn(UpdateIndexingStep, u64) + Sync,
|
||||
{
|
||||
let mut reader = BufReader::new(reader);
|
||||
reader.fill_buf()?;
|
||||
|
||||
// Early return when there is no document to add
|
||||
if reader.buffer().is_empty() {
|
||||
if reader.is_empty() {
|
||||
return Ok(DocumentAdditionResult { nb_documents: 0 });
|
||||
}
|
||||
|
||||
|
@ -165,14 +154,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||
autogenerate_docids: self.autogenerate_docids,
|
||||
};
|
||||
|
||||
let output = match self.update_format {
|
||||
UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?,
|
||||
UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?,
|
||||
UpdateFormat::JsonStream => {
|
||||
transform.output_from_json_stream(reader, &progress_callback)?
|
||||
}
|
||||
};
|
||||
|
||||
let output = transform.read_documents(reader, progress_callback)?;
|
||||
let nb_documents = output.documents_count;
|
||||
|
||||
info!("Update transformed in {:.02?}", before_transform.elapsed());
|
||||
|
@ -462,6 +444,7 @@ mod tests {
|
|||
use heed::EnvOpenOptions;
|
||||
|
||||
use super::*;
|
||||
use crate::documents::DocumentBatchBuilder;
|
||||
use crate::update::DeleteDocuments;
|
||||
use crate::HashMap;
|
||||
|
||||
|
@ -474,9 +457,12 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
let content = documents!([
|
||||
{ "id": 1, "name": "kevin" },
|
||||
{ "id": 2, "name": "kevina" },
|
||||
{ "id": 3, "name": "benoit" }
|
||||
]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -488,9 +474,8 @@ mod tests {
|
|||
|
||||
// Second we send 1 document with id 1, to erase the previous ones.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"id,name\n1,updated kevin\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
let content = documents!([ { "id": 1, "name": "updated kevin" } ]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -502,9 +487,12 @@ mod tests {
|
|||
|
||||
// Third we send 3 documents again to replace the existing ones.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 2);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
let content = documents!([
|
||||
{ "id": 1, "name": "updated second kevin" },
|
||||
{ "id": 2, "name": "updated kevina" },
|
||||
{ "id": 3, "name": "updated benoit" }
|
||||
]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 2);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -525,9 +513,12 @@ mod tests {
|
|||
// First we send 3 documents with duplicate ids and
|
||||
// change the index method to merge documents.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"id,name\n1,kevin\n1,kevina\n1,benoit\n"[..];
|
||||
let content = documents!([
|
||||
{ "id": 1, "name": "kevin" },
|
||||
{ "id": 1, "name": "kevina" },
|
||||
{ "id": 1, "name": "benoit" }
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -552,9 +543,8 @@ mod tests {
|
|||
|
||||
// Second we send 1 document with id 1, to force it to be merged with the previous one.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"id,age\n1,25\n"[..];
|
||||
let content = documents!([ { "id": 1, "age": 25 } ]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -574,13 +564,13 @@ mod tests {
|
|||
let mut doc_iter = doc.iter();
|
||||
assert_eq!(doc_iter.next(), Some((0, &br#""1""#[..])));
|
||||
assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
|
||||
assert_eq!(doc_iter.next(), Some((2, &br#""25""#[..])));
|
||||
assert_eq!(doc_iter.next(), Some((2, &br#"25"#[..])));
|
||||
assert_eq!(doc_iter.next(), None);
|
||||
drop(rtxn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn not_auto_generated_csv_documents_ids() {
|
||||
fn not_auto_generated_documents_ids() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
|
@ -588,35 +578,12 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
assert!(builder.execute(content, |_, _| ()).is_err());
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is no document.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let count = index.number_of_documents(&rtxn).unwrap();
|
||||
assert_eq!(count, 0);
|
||||
drop(rtxn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn not_auto_generated_json_documents_ids() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
// First we send 3 documents and 2 without ids.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &br#"[
|
||||
{ "name": "kevina", "id": 21 },
|
||||
let content = documents!([
|
||||
{ "name": "kevin" },
|
||||
{ "name": "kevina" },
|
||||
{ "name": "benoit" }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
assert!(builder.execute(content, |_, _| ()).is_err());
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -636,10 +603,13 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
|
||||
let content = documents!([
|
||||
{ "name": "kevin" },
|
||||
{ "name": "kevina" },
|
||||
{ "name": "benoit" }
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -655,10 +625,9 @@ mod tests {
|
|||
|
||||
// Second we send 1 document with the generated uuid, to erase the previous ones.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = format!("id,name\n{},updated kevin", kevin_uuid);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
|
||||
let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is **always** 3 documents.
|
||||
|
@ -689,9 +658,12 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
let content = documents!([
|
||||
{ "id": 1, "name": "kevin" },
|
||||
{ "id": 2, "name": "kevina" },
|
||||
{ "id": 3, "name": "benoit" }
|
||||
]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -703,9 +675,9 @@ mod tests {
|
|||
|
||||
// Second we send 1 document without specifying the id.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"name\nnew kevin"[..];
|
||||
let content = documents!([ { "name": "new kevin" } ]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -717,7 +689,7 @@ mod tests {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn empty_csv_update() {
|
||||
fn empty_update() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
|
@ -725,9 +697,8 @@ mod tests {
|
|||
|
||||
// First we send 0 documents and only headers.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"id,name\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
let content = documents!([]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -738,83 +709,6 @@ mod tests {
|
|||
drop(rtxn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn json_documents() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
// First we send 3 documents with an id for only one of them.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &br#"[
|
||||
{ "name": "kevin" },
|
||||
{ "name": "kevina", "id": 21 },
|
||||
{ "name": "benoit" }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is 3 documents now.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let count = index.number_of_documents(&rtxn).unwrap();
|
||||
assert_eq!(count, 3);
|
||||
drop(rtxn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_json_update() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
// First we send 0 documents.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"[]"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is no documents.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let count = index.number_of_documents(&rtxn).unwrap();
|
||||
assert_eq!(count, 0);
|
||||
drop(rtxn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn json_stream_documents() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
// First we send 3 documents with an id for only one of them.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &br#"
|
||||
{ "name": "kevin" }
|
||||
{ "name": "kevina", "id": 21 }
|
||||
{ "name": "benoit" }
|
||||
"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::JsonStream);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is 3 documents now.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let count = index.number_of_documents(&rtxn).unwrap();
|
||||
assert_eq!(count, 3);
|
||||
drop(rtxn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_documents_ids() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
|
@ -825,18 +719,16 @@ mod tests {
|
|||
// First we send 1 document with an invalid id.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
// There is a space in the document id.
|
||||
let content = &b"id,name\nbrume bleue,kevin\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
assert!(builder.execute(content, |_, _| ()).is_err());
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// First we send 1 document with a valid id.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
// There is a space in the document id.
|
||||
let content = &b"id,name\n32,kevin\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
let content = documents!([ { "id": 32, "name": "kevin" } ]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -848,7 +740,7 @@ mod tests {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn complex_json_documents() {
|
||||
fn complex_documents() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
|
@ -856,13 +748,12 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with an id for only one of them.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
|
||||
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
|
||||
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -893,33 +784,31 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with an id for only one of them.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let documents = &r#"[
|
||||
let documents = documents!([
|
||||
{ "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, "_geo": { "lat": 12, "lng": 42 } },
|
||||
{ "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 },
|
||||
{ "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 },
|
||||
{ "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" },
|
||||
{ "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" },
|
||||
{ "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } }
|
||||
]"#[..];
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||
builder.execute(Cursor::new(documents), |_, _| ()).unwrap();
|
||||
builder.execute(documents, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
|
||||
let documents = &r#"[
|
||||
let documents = documents!([
|
||||
{
|
||||
"id": 2,
|
||||
"author": "J. Austen",
|
||||
"date": "1813"
|
||||
}
|
||||
]"#[..];
|
||||
]);
|
||||
|
||||
builder.execute(Cursor::new(documents), |_, _| ()).unwrap();
|
||||
builder.execute(documents, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
|
@ -931,15 +820,13 @@ mod tests {
|
|||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{ "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" },
|
||||
{ "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" },
|
||||
{ "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" },
|
||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
]);
|
||||
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
|
||||
|
||||
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
|
||||
|
||||
|
@ -951,22 +838,18 @@ mod tests {
|
|||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
||||
assert!(external_documents_ids.get("30").is_none());
|
||||
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
]);
|
||||
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
|
||||
|
||||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
||||
assert!(external_documents_ids.get("30").is_some());
|
||||
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
]);
|
||||
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
@ -987,12 +870,16 @@ mod tests {
|
|||
big_object.insert(key, "I am a text!");
|
||||
}
|
||||
|
||||
let content = vec![big_object];
|
||||
let content = serde_json::to_string(&content).unwrap();
|
||||
let mut cursor = Cursor::new(Vec::new());
|
||||
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(Cursor::new(content), |_, _| ()).unwrap();
|
||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||
builder.add_documents(big_object).unwrap();
|
||||
builder.finish().unwrap();
|
||||
cursor.set_position(0);
|
||||
let content = DocumentBatchReader::from_reader(cursor).unwrap();
|
||||
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
@ -1005,16 +892,38 @@ mod tests {
|
|||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = r#"#id,title,au{hor,genre,price$
|
||||
2,"Prideand Prejudice","Jane Austin","romance",3.5$
|
||||
456,"Le Petit Prince","Antoine de Saint-Exupéry","adventure",10.0$
|
||||
1,Wonderland","Lewis Carroll","fantasy",25.99$
|
||||
4,"Harry Potter ing","fantasy\0lood Prince","J. K. Rowling","fantasy\0,
|
||||
"#;
|
||||
let content = documents!([
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Prideand Prejudice",
|
||||
"au{hor": "Jane Austin",
|
||||
"genre": "romance",
|
||||
"price$": "3.5$",
|
||||
},
|
||||
{
|
||||
"id": 456,
|
||||
"title": "Le Petit Prince",
|
||||
"au{hor": "Antoine de Saint-Exupéry",
|
||||
"genre": "adventure",
|
||||
"price$": "10.0$",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Wonderland",
|
||||
"au{hor": "Lewis Carroll",
|
||||
"genre": "fantasy",
|
||||
"price$": "25.99$",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Harry Potter ing fantasy\0lood Prince",
|
||||
"au{hor": "J. K. Rowling",
|
||||
"genre": "fantasy\0",
|
||||
},
|
||||
]);
|
||||
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
use std::borrow::Cow;
|
||||
use std::collections::btree_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek, SeekFrom};
|
||||
use std::iter::Peekable;
|
||||
use std::result::Result as StdResult;
|
||||
use std::time::Instant;
|
||||
|
||||
use grenad::CompressionType;
|
||||
use itertools::Itertools;
|
||||
use log::info;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{Map, Value};
|
||||
|
@ -15,7 +15,8 @@ use super::helpers::{
|
|||
create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn,
|
||||
};
|
||||
use super::IndexDocumentsMethod;
|
||||
use crate::error::{InternalError, UserError};
|
||||
use crate::documents::{DocumentBatchReader, DocumentsBatchIndex};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::index::db_name;
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32};
|
||||
|
@ -51,90 +52,63 @@ pub struct Transform<'t, 'i> {
|
|||
pub autogenerate_docids: bool,
|
||||
}
|
||||
|
||||
fn is_primary_key(field: impl AsRef<str>) -> bool {
|
||||
field.as_ref().to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME)
|
||||
/// Create a mapping between the field ids found in the document batch and the one that were
|
||||
/// already present in the index.
|
||||
///
|
||||
/// If new fields are present in the addition, they are added to the index field ids map.
|
||||
fn create_fields_mapping(
|
||||
index_field_map: &mut FieldsIdsMap,
|
||||
batch_field_map: &DocumentsBatchIndex,
|
||||
) -> Result<HashMap<FieldId, FieldId>> {
|
||||
batch_field_map
|
||||
.iter()
|
||||
// we sort by id here to ensure a deterministic mapping of the fields, that preserves
|
||||
// the original ordering.
|
||||
.sorted_by_key(|(&id, _)| id)
|
||||
.map(|(field, name)| match index_field_map.id(&name) {
|
||||
Some(id) => Ok((*field, id)),
|
||||
None => index_field_map
|
||||
.insert(&name)
|
||||
.ok_or(Error::UserError(UserError::AttributeLimitReached))
|
||||
.map(|id| (*field, id)),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn find_primary_key(index: &bimap::BiHashMap<u16, String>) -> Option<&str> {
|
||||
index
|
||||
.right_values()
|
||||
.find(|v| v.to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME))
|
||||
.map(String::as_str)
|
||||
}
|
||||
|
||||
impl Transform<'_, '_> {
|
||||
pub fn output_from_json<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput>
|
||||
where
|
||||
R: Read,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
self.output_from_generic_json(reader, false, progress_callback)
|
||||
}
|
||||
|
||||
pub fn output_from_json_stream<R, F>(
|
||||
pub fn read_documents<R, F>(
|
||||
self,
|
||||
reader: R,
|
||||
mut reader: DocumentBatchReader<R>,
|
||||
progress_callback: F,
|
||||
) -> Result<TransformOutput>
|
||||
where
|
||||
R: Read,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
self.output_from_generic_json(reader, true, progress_callback)
|
||||
}
|
||||
|
||||
fn output_from_generic_json<R, F>(
|
||||
self,
|
||||
reader: R,
|
||||
is_stream: bool,
|
||||
progress_callback: F,
|
||||
) -> Result<TransformOutput>
|
||||
where
|
||||
R: Read,
|
||||
R: Read + Seek,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let fields_index = reader.index();
|
||||
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
|
||||
let mapping = create_fields_mapping(&mut fields_ids_map, fields_index)?;
|
||||
|
||||
// Deserialize the whole batch of documents in memory.
|
||||
let mut documents: Peekable<
|
||||
Box<dyn Iterator<Item = serde_json::Result<Map<String, Value>>>>,
|
||||
> = if is_stream {
|
||||
let iter = serde_json::Deserializer::from_reader(reader).into_iter();
|
||||
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
|
||||
iter.peekable()
|
||||
} else {
|
||||
let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?;
|
||||
let iter = vec.into_iter().map(Ok);
|
||||
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
|
||||
iter.peekable()
|
||||
};
|
||||
let alternative_name = self
|
||||
.index
|
||||
.primary_key(self.rtxn)?
|
||||
.or_else(|| find_primary_key(fields_index))
|
||||
.map(String::from);
|
||||
|
||||
// We extract the primary key from the first document in
|
||||
// the batch if it hasn't already been defined in the index
|
||||
let first = match documents.peek().map(StdResult::as_ref).transpose() {
|
||||
Ok(first) => first,
|
||||
Err(_) => {
|
||||
let error = documents.next().unwrap().unwrap_err();
|
||||
return Err(UserError::SerdeJson(error).into());
|
||||
}
|
||||
};
|
||||
|
||||
let alternative_name =
|
||||
first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
|
||||
let (primary_key_id, primary_key) = compute_primary_key_pair(
|
||||
let (primary_key_id, primary_key_name) = compute_primary_key_pair(
|
||||
self.index.primary_key(self.rtxn)?,
|
||||
&mut fields_ids_map,
|
||||
alternative_name,
|
||||
self.autogenerate_docids,
|
||||
)?;
|
||||
|
||||
if documents.peek().is_none() {
|
||||
return Ok(TransformOutput {
|
||||
primary_key,
|
||||
fields_ids_map,
|
||||
field_distribution: self.index.field_distribution(self.rtxn)?,
|
||||
external_documents_ids: ExternalDocumentsIds::default(),
|
||||
new_documents_ids: RoaringBitmap::new(),
|
||||
replaced_documents_ids: RoaringBitmap::new(),
|
||||
documents_count: 0,
|
||||
documents_file: tempfile::tempfile()?,
|
||||
});
|
||||
}
|
||||
|
||||
// We must choose the appropriate merge function for when two or more documents
|
||||
// with the same user id must be merged or fully replaced in the same batch.
|
||||
let merge_function = match self.index_documents_method {
|
||||
|
@ -151,204 +125,103 @@ impl Transform<'_, '_> {
|
|||
self.max_memory,
|
||||
);
|
||||
|
||||
let mut json_buffer = Vec::new();
|
||||
let mut obkv_buffer = Vec::new();
|
||||
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
|
||||
let mut documents_count = 0;
|
||||
|
||||
for result in documents {
|
||||
let document = result.map_err(UserError::SerdeJson)?;
|
||||
|
||||
let mut external_id_buffer = Vec::new();
|
||||
let mut field_buffer: Vec<(u16, &[u8])> = Vec::new();
|
||||
while let Some((addition_index, document)) = reader.next_document_with_index()? {
|
||||
let mut field_buffer_cache = drop_and_reuse(field_buffer);
|
||||
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
||||
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
|
||||
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
||||
documents_seen: documents_count,
|
||||
});
|
||||
}
|
||||
|
||||
obkv_buffer.clear();
|
||||
let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
|
||||
|
||||
// We prepare the fields ids map with the documents keys.
|
||||
for (key, _value) in &document {
|
||||
fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
|
||||
for (k, v) in document.iter() {
|
||||
let mapped_id = *mapping.get(&k).unwrap();
|
||||
field_buffer_cache.push((mapped_id, v));
|
||||
}
|
||||
|
||||
// We retrieve the user id from the document based on the primary key name,
|
||||
// if the document id isn't present we generate a uuid.
|
||||
let external_id = match document.get(&primary_key) {
|
||||
Some(value) => match value {
|
||||
Value::String(string) => Cow::Borrowed(string.as_str()),
|
||||
Value::Number(number) => Cow::Owned(number.to_string()),
|
||||
content => {
|
||||
return Err(
|
||||
UserError::InvalidDocumentId { document_id: content.clone() }.into()
|
||||
)
|
||||
// We need to make sure that every document has a primary key. After we have remapped
|
||||
// all the fields in the document, we try to find the primary key value. If we can find
|
||||
// it, transform it into a string and validate it, and then update it in the
|
||||
// document. If none is found, and we were told to generate missing document ids, then
|
||||
// we create the missing field, and update the new document.
|
||||
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
|
||||
let external_id =
|
||||
match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) {
|
||||
Some((_, bytes)) => {
|
||||
let value = match serde_json::from_slice(bytes).unwrap() {
|
||||
Value::String(string) => match validate_document_id(&string) {
|
||||
Some(s) if s.len() == string.len() => string,
|
||||
Some(s) => s.to_string(),
|
||||
None => {
|
||||
return Err(UserError::InvalidDocumentId {
|
||||
document_id: Value::String(string),
|
||||
}
|
||||
.into())
|
||||
}
|
||||
},
|
||||
Value::Number(number) => number.to_string(),
|
||||
content => {
|
||||
return Err(UserError::InvalidDocumentId {
|
||||
document_id: content.clone(),
|
||||
}
|
||||
.into())
|
||||
}
|
||||
};
|
||||
serde_json::to_writer(&mut external_id_buffer, &value).unwrap();
|
||||
*bytes = &external_id_buffer;
|
||||
Cow::Owned(value)
|
||||
}
|
||||
},
|
||||
None => {
|
||||
if !self.autogenerate_docids {
|
||||
return Err(UserError::MissingDocumentId { document }.into());
|
||||
}
|
||||
let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
|
||||
Cow::Borrowed(uuid)
|
||||
}
|
||||
};
|
||||
None => {
|
||||
if !self.autogenerate_docids {
|
||||
let mut json = Map::new();
|
||||
for (key, value) in document.iter() {
|
||||
let key = addition_index.get_by_left(&key).cloned();
|
||||
let value = serde_json::from_slice::<Value>(&value).ok();
|
||||
|
||||
// We iterate in the fields ids ordered.
|
||||
for (field_id, name) in fields_ids_map.iter() {
|
||||
json_buffer.clear();
|
||||
if let Some((k, v)) = key.zip(value) {
|
||||
json.insert(k, v);
|
||||
}
|
||||
}
|
||||
|
||||
// We try to extract the value from the document and if we don't find anything
|
||||
// and this should be the document id we return the one we generated.
|
||||
if let Some(value) = document.get(name) {
|
||||
// We serialize the attribute values.
|
||||
serde_json::to_writer(&mut json_buffer, value)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(field_id, &json_buffer)?;
|
||||
}
|
||||
// We validate the document id [a-zA-Z0-9\-_].
|
||||
if field_id == primary_key_id && validate_document_id(&external_id).is_none() {
|
||||
return Err(UserError::InvalidDocumentId {
|
||||
document_id: Value::from(external_id),
|
||||
return Err(UserError::MissingDocumentId { document: json }.into());
|
||||
}
|
||||
|
||||
let uuid =
|
||||
uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
|
||||
serde_json::to_writer(&mut external_id_buffer, &uuid).unwrap();
|
||||
field_buffer_cache.push((primary_key_id, &external_id_buffer));
|
||||
Cow::Borrowed(&*uuid)
|
||||
}
|
||||
.into());
|
||||
}
|
||||
};
|
||||
|
||||
// Insertion in a obkv need to be done with keys ordered. For now they are ordered
|
||||
// according to the document addition key order, so we sort it according to the
|
||||
// fieldids map keys order.
|
||||
field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2));
|
||||
|
||||
// The last step is to build the new obkv document, and insert it in the sorter.
|
||||
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
|
||||
for (k, v) in field_buffer_cache.iter() {
|
||||
writer.insert(*k, v)?;
|
||||
}
|
||||
|
||||
// We use the extracted/generated user id as the key for this document.
|
||||
sorter.insert(external_id.as_bytes(), &obkv_buffer)?;
|
||||
sorter.insert(&external_id.as_ref().as_bytes(), &obkv_buffer)?;
|
||||
documents_count += 1;
|
||||
}
|
||||
|
||||
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
|
||||
documents_seen: documents_count,
|
||||
});
|
||||
|
||||
// Now that we have a valid sorter that contains the user id and the obkv we
|
||||
// give it to the last transforming function which returns the TransformOutput.
|
||||
self.output_from_sorter(
|
||||
sorter,
|
||||
primary_key,
|
||||
fields_ids_map,
|
||||
documents_count,
|
||||
external_documents_ids,
|
||||
progress_callback,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn output_from_csv<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput>
|
||||
where
|
||||
R: Read,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
|
||||
|
||||
let mut csv = csv::Reader::from_reader(reader);
|
||||
let headers = csv.headers().map_err(UserError::Csv)?;
|
||||
|
||||
let mut fields_ids = Vec::new();
|
||||
// Generate the new fields ids based on the current fields ids and this CSV headers.
|
||||
for (i, header) in headers.iter().enumerate() {
|
||||
let id = fields_ids_map.insert(header).ok_or(UserError::AttributeLimitReached)?;
|
||||
fields_ids.push((id, i));
|
||||
}
|
||||
|
||||
// Extract the position of the primary key in the current headers, None if not found.
|
||||
let primary_key_pos = match self.index.primary_key(self.rtxn)? {
|
||||
Some(primary_key) => {
|
||||
// The primary key is known so we must find the position in the CSV headers.
|
||||
headers.iter().position(|h| h == primary_key)
|
||||
}
|
||||
None => headers.iter().position(is_primary_key),
|
||||
};
|
||||
|
||||
// Returns the field id in the fields ids map, create an "id" field
|
||||
// in case it is not in the current headers.
|
||||
let alternative_name = primary_key_pos.map(|pos| headers[pos].to_string());
|
||||
let (primary_key_id, primary_key_name) = compute_primary_key_pair(
|
||||
self.index.primary_key(self.rtxn)?,
|
||||
&mut fields_ids_map,
|
||||
alternative_name,
|
||||
self.autogenerate_docids,
|
||||
)?;
|
||||
|
||||
// The primary key field is not present in the header, so we need to create it.
|
||||
if primary_key_pos.is_none() {
|
||||
fields_ids.push((primary_key_id, usize::max_value()));
|
||||
}
|
||||
|
||||
// We sort the fields ids by the fields ids map id, this way we are sure to iterate over
|
||||
// the records fields in the fields ids map order and correctly generate the obkv.
|
||||
fields_ids.sort_unstable_by_key(|(field_id, _)| *field_id);
|
||||
|
||||
// We initialize the sorter with the user indexing settings.
|
||||
let mut sorter = create_sorter(
|
||||
keep_latest_obkv,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
|
||||
// We write into the sorter to merge and deduplicate the documents
|
||||
// based on the external ids.
|
||||
let mut json_buffer = Vec::new();
|
||||
let mut obkv_buffer = Vec::new();
|
||||
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
|
||||
let mut documents_count = 0;
|
||||
|
||||
let mut record = csv::StringRecord::new();
|
||||
while csv.read_record(&mut record).map_err(UserError::Csv)? {
|
||||
obkv_buffer.clear();
|
||||
let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
|
||||
|
||||
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
||||
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
|
||||
documents_seen: documents_count,
|
||||
});
|
||||
}
|
||||
|
||||
// We extract the user id if we know where it is or generate an UUID V4 otherwise.
|
||||
let external_id = match primary_key_pos {
|
||||
Some(pos) => {
|
||||
let external_id = &record[pos];
|
||||
// We validate the document id [a-zA-Z0-9\-_].
|
||||
match validate_document_id(&external_id) {
|
||||
Some(valid) => valid,
|
||||
None => {
|
||||
return Err(UserError::InvalidDocumentId {
|
||||
document_id: Value::from(external_id),
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer),
|
||||
};
|
||||
|
||||
// When the primary_key_field_id is found in the fields ids list
|
||||
// we return the generated document id instead of the record field.
|
||||
let iter = fields_ids.iter().map(|(fi, i)| {
|
||||
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
|
||||
(fi, field)
|
||||
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
||||
documents_seen: documents_count,
|
||||
});
|
||||
|
||||
// We retrieve the field id based on the fields ids map fields ids order.
|
||||
for (field_id, field) in iter {
|
||||
// We serialize the attribute values as JSON strings.
|
||||
json_buffer.clear();
|
||||
serde_json::to_writer(&mut json_buffer, &field)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(*field_id, &json_buffer)?;
|
||||
}
|
||||
|
||||
// We use the extracted/generated user id as the key for this document.
|
||||
sorter.insert(external_id, &obkv_buffer)?;
|
||||
documents_count += 1;
|
||||
obkv_buffer.clear();
|
||||
field_buffer = drop_and_reuse(field_buffer_cache);
|
||||
external_id_buffer.clear();
|
||||
}
|
||||
|
||||
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
|
||||
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
||||
documents_seen: documents_count,
|
||||
});
|
||||
|
||||
|
@ -359,7 +232,6 @@ impl Transform<'_, '_> {
|
|||
primary_key_name,
|
||||
fields_ids_map,
|
||||
documents_count,
|
||||
external_documents_ids,
|
||||
progress_callback,
|
||||
)
|
||||
}
|
||||
|
@ -373,12 +245,12 @@ impl Transform<'_, '_> {
|
|||
primary_key: String,
|
||||
fields_ids_map: FieldsIdsMap,
|
||||
approximate_number_of_documents: usize,
|
||||
mut external_documents_ids: ExternalDocumentsIds<'_>,
|
||||
progress_callback: F,
|
||||
) -> Result<TransformOutput>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let mut external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
|
||||
let documents_ids = self.index.documents_ids(self.rtxn)?;
|
||||
let mut field_distribution = self.index.field_distribution(self.rtxn)?;
|
||||
let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
|
||||
|
@ -610,6 +482,17 @@ fn validate_document_id(document_id: &str) -> Option<&str> {
|
|||
})
|
||||
}
|
||||
|
||||
/// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec<T>`.
|
||||
///
|
||||
/// The size and alignment of T and U must match.
|
||||
fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> {
|
||||
debug_assert_eq!(std::mem::align_of::<U>(), std::mem::align_of::<T>());
|
||||
debug_assert_eq!(std::mem::size_of::<U>(), std::mem::size_of::<T>());
|
||||
vec.clear();
|
||||
debug_assert!(vec.is_empty());
|
||||
vec.into_iter().map(|_| unreachable!()).collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
|
|
@ -2,9 +2,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds;
|
|||
pub use self::clear_documents::ClearDocuments;
|
||||
pub use self::delete_documents::DeleteDocuments;
|
||||
pub use self::facets::Facets;
|
||||
pub use self::index_documents::{
|
||||
DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat,
|
||||
};
|
||||
pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod};
|
||||
pub use self::settings::{Setting, Settings};
|
||||
pub use self::update_builder::UpdateBuilder;
|
||||
pub use self::update_step::UpdateIndexingStep;
|
||||
|
|
|
@ -111,6 +111,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn log_every_n(&mut self, n: usize) {
|
||||
self.log_every_n = Some(n);
|
||||
}
|
||||
|
||||
pub fn reset_searchable_fields(&mut self) {
|
||||
self.searchable_fields = Setting::Reset;
|
||||
}
|
||||
|
@ -501,7 +505,7 @@ mod tests {
|
|||
|
||||
use super::*;
|
||||
use crate::error::Error;
|
||||
use crate::update::{IndexDocuments, UpdateFormat};
|
||||
use crate::update::IndexDocuments;
|
||||
use crate::{Criterion, FilterCondition, SearchResult};
|
||||
|
||||
#[test]
|
||||
|
@ -513,9 +517,13 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"id,name,age\n0,kevin,23\n1,kevina,21\n2,benoit,34\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
|
||||
let content = documents!([
|
||||
{ "id": 1, "name": "kevin", "age": 23 },
|
||||
{ "id": 2, "name": "kevina", "age": 21},
|
||||
{ "id": 3, "name": "benoit", "age": 34 }
|
||||
]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -567,10 +575,13 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
||||
let content = documents!([
|
||||
{ "name": "kevin", "age": 23},
|
||||
{ "name": "kevina", "age": 21 },
|
||||
{ "name": "benoit", "age": 34 }
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -611,10 +622,13 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
||||
let content = documents!([
|
||||
{ "name": "kevin", "age": 23},
|
||||
{ "name": "kevina", "age": 21 },
|
||||
{ "name": "benoit", "age": 34 }
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -633,10 +647,13 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
||||
let content = documents!([
|
||||
{ "name": "kevin", "age": 23},
|
||||
{ "name": "kevina", "age": 21 },
|
||||
{ "name": "benoit", "age": 34 }
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
|
||||
// In the same transaction we change the displayed fields to be only the age.
|
||||
|
@ -678,13 +695,12 @@ mod tests {
|
|||
builder.execute(|_, _| ()).unwrap();
|
||||
|
||||
// Then index some documents.
|
||||
let content = &br#"[
|
||||
{ "name": "kevin", "age": 23 },
|
||||
let content = documents!([
|
||||
{ "name": "kevin", "age": 23},
|
||||
{ "name": "kevina", "age": 21 },
|
||||
{ "name": "benoit", "age": 34 }
|
||||
]"#[..];
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -695,11 +711,19 @@ mod tests {
|
|||
assert_eq!(fields_ids, hashset! { S("age") });
|
||||
// Only count the field_id 0 and level 0 facet values.
|
||||
// TODO we must support typed CSVs for numbers to be understood.
|
||||
let fidmap = index.fields_ids_map(&rtxn).unwrap();
|
||||
println!("fidmap: {:?}", fidmap);
|
||||
for document in index.all_documents(&rtxn).unwrap() {
|
||||
let document = document.unwrap();
|
||||
let json = crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document.1)
|
||||
.unwrap();
|
||||
println!("json: {:?}", json);
|
||||
}
|
||||
let count = index
|
||||
.facet_id_f64_docids
|
||||
.remap_key_type::<ByteSlice>()
|
||||
// The faceted field id is 2u16
|
||||
.prefix_iter(&rtxn, &[0, 2, 0])
|
||||
// The faceted field id is 1u16
|
||||
.prefix_iter(&rtxn, &[0, 1, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 3);
|
||||
|
@ -707,25 +731,23 @@ mod tests {
|
|||
|
||||
// Index a little more documents with new and current facets values.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &br#"[
|
||||
{ "name": "kevin2", "age": 23 },
|
||||
let content = documents!([
|
||||
{ "name": "kevin2", "age": 23},
|
||||
{ "name": "kevina2", "age": 21 },
|
||||
{ "name": "benoit", "age": 35 }
|
||||
]"#[..];
|
||||
{ "name": "benoit", "age": 35 }
|
||||
]);
|
||||
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 2);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 0 and level 0 facet values.
|
||||
// TODO we must support typed CSVs for numbers to be understood.
|
||||
let count = index
|
||||
.facet_id_f64_docids
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(&rtxn, &[0, 2, 0])
|
||||
.prefix_iter(&rtxn, &[0, 1, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 4);
|
||||
|
@ -747,13 +769,12 @@ mod tests {
|
|||
builder.execute(|_, _| ()).unwrap();
|
||||
|
||||
// Then index some documents.
|
||||
let content = &br#"[
|
||||
{ "name": "kevin", "age": 23 },
|
||||
let content = documents!([
|
||||
{ "name": "kevin", "age": 23},
|
||||
{ "name": "kevina", "age": 21 },
|
||||
{ "name": "benoit", "age": 34 }
|
||||
]"#[..];
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -790,7 +811,7 @@ mod tests {
|
|||
builder.execute(|_, _| ()).unwrap();
|
||||
|
||||
// Then index some documents.
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{ "name": "kevin", "age": 23 },
|
||||
{ "name": "kevina", "age": 21 },
|
||||
{ "name": "benoit", "age": 34 },
|
||||
|
@ -798,9 +819,8 @@ mod tests {
|
|||
{ "name": "bertrand", "age": 34 },
|
||||
{ "name": "bernie", "age": 34 },
|
||||
{ "name": "ben", "age": 34 }
|
||||
]"#[..];
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -822,10 +842,13 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
||||
let content = documents!([
|
||||
{ "name": "kevin", "age": 23},
|
||||
{ "name": "kevina", "age": 21 },
|
||||
{ "name": "benoit", "age": 34 }
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
@ -844,10 +867,13 @@ mod tests {
|
|||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..];
|
||||
let content = documents!([
|
||||
{ "name": "kevin", "age": 23, "maxim": "I love dogs" },
|
||||
{ "name": "kevina", "age": 21, "maxim": "Doggos are the best" },
|
||||
{ "name": "benoit", "age": 34, "maxim": "The crepes are really good" },
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
|
||||
// In the same transaction we provide some stop_words
|
||||
|
@ -915,10 +941,13 @@ mod tests {
|
|||
|
||||
// Send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..];
|
||||
let content = documents!([
|
||||
{ "name": "kevin", "age": 23, "maxim": "I love dogs"},
|
||||
{ "name": "kevina", "age": 21, "maxim": "Doggos are the best"},
|
||||
{ "name": "benoit", "age": 34, "maxim": "The crepes are really good"},
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
|
||||
// In the same transaction provide some synonyms
|
||||
|
@ -1038,7 +1067,7 @@ mod tests {
|
|||
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey"));
|
||||
|
||||
// Then index some documents with the "mykey" primary key.
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{ "mykey": 1, "name": "kevin", "age": 23 },
|
||||
{ "mykey": 2, "name": "kevina", "age": 21 },
|
||||
{ "mykey": 3, "name": "benoit", "age": 34 },
|
||||
|
@ -1046,9 +1075,8 @@ mod tests {
|
|||
{ "mykey": 5, "name": "bertrand", "age": 34 },
|
||||
{ "mykey": 6, "name": "bernie", "age": 34 },
|
||||
{ "mykey": 7, "name": "ben", "age": 34 }
|
||||
]"#[..];
|
||||
]);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.disable_autogenerate_docids();
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -1087,7 +1115,7 @@ mod tests {
|
|||
builder.set_filterable_fields(hashset! { S("genres") });
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
|
||||
let content = &br#"[
|
||||
let content = documents!([
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Star Wars",
|
||||
|
@ -1105,9 +1133,8 @@ mod tests {
|
|||
"poster": "https://image.tmdb.org/t/p/w500/gSuHDeWemA1menrwfMRChnSmMVN.jpg",
|
||||
"release_date": 819676800
|
||||
}
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
]);
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
|
|
|
@ -2,10 +2,9 @@ use UpdateIndexingStep::*;
|
|||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum UpdateIndexingStep {
|
||||
/// Transform from the original user given format (CSV, JSON, JSON lines)
|
||||
/// into a generic format based on the obkv and grenad crates. This step also
|
||||
/// deduplicate potential documents in this batch update by merging or replacing them.
|
||||
TransformFromUserIntoGenericFormat { documents_seen: usize },
|
||||
/// Remap document addition fields the one present in the database, adding new fields in to the
|
||||
/// schema on the go.
|
||||
RemapDocumentAddition { documents_seen: usize },
|
||||
|
||||
/// This step check the external document id, computes the internal ids and merge
|
||||
/// the documents that are already present in the database.
|
||||
|
@ -23,7 +22,7 @@ pub enum UpdateIndexingStep {
|
|||
impl UpdateIndexingStep {
|
||||
pub const fn step(&self) -> usize {
|
||||
match self {
|
||||
TransformFromUserIntoGenericFormat { .. } => 0,
|
||||
RemapDocumentAddition { .. } => 0,
|
||||
ComputeIdsAndMergeDocuments { .. } => 1,
|
||||
IndexDocuments { .. } => 2,
|
||||
MergeDataIntoFinalDatabase { .. } => 3,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue