mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-25 12:47:28 +01:00
merge with main
This commit is contained in:
commit
6831c23449
@ -1,7 +1,7 @@
|
|||||||
#![allow(dead_code)]
|
#![allow(dead_code)]
|
||||||
|
|
||||||
use std::fs::{create_dir_all, remove_dir_all, File};
|
use std::fs::{create_dir_all, remove_dir_all, File};
|
||||||
use std::io::{self, Cursor, Read, Seek};
|
use std::io::{self, BufRead, BufReader, Cursor, Read, Seek};
|
||||||
use std::num::ParseFloatError;
|
use std::num::ParseFloatError;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
@ -146,44 +146,34 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<imp
|
|||||||
DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
|
fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let mut writer = Cursor::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
||||||
|
|
||||||
let values = serde_json::Deserializer::from_reader(reader)
|
let mut buf = String::new();
|
||||||
.into_iter::<serde_json::Map<String, serde_json::Value>>();
|
let mut reader = BufReader::new(reader);
|
||||||
for document in values {
|
|
||||||
let document = document?;
|
while reader.read_line(&mut buf)? > 0 {
|
||||||
documents.add_documents(document)?;
|
documents.extend_from_json(&mut buf.as_bytes())?;
|
||||||
}
|
}
|
||||||
documents.finish()?;
|
documents.finish()?;
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
Ok(writer.into_inner())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
|
fn documents_from_json(reader: impl Read) -> anyhow::Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let mut writer = Cursor::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
||||||
|
|
||||||
let json: serde_json::Value = serde_json::from_reader(reader)?;
|
documents.extend_from_json(reader)?;
|
||||||
documents.add_documents(json)?;
|
|
||||||
documents.finish()?;
|
documents.finish()?;
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
Ok(writer.into_inner())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
|
fn documents_from_csv(reader: impl Read) -> anyhow::Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let mut writer = Cursor::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
|
||||||
|
|
||||||
let iter = CSVDocumentDeserializer::from_reader(reader)?;
|
|
||||||
|
|
||||||
for doc in iter {
|
|
||||||
let doc = doc?;
|
|
||||||
documents.add_documents(doc)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
documents.finish()?;
|
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
Ok(writer.into_inner())
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "cli"
|
name = "cli"
|
||||||
version = "0.1.0"
|
version = "0.20.2"
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
description = "A CLI to interact with a milli index"
|
description = "A CLI to interact with a milli index"
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{stdin, Cursor, Read};
|
use std::io::{stdin, BufRead, BufReader, Cursor, Read};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
@ -9,7 +9,6 @@ use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
|
|||||||
use milli::update::UpdateIndexingStep::{
|
use milli::update::UpdateIndexingStep::{
|
||||||
ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
|
ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
|
||||||
};
|
};
|
||||||
use serde_json::{Map, Value};
|
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
@ -202,11 +201,11 @@ fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
|
|||||||
let mut writer = Cursor::new(Vec::new());
|
let mut writer = Cursor::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
||||||
|
|
||||||
let values = serde_json::Deserializer::from_reader(reader)
|
let mut buf = String::new();
|
||||||
.into_iter::<serde_json::Map<String, serde_json::Value>>();
|
let mut reader = BufReader::new(reader);
|
||||||
for document in values {
|
|
||||||
let document = document?;
|
while reader.read_line(&mut buf)? > 0 {
|
||||||
documents.add_documents(document)?;
|
documents.extend_from_json(&mut buf.as_bytes())?;
|
||||||
}
|
}
|
||||||
documents.finish()?;
|
documents.finish()?;
|
||||||
|
|
||||||
@ -217,8 +216,7 @@ fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
|
|||||||
let mut writer = Cursor::new(Vec::new());
|
let mut writer = Cursor::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
||||||
|
|
||||||
let json: serde_json::Value = serde_json::from_reader(reader)?;
|
documents.extend_from_json(reader)?;
|
||||||
documents.add_documents(json)?;
|
|
||||||
documents.finish()?;
|
documents.finish()?;
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
Ok(writer.into_inner())
|
||||||
@ -226,17 +224,7 @@ fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
|
|||||||
|
|
||||||
fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> {
|
fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let mut writer = Cursor::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
|
||||||
|
|
||||||
let mut records = csv::Reader::from_reader(reader);
|
|
||||||
let iter = records.deserialize::<Map<String, Value>>();
|
|
||||||
|
|
||||||
for doc in iter {
|
|
||||||
let doc = doc?;
|
|
||||||
documents.add_documents(doc)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
documents.finish()?;
|
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
Ok(writer.into_inner())
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "helpers"
|
name = "helpers"
|
||||||
version = "0.18.0"
|
version = "0.20.2"
|
||||||
authors = ["Clément Renault <clement@meilisearch.com>"]
|
authors = ["Clément Renault <clement@meilisearch.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "http-ui"
|
name = "http-ui"
|
||||||
description = "The HTTP user interface of the milli search engine"
|
description = "The HTTP user interface of the milli search engine"
|
||||||
version = "0.18.0"
|
version = "0.20.2"
|
||||||
authors = ["Clément Renault <clement@meilisearch.com>"]
|
authors = ["Clément Renault <clement@meilisearch.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
|
@ -1,285 +0,0 @@
|
|||||||
use std::io::{Read, Result as IoResult};
|
|
||||||
use std::num::ParseFloatError;
|
|
||||||
|
|
||||||
use serde_json::{Map, Value};
|
|
||||||
|
|
||||||
enum AllowedType {
|
|
||||||
String,
|
|
||||||
Number,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_csv_header(header: &str) -> (String, AllowedType) {
|
|
||||||
// if there are several separators we only split on the last one.
|
|
||||||
match header.rsplit_once(':') {
|
|
||||||
Some((field_name, field_type)) => match field_type {
|
|
||||||
"string" => (field_name.to_string(), AllowedType::String),
|
|
||||||
"number" => (field_name.to_string(), AllowedType::Number),
|
|
||||||
// we may return an error in this case.
|
|
||||||
_otherwise => (header.to_string(), AllowedType::String),
|
|
||||||
},
|
|
||||||
None => (header.to_string(), AllowedType::String),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct CSVDocumentDeserializer<R>
|
|
||||||
where
|
|
||||||
R: Read,
|
|
||||||
{
|
|
||||||
documents: csv::StringRecordsIntoIter<R>,
|
|
||||||
headers: Vec<(String, AllowedType)>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<R: Read> CSVDocumentDeserializer<R> {
|
|
||||||
pub fn from_reader(reader: R) -> IoResult<Self> {
|
|
||||||
let mut records = csv::Reader::from_reader(reader);
|
|
||||||
|
|
||||||
let headers = records.headers()?.into_iter().map(parse_csv_header).collect();
|
|
||||||
|
|
||||||
Ok(Self { documents: records.into_records(), headers })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<R: Read> Iterator for CSVDocumentDeserializer<R> {
|
|
||||||
type Item = anyhow::Result<Map<String, Value>>;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
let csv_document = self.documents.next()?;
|
|
||||||
|
|
||||||
match csv_document {
|
|
||||||
Ok(csv_document) => {
|
|
||||||
let mut document = Map::new();
|
|
||||||
|
|
||||||
for ((field_name, field_type), value) in
|
|
||||||
self.headers.iter().zip(csv_document.into_iter())
|
|
||||||
{
|
|
||||||
let parsed_value: Result<Value, ParseFloatError> = match field_type {
|
|
||||||
AllowedType::Number => {
|
|
||||||
value.parse::<f64>().map(Value::from).map_err(Into::into)
|
|
||||||
}
|
|
||||||
AllowedType::String => Ok(Value::String(value.to_string())),
|
|
||||||
};
|
|
||||||
|
|
||||||
match parsed_value {
|
|
||||||
Ok(value) => drop(document.insert(field_name.to_string(), value)),
|
|
||||||
Err(_e) => {
|
|
||||||
return Some(Err(anyhow::anyhow!(
|
|
||||||
"Value '{}' is not a valid number",
|
|
||||||
value
|
|
||||||
)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(Ok(document))
|
|
||||||
}
|
|
||||||
Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod test {
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn simple_csv_document() {
|
|
||||||
let documents = r#"city,country,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
Value::Object(csv_iter.next().unwrap().unwrap()),
|
|
||||||
json!({
|
|
||||||
"city": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn coma_in_field() {
|
|
||||||
let documents = r#"city,country,pop
|
|
||||||
"Boston","United, States","4628910""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
Value::Object(csv_iter.next().unwrap().unwrap()),
|
|
||||||
json!({
|
|
||||||
"city": "Boston",
|
|
||||||
"country": "United, States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn quote_in_field() {
|
|
||||||
let documents = r#"city,country,pop
|
|
||||||
"Boston","United"" States","4628910""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
Value::Object(csv_iter.next().unwrap().unwrap()),
|
|
||||||
json!({
|
|
||||||
"city": "Boston",
|
|
||||||
"country": "United\" States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn integer_in_field() {
|
|
||||||
let documents = r#"city,country,pop:number
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
Value::Object(csv_iter.next().unwrap().unwrap()),
|
|
||||||
json!({
|
|
||||||
"city": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": 4628910.0,
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn float_in_field() {
|
|
||||||
let documents = r#"city,country,pop:number
|
|
||||||
"Boston","United States","4628910.01""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
Value::Object(csv_iter.next().unwrap().unwrap()),
|
|
||||||
json!({
|
|
||||||
"city": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": 4628910.01,
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn several_double_dot_in_header() {
|
|
||||||
let documents = r#"city:love:string,country:state,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
Value::Object(csv_iter.next().unwrap().unwrap()),
|
|
||||||
json!({
|
|
||||||
"city:love": "Boston",
|
|
||||||
"country:state": "United States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn ending_by_double_dot_in_header() {
|
|
||||||
let documents = r#"city:,country,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
Value::Object(csv_iter.next().unwrap().unwrap()),
|
|
||||||
json!({
|
|
||||||
"city:": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn starting_by_double_dot_in_header() {
|
|
||||||
let documents = r#":city,country,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
Value::Object(csv_iter.next().unwrap().unwrap()),
|
|
||||||
json!({
|
|
||||||
":city": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn starting_by_double_dot_in_header2() {
|
|
||||||
let documents = r#":string,country,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
Value::Object(csv_iter.next().unwrap().unwrap()),
|
|
||||||
json!({
|
|
||||||
"": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn double_double_dot_in_header() {
|
|
||||||
let documents = r#"city::string,country,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
Value::Object(csv_iter.next().unwrap().unwrap()),
|
|
||||||
json!({
|
|
||||||
"city:": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn bad_type_in_header() {
|
|
||||||
let documents = r#"city,country:number,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert!(csv_iter.next().unwrap().is_err());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn bad_column_count1() {
|
|
||||||
let documents = r#"city,country,pop
|
|
||||||
"Boston","United States","4628910", "too much""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert!(csv_iter.next().unwrap().is_err());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn bad_column_count2() {
|
|
||||||
let documents = r#"city,country,pop
|
|
||||||
"Boston","United States""#;
|
|
||||||
|
|
||||||
let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
|
|
||||||
|
|
||||||
assert!(csv_iter.next().unwrap().is_err());
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,10 +1,9 @@
|
|||||||
mod documents_from_csv;
|
|
||||||
mod update_store;
|
mod update_store;
|
||||||
|
|
||||||
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||||
use std::fmt::Display;
|
use std::fmt::Display;
|
||||||
use std::fs::{create_dir_all, File};
|
use std::fs::{create_dir_all, File};
|
||||||
use std::io::Cursor;
|
use std::io::{BufRead, BufReader, Cursor};
|
||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
use std::num::{NonZeroU32, NonZeroUsize};
|
use std::num::{NonZeroU32, NonZeroUsize};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
@ -40,7 +39,6 @@ use warp::http::Response;
|
|||||||
use warp::Filter;
|
use warp::Filter;
|
||||||
|
|
||||||
use self::update_store::UpdateStore;
|
use self::update_store::UpdateStore;
|
||||||
use crate::documents_from_csv::CSVDocumentDeserializer;
|
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
#[global_allocator]
|
#[global_allocator]
|
||||||
@ -1048,11 +1046,11 @@ fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
|
|||||||
let mut writer = Cursor::new(Vec::new());
|
let mut writer = Cursor::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
||||||
|
|
||||||
let values = serde_json::Deserializer::from_reader(reader)
|
let mut buf = String::new();
|
||||||
.into_iter::<serde_json::Map<String, serde_json::Value>>();
|
let mut reader = BufReader::new(reader);
|
||||||
for document in values {
|
|
||||||
let document = document?;
|
while reader.read_line(&mut buf)? > 0 {
|
||||||
documents.add_documents(document)?;
|
documents.extend_from_json(&mut buf.as_bytes())?;
|
||||||
}
|
}
|
||||||
documents.finish()?;
|
documents.finish()?;
|
||||||
|
|
||||||
@ -1063,8 +1061,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
|
|||||||
let mut writer = Cursor::new(Vec::new());
|
let mut writer = Cursor::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
||||||
|
|
||||||
let json: serde_json::Value = serde_json::from_reader(reader)?;
|
documents.extend_from_json(reader)?;
|
||||||
documents.add_documents(json)?;
|
|
||||||
documents.finish()?;
|
documents.finish()?;
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
Ok(writer.into_inner())
|
||||||
@ -1072,16 +1069,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
|
|||||||
|
|
||||||
fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
|
fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let mut writer = Cursor::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
|
||||||
|
|
||||||
let iter = CSVDocumentDeserializer::from_reader(reader)?;
|
|
||||||
|
|
||||||
for doc in iter {
|
|
||||||
let doc = doc?;
|
|
||||||
documents.add_documents(doc)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
documents.finish()?;
|
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
Ok(writer.into_inner())
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "infos"
|
name = "infos"
|
||||||
version = "0.18.0"
|
version = "0.20.2"
|
||||||
authors = ["Clément Renault <clement@meilisearch.com>"]
|
authors = ["Clément Renault <clement@meilisearch.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "milli"
|
name = "milli"
|
||||||
version = "0.18.0"
|
version = "0.20.2"
|
||||||
authors = ["Kerollmops <clement@meilisearch.com>"]
|
authors = ["Kerollmops <clement@meilisearch.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
@ -46,6 +46,7 @@ itertools = "0.10.0"
|
|||||||
# logging
|
# logging
|
||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
logging_timer = "1.0.0"
|
logging_timer = "1.0.0"
|
||||||
|
csv = "1.1.6"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
big_s = "1.0.2"
|
big_s = "1.0.2"
|
||||||
|
@ -28,12 +28,12 @@ impl fmt::Display for AscDescError {
|
|||||||
write!(f, "Longitude must be contained between -180 and 180 degrees.",)
|
write!(f, "Longitude must be contained between -180 and 180 degrees.",)
|
||||||
}
|
}
|
||||||
Self::InvalidSyntax { name } => {
|
Self::InvalidSyntax { name } => {
|
||||||
write!(f, "invalid asc/desc syntax for {}.", name)
|
write!(f, "Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name)
|
||||||
}
|
}
|
||||||
Self::ReservedKeyword { name } => {
|
Self::ReservedKeyword { name } => {
|
||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"{} is a reserved keyword and thus can't be used as a asc/desc rule.",
|
"`{}` is a reserved keyword and thus can't be used as a asc/desc rule.",
|
||||||
name
|
name
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -192,18 +192,18 @@ impl fmt::Display for SortError {
|
|||||||
Self::BadGeoPointUsage { name } => {
|
Self::BadGeoPointUsage { name } => {
|
||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"invalid syntax for the `_geoPoint` parameter: `{}`. \
|
"Invalid syntax for the geo parameter: expected expression formated like \
|
||||||
Usage: `_geoPoint(latitude, longitude):asc`.",
|
`_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{}`.",
|
||||||
name
|
name
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
Self::InvalidName { name } => {
|
Self::InvalidName { name } => {
|
||||||
write!(f, "invalid syntax for the sort parameter `{}`.", name)
|
write!(f, "Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name)
|
||||||
}
|
}
|
||||||
Self::ReservedName { name } => {
|
Self::ReservedName { name } => {
|
||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"{} is a reserved keyword and thus can't be used as a sort expression.",
|
"`{}` is a reserved keyword and thus can't be used as a sort expression.",
|
||||||
name
|
name
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -211,7 +211,7 @@ impl fmt::Display for SortError {
|
|||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"`{}` is a reserved keyword and thus can't be used as a sort expression. \
|
"`{}` is a reserved keyword and thus can't be used as a sort expression. \
|
||||||
Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates.",
|
Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.",
|
||||||
name,
|
name,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -318,50 +318,4 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn sort_error_message() {
|
|
||||||
let errors = [
|
|
||||||
(
|
|
||||||
AscDescError::InvalidSyntax { name: S("truc:machin") },
|
|
||||||
S("invalid syntax for the sort parameter `truc:machin`."),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
AscDescError::InvalidSyntax { name: S("hello:world") },
|
|
||||||
S("invalid syntax for the sort parameter `hello:world`."),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
AscDescError::ReservedKeyword { name: S("_geo") },
|
|
||||||
S("`_geo` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
AscDescError::ReservedKeyword { name: S("_geoDistance") },
|
|
||||||
S("_geoDistance is a reserved keyword and thus can't be used as a sort expression.")
|
|
||||||
),
|
|
||||||
(
|
|
||||||
AscDescError::ReservedKeyword { name: S("_geoRadius(12, 13)") },
|
|
||||||
S("`_geoRadius` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
AscDescError::InvalidLatitude,
|
|
||||||
S("Latitude must be contained between -90 and 90 degrees."),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
AscDescError::InvalidLongitude,
|
|
||||||
S("Longitude must be contained between -180 and 180 degrees."),
|
|
||||||
),
|
|
||||||
];
|
|
||||||
|
|
||||||
for (asc_desc_error, expected_message) in errors {
|
|
||||||
let sort_error = SortError::from(asc_desc_error);
|
|
||||||
assert_eq!(
|
|
||||||
sort_error.to_string(),
|
|
||||||
expected_message,
|
|
||||||
"was expecting {} for the error {:?} but instead got {}",
|
|
||||||
expected_message,
|
|
||||||
sort_error,
|
|
||||||
sort_error.to_string()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -17,23 +17,27 @@ pub enum CriterionError {
|
|||||||
impl fmt::Display for CriterionError {
|
impl fmt::Display for CriterionError {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name),
|
Self::InvalidName { name } => write!(f, "`{}` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules.", name),
|
||||||
Self::ReservedName { name } => {
|
Self::ReservedName { name } => {
|
||||||
write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name)
|
write!(
|
||||||
|
f,
|
||||||
|
"`{}` is a reserved keyword and thus can't be used as a ranking rule",
|
||||||
|
name
|
||||||
|
)
|
||||||
}
|
}
|
||||||
Self::ReservedNameForSort { name } => {
|
Self::ReservedNameForSort { name } => {
|
||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"{} is a reserved keyword and thus can't be used as a ranking rule. \
|
"`{}` is a reserved keyword and thus can't be used as a ranking rule. \
|
||||||
{} can only be used for sorting at search time",
|
`{}` can only be used for sorting at search time",
|
||||||
name, name
|
name, name
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
Self::ReservedNameForFilter { name } => {
|
Self::ReservedNameForFilter { name } => {
|
||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"{} is a reserved keyword and thus can't be used as a ranking rule. \
|
"`{}` is a reserved keyword and thus can't be used as a ranking rule. \
|
||||||
{} can only be used for filtering at search time",
|
`{}` can only be used for filtering at search time",
|
||||||
name, name
|
name, name
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -1,16 +1,20 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
use std::io::{Cursor, Write};
|
||||||
|
|
||||||
use byteorder::{BigEndian, WriteBytesExt};
|
use byteorder::{BigEndian, WriteBytesExt};
|
||||||
use serde::ser::Serialize;
|
use serde::Deserializer;
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
use super::serde::DocumentSerializer;
|
use super::serde::DocumentVisitor;
|
||||||
use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
|
use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
|
||||||
|
use crate::FieldId;
|
||||||
|
|
||||||
/// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
|
/// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
|
||||||
/// format used by milli.
|
/// format used by milli.
|
||||||
///
|
///
|
||||||
/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to
|
/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to
|
||||||
/// iterate other the documents.
|
/// iterate over the documents.
|
||||||
///
|
///
|
||||||
/// ## example:
|
/// ## example:
|
||||||
/// ```
|
/// ```
|
||||||
@ -18,43 +22,48 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
|
|||||||
/// use serde_json::json;
|
/// use serde_json::json;
|
||||||
/// use std::io::Cursor;
|
/// use std::io::Cursor;
|
||||||
///
|
///
|
||||||
|
/// let json = r##"{"id": 1, "name": "foo"}"##;
|
||||||
/// let mut writer = Cursor::new(Vec::new());
|
/// let mut writer = Cursor::new(Vec::new());
|
||||||
/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
|
/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
|
||||||
/// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap();
|
/// builder.extend_from_json(&mut json.as_bytes()).unwrap();
|
||||||
/// builder.finish().unwrap();
|
/// builder.finish().unwrap();
|
||||||
/// ```
|
/// ```
|
||||||
pub struct DocumentBatchBuilder<W> {
|
pub struct DocumentBatchBuilder<W> {
|
||||||
serializer: DocumentSerializer<W>,
|
inner: ByteCounter<W>,
|
||||||
|
index: DocumentsBatchIndex,
|
||||||
|
obkv_buffer: Vec<u8>,
|
||||||
|
value_buffer: Vec<u8>,
|
||||||
|
values: BTreeMap<FieldId, Value>,
|
||||||
|
count: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
|
impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
|
||||||
pub fn new(writer: W) -> Result<Self, Error> {
|
pub fn new(writer: W) -> Result<Self, Error> {
|
||||||
let index = DocumentsBatchIndex::new();
|
let index = DocumentsBatchIndex::default();
|
||||||
let mut writer = ByteCounter::new(writer);
|
let mut writer = ByteCounter::new(writer);
|
||||||
// add space to write the offset of the metadata at the end of the writer
|
// add space to write the offset of the metadata at the end of the writer
|
||||||
writer.write_u64::<BigEndian>(0)?;
|
writer.write_u64::<BigEndian>(0)?;
|
||||||
|
|
||||||
let serializer =
|
Ok(Self {
|
||||||
DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true };
|
inner: writer,
|
||||||
|
index,
|
||||||
Ok(Self { serializer })
|
obkv_buffer: Vec::new(),
|
||||||
|
value_buffer: Vec::new(),
|
||||||
|
values: BTreeMap::new(),
|
||||||
|
count: 0,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the number of documents that have been written to the builder.
|
/// Returns the number of documents that have been written to the builder.
|
||||||
pub fn len(&self) -> usize {
|
pub fn len(&self) -> usize {
|
||||||
self.serializer.count
|
self.count
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This method must be called after the document addition is terminated. It will put the
|
/// This method must be called after the document addition is terminated. It will put the
|
||||||
/// metadata at the end of the file, and write the metadata offset at the beginning on the
|
/// metadata at the end of the file, and write the metadata offset at the beginning on the
|
||||||
/// file.
|
/// file.
|
||||||
pub fn finish(self) -> Result<(), Error> {
|
pub fn finish(self) -> Result<usize, Error> {
|
||||||
let DocumentSerializer {
|
let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self;
|
||||||
writer: ByteCounter { mut writer, count: offset },
|
|
||||||
index,
|
|
||||||
count,
|
|
||||||
..
|
|
||||||
} = self.serializer;
|
|
||||||
|
|
||||||
let meta = DocumentsMetadata { count, index };
|
let meta = DocumentsMetadata { count, index };
|
||||||
|
|
||||||
@ -65,16 +74,481 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
|
|||||||
|
|
||||||
writer.flush()?;
|
writer.flush()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(count)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Adds documents to the builder.
|
/// Extends the builder with json documents from a reader.
|
||||||
|
pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
|
||||||
|
let mut de = serde_json::Deserializer::from_reader(reader);
|
||||||
|
|
||||||
|
let mut visitor = DocumentVisitor {
|
||||||
|
inner: &mut self.inner,
|
||||||
|
index: &mut self.index,
|
||||||
|
obkv_buffer: &mut self.obkv_buffer,
|
||||||
|
value_buffer: &mut self.value_buffer,
|
||||||
|
values: &mut self.values,
|
||||||
|
count: &mut self.count,
|
||||||
|
};
|
||||||
|
|
||||||
|
de.deserialize_any(&mut visitor).map_err(Error::JsonError)?
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a builder from a reader of CSV documents.
|
||||||
///
|
///
|
||||||
/// The internal index is updated with the fields found
|
/// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform
|
||||||
/// in the documents. Document must either be a map or a sequences of map, anything else will
|
/// optimisations, and extending from another CSV is not allowed.
|
||||||
/// fail.
|
pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> {
|
||||||
pub fn add_documents<T: Serialize>(&mut self, document: T) -> Result<(), Error> {
|
let mut this = Self::new(writer)?;
|
||||||
document.serialize(&mut self.serializer)?;
|
// Ensure that this is the first and only addition made with this builder
|
||||||
Ok(())
|
debug_assert!(this.index.is_empty());
|
||||||
|
|
||||||
|
let mut records = csv::Reader::from_reader(reader);
|
||||||
|
|
||||||
|
let headers = records
|
||||||
|
.headers()?
|
||||||
|
.into_iter()
|
||||||
|
.map(parse_csv_header)
|
||||||
|
.map(|(k, t)| (this.index.insert(&k), t))
|
||||||
|
.collect::<BTreeMap<_, _>>();
|
||||||
|
|
||||||
|
for (i, record) in records.into_records().enumerate() {
|
||||||
|
let record = record?;
|
||||||
|
this.obkv_buffer.clear();
|
||||||
|
let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer);
|
||||||
|
for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
|
||||||
|
let value = match ty {
|
||||||
|
AllowedType::Number => {
|
||||||
|
value.parse::<f64>().map(Value::from).map_err(|error| {
|
||||||
|
Error::ParseFloat {
|
||||||
|
error,
|
||||||
|
// +1 for the header offset.
|
||||||
|
line: i + 1,
|
||||||
|
value: value.to_string(),
|
||||||
|
}
|
||||||
|
})?
|
||||||
|
}
|
||||||
|
AllowedType::String => Value::String(value.to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
this.value_buffer.clear();
|
||||||
|
serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
|
||||||
|
writer.insert(*fid, &this.value_buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
|
||||||
|
this.inner.write_all(&this.obkv_buffer)?;
|
||||||
|
|
||||||
|
this.count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(this)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
enum AllowedType {
|
||||||
|
String,
|
||||||
|
Number,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_csv_header(header: &str) -> (String, AllowedType) {
|
||||||
|
// if there are several separators we only split on the last one.
|
||||||
|
match header.rsplit_once(':') {
|
||||||
|
Some((field_name, field_type)) => match field_type {
|
||||||
|
"string" => (field_name.to_string(), AllowedType::String),
|
||||||
|
"number" => (field_name.to_string(), AllowedType::Number),
|
||||||
|
// if the pattern isn't reconized, we keep the whole field.
|
||||||
|
_otherwise => (header.to_string(), AllowedType::String),
|
||||||
|
},
|
||||||
|
None => (header.to_string(), AllowedType::String),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use std::io::Cursor;
|
||||||
|
|
||||||
|
use serde_json::{json, Map};
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use crate::documents::DocumentBatchReader;
|
||||||
|
|
||||||
|
fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
|
||||||
|
let mut map = Map::new();
|
||||||
|
|
||||||
|
for (fid, value) in obkv.iter() {
|
||||||
|
let field_name = index.name(fid).unwrap().clone();
|
||||||
|
let value: Value = serde_json::from_slice(value).unwrap();
|
||||||
|
|
||||||
|
map.insert(field_name, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value::Object(map)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn add_single_documents_json() {
|
||||||
|
let mut cursor = Cursor::new(Vec::new());
|
||||||
|
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||||
|
|
||||||
|
let json = serde_json::json!({
|
||||||
|
"id": 1,
|
||||||
|
"field": "hello!",
|
||||||
|
});
|
||||||
|
|
||||||
|
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
|
||||||
|
|
||||||
|
let json = serde_json::json!({
|
||||||
|
"blabla": false,
|
||||||
|
"field": "hello!",
|
||||||
|
"id": 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(builder.len(), 2);
|
||||||
|
|
||||||
|
builder.finish().unwrap();
|
||||||
|
|
||||||
|
cursor.set_position(0);
|
||||||
|
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
|
||||||
|
|
||||||
|
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
assert_eq!(index.len(), 3);
|
||||||
|
assert_eq!(document.iter().count(), 2);
|
||||||
|
|
||||||
|
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
assert_eq!(index.len(), 3);
|
||||||
|
assert_eq!(document.iter().count(), 3);
|
||||||
|
|
||||||
|
assert!(reader.next_document_with_index().unwrap().is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn add_documents_seq_json() {
|
||||||
|
let mut cursor = Cursor::new(Vec::new());
|
||||||
|
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||||
|
|
||||||
|
let json = serde_json::json!([{
|
||||||
|
"id": 1,
|
||||||
|
"field": "hello!",
|
||||||
|
},{
|
||||||
|
"blabla": false,
|
||||||
|
"field": "hello!",
|
||||||
|
"id": 1,
|
||||||
|
}
|
||||||
|
]);
|
||||||
|
|
||||||
|
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(builder.len(), 2);
|
||||||
|
|
||||||
|
builder.finish().unwrap();
|
||||||
|
|
||||||
|
cursor.set_position(0);
|
||||||
|
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
|
||||||
|
|
||||||
|
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
assert_eq!(index.len(), 3);
|
||||||
|
assert_eq!(document.iter().count(), 2);
|
||||||
|
|
||||||
|
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
assert_eq!(index.len(), 3);
|
||||||
|
assert_eq!(document.iter().count(), 3);
|
||||||
|
|
||||||
|
assert!(reader.next_document_with_index().unwrap().is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn add_documents_csv() {
|
||||||
|
let mut cursor = Cursor::new(Vec::new());
|
||||||
|
|
||||||
|
let csv = "id:number,field:string\n1,hello!\n2,blabla";
|
||||||
|
|
||||||
|
let builder =
|
||||||
|
DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
|
||||||
|
builder.finish().unwrap();
|
||||||
|
|
||||||
|
cursor.set_position(0);
|
||||||
|
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
|
||||||
|
|
||||||
|
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
assert_eq!(index.len(), 2);
|
||||||
|
assert_eq!(document.iter().count(), 2);
|
||||||
|
|
||||||
|
let (_index, document) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
assert_eq!(document.iter().count(), 2);
|
||||||
|
|
||||||
|
assert!(reader.next_document_with_index().unwrap().is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn simple_csv_document() {
|
||||||
|
let documents = r#"city,country,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(reader.next_document_with_index().unwrap().is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn coma_in_field() {
|
||||||
|
let documents = r#"city,country,pop
|
||||||
|
"Boston","United, States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city": "Boston",
|
||||||
|
"country": "United, States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn quote_in_field() {
|
||||||
|
let documents = r#"city,country,pop
|
||||||
|
"Boston","United"" States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city": "Boston",
|
||||||
|
"country": "United\" States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn integer_in_field() {
|
||||||
|
let documents = r#"city,country,pop:number
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": 4628910.0,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn float_in_field() {
|
||||||
|
let documents = r#"city,country,pop:number
|
||||||
|
"Boston","United States","4628910.01""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": 4628910.01,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn several_colon_in_header() {
|
||||||
|
let documents = r#"city:love:string,country:state,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city:love": "Boston",
|
||||||
|
"country:state": "United States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn ending_by_colon_in_header() {
|
||||||
|
let documents = r#"city:,country,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city:": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn starting_by_colon_in_header() {
|
||||||
|
let documents = r#":city,country,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
":city": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[ignore]
|
||||||
|
#[test]
|
||||||
|
fn starting_by_colon_in_header2() {
|
||||||
|
let documents = r#":string,country,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
|
||||||
|
assert!(reader.next_document_with_index().is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn double_colon_in_header() {
|
||||||
|
let documents = r#"city::string,country,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city:": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bad_type_in_header() {
|
||||||
|
let documents = r#"city,country:number,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
assert!(
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bad_column_count1() {
|
||||||
|
let documents = r#"city,country,pop
|
||||||
|
"Boston","United States","4628910", "too much""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
assert!(
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bad_column_count2() {
|
||||||
|
let documents = r#"city,country,pop
|
||||||
|
"Boston","United States""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
assert!(
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,8 @@ mod builder;
|
|||||||
mod reader;
|
mod reader;
|
||||||
mod serde;
|
mod serde;
|
||||||
|
|
||||||
use std::{fmt, io};
|
use std::fmt::{self, Debug};
|
||||||
|
use std::io;
|
||||||
|
|
||||||
use ::serde::{Deserialize, Serialize};
|
use ::serde::{Deserialize, Serialize};
|
||||||
use bimap::BiHashMap;
|
use bimap::BiHashMap;
|
||||||
@ -17,7 +18,38 @@ pub use reader::DocumentBatchReader;
|
|||||||
use crate::FieldId;
|
use crate::FieldId;
|
||||||
|
|
||||||
/// A bidirectional map that links field ids to their name in a document batch.
|
/// A bidirectional map that links field ids to their name in a document batch.
|
||||||
pub type DocumentsBatchIndex = BiHashMap<FieldId, String>;
|
#[derive(Default, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
|
||||||
|
|
||||||
|
impl DocumentsBatchIndex {
|
||||||
|
/// Insert the field in the map, or return it's field id if it doesn't already exists.
|
||||||
|
pub fn insert(&mut self, field: &str) -> FieldId {
|
||||||
|
match self.0.get_by_right(field) {
|
||||||
|
Some(field_id) => *field_id,
|
||||||
|
None => {
|
||||||
|
let field_id = self.0.len() as FieldId;
|
||||||
|
self.0.insert(field_id, field.to_string());
|
||||||
|
field_id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.0.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.0.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn iter(&self) -> bimap::hash::Iter<FieldId, String> {
|
||||||
|
self.0.iter()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn name(&self, id: FieldId) -> Option<&String> {
|
||||||
|
self.0.get_by_left(&id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
struct DocumentsMetadata {
|
struct DocumentsMetadata {
|
||||||
@ -50,14 +82,22 @@ impl<W: io::Write> io::Write for ByteCounter<W> {
|
|||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
|
ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
|
||||||
InvalidDocumentFormat,
|
InvalidDocumentFormat,
|
||||||
Custom(String),
|
Custom(String),
|
||||||
JsonError(serde_json::Error),
|
JsonError(serde_json::Error),
|
||||||
|
CsvError(csv::Error),
|
||||||
Serialize(bincode::Error),
|
Serialize(bincode::Error),
|
||||||
Io(io::Error),
|
Io(io::Error),
|
||||||
DocumentTooLarge,
|
DocumentTooLarge,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<csv::Error> for Error {
|
||||||
|
fn from(e: csv::Error) -> Self {
|
||||||
|
Self::CsvError(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl From<io::Error> for Error {
|
impl From<io::Error> for Error {
|
||||||
fn from(other: io::Error) -> Self {
|
fn from(other: io::Error) -> Self {
|
||||||
Self::Io(other)
|
Self::Io(other)
|
||||||
@ -70,15 +110,25 @@ impl From<bincode::Error> for Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<serde_json::Error> for Error {
|
||||||
|
fn from(other: serde_json::Error) -> Self {
|
||||||
|
Self::JsonError(other)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl fmt::Display for Error {
|
impl fmt::Display for Error {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
|
Error::ParseFloat { error, line, value } => {
|
||||||
|
write!(f, "Error parsing number {:?} at line {}: {}", value, line, error)
|
||||||
|
}
|
||||||
Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
|
Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
|
||||||
Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
|
Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
|
||||||
Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
|
Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
|
||||||
Error::Io(e) => e.fmt(f),
|
Error::Io(e) => write!(f, "{}", e),
|
||||||
Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"),
|
Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"),
|
||||||
Error::Serialize(e) => e.fmt(f),
|
Error::Serialize(e) => write!(f, "{}", e),
|
||||||
|
Error::CsvError(e) => write!(f, "{}", e),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -92,7 +142,8 @@ macro_rules! documents {
|
|||||||
let documents = serde_json::json!($data);
|
let documents = serde_json::json!($data);
|
||||||
let mut writer = std::io::Cursor::new(Vec::new());
|
let mut writer = std::io::Cursor::new(Vec::new());
|
||||||
let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
|
let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
let documents = serde_json::to_vec(&documents).unwrap();
|
||||||
|
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
|
||||||
builder.finish().unwrap();
|
builder.finish().unwrap();
|
||||||
|
|
||||||
writer.set_position(0);
|
writer.set_position(0);
|
||||||
@ -103,6 +154,8 @@ macro_rules! documents {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
|
use std::io::Cursor;
|
||||||
|
|
||||||
use serde_json::{json, Value};
|
use serde_json::{json, Value};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
@ -119,12 +172,14 @@ mod test {
|
|||||||
"bool": true
|
"bool": true
|
||||||
});
|
});
|
||||||
|
|
||||||
|
let json = serde_json::to_vec(&json).unwrap();
|
||||||
|
|
||||||
let mut v = Vec::new();
|
let mut v = Vec::new();
|
||||||
let mut cursor = io::Cursor::new(&mut v);
|
let mut cursor = io::Cursor::new(&mut v);
|
||||||
|
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||||
|
|
||||||
builder.add_documents(json).unwrap();
|
builder.extend_from_json(Cursor::new(json)).unwrap();
|
||||||
|
|
||||||
builder.finish().unwrap();
|
builder.finish().unwrap();
|
||||||
|
|
||||||
@ -148,13 +203,16 @@ mod test {
|
|||||||
"toto": false,
|
"toto": false,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
let doc1 = serde_json::to_vec(&doc1).unwrap();
|
||||||
|
let doc2 = serde_json::to_vec(&doc2).unwrap();
|
||||||
|
|
||||||
let mut v = Vec::new();
|
let mut v = Vec::new();
|
||||||
let mut cursor = io::Cursor::new(&mut v);
|
let mut cursor = io::Cursor::new(&mut v);
|
||||||
|
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||||
|
|
||||||
builder.add_documents(doc1).unwrap();
|
builder.extend_from_json(Cursor::new(doc1)).unwrap();
|
||||||
builder.add_documents(doc2).unwrap();
|
builder.extend_from_json(Cursor::new(doc2)).unwrap();
|
||||||
|
|
||||||
builder.finish().unwrap();
|
builder.finish().unwrap();
|
||||||
|
|
||||||
@ -177,12 +235,14 @@ mod test {
|
|||||||
{ "tata": "hello" },
|
{ "tata": "hello" },
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
let docs = serde_json::to_vec(&docs).unwrap();
|
||||||
|
|
||||||
let mut v = Vec::new();
|
let mut v = Vec::new();
|
||||||
let mut cursor = io::Cursor::new(&mut v);
|
let mut cursor = io::Cursor::new(&mut v);
|
||||||
|
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||||
|
|
||||||
builder.add_documents(docs).unwrap();
|
builder.extend_from_json(Cursor::new(docs)).unwrap();
|
||||||
|
|
||||||
builder.finish().unwrap();
|
builder.finish().unwrap();
|
||||||
|
|
||||||
@ -210,11 +270,13 @@ mod test {
|
|||||||
{ "tata": "hello" },
|
{ "tata": "hello" },
|
||||||
]]);
|
]]);
|
||||||
|
|
||||||
assert!(builder.add_documents(docs).is_err());
|
let docs = serde_json::to_vec(&docs).unwrap();
|
||||||
|
assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
|
||||||
|
|
||||||
let docs = json!("hello");
|
let docs = json!("hello");
|
||||||
|
let docs = serde_json::to_vec(&docs).unwrap();
|
||||||
|
|
||||||
assert!(builder.add_documents(docs).is_err());
|
assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -1,474 +1,134 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::convert::TryInto;
|
use std::fmt;
|
||||||
use std::io::Cursor;
|
use std::io::{Cursor, Write};
|
||||||
use std::{fmt, io};
|
|
||||||
|
|
||||||
use byteorder::{BigEndian, WriteBytesExt};
|
use byteorder::WriteBytesExt;
|
||||||
use obkv::KvWriter;
|
use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
|
||||||
use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer};
|
use serde::Deserialize;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use super::{ByteCounter, DocumentsBatchIndex, Error};
|
use super::{ByteCounter, DocumentsBatchIndex, Error};
|
||||||
use crate::FieldId;
|
use crate::FieldId;
|
||||||
|
|
||||||
pub struct DocumentSerializer<W> {
|
macro_rules! tri {
|
||||||
pub writer: ByteCounter<W>,
|
($e:expr) => {
|
||||||
pub buffer: Vec<u8>,
|
match $e {
|
||||||
pub index: DocumentsBatchIndex,
|
Ok(r) => r,
|
||||||
pub count: usize,
|
Err(e) => return Ok(Err(e.into())),
|
||||||
pub allow_seq: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer<W> {
|
|
||||||
type Ok = ();
|
|
||||||
|
|
||||||
type Error = Error;
|
|
||||||
|
|
||||||
type SerializeSeq = SeqSerializer<'a, W>;
|
|
||||||
type SerializeTuple = Impossible<(), Self::Error>;
|
|
||||||
type SerializeTupleStruct = Impossible<(), Self::Error>;
|
|
||||||
type SerializeTupleVariant = Impossible<(), Self::Error>;
|
|
||||||
type SerializeMap = MapSerializer<'a, &'a mut ByteCounter<W>>;
|
|
||||||
type SerializeStruct = Impossible<(), Self::Error>;
|
|
||||||
type SerializeStructVariant = Impossible<(), Self::Error>;
|
|
||||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
|
||||||
self.buffer.clear();
|
|
||||||
let cursor = io::Cursor::new(&mut self.buffer);
|
|
||||||
self.count += 1;
|
|
||||||
let map_serializer = MapSerializer {
|
|
||||||
map: KvWriter::new(cursor),
|
|
||||||
index: &mut self.index,
|
|
||||||
writer: &mut self.writer,
|
|
||||||
mapped_documents: BTreeMap::new(),
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(map_serializer)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
|
||||||
if self.allow_seq {
|
|
||||||
// Only allow sequence of documents of depth 1.
|
|
||||||
self.allow_seq = false;
|
|
||||||
Ok(SeqSerializer { serializer: self })
|
|
||||||
} else {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
|
||||||
where
|
|
||||||
T: Serialize,
|
|
||||||
{
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_unit_variant(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_variant_index: u32,
|
|
||||||
_variant: &'static str,
|
|
||||||
) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_newtype_struct<T: ?Sized>(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_value: &T,
|
|
||||||
) -> Result<Self::Ok, Self::Error>
|
|
||||||
where
|
|
||||||
T: Serialize,
|
|
||||||
{
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_newtype_variant<T: ?Sized>(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_variant_index: u32,
|
|
||||||
_variant: &'static str,
|
|
||||||
_value: &T,
|
|
||||||
) -> Result<Self::Ok, Self::Error>
|
|
||||||
where
|
|
||||||
T: Serialize,
|
|
||||||
{
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_tuple_struct(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_len: usize,
|
|
||||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_tuple_variant(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_variant_index: u32,
|
|
||||||
_variant: &'static str,
|
|
||||||
_len: usize,
|
|
||||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_struct(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_len: usize,
|
|
||||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_struct_variant(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_variant_index: u32,
|
|
||||||
_variant: &'static str,
|
|
||||||
_len: usize,
|
|
||||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct SeqSerializer<'a, W> {
|
|
||||||
serializer: &'a mut DocumentSerializer<W>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> {
|
|
||||||
type Ok = ();
|
|
||||||
type Error = Error;
|
|
||||||
|
|
||||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
|
||||||
where
|
|
||||||
T: Serialize,
|
|
||||||
{
|
|
||||||
value.serialize(&mut *self.serializer)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct MapSerializer<'a, W> {
|
|
||||||
map: KvWriter<io::Cursor<&'a mut Vec<u8>>, FieldId>,
|
|
||||||
index: &'a mut DocumentsBatchIndex,
|
|
||||||
writer: W,
|
|
||||||
mapped_documents: BTreeMap<FieldId, Value>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and
|
|
||||||
/// serialize_value, therefore these to methods remain unimplemented.
|
|
||||||
impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> {
|
|
||||||
type Ok = ();
|
|
||||||
type Error = Error;
|
|
||||||
|
|
||||||
fn serialize_key<T: ?Sized + Serialize>(&mut self, _key: &T) -> Result<(), Self::Error> {
|
|
||||||
unreachable!()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_value<T: ?Sized>(&mut self, _value: &T) -> Result<(), Self::Error> {
|
|
||||||
unreachable!()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn end(mut self) -> Result<Self::Ok, Self::Error> {
|
|
||||||
let mut buf = Vec::new();
|
|
||||||
for (key, value) in self.mapped_documents {
|
|
||||||
buf.clear();
|
|
||||||
let mut cursor = Cursor::new(&mut buf);
|
|
||||||
serde_json::to_writer(&mut cursor, &value).map_err(Error::JsonError)?;
|
|
||||||
self.map.insert(key, cursor.into_inner()).map_err(Error::Io)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let data = self.map.into_inner().map_err(Error::Io)?.into_inner();
|
|
||||||
let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?;
|
|
||||||
|
|
||||||
self.writer.write_u32::<BigEndian>(data_len).map_err(Error::Io)?;
|
|
||||||
self.writer.write_all(&data).map_err(Error::Io)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
|
||||||
&mut self,
|
|
||||||
key: &K,
|
|
||||||
value: &V,
|
|
||||||
) -> Result<(), Self::Error>
|
|
||||||
where
|
|
||||||
K: Serialize,
|
|
||||||
V: Serialize,
|
|
||||||
{
|
|
||||||
let field_serializer = FieldSerializer { index: &mut self.index };
|
|
||||||
let field_id: FieldId = key.serialize(field_serializer)?;
|
|
||||||
|
|
||||||
let value = serde_json::to_value(value).map_err(Error::JsonError)?;
|
|
||||||
|
|
||||||
self.mapped_documents.insert(field_id, value);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct FieldSerializer<'a> {
|
|
||||||
index: &'a mut DocumentsBatchIndex,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> serde::Serializer for FieldSerializer<'a> {
|
|
||||||
type Ok = FieldId;
|
|
||||||
|
|
||||||
type Error = Error;
|
|
||||||
|
|
||||||
type SerializeSeq = Impossible<FieldId, Self::Error>;
|
|
||||||
type SerializeTuple = Impossible<FieldId, Self::Error>;
|
|
||||||
type SerializeTupleStruct = Impossible<FieldId, Self::Error>;
|
|
||||||
type SerializeTupleVariant = Impossible<FieldId, Self::Error>;
|
|
||||||
type SerializeMap = Impossible<FieldId, Self::Error>;
|
|
||||||
type SerializeStruct = Impossible<FieldId, Self::Error>;
|
|
||||||
type SerializeStructVariant = Impossible<FieldId, Self::Error>;
|
|
||||||
|
|
||||||
fn serialize_str(self, ws: &str) -> Result<Self::Ok, Self::Error> {
|
|
||||||
let field_id = match self.index.get_by_right(ws) {
|
|
||||||
Some(field_id) => *field_id,
|
|
||||||
None => {
|
|
||||||
let field_id = self.index.len() as FieldId;
|
|
||||||
self.index.insert(field_id, ws.to_string());
|
|
||||||
field_id
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(field_id)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
|
struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex);
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
|
impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> {
|
||||||
Err(Error::InvalidDocumentFormat)
|
type Value = FieldId;
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
|
fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
|
||||||
where
|
where
|
||||||
T: Serialize,
|
D: serde::Deserializer<'de>,
|
||||||
{
|
{
|
||||||
Err(Error::InvalidDocumentFormat)
|
deserializer.deserialize_str(self)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> {
|
||||||
Err(Error::InvalidDocumentFormat)
|
type Value = FieldId;
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_unit_variant(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_variant_index: u32,
|
|
||||||
_variant: &'static str,
|
|
||||||
) -> Result<Self::Ok, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_newtype_struct<T: ?Sized>(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_value: &T,
|
|
||||||
) -> Result<Self::Ok, Self::Error>
|
|
||||||
where
|
where
|
||||||
T: Serialize,
|
E: serde::de::Error,
|
||||||
{
|
{
|
||||||
Err(Error::InvalidDocumentFormat)
|
Ok(self.0.insert(v))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_newtype_variant<T: ?Sized>(
|
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
self,
|
write!(f, "a string")
|
||||||
_name: &'static str,
|
}
|
||||||
_variant_index: u32,
|
}
|
||||||
_variant: &'static str,
|
|
||||||
_value: &T,
|
struct ValueDeserializer;
|
||||||
) -> Result<Self::Ok, Self::Error>
|
|
||||||
|
impl<'de> DeserializeSeed<'de> for ValueDeserializer {
|
||||||
|
type Value = serde_json::Value;
|
||||||
|
|
||||||
|
fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
|
||||||
where
|
where
|
||||||
T: Serialize,
|
D: serde::Deserializer<'de>,
|
||||||
{
|
{
|
||||||
Err(Error::InvalidDocumentFormat)
|
serde_json::Value::deserialize(deserializer)
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_tuple_struct(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_len: usize,
|
|
||||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_tuple_variant(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_variant_index: u32,
|
|
||||||
_variant: &'static str,
|
|
||||||
_len: usize,
|
|
||||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_struct(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_len: usize,
|
|
||||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize_struct_variant(
|
|
||||||
self,
|
|
||||||
_name: &'static str,
|
|
||||||
_variant_index: u32,
|
|
||||||
_variant: &'static str,
|
|
||||||
_len: usize,
|
|
||||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
|
||||||
Err(Error::InvalidDocumentFormat)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl serde::ser::Error for Error {
|
pub struct DocumentVisitor<'a, W> {
|
||||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
pub inner: &'a mut ByteCounter<W>,
|
||||||
Error::Custom(msg.to_string())
|
pub index: &'a mut DocumentsBatchIndex,
|
||||||
|
pub obkv_buffer: &'a mut Vec<u8>,
|
||||||
|
pub value_buffer: &'a mut Vec<u8>,
|
||||||
|
pub values: &'a mut BTreeMap<FieldId, Value>,
|
||||||
|
pub count: &'a mut usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
|
||||||
|
/// This Visitor value is nothing, since it write the value to a file.
|
||||||
|
type Value = Result<(), Error>;
|
||||||
|
|
||||||
|
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
|
||||||
|
where
|
||||||
|
A: SeqAccess<'de>,
|
||||||
|
{
|
||||||
|
while let Some(v) = seq.next_element_seed(&mut *self)? {
|
||||||
|
tri!(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Ok(()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
|
||||||
|
where
|
||||||
|
A: MapAccess<'de>,
|
||||||
|
{
|
||||||
|
while let Some((key, value)) =
|
||||||
|
map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)?
|
||||||
|
{
|
||||||
|
self.values.insert(key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.obkv_buffer.clear();
|
||||||
|
let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer));
|
||||||
|
for (key, value) in self.values.iter() {
|
||||||
|
self.value_buffer.clear();
|
||||||
|
// This is guaranteed to work
|
||||||
|
tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value));
|
||||||
|
tri!(obkv.insert(*key, &self.value_buffer));
|
||||||
|
}
|
||||||
|
|
||||||
|
let reader = tri!(obkv.into_inner()).into_inner();
|
||||||
|
|
||||||
|
tri!(self.inner.write_u32::<byteorder::BigEndian>(reader.len() as u32));
|
||||||
|
tri!(self.inner.write_all(reader));
|
||||||
|
|
||||||
|
*self.count += 1;
|
||||||
|
self.values.clear();
|
||||||
|
|
||||||
|
Ok(Ok(()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
write!(f, "a documents, or a sequence of documents.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W>
|
||||||
|
where
|
||||||
|
W: Write,
|
||||||
|
{
|
||||||
|
type Value = Result<(), Error>;
|
||||||
|
|
||||||
|
fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
deserializer.deserialize_map(self)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::BTreeSet;
|
||||||
use std::convert::Infallible;
|
use std::convert::Infallible;
|
||||||
use std::error::Error as StdError;
|
use std::error::Error as StdError;
|
||||||
use std::{fmt, io, str};
|
use std::{fmt, io, str};
|
||||||
@ -57,19 +57,17 @@ pub enum UserError {
|
|||||||
CriterionError(CriterionError),
|
CriterionError(CriterionError),
|
||||||
DocumentLimitReached,
|
DocumentLimitReached,
|
||||||
InvalidDocumentId { document_id: Value },
|
InvalidDocumentId { document_id: Value },
|
||||||
InvalidFacetsDistribution { invalid_facets_name: HashSet<String> },
|
InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> },
|
||||||
InvalidGeoField { document_id: Value, object: Value },
|
InvalidGeoField { document_id: Value, object: Value },
|
||||||
InvalidFilter(String),
|
InvalidFilter(String),
|
||||||
InvalidSortName { name: String },
|
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
|
||||||
InvalidSortableAttribute { field: String, valid_fields: HashSet<String> },
|
|
||||||
SortRankingRuleMissing,
|
SortRankingRuleMissing,
|
||||||
InvalidStoreFile,
|
InvalidStoreFile,
|
||||||
MaxDatabaseSizeReached,
|
MaxDatabaseSizeReached,
|
||||||
MissingDocumentId { document: Object },
|
MissingDocumentId { primary_key: String, document: Object },
|
||||||
MissingPrimaryKey,
|
MissingPrimaryKey,
|
||||||
NoSpaceLeftOnDevice,
|
NoSpaceLeftOnDevice,
|
||||||
PrimaryKeyCannotBeChanged,
|
PrimaryKeyCannotBeChanged(String),
|
||||||
PrimaryKeyCannotBeReset,
|
|
||||||
SerdeJson(serde_json::Error),
|
SerdeJson(serde_json::Error),
|
||||||
SortError(SortError),
|
SortError(SortError),
|
||||||
UnknownInternalDocumentId { document_id: DocumentId },
|
UnknownInternalDocumentId { document_id: DocumentId },
|
||||||
@ -168,7 +166,7 @@ impl From<SerializationError> for Error {
|
|||||||
impl fmt::Display for Error {
|
impl fmt::Display for Error {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
Self::InternalError(error) => write!(f, "internal: {}", error),
|
Self::InternalError(error) => write!(f, "internal: {}.", error),
|
||||||
Self::IoError(error) => error.fmt(f),
|
Self::IoError(error) => error.fmt(f),
|
||||||
Self::UserError(error) => error.fmt(f),
|
Self::UserError(error) => error.fmt(f),
|
||||||
}
|
}
|
||||||
@ -181,15 +179,15 @@ impl fmt::Display for InternalError {
|
|||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
Self::DatabaseMissingEntry { db_name, key } => {
|
Self::DatabaseMissingEntry { db_name, key } => {
|
||||||
write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name)
|
write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name)
|
||||||
}
|
}
|
||||||
Self::FieldIdMapMissingEntry(error) => error.fmt(f),
|
Self::FieldIdMapMissingEntry(error) => error.fmt(f),
|
||||||
Self::Fst(error) => error.fmt(f),
|
Self::Fst(error) => error.fmt(f),
|
||||||
Self::GrenadInvalidCompressionType => {
|
Self::GrenadInvalidCompressionType => {
|
||||||
f.write_str("invalid compression type have been specified to grenad")
|
f.write_str("Invalid compression type have been specified to grenad.")
|
||||||
}
|
}
|
||||||
Self::IndexingMergingKeys { process } => {
|
Self::IndexingMergingKeys { process } => {
|
||||||
write!(f, "invalid merge while processing {}", process)
|
write!(f, "Invalid merge while processing {}.", process)
|
||||||
}
|
}
|
||||||
Self::Serialization(error) => error.fmt(f),
|
Self::Serialization(error) => error.fmt(f),
|
||||||
Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f),
|
Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f),
|
||||||
@ -207,69 +205,75 @@ impl StdError for InternalError {}
|
|||||||
impl fmt::Display for UserError {
|
impl fmt::Display for UserError {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
Self::InvalidFilter(input) => write!(f, "{}", input),
|
Self::InvalidFilter(error) => f.write_str(error),
|
||||||
Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"),
|
Self::AttributeLimitReached => f.write_str("A document cannot contain more than 65,535 fields."),
|
||||||
Self::CriterionError(error) => write!(f, "{}", error),
|
Self::CriterionError(error) => write!(f, "{}", error),
|
||||||
Self::DocumentLimitReached => f.write_str("maximum number of documents reached"),
|
Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."),
|
||||||
Self::InvalidFacetsDistribution { invalid_facets_name } => {
|
Self::InvalidFacetsDistribution { invalid_facets_name } => {
|
||||||
let name_list =
|
let name_list =
|
||||||
invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ");
|
invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ");
|
||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"invalid facet distribution, the fields {} are not set as filterable",
|
"Invalid facet distribution, the fields `{}` are not set as filterable.",
|
||||||
name_list
|
name_list
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
Self::InvalidGeoField { document_id, object } => write!(
|
Self::InvalidGeoField { document_id, object } => {
|
||||||
f,
|
let document_id = match document_id {
|
||||||
"the document with the id: {} contains an invalid _geo field: {}",
|
Value::String(id) => id.clone(),
|
||||||
document_id, object
|
_ => document_id.to_string(),
|
||||||
),
|
};
|
||||||
Self::InvalidDocumentId { document_id } => {
|
let object = match object {
|
||||||
let json = serde_json::to_string(document_id).unwrap();
|
Value::String(id) => id.clone(),
|
||||||
|
_ => object.to_string(),
|
||||||
|
};
|
||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"document identifier is invalid {}, \
|
"The document with the id: `{}` contains an invalid _geo field: `{}`.",
|
||||||
a document id can be of type integer or string \
|
document_id, object
|
||||||
only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)",
|
)
|
||||||
json
|
},
|
||||||
|
Self::InvalidDocumentId { document_id } => {
|
||||||
|
let document_id = match document_id {
|
||||||
|
Value::String(id) => id.clone(),
|
||||||
|
_ => document_id.to_string(),
|
||||||
|
};
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"Document identifier `{}` is invalid. \
|
||||||
|
A document identifier can be of type integer or string, \
|
||||||
|
only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).",
|
||||||
|
document_id
|
||||||
)
|
)
|
||||||
}
|
|
||||||
Self::InvalidSortName { name } => {
|
|
||||||
write!(f, "Invalid syntax for the sort parameter: {}", name)
|
|
||||||
}
|
}
|
||||||
Self::InvalidSortableAttribute { field, valid_fields } => {
|
Self::InvalidSortableAttribute { field, valid_fields } => {
|
||||||
let valid_names =
|
let valid_names =
|
||||||
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ");
|
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ");
|
||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"Attribute {} is not sortable, available sortable attributes are: {}",
|
"Attribute `{}` is not sortable. Available sortable attributes are: `{}`.",
|
||||||
field, valid_names
|
field, valid_names
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
Self::SortRankingRuleMissing => f.write_str(
|
Self::SortRankingRuleMissing => f.write_str(
|
||||||
"You must specify where \"sort\" is listed in the \
|
"The sort ranking rule must be specified in the \
|
||||||
rankingRules setting to use the sort parameter at search time",
|
ranking rules settings to use the sort parameter at search time.",
|
||||||
),
|
),
|
||||||
Self::MissingDocumentId { document } => {
|
Self::MissingDocumentId { primary_key, document } => {
|
||||||
let json = serde_json::to_string(document).unwrap();
|
let json = serde_json::to_string(document).unwrap();
|
||||||
write!(f, "document doesn't have an identifier {}", json)
|
write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json)
|
||||||
}
|
}
|
||||||
Self::MissingPrimaryKey => f.write_str("missing primary key"),
|
Self::MissingPrimaryKey => f.write_str("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index."),
|
||||||
Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"),
|
Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."),
|
||||||
// TODO where can we find it instead of writing the text ourselves?
|
Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."),
|
||||||
Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"),
|
Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."),
|
||||||
Self::InvalidStoreFile => f.write_str("store file is not a valid database file"),
|
Self::PrimaryKeyCannotBeChanged(primary_key) => {
|
||||||
Self::PrimaryKeyCannotBeChanged => {
|
write!(f, "Index already has a primary key: `{}`.", primary_key)
|
||||||
f.write_str("primary key cannot be changed if the database contains documents")
|
|
||||||
}
|
|
||||||
Self::PrimaryKeyCannotBeReset => {
|
|
||||||
f.write_str("primary key cannot be reset if the database contains documents")
|
|
||||||
}
|
}
|
||||||
Self::SerdeJson(error) => error.fmt(f),
|
Self::SerdeJson(error) => error.fmt(f),
|
||||||
Self::SortError(error) => write!(f, "{}", error),
|
Self::SortError(error) => write!(f, "{}", error),
|
||||||
Self::UnknownInternalDocumentId { document_id } => {
|
Self::UnknownInternalDocumentId { document_id } => {
|
||||||
write!(f, "an unknown internal document id have been used ({})", document_id)
|
write!(f, "An unknown internal document id have been used: `{}`.", document_id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -953,6 +953,7 @@ pub(crate) mod tests {
|
|||||||
{ "id": 1, "name": "kevin", "has_dog": true },
|
{ "id": 1, "name": "kevin", "has_dog": true },
|
||||||
{ "id": 2, "name": "bob" }
|
{ "id": 2, "name": "bob" }
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||||
builder.execute(content, |_, _| ()).unwrap();
|
builder.execute(content, |_, _| ()).unwrap();
|
||||||
|
@ -68,7 +68,9 @@ mod test {
|
|||||||
"txts": sample_txts[..(rng.gen_range(0..3))],
|
"txts": sample_txts[..(rng.gen_range(0..3))],
|
||||||
"cat-ints": sample_ints[..(rng.gen_range(0..3))],
|
"cat-ints": sample_ints[..(rng.gen_range(0..3))],
|
||||||
});
|
});
|
||||||
builder.add_documents(doc).unwrap();
|
|
||||||
|
let doc = Cursor::new(serde_json::to_vec(&doc).unwrap());
|
||||||
|
builder.extend_from_json(doc).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.finish().unwrap();
|
builder.finish().unwrap();
|
||||||
|
33
milli/src/search/facet/grammar.pest
Normal file
33
milli/src/search/facet/grammar.pest
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
key = _{reserved | quoted | word }
|
||||||
|
value = _{quoted | word }
|
||||||
|
quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP }
|
||||||
|
string = {char*}
|
||||||
|
word = ${(LETTER | NUMBER | "_" | "-" | ".")+}
|
||||||
|
|
||||||
|
char = _{ !(PEEK | "\\") ~ ANY
|
||||||
|
| "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t")
|
||||||
|
| "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})}
|
||||||
|
|
||||||
|
reserved = { "_geoDistance" | ("_geoPoint" ~ parameters) | "_geo" }
|
||||||
|
// we deliberately choose to allow empty parameters to generate more specific error message later
|
||||||
|
parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""}
|
||||||
|
condition = _{between | eq | greater | less | geq | leq | neq}
|
||||||
|
between = {key ~ value ~ "TO" ~ value}
|
||||||
|
geq = {key ~ ">=" ~ value}
|
||||||
|
leq = {key ~ "<=" ~ value}
|
||||||
|
neq = {key ~ "!=" ~ value}
|
||||||
|
eq = {key ~ "=" ~ value}
|
||||||
|
greater = {key ~ ">" ~ value}
|
||||||
|
less = {key ~ "<" ~ value}
|
||||||
|
geo_radius = {"_geoRadius" ~ parameters }
|
||||||
|
|
||||||
|
prgm = {SOI ~ expr ~ EOI}
|
||||||
|
expr = _{ ( term ~ (operation ~ term)* ) }
|
||||||
|
term = { ("(" ~ expr ~ ")") | condition | not | geo_radius }
|
||||||
|
operation = _{ and | or }
|
||||||
|
and = {"AND"}
|
||||||
|
or = {"OR"}
|
||||||
|
|
||||||
|
not = {"NOT" ~ term}
|
||||||
|
|
||||||
|
WHITESPACE = _{ " " }
|
@ -151,13 +151,13 @@ impl<'a> Search<'a> {
|
|||||||
Member::Field(ref field) if !sortable_fields.contains(field) => {
|
Member::Field(ref field) if !sortable_fields.contains(field) => {
|
||||||
return Err(UserError::InvalidSortableAttribute {
|
return Err(UserError::InvalidSortableAttribute {
|
||||||
field: field.to_string(),
|
field: field.to_string(),
|
||||||
valid_fields: sortable_fields,
|
valid_fields: sortable_fields.into_iter().collect(),
|
||||||
})?
|
})?
|
||||||
}
|
}
|
||||||
Member::Geo(_) if !sortable_fields.contains("_geo") => {
|
Member::Geo(_) if !sortable_fields.contains("_geo") => {
|
||||||
return Err(UserError::InvalidSortableAttribute {
|
return Err(UserError::InvalidSortableAttribute {
|
||||||
field: "_geo".to_string(),
|
field: "_geo".to_string(),
|
||||||
valid_fields: sortable_fields,
|
valid_fields: sortable_fields.into_iter().collect(),
|
||||||
})?
|
})?
|
||||||
}
|
}
|
||||||
_ => (),
|
_ => (),
|
||||||
|
@ -877,7 +877,8 @@ mod tests {
|
|||||||
let mut cursor = Cursor::new(Vec::new());
|
let mut cursor = Cursor::new(Vec::new());
|
||||||
|
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||||
builder.add_documents(big_object).unwrap();
|
let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap());
|
||||||
|
builder.extend_from_json(big_object).unwrap();
|
||||||
builder.finish().unwrap();
|
builder.finish().unwrap();
|
||||||
cursor.set_position(0);
|
cursor.set_position(0);
|
||||||
let content = DocumentBatchReader::from_reader(cursor).unwrap();
|
let content = DocumentBatchReader::from_reader(cursor).unwrap();
|
||||||
@ -905,8 +906,9 @@ mod tests {
|
|||||||
|
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
let mut cursor = Cursor::new(Vec::new());
|
||||||
|
|
||||||
|
let big_object = serde_json::to_string(&big_object).unwrap();
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||||
builder.add_documents(big_object).unwrap();
|
builder.extend_from_json(&mut big_object.as_bytes()).unwrap();
|
||||||
builder.finish().unwrap();
|
builder.finish().unwrap();
|
||||||
cursor.set_position(0);
|
cursor.set_position(0);
|
||||||
let content = DocumentBatchReader::from_reader(cursor).unwrap();
|
let content = DocumentBatchReader::from_reader(cursor).unwrap();
|
||||||
|
@ -75,7 +75,7 @@ fn create_fields_mapping(
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn find_primary_key(index: &bimap::BiHashMap<u16, String>) -> Option<&str> {
|
fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> {
|
||||||
index
|
index
|
||||||
.iter()
|
.iter()
|
||||||
.sorted_by_key(|(k, _)| *k)
|
.sorted_by_key(|(k, _)| *k)
|
||||||
@ -179,7 +179,7 @@ impl Transform<'_, '_> {
|
|||||||
if !self.autogenerate_docids {
|
if !self.autogenerate_docids {
|
||||||
let mut json = Map::new();
|
let mut json = Map::new();
|
||||||
for (key, value) in document.iter() {
|
for (key, value) in document.iter() {
|
||||||
let key = addition_index.get_by_left(&key).cloned();
|
let key = addition_index.name(key).cloned();
|
||||||
let value = serde_json::from_slice::<Value>(&value).ok();
|
let value = serde_json::from_slice::<Value>(&value).ok();
|
||||||
|
|
||||||
if let Some((k, v)) = key.zip(value) {
|
if let Some((k, v)) = key.zip(value) {
|
||||||
@ -187,7 +187,11 @@ impl Transform<'_, '_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return Err(UserError::MissingDocumentId { document: json }.into());
|
return Err(UserError::MissingDocumentId {
|
||||||
|
primary_key: primary_key_name,
|
||||||
|
document: json,
|
||||||
|
}
|
||||||
|
.into());
|
||||||
}
|
}
|
||||||
|
|
||||||
let uuid =
|
let uuid =
|
||||||
@ -544,6 +548,7 @@ mod test {
|
|||||||
mod primary_key_inference {
|
mod primary_key_inference {
|
||||||
use bimap::BiHashMap;
|
use bimap::BiHashMap;
|
||||||
|
|
||||||
|
use crate::documents::DocumentsBatchIndex;
|
||||||
use crate::update::index_documents::transform::find_primary_key;
|
use crate::update::index_documents::transform::find_primary_key;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -557,7 +562,7 @@ mod test {
|
|||||||
map.insert(4, "fakeId".to_string());
|
map.insert(4, "fakeId".to_string());
|
||||||
map.insert(0, "realId".to_string());
|
map.insert(0, "realId".to_string());
|
||||||
|
|
||||||
assert_eq!(find_primary_key(&map), Some("realId"));
|
assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -465,7 +465,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
self.index.put_primary_key(self.wtxn, primary_key)?;
|
self.index.put_primary_key(self.wtxn, primary_key)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
} else {
|
} else {
|
||||||
Err(UserError::PrimaryKeyCannotBeChanged.into())
|
let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
|
||||||
|
Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Setting::Reset => {
|
Setting::Reset => {
|
||||||
@ -473,7 +474,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
self.index.delete_primary_key(self.wtxn)?;
|
self.index.delete_primary_key(self.wtxn)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
} else {
|
} else {
|
||||||
Err(UserError::PrimaryKeyCannotBeReset.into())
|
let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
|
||||||
|
Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Setting::NotSet => Ok(()),
|
Setting::NotSet => Ok(()),
|
||||||
@ -1106,7 +1108,7 @@ mod tests {
|
|||||||
builder.reset_primary_key();
|
builder.reset_primary_key();
|
||||||
|
|
||||||
let err = builder.execute(|_, _| ()).unwrap_err();
|
let err = builder.execute(|_, _| ()).unwrap_err();
|
||||||
assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset)));
|
assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_))));
|
||||||
wtxn.abort().unwrap();
|
wtxn.abort().unwrap();
|
||||||
|
|
||||||
// But if we clear the database...
|
// But if we clear the database...
|
||||||
|
@ -61,9 +61,12 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||||||
let mut cursor = Cursor::new(Vec::new());
|
let mut cursor = Cursor::new(Vec::new());
|
||||||
let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||||
let reader = Cursor::new(CONTENT.as_bytes());
|
let reader = Cursor::new(CONTENT.as_bytes());
|
||||||
|
|
||||||
for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
|
for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
|
||||||
documents_builder.add_documents(doc.unwrap()).unwrap();
|
let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap());
|
||||||
|
documents_builder.extend_from_json(doc).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
documents_builder.finish().unwrap();
|
documents_builder.finish().unwrap();
|
||||||
|
|
||||||
cursor.set_position(0);
|
cursor.set_position(0);
|
||||||
|
@ -409,7 +409,8 @@ fn criteria_ascdesc() {
|
|||||||
"age": age,
|
"age": age,
|
||||||
});
|
});
|
||||||
|
|
||||||
batch_builder.add_documents(json).unwrap();
|
let json = Cursor::new(serde_json::to_vec(&json).unwrap());
|
||||||
|
batch_builder.extend_from_json(json).unwrap();
|
||||||
});
|
});
|
||||||
|
|
||||||
batch_builder.finish().unwrap();
|
batch_builder.finish().unwrap();
|
||||||
|
@ -1,18 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "search"
|
|
||||||
version = "0.18.0"
|
|
||||||
authors = ["Clément Renault <clement@meilisearch.com>"]
|
|
||||||
edition = "2018"
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
anyhow = "1.0.38"
|
|
||||||
byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
|
|
||||||
heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
|
|
||||||
log = "0.4.14"
|
|
||||||
milli = { path = "../milli" }
|
|
||||||
serde_json = "1.0.62"
|
|
||||||
stderrlog = "0.5.1"
|
|
||||||
structopt = { version = "0.3.21", default-features = false }
|
|
||||||
|
|
||||||
[target.'cfg(target_os = "linux")'.dependencies]
|
|
||||||
jemallocator = "0.3.2"
|
|
Loading…
x
Reference in New Issue
Block a user