Implement documents format

document reader transform

remove update format

support document sequences

fix document transform

clean transform

improve error handling

add documents! macro

fix transform bug

fix tests

remove csv dependency

Add comments on the transform process

replace search cli

fmt

review edits

fix http ui

fix clippy warnings

Revert "fix clippy warnings"

This reverts commit a1ce3cd96e603633dbf43e9e0b12b2453c9c5620.

fix review comments

remove smallvec in transform loop

review edits
This commit is contained in:
mpostma 2021-08-31 11:44:15 +02:00
parent 94764e5c7c
commit aa6c5df0bc
25 changed files with 5114 additions and 713 deletions

3410
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
[workspace]
members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"]
members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "cli"]
default-members = ["milli"]
[profile.dev]

24
cli/Cargo.toml Normal file
View File

@ -0,0 +1,24 @@
[package]
name = "cli"
version = "0.1.0"
edition = "2018"
description = "A CLI to interact with a milli index"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
indicatif = "0.16.2"
serde = "1.0.129"
serde_json = "1.0.66"
structopt = "0.3.22"
milli = { path = "../milli" }
eyre = "0.6.5"
color-eyre = "0.5.11"
heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
byte-unit = { version = "4.0.12", features = ["serde"] }
bimap = "0.6.1"
csv = "1.1.6"
stderrlog = "0.5.1"
[target.'cfg(target_os = "linux")'.dependencies]
jemallocator = "0.3.2"

335
cli/src/main.rs Normal file
View File

@ -0,0 +1,335 @@
use std::fs::File;
use std::io::{stdin, Cursor, Read};
use std::path::PathBuf;
use std::str::FromStr;
use byte_unit::Byte;
use eyre::Result;
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
use milli::update::UpdateIndexingStep::{
ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
};
use serde_json::{Map, Value};
use structopt::StructOpt;
#[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
#[derive(Debug, StructOpt)]
#[structopt(name = "Milli CLI", about = "A simple CLI to manipulate a milli index.")]
struct Cli {
#[structopt(short, long)]
index_path: PathBuf,
#[structopt(short = "s", long, default_value = "100GiB")]
index_size: Byte,
/// Verbose mode (-v, -vv, -vvv, etc.)
#[structopt(short, long, parse(from_occurrences))]
verbose: usize,
#[structopt(subcommand)]
subcommand: Command,
}
#[derive(Debug, StructOpt)]
enum Command {
DocumentAddition(DocumentAddition),
Search(Search),
SettingsUpdate(SettingsUpdate),
}
fn setup(opt: &Cli) -> Result<()> {
color_eyre::install()?;
stderrlog::new()
.verbosity(opt.verbose)
.show_level(false)
.timestamp(stderrlog::Timestamp::Off)
.init()?;
Ok(())
}
fn main() -> Result<()> {
let command = Cli::from_args();
setup(&command)?;
let mut options = heed::EnvOpenOptions::new();
options.map_size(command.index_size.get_bytes() as usize);
let index = milli::Index::new(options, command.index_path)?;
match command.subcommand {
Command::DocumentAddition(addition) => addition.perform(index)?,
Command::Search(search) => search.perform(index)?,
Command::SettingsUpdate(update) => update.perform(index)?,
}
Ok(())
}
#[derive(Debug)]
enum DocumentAdditionFormat {
Csv,
Json,
Jsonl,
}
impl FromStr for DocumentAdditionFormat {
type Err = eyre::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"csv" => Ok(Self::Csv),
"jsonl" => Ok(Self::Jsonl),
"json" => Ok(Self::Json),
other => eyre::bail!("invalid format: {}", other),
}
}
}
#[derive(Debug, StructOpt)]
struct DocumentAddition {
#[structopt(short, long, default_value = "json", possible_values = &["csv", "jsonl", "json"])]
format: DocumentAdditionFormat,
/// Path to the update file, if not present, will read from stdin.
#[structopt(short, long)]
path: Option<PathBuf>,
/// Whether to generate missing document ids.
#[structopt(short, long)]
autogen_docids: bool,
/// Whether to update or replace the documents if they already exist.
#[structopt(short, long)]
update_documents: bool,
}
impl DocumentAddition {
fn perform(&self, index: milli::Index) -> Result<()> {
let reader: Box<dyn Read> = match self.path {
Some(ref path) => {
let file = File::open(path)?;
Box::new(file)
}
None => Box::new(stdin()),
};
println!("parsing documents...");
let documents = match self.format {
DocumentAdditionFormat::Csv => documents_from_csv(reader)?,
DocumentAdditionFormat::Json => documents_from_json(reader)?,
DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?,
};
let reader = milli::documents::DocumentBatchReader::from_reader(Cursor::new(documents))?;
println!("Adding {} documents to the index.", reader.len());
let mut txn = index.env.write_txn()?;
let mut addition = milli::update::IndexDocuments::new(&mut txn, &index, 0);
if self.update_documents {
addition.index_documents_method(milli::update::IndexDocumentsMethod::UpdateDocuments);
}
addition.log_every_n(100);
if self.autogen_docids {
addition.enable_autogenerate_docids()
}
let mut bars = Vec::new();
let progesses = MultiProgress::new();
for _ in 0..4 {
let bar = ProgressBar::hidden();
let bar = progesses.add(bar);
bars.push(bar);
}
std::thread::spawn(move || {
progesses.join().unwrap();
});
let result = addition.execute(reader, |step, _| indexing_callback(step, &bars))?;
txn.commit()?;
println!("{:?}", result);
Ok(())
}
}
fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBar]) {
let step_index = step.step();
let bar = &bars[step_index];
if step_index > 0 {
let prev = &bars[step_index - 1];
if !prev.is_finished() {
prev.disable_steady_tick();
prev.finish_at_current_pos();
}
}
let style = ProgressStyle::default_bar()
.template("[eta: {eta_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}")
.progress_chars("##-");
match step {
RemapDocumentAddition { documents_seen } => {
bar.set_style(ProgressStyle::default_spinner());
bar.set_message(format!("remaped {} documents so far.", documents_seen));
}
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => {
bar.set_style(style);
bar.set_length(total_documents as u64);
bar.set_message("Merging documents...");
bar.set_position(documents_seen as u64);
}
IndexDocuments { documents_seen, total_documents } => {
bar.set_style(style);
bar.set_length(total_documents as u64);
bar.set_message("Indexing documents...");
bar.set_position(documents_seen as u64);
}
MergeDataIntoFinalDatabase { databases_seen, total_databases } => {
bar.set_style(style);
bar.set_length(total_databases as u64);
bar.set_message("Merging databases...");
bar.set_position(databases_seen as u64);
}
}
bar.enable_steady_tick(200);
}
fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
let values = serde_json::Deserializer::from_reader(reader)
.into_iter::<serde_json::Map<String, serde_json::Value>>();
for document in values {
let document = document?;
documents.add_documents(document)?;
}
documents.finish()?;
Ok(writer.into_inner())
}
fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
let json: serde_json::Value = serde_json::from_reader(reader)?;
documents.add_documents(json)?;
documents.finish()?;
Ok(writer.into_inner())
}
fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
let mut records = csv::Reader::from_reader(reader);
let iter = records.deserialize::<Map<String, Value>>();
for doc in iter {
let doc = doc?;
documents.add_documents(doc)?;
}
documents.finish()?;
Ok(writer.into_inner())
}
#[derive(Debug, StructOpt)]
struct Search {
query: Option<String>,
#[structopt(short, long)]
filter: Option<String>,
#[structopt(short, long)]
offset: Option<usize>,
#[structopt(short, long)]
limit: Option<usize>,
}
impl Search {
fn perform(&self, index: milli::Index) -> Result<()> {
let txn = index.env.read_txn()?;
let mut search = index.search(&txn);
if let Some(ref query) = self.query {
search.query(query);
}
if let Some(ref filter) = self.filter {
let condition = milli::FilterCondition::from_str(&txn, &index, filter)?;
search.filter(condition);
}
if let Some(offset) = self.offset {
search.offset(offset);
}
if let Some(limit) = self.limit {
search.limit(limit);
}
let result = search.execute()?;
let fields_ids_map = index.fields_ids_map(&txn)?;
let displayed_fields =
index.displayed_fields_ids(&txn)?.unwrap_or_else(|| fields_ids_map.ids().collect());
let documents = index.documents(&txn, result.documents_ids)?;
let mut jsons = Vec::new();
for (_, obkv) in documents {
let json = milli::obkv_to_json(&displayed_fields, &fields_ids_map, obkv)?;
jsons.push(json);
}
let hits = serde_json::to_string_pretty(&jsons)?;
println!("{}", hits);
Ok(())
}
}
#[derive(Debug, StructOpt)]
struct SettingsUpdate {
#[structopt(short, long)]
filterable_attributes: Option<Vec<String>>,
}
impl SettingsUpdate {
fn perform(&self, index: milli::Index) -> Result<()> {
let mut txn = index.env.write_txn()?;
let mut update = milli::update::Settings::new(&mut txn, &index, 0);
update.log_every_n(100);
if let Some(ref filterable_attributes) = self.filterable_attributes {
if !filterable_attributes.is_empty() {
update.set_filterable_fields(filterable_attributes.iter().cloned().collect());
} else {
update.reset_filterable_fields();
}
}
let mut bars = Vec::new();
let progesses = MultiProgress::new();
for _ in 0..4 {
let bar = ProgressBar::hidden();
let bar = progesses.add(bar);
bars.push(bar);
}
std::thread::spawn(move || {
progesses.join().unwrap();
});
update.execute(|step, _| indexing_callback(step, &bars))?;
txn.commit()?;
Ok(())
}
}

View File

@ -37,6 +37,8 @@ fst = "0.4.5"
# Temporary fix for bitvec, remove once fixed. (https://github.com/bitvecto-rs/bitvec/issues/105)
funty = "=1.1"
bimap = "0.6.1"
csv = "1.1.6"
[dev-dependencies]
maplit = "1.0.2"

View File

@ -3,6 +3,7 @@ mod update_store;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fmt::Display;
use std::fs::{create_dir_all, File};
use std::io::Cursor;
use std::net::SocketAddr;
use std::num::{NonZeroU32, NonZeroUsize};
use std::path::PathBuf;
@ -18,8 +19,9 @@ use flate2::read::GzDecoder;
use futures::{stream, FutureExt, StreamExt};
use heed::EnvOpenOptions;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use milli::documents::DocumentBatchReader;
use milli::update::UpdateIndexingStep::*;
use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat};
use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder};
use milli::{obkv_to_json, CompressionType, FilterCondition, Index, MatchingWords, SearchResult};
use once_cell::sync::OnceCell;
use rayon::ThreadPool;
@ -350,19 +352,12 @@ async fn main() -> anyhow::Result<()> {
let before_update = Instant::now();
// we extract the update type and execute the update itself.
let result: anyhow::Result<()> =
match meta {
(|| match meta {
UpdateMeta::DocumentsAddition { method, format, encoding } => {
// We must use the write transaction of the update here.
let mut wtxn = index_cloned.write_txn()?;
let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned);
match format.as_str() {
"csv" => builder.update_format(UpdateFormat::Csv),
"json" => builder.update_format(UpdateFormat::Json),
"json-stream" => builder.update_format(UpdateFormat::JsonStream),
otherwise => panic!("invalid update format {:?}", otherwise),
};
match method.as_str() {
"replace" => builder
.index_documents_method(IndexDocumentsMethod::ReplaceDocuments),
@ -377,11 +372,18 @@ async fn main() -> anyhow::Result<()> {
otherwise => panic!("invalid encoding format {:?}", otherwise),
};
let result = builder.execute(reader, |indexing_step, update_id| {
let documents = match format.as_str() {
"csv" => documents_from_csv(reader)?,
"json" => documents_from_json(reader)?,
"jsonl" => documents_from_jsonl(reader)?,
otherwise => panic!("invalid update format {:?}", otherwise),
};
let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?;
let result = builder.execute(documents, |indexing_step, update_id| {
let (current, total) = match indexing_step {
TransformFromUserIntoGenericFormat { documents_seen } => {
(documents_seen, None)
}
RemapDocumentAddition { documents_seen } => (documents_seen, None),
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => {
(documents_seen, Some(total_documents))
}
@ -482,9 +484,7 @@ async fn main() -> anyhow::Result<()> {
let result = builder.execute(|indexing_step, update_id| {
let (current, total) = match indexing_step {
TransformFromUserIntoGenericFormat { documents_seen } => {
(documents_seen, None)
}
RemapDocumentAddition { documents_seen } => (documents_seen, None),
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => {
(documents_seen, Some(total_documents))
}
@ -526,7 +526,7 @@ async fn main() -> anyhow::Result<()> {
Err(e) => Err(e.into()),
}
}
};
})();
let meta = match result {
Ok(()) => {
@ -842,7 +842,7 @@ async fn main() -> anyhow::Result<()> {
UpdateStatus<UpdateMeta, UpdateMetaProgress, String>,
>,
update_method: Option<String>,
update_format: UpdateFormat,
format: String,
encoding: Option<String>,
mut stream: impl futures::Stream<Item = Result<impl bytes::Buf, warp::Error>> + Unpin,
) -> Result<impl warp::Reply, warp::Rejection> {
@ -863,13 +863,6 @@ async fn main() -> anyhow::Result<()> {
_ => String::from("replace"),
};
let format = match update_format {
UpdateFormat::Csv => String::from("csv"),
UpdateFormat::Json => String::from("json"),
UpdateFormat::JsonStream => String::from("json-stream"),
_ => panic!("Unknown update format"),
};
let meta = UpdateMeta::DocumentsAddition { method, format, encoding };
let update_id = update_store.register_update(&meta, &mmap[..]).unwrap();
let _ = update_status_sender.send(UpdateStatus::Pending { update_id, meta });
@ -893,9 +886,9 @@ async fn main() -> anyhow::Result<()> {
.and(warp::body::stream())
.and_then(move |content_type: String, content_encoding, params: QueryUpdate, stream| {
let format = match content_type.as_str() {
"text/csv" => UpdateFormat::Csv,
"application/json" => UpdateFormat::Json,
"application/x-ndjson" => UpdateFormat::JsonStream,
"text/csv" => "csv",
"application/json" => "json",
"application/x-ndjson" => "jsonl",
otherwise => panic!("invalid update format: {}", otherwise),
};
@ -903,7 +896,7 @@ async fn main() -> anyhow::Result<()> {
update_store_cloned.clone(),
update_status_sender_cloned.clone(),
params.method,
format,
format.to_string(),
content_encoding,
stream,
)
@ -1031,6 +1024,49 @@ async fn main() -> anyhow::Result<()> {
Ok(())
}
fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
let values = serde_json::Deserializer::from_reader(reader)
.into_iter::<serde_json::Map<String, serde_json::Value>>();
for document in values {
let document = document?;
documents.add_documents(document)?;
}
documents.finish()?;
Ok(writer.into_inner())
}
fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
let json: serde_json::Value = serde_json::from_reader(reader)?;
documents.add_documents(json)?;
documents.finish()?;
Ok(writer.into_inner())
}
fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
let mut records = csv::Reader::from_reader(reader);
let iter = records.deserialize::<Map<String, Value>>();
for doc in iter {
let doc = doc?;
documents.add_documents(doc)?;
}
documents.finish()?;
Ok(writer.into_inner())
}
#[cfg(test)]
mod tests {
use maplit::{btreeset, hashmap, hashset};

View File

@ -5,12 +5,13 @@ authors = ["Kerollmops <clement@meilisearch.com>"]
edition = "2018"
[dependencies]
bimap = { version = "0.6.1", features = ["serde"] }
bincode = "1.3.3"
bstr = "0.2.15"
byteorder = "1.4.2"
chrono = { version = "0.4.19", features = ["serde"] }
concat-arrays = "0.1.2"
crossbeam-channel = "0.5.1"
csv = "1.1.5"
either = "1.6.1"
flate2 = "1.0.20"
fst = "0.4.5"

View File

@ -0,0 +1,80 @@
use std::io;
use byteorder::{BigEndian, WriteBytesExt};
use serde::ser::Serialize;
use super::serde::DocumentSerializer;
use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
/// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
/// format used by milli.
///
/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to
/// iterate other the documents.
///
/// ## example:
/// ```
/// use milli::documents::DocumentBatchBuilder;
/// use serde_json::json;
/// use std::io::Cursor;
///
/// let mut writer = Cursor::new(Vec::new());
/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
/// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap();
/// builder.finish().unwrap();
/// ```
pub struct DocumentBatchBuilder<W> {
serializer: DocumentSerializer<W>,
}
impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
pub fn new(writer: W) -> Result<Self, Error> {
let index = DocumentsBatchIndex::new();
let mut writer = ByteCounter::new(writer);
// add space to write the offset of the metadata at the end of the writer
writer.write_u64::<BigEndian>(0)?;
let serializer =
DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true };
Ok(Self { serializer })
}
/// Returns the number of documents that have been written to the builder.
pub fn len(&self) -> usize {
self.serializer.count
}
/// This method must be called after the document addition is terminated. It will put the
/// metadata at the end of the file, and write the metadata offset at the beginning on the
/// file.
pub fn finish(self) -> Result<(), Error> {
let DocumentSerializer {
writer: ByteCounter { mut writer, count: offset },
index,
count,
..
} = self.serializer;
let meta = DocumentsMetadata { count, index };
bincode::serialize_into(&mut writer, &meta)?;
writer.seek(io::SeekFrom::Start(0))?;
writer.write_u64::<BigEndian>(offset as u64)?;
writer.flush()?;
Ok(())
}
/// Adds documents to the builder.
///
/// The internal index is updated with the fields found
/// in the documents. Document must either be a map or a sequences of map, anything else will
/// fail.
pub fn add_documents<T: Serialize>(&mut self, document: T) -> Result<(), Error> {
document.serialize(&mut self.serializer)?;
Ok(())
}
}

233
milli/src/documents/mod.rs Normal file
View File

@ -0,0 +1,233 @@
mod builder;
/// The documents module defines an intermediary document format that milli uses for indexation, and
/// provides an API to easily build and read such documents.
///
/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can
/// later be read by milli using the `DocumentBatchReader` interface.
mod reader;
mod serde;
use std::{fmt, io};
use ::serde::{Deserialize, Serialize};
use bimap::BiHashMap;
pub use builder::DocumentBatchBuilder;
pub use reader::DocumentBatchReader;
use crate::FieldId;
/// A bidirectional map that links field ids to their name in a document batch.
pub type DocumentsBatchIndex = BiHashMap<FieldId, String>;
#[derive(Debug, Serialize, Deserialize)]
struct DocumentsMetadata {
count: usize,
index: DocumentsBatchIndex,
}
pub struct ByteCounter<W> {
count: usize,
writer: W,
}
impl<W> ByteCounter<W> {
fn new(writer: W) -> Self {
Self { count: 0, writer }
}
}
impl<W: io::Write> io::Write for ByteCounter<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let count = self.writer.write(buf)?;
self.count += count;
Ok(count)
}
fn flush(&mut self) -> io::Result<()> {
self.writer.flush()
}
}
#[derive(Debug)]
pub enum Error {
InvalidDocumentFormat,
Custom(String),
JsonError(serde_json::Error),
Serialize(bincode::Error),
Io(io::Error),
DocumentTooLarge,
}
impl From<io::Error> for Error {
fn from(other: io::Error) -> Self {
Self::Io(other)
}
}
impl From<bincode::Error> for Error {
fn from(other: bincode::Error) -> Self {
Self::Serialize(other)
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
Error::Io(e) => e.fmt(f),
Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"),
Error::Serialize(e) => e.fmt(f),
}
}
}
impl std::error::Error for Error {}
/// Macro used to generate documents, with the same syntax as `serde_json::json`
#[cfg(test)]
macro_rules! documents {
($data:tt) => {{
let documents = serde_json::json!($data);
let mut writer = std::io::Cursor::new(Vec::new());
let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
builder.add_documents(documents).unwrap();
builder.finish().unwrap();
writer.set_position(0);
crate::documents::DocumentBatchReader::from_reader(writer).unwrap()
}};
}
#[cfg(test)]
mod test {
use serde_json::{json, Value};
use super::*;
#[test]
fn create_documents_no_errors() {
let json = json!({
"number": 1,
"string": "this is a field",
"array": ["an", "array"],
"object": {
"key": "value",
},
"bool": true
});
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.add_documents(json).unwrap();
builder.finish().unwrap();
let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
assert_eq!(documents.index().iter().count(), 5);
let reader = documents.next_document_with_index().unwrap().unwrap();
assert_eq!(reader.1.iter().count(), 5);
assert!(documents.next_document_with_index().unwrap().is_none());
}
#[test]
fn test_add_multiple_documents() {
let doc1 = json!({
"bool": true,
});
let doc2 = json!({
"toto": false,
});
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.add_documents(doc1).unwrap();
builder.add_documents(doc2).unwrap();
builder.finish().unwrap();
let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
assert_eq!(documents.index().iter().count(), 2);
let reader = documents.next_document_with_index().unwrap().unwrap();
assert_eq!(reader.1.iter().count(), 1);
assert!(documents.next_document_with_index().unwrap().is_some());
assert!(documents.next_document_with_index().unwrap().is_none());
}
#[test]
fn add_documents_array() {
let docs = json!([
{ "toto": false },
{ "tata": "hello" },
]);
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.add_documents(docs).unwrap();
builder.finish().unwrap();
let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
assert_eq!(documents.index().iter().count(), 2);
let reader = documents.next_document_with_index().unwrap().unwrap();
assert_eq!(reader.1.iter().count(), 1);
assert!(documents.next_document_with_index().unwrap().is_some());
assert!(documents.next_document_with_index().unwrap().is_none());
}
#[test]
fn add_invalid_document_format() {
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let docs = json!([[
{ "toto": false },
{ "tata": "hello" },
]]);
assert!(builder.add_documents(docs).is_err());
let docs = json!("hello");
assert!(builder.add_documents(docs).is_err());
}
#[test]
fn test_nested() {
let mut docs = documents!([{
"hello": {
"toto": ["hello"]
}
}]);
let (_index, doc) = docs.next_document_with_index().unwrap().unwrap();
let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
assert_eq!(nested, json!({ "toto": ["hello"] }));
}
}

View File

@ -0,0 +1,75 @@
use std::io;
use std::io::{BufReader, Read};
use std::mem::size_of;
use byteorder::{BigEndian, ReadBytesExt};
use obkv::KvReader;
use super::{DocumentsBatchIndex, DocumentsMetadata, Error};
use crate::FieldId;
/// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with
/// a `DocumentsBatchWriter`.
///
/// The documents are returned in the form of `obkv::Reader` where each field is identified with a
/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
pub struct DocumentBatchReader<R> {
reader: BufReader<R>,
metadata: DocumentsMetadata,
buffer: Vec<u8>,
seen_documents: usize,
}
impl<R: io::Read + io::Seek> DocumentBatchReader<R> {
/// Construct a `DocumentsReader` from a reader.
///
/// It first retrieves the index, then moves to the first document. Subsequent calls to
/// `next_document` advance the document reader until all the documents have been read.
pub fn from_reader(mut reader: R) -> Result<Self, Error> {
let mut buffer = Vec::new();
let meta_offset = reader.read_u64::<BigEndian>()?;
reader.seek(io::SeekFrom::Start(meta_offset))?;
reader.read_to_end(&mut buffer)?;
let metadata: DocumentsMetadata = bincode::deserialize(&buffer)?;
reader.seek(io::SeekFrom::Start(size_of::<u64>() as u64))?;
buffer.clear();
let reader = BufReader::new(reader);
Ok(Self { reader, metadata, buffer, seen_documents: 0 })
}
/// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a
/// reference to the addition index.
pub fn next_document_with_index<'a>(
&'a mut self,
) -> io::Result<Option<(&'a DocumentsBatchIndex, KvReader<'a, FieldId>)>> {
if self.seen_documents < self.metadata.count {
let doc_len = self.reader.read_u32::<BigEndian>()?;
self.buffer.resize(doc_len as usize, 0);
self.reader.read_exact(&mut self.buffer)?;
self.seen_documents += 1;
let reader = KvReader::new(&self.buffer);
Ok(Some((&self.metadata.index, reader)))
} else {
Ok(None)
}
}
/// Return the fields index for the documents batch.
pub fn index(&self) -> &DocumentsBatchIndex {
&self.metadata.index
}
/// Returns the number of documents in the reader.
pub fn len(&self) -> usize {
self.metadata.count
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}

View File

@ -0,0 +1,465 @@
use std::convert::TryInto;
use std::{fmt, io};
use byteorder::{BigEndian, WriteBytesExt};
use obkv::KvWriter;
use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer};
use super::{ByteCounter, DocumentsBatchIndex, Error};
use crate::FieldId;
pub struct DocumentSerializer<W> {
pub writer: ByteCounter<W>,
pub buffer: Vec<u8>,
pub index: DocumentsBatchIndex,
pub count: usize,
pub allow_seq: bool,
}
impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer<W> {
type Ok = ();
type Error = Error;
type SerializeSeq = SeqSerializer<'a, W>;
type SerializeTuple = Impossible<(), Self::Error>;
type SerializeTupleStruct = Impossible<(), Self::Error>;
type SerializeTupleVariant = Impossible<(), Self::Error>;
type SerializeMap = MapSerializer<'a, &'a mut ByteCounter<W>>;
type SerializeStruct = Impossible<(), Self::Error>;
type SerializeStructVariant = Impossible<(), Self::Error>;
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
self.buffer.clear();
let cursor = io::Cursor::new(&mut self.buffer);
self.count += 1;
let map_serializer = MapSerializer {
map: KvWriter::new(cursor),
index: &mut self.index,
writer: &mut self.writer,
buffer: Vec::new(),
};
Ok(map_serializer)
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
if self.allow_seq {
// Only allow sequence of documents of depth 1.
self.allow_seq = false;
Ok(SeqSerializer { serializer: self })
} else {
Err(Error::InvalidDocumentFormat)
}
}
fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
Err(Error::InvalidDocumentFormat)
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
_value: &T,
) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
Err(Error::InvalidDocumentFormat)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T,
) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
Err(Error::InvalidDocumentFormat)
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
}
pub struct SeqSerializer<'a, W> {
serializer: &'a mut DocumentSerializer<W>,
}
impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> {
type Ok = ();
type Error = Error;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
value.serialize(&mut *self.serializer)?;
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
pub struct MapSerializer<'a, W> {
map: KvWriter<io::Cursor<&'a mut Vec<u8>>, FieldId>,
index: &'a mut DocumentsBatchIndex,
writer: W,
buffer: Vec<u8>,
}
/// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and
/// serialize_value, therefore these to methods remain unimplemented.
impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> {
type Ok = ();
type Error = Error;
fn serialize_key<T: ?Sized + Serialize>(&mut self, _key: &T) -> Result<(), Self::Error> {
unreachable!()
}
fn serialize_value<T: ?Sized>(&mut self, _value: &T) -> Result<(), Self::Error> {
unreachable!()
}
fn end(mut self) -> Result<Self::Ok, Self::Error> {
let data = self.map.into_inner().map_err(Error::Io)?.into_inner();
let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?;
self.writer.write_u32::<BigEndian>(data_len).map_err(Error::Io)?;
self.writer.write_all(&data).map_err(Error::Io)?;
Ok(())
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V,
) -> Result<(), Self::Error>
where
K: Serialize,
V: Serialize,
{
let field_serializer = FieldSerializer { index: &mut self.index };
let field_id: FieldId = key.serialize(field_serializer)?;
self.buffer.clear();
let mut cursor = io::Cursor::new(&mut self.buffer);
serde_json::to_writer(&mut cursor, value).map_err(Error::JsonError)?;
self.map.insert(field_id, cursor.into_inner()).map_err(Error::Io)?;
Ok(())
}
}
struct FieldSerializer<'a> {
index: &'a mut DocumentsBatchIndex,
}
impl<'a> serde::Serializer for FieldSerializer<'a> {
type Ok = FieldId;
type Error = Error;
type SerializeSeq = Impossible<FieldId, Self::Error>;
type SerializeTuple = Impossible<FieldId, Self::Error>;
type SerializeTupleStruct = Impossible<FieldId, Self::Error>;
type SerializeTupleVariant = Impossible<FieldId, Self::Error>;
type SerializeMap = Impossible<FieldId, Self::Error>;
type SerializeStruct = Impossible<FieldId, Self::Error>;
type SerializeStructVariant = Impossible<FieldId, Self::Error>;
fn serialize_str(self, ws: &str) -> Result<Self::Ok, Self::Error> {
let field_id = match self.index.get_by_right(ws) {
Some(field_id) => *field_id,
None => {
let field_id = self.index.len() as FieldId;
self.index.insert(field_id, ws.to_string());
field_id
}
};
Ok(field_id)
}
fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
Err(Error::InvalidDocumentFormat)
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
) -> Result<Self::Ok, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
_value: &T,
) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
Err(Error::InvalidDocumentFormat)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T,
) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
Err(Error::InvalidDocumentFormat)
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> {
Err(Error::InvalidDocumentFormat)
}
}
impl serde::ser::Error for Error {
fn custom<T: fmt::Display>(msg: T) -> Self {
Error::Custom(msg.to_string())
}
}

View File

@ -55,7 +55,6 @@ pub enum FieldIdMapMissingEntry {
#[derive(Debug)]
pub enum UserError {
AttributeLimitReached,
Csv(csv::Error),
DocumentLimitReached,
InvalidAscDescSyntax { name: String },
InvalidDocumentId { document_id: Value },
@ -212,7 +211,6 @@ impl fmt::Display for UserError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"),
Self::Csv(error) => error.fmt(f),
Self::DocumentLimitReached => f.write_str("maximum number of documents reached"),
Self::InvalidFacetsDistribution { invalid_facets_name } => {
let name_list =

View File

@ -868,7 +868,7 @@ pub(crate) mod tests {
use maplit::btreemap;
use tempfile::TempDir;
use crate::update::{IndexDocuments, UpdateFormat};
use crate::update::IndexDocuments;
use crate::Index;
pub(crate) struct TempIndex {
@ -904,13 +904,12 @@ pub(crate) mod tests {
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
let content = documents!([
{ "id": 1, "name": "kevin" },
{ "id": 2, "name": "bob", "age": 20 },
{ "id": 2, "name": "bob", "age": 20 }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -929,8 +928,12 @@ pub(crate) mod tests {
// we add all the documents a second time. we are supposed to get the same
// field_distribution in the end
let mut wtxn = index.write_txn().unwrap();
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
let content = documents!([
{ "id": 1, "name": "kevin" },
{ "id": 2, "name": "bob", "age": 20 },
{ "id": 2, "name": "bob", "age": 20 }
]);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -947,13 +950,12 @@ pub(crate) mod tests {
);
// then we update a document by removing one field and another by adding one field
let content = &br#"[
let content = documents!([
{ "id": 1, "name": "kevin", "has_dog": true },
{ "id": 2, "name": "bob" }
]"#[..];
]);
let mut wtxn = index.write_txn().unwrap();
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();

View File

@ -1,6 +1,9 @@
#[macro_use]
extern crate pest_derive;
#[macro_use]
pub mod documents;
mod criterion;
mod error;
mod external_documents_ids;

View File

@ -27,6 +27,7 @@ pub trait Distinct {
#[cfg(test)]
mod test {
use std::collections::HashSet;
use std::io::Cursor;
use once_cell::sync::Lazy;
use rand::seq::SliceRandom;
@ -34,19 +35,20 @@ mod test {
use roaring::RoaringBitmap;
use serde_json::{json, Value};
use crate::documents::{DocumentBatchBuilder, DocumentBatchReader};
use crate::index::tests::TempIndex;
use crate::index::Index;
use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
use crate::update::{IndexDocumentsMethod, UpdateBuilder};
use crate::{DocumentId, FieldId, BEU32};
static JSON: Lazy<Value> = Lazy::new(generate_json);
static JSON: Lazy<Vec<u8>> = Lazy::new(generate_documents);
fn generate_json() -> Value {
fn generate_documents() -> Vec<u8> {
let mut rng = rand::thread_rng();
let num_docs = rng.gen_range(10..30);
let mut documents = Vec::new();
let mut cursor = Cursor::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let txts = ["Toto", "Titi", "Tata"];
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
let cat_ints = (1..10).collect::<Vec<_>>();
@ -66,10 +68,11 @@ mod test {
"txts": sample_txts[..(rng.gen_range(0..3))],
"cat-ints": sample_ints[..(rng.gen_range(0..3))],
});
documents.push(doc);
builder.add_documents(doc).unwrap();
}
Value::Array(documents)
builder.finish().unwrap();
cursor.into_inner()
}
/// Returns a temporary index populated with random test documents, the FieldId for the
@ -89,13 +92,15 @@ mod test {
let mut addition = builder.index_documents(&mut txn, &index);
addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
addition.update_format(UpdateFormat::Json);
addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap();
let reader =
crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
addition.execute(reader, |_, _| ()).unwrap();
let fields_map = index.fields_ids_map(&txn).unwrap();
let fid = fields_map.id(&distinct).unwrap();
let map = (0..JSON.as_array().unwrap().len() as u32).collect();
let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
let map = (0..documents.len() as u32).collect();
txn.commit().unwrap();

View File

@ -82,7 +82,7 @@ mod tests {
use heed::EnvOpenOptions;
use super::*;
use crate::update::{IndexDocuments, UpdateFormat};
use crate::update::IndexDocuments;
#[test]
fn clear_documents() {
@ -92,14 +92,12 @@ mod tests {
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
let content = documents!([
{ "id": 0, "name": "kevin", "age": 20 },
{ "id": 1, "name": "kevina" },
{ "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
]);
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
// Clear all documents from the database.
let builder = ClearDocuments::new(&mut wtxn, &index, 1);

View File

@ -567,7 +567,7 @@ mod tests {
use maplit::hashset;
use super::*;
use crate::update::{IndexDocuments, Settings, UpdateFormat};
use crate::update::{IndexDocuments, Settings};
use crate::FilterCondition;
#[test]
@ -578,13 +578,12 @@ mod tests {
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
let content = documents!([
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
// delete those documents, ids are synchronous therefore 0, 1, and 2.
@ -609,13 +608,12 @@ mod tests {
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
let content = documents!([
{ "mysuperid": 0, "name": "kevin" },
{ "mysuperid": 1, "name": "kevina" },
{ "mysuperid": 2, "name": "benoit" }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
// Delete not all of the documents but some of them.
@ -640,7 +638,7 @@ mod tests {
builder.set_filterable_fields(hashset! { S("label") });
builder.execute(|_, _| ()).unwrap();
let content = &br#"[
let content = documents!([
{"docid":"1_4","label":"sign"},
{"docid":"1_5","label":"letter"},
{"docid":"1_7","label":"abstract,cartoon,design,pattern"},
@ -661,9 +659,8 @@ mod tests {
{"docid":"1_58","label":"abstract,art,cartoon"},
{"docid":"1_68","label":"design"},
{"docid":"1_69","label":"geometry"}
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
// Delete not all of the documents but some of them.
@ -692,7 +689,7 @@ mod tests {
builder.set_sortable_fields(hashset!(S("_geo")));
builder.execute(|_, _| ()).unwrap();
let content = &r#"[
let content = documents!([
{"id":"1","city":"Lille", "_geo": { "lat": 50.629973371633746, "lng": 3.0569447399419570 } },
{"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.641586120121050, "lng": 3.1106593480348670 } },
{"id":"3","city":"Hellemmes", "_geo": { "lat": 50.631220965518080, "lng": 3.1106399673339933 } },
@ -713,12 +710,10 @@ mod tests {
{"id":"18","city":"Amiens", "_geo": { "lat": 49.931472529669996, "lng": 2.2710499758317080 } },
{"id":"19","city":"Compiègne", "_geo": { "lat": 49.444980887725656, "lng": 2.7913841281529015 } },
{"id":"20","city":"Paris", "_geo": { "lat": 48.902100060895480, "lng": 2.3708400867406930 } }
]"#[..];
]);
let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
let external_document_ids = index.external_documents_ids(&wtxn).unwrap();
let ids_to_delete: Vec<u32> = external_ids_to_delete

View File

@ -4,7 +4,7 @@ mod transform;
mod typed_chunk;
use std::collections::HashSet;
use std::io::{self, BufRead, BufReader};
use std::io::{Read, Seek};
use std::iter::FromIterator;
use std::num::{NonZeroU32, NonZeroUsize};
use std::time::Instant;
@ -24,6 +24,7 @@ pub use self::helpers::{
};
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
pub use self::transform::{Transform, TransformOutput};
use crate::documents::DocumentBatchReader;
use crate::update::{
Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
WordsLevelPositions, WordsPrefixesFst,
@ -57,17 +58,6 @@ pub enum WriteMethod {
GetMergePut,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum UpdateFormat {
/// The given update is a real **comma seperated** CSV with headers on the first line.
Csv,
/// The given update is a JSON array with documents inside.
Json,
/// The given update is a JSON stream with a document on each line.
JsonStream,
}
pub struct IndexDocuments<'t, 'u, 'i, 'a> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
@ -85,7 +75,6 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
words_positions_level_group_size: Option<NonZeroU32>,
words_positions_min_level_size: Option<NonZeroU32>,
update_method: IndexDocumentsMethod,
update_format: UpdateFormat,
autogenerate_docids: bool,
update_id: u64,
}
@ -113,18 +102,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
words_positions_level_group_size: None,
words_positions_min_level_size: None,
update_method: IndexDocumentsMethod::ReplaceDocuments,
update_format: UpdateFormat::Json,
autogenerate_docids: false,
update_id,
}
}
pub fn index_documents_method(&mut self, method: IndexDocumentsMethod) {
self.update_method = method;
pub fn log_every_n(&mut self, n: usize) {
self.log_every_n = Some(n);
}
pub fn update_format(&mut self, format: UpdateFormat) {
self.update_format = format;
pub fn index_documents_method(&mut self, method: IndexDocumentsMethod) {
self.update_method = method;
}
pub fn enable_autogenerate_docids(&mut self) {
@ -136,16 +124,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
}
#[logging_timer::time("IndexDocuments::{}")]
pub fn execute<R, F>(self, reader: R, progress_callback: F) -> Result<DocumentAdditionResult>
pub fn execute<R, F>(
self,
reader: DocumentBatchReader<R>,
progress_callback: F,
) -> Result<DocumentAdditionResult>
where
R: io::Read,
R: Read + Seek,
F: Fn(UpdateIndexingStep, u64) + Sync,
{
let mut reader = BufReader::new(reader);
reader.fill_buf()?;
// Early return when there is no document to add
if reader.buffer().is_empty() {
if reader.is_empty() {
return Ok(DocumentAdditionResult { nb_documents: 0 });
}
@ -165,14 +154,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
autogenerate_docids: self.autogenerate_docids,
};
let output = match self.update_format {
UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?,
UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?,
UpdateFormat::JsonStream => {
transform.output_from_json_stream(reader, &progress_callback)?
}
};
let output = transform.read_documents(reader, progress_callback)?;
let nb_documents = output.documents_count;
info!("Update transformed in {:.02?}", before_transform.elapsed());
@ -462,6 +444,7 @@ mod tests {
use heed::EnvOpenOptions;
use super::*;
use crate::documents::DocumentBatchBuilder;
use crate::update::DeleteDocuments;
use crate::HashMap;
@ -474,9 +457,12 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
let content = documents!([
{ "id": 1, "name": "kevin" },
{ "id": 2, "name": "kevina" },
{ "id": 3, "name": "benoit" }
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -488,9 +474,8 @@ mod tests {
// Second we send 1 document with id 1, to erase the previous ones.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n1,updated kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Csv);
let content = documents!([ { "id": 1, "name": "updated kevin" } ]);
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -502,9 +487,12 @@ mod tests {
// Third we send 3 documents again to replace the existing ones.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 2);
builder.update_format(UpdateFormat::Csv);
let content = documents!([
{ "id": 1, "name": "updated second kevin" },
{ "id": 2, "name": "updated kevina" },
{ "id": 3, "name": "updated benoit" }
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 2);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -525,9 +513,12 @@ mod tests {
// First we send 3 documents with duplicate ids and
// change the index method to merge documents.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n1,kevin\n1,kevina\n1,benoit\n"[..];
let content = documents!([
{ "id": 1, "name": "kevin" },
{ "id": 1, "name": "kevina" },
{ "id": 1, "name": "benoit" }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -552,9 +543,8 @@ mod tests {
// Second we send 1 document with id 1, to force it to be merged with the previous one.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,age\n1,25\n"[..];
let content = documents!([ { "id": 1, "age": 25 } ]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -574,13 +564,13 @@ mod tests {
let mut doc_iter = doc.iter();
assert_eq!(doc_iter.next(), Some((0, &br#""1""#[..])));
assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
assert_eq!(doc_iter.next(), Some((2, &br#""25""#[..])));
assert_eq!(doc_iter.next(), Some((2, &br#"25"#[..])));
assert_eq!(doc_iter.next(), None);
drop(rtxn);
}
#[test]
fn not_auto_generated_csv_documents_ids() {
fn not_auto_generated_documents_ids() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
@ -588,35 +578,12 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
assert!(builder.execute(content, |_, _| ()).is_err());
wtxn.commit().unwrap();
// Check that there is no document.
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 0);
drop(rtxn);
}
#[test]
fn not_auto_generated_json_documents_ids() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// First we send 3 documents and 2 without ids.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
{ "name": "kevina", "id": 21 },
let content = documents!([
{ "name": "kevin" },
{ "name": "kevina" },
{ "name": "benoit" }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
assert!(builder.execute(content, |_, _| ()).is_err());
wtxn.commit().unwrap();
@ -636,10 +603,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
let content = documents!([
{ "name": "kevin" },
{ "name": "kevina" },
{ "name": "benoit" }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -655,10 +625,9 @@ mod tests {
// Second we send 1 document with the generated uuid, to erase the previous ones.
let mut wtxn = index.write_txn().unwrap();
let content = format!("id,name\n{},updated kevin", kevin_uuid);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Csv);
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]);
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is **always** 3 documents.
@ -689,9 +658,12 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
let content = documents!([
{ "id": 1, "name": "kevin" },
{ "id": 2, "name": "kevina" },
{ "id": 3, "name": "benoit" }
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -703,9 +675,9 @@ mod tests {
// Second we send 1 document without specifying the id.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name\nnew kevin"[..];
let content = documents!([ { "name": "new kevin" } ]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Csv);
builder.enable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -717,7 +689,7 @@ mod tests {
}
#[test]
fn empty_csv_update() {
fn empty_update() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
@ -725,9 +697,8 @@ mod tests {
// First we send 0 documents and only headers.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
let content = documents!([]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -738,83 +709,6 @@ mod tests {
drop(rtxn);
}
#[test]
fn json_documents() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// First we send 3 documents with an id for only one of them.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
{ "name": "kevin" },
{ "name": "kevina", "id": 21 },
{ "name": "benoit" }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 3 documents now.
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 3);
drop(rtxn);
}
#[test]
fn empty_json_update() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// First we send 0 documents.
let mut wtxn = index.write_txn().unwrap();
let content = &b"[]"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is no documents.
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 0);
drop(rtxn);
}
#[test]
fn json_stream_documents() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// First we send 3 documents with an id for only one of them.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"
{ "name": "kevin" }
{ "name": "kevina", "id": 21 }
{ "name": "benoit" }
"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::JsonStream);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 3 documents now.
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 3);
drop(rtxn);
}
#[test]
fn invalid_documents_ids() {
let path = tempfile::tempdir().unwrap();
@ -825,18 +719,16 @@ mod tests {
// First we send 1 document with an invalid id.
let mut wtxn = index.write_txn().unwrap();
// There is a space in the document id.
let content = &b"id,name\nbrume bleue,kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
assert!(builder.execute(content, |_, _| ()).is_err());
wtxn.commit().unwrap();
// First we send 1 document with a valid id.
let mut wtxn = index.write_txn().unwrap();
// There is a space in the document id.
let content = &b"id,name\n32,kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Csv);
let content = documents!([ { "id": 32, "name": "kevin" } ]);
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -848,7 +740,7 @@ mod tests {
}
#[test]
fn complex_json_documents() {
fn complex_documents() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
@ -856,13 +748,12 @@ mod tests {
// First we send 3 documents with an id for only one of them.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
let content = documents!([
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -893,33 +784,31 @@ mod tests {
// First we send 3 documents with an id for only one of them.
let mut wtxn = index.write_txn().unwrap();
let documents = &r#"[
let documents = documents!([
{ "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, "_geo": { "lat": 12, "lng": 42 } },
{ "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 },
{ "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 },
{ "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" },
{ "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" },
{ "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } }
]"#[..];
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
builder.execute(Cursor::new(documents), |_, _| ()).unwrap();
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();
let mut wtxn = index.write_txn().unwrap();
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
let documents = &r#"[
let documents = documents!([
{
"id": 2,
"author": "J. Austen",
"date": "1813"
}
]"#[..];
]);
builder.execute(Cursor::new(documents), |_, _| ()).unwrap();
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();
}
@ -931,15 +820,13 @@ mod tests {
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
let content = documents!([
{ "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" },
{ "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" },
{ "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" },
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
]);
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
@ -951,22 +838,18 @@ mod tests {
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
assert!(external_documents_ids.get("30").is_none());
let content = &br#"[
let content = documents!([
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
]);
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
assert!(external_documents_ids.get("30").is_some());
let content = &br#"[
let content = documents!([
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
]);
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
}
@ -987,12 +870,16 @@ mod tests {
big_object.insert(key, "I am a text!");
}
let content = vec![big_object];
let content = serde_json::to_string(&content).unwrap();
let mut cursor = Cursor::new(Vec::new());
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(Cursor::new(content), |_, _| ()).unwrap();
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.add_documents(big_object).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let content = DocumentBatchReader::from_reader(cursor).unwrap();
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
}
@ -1005,16 +892,38 @@ mod tests {
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let content = r#"#id,title,au{hor,genre,price$
2,"Prideand Prejudice","Jane Austin","romance",3.5$
456,"Le Petit Prince","Antoine de Saint-Exupéry","adventure",10.0$
1,Wonderland","Lewis Carroll","fantasy",25.99$
4,"Harry Potter ing","fantasy\0lood Prince","J. K. Rowling","fantasy\0,
"#;
let content = documents!([
{
"id": 2,
"title": "Prideand Prejudice",
"au{hor": "Jane Austin",
"genre": "romance",
"price$": "3.5$",
},
{
"id": 456,
"title": "Le Petit Prince",
"au{hor": "Antoine de Saint-Exupéry",
"genre": "adventure",
"price$": "10.0$",
},
{
"id": 1,
"title": "Wonderland",
"au{hor": "Lewis Carroll",
"genre": "fantasy",
"price$": "25.99$",
},
{
"id": 4,
"title": "Harry Potter ing fantasy\0lood Prince",
"au{hor": "J. K. Rowling",
"genre": "fantasy\0",
},
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
}

View File

@ -1,12 +1,12 @@
use std::borrow::Cow;
use std::collections::btree_map::Entry;
use std::collections::HashMap;
use std::fs::File;
use std::io::{Read, Seek, SeekFrom};
use std::iter::Peekable;
use std::result::Result as StdResult;
use std::time::Instant;
use grenad::CompressionType;
use itertools::Itertools;
use log::info;
use roaring::RoaringBitmap;
use serde_json::{Map, Value};
@ -15,7 +15,8 @@ use super::helpers::{
create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn,
};
use super::IndexDocumentsMethod;
use crate::error::{InternalError, UserError};
use crate::documents::{DocumentBatchReader, DocumentsBatchIndex};
use crate::error::{Error, InternalError, UserError};
use crate::index::db_name;
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32};
@ -51,90 +52,63 @@ pub struct Transform<'t, 'i> {
pub autogenerate_docids: bool,
}
fn is_primary_key(field: impl AsRef<str>) -> bool {
field.as_ref().to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME)
/// Create a mapping between the field ids found in the document batch and the one that were
/// already present in the index.
///
/// If new fields are present in the addition, they are added to the index field ids map.
fn create_fields_mapping(
index_field_map: &mut FieldsIdsMap,
batch_field_map: &DocumentsBatchIndex,
) -> Result<HashMap<FieldId, FieldId>> {
batch_field_map
.iter()
// we sort by id here to ensure a deterministic mapping of the fields, that preserves
// the original ordering.
.sorted_by_key(|(&id, _)| id)
.map(|(field, name)| match index_field_map.id(&name) {
Some(id) => Ok((*field, id)),
None => index_field_map
.insert(&name)
.ok_or(Error::UserError(UserError::AttributeLimitReached))
.map(|id| (*field, id)),
})
.collect()
}
fn find_primary_key(index: &bimap::BiHashMap<u16, String>) -> Option<&str> {
index
.right_values()
.find(|v| v.to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME))
.map(String::as_str)
}
impl Transform<'_, '_> {
pub fn output_from_json<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput>
where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
{
self.output_from_generic_json(reader, false, progress_callback)
}
pub fn output_from_json_stream<R, F>(
pub fn read_documents<R, F>(
self,
reader: R,
mut reader: DocumentBatchReader<R>,
progress_callback: F,
) -> Result<TransformOutput>
where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
{
self.output_from_generic_json(reader, true, progress_callback)
}
fn output_from_generic_json<R, F>(
self,
reader: R,
is_stream: bool,
progress_callback: F,
) -> Result<TransformOutput>
where
R: Read,
R: Read + Seek,
F: Fn(UpdateIndexingStep) + Sync,
{
let fields_index = reader.index();
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
let mapping = create_fields_mapping(&mut fields_ids_map, fields_index)?;
// Deserialize the whole batch of documents in memory.
let mut documents: Peekable<
Box<dyn Iterator<Item = serde_json::Result<Map<String, Value>>>>,
> = if is_stream {
let iter = serde_json::Deserializer::from_reader(reader).into_iter();
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
iter.peekable()
} else {
let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?;
let iter = vec.into_iter().map(Ok);
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
iter.peekable()
};
let alternative_name = self
.index
.primary_key(self.rtxn)?
.or_else(|| find_primary_key(fields_index))
.map(String::from);
// We extract the primary key from the first document in
// the batch if it hasn't already been defined in the index
let first = match documents.peek().map(StdResult::as_ref).transpose() {
Ok(first) => first,
Err(_) => {
let error = documents.next().unwrap().unwrap_err();
return Err(UserError::SerdeJson(error).into());
}
};
let alternative_name =
first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
let (primary_key_id, primary_key) = compute_primary_key_pair(
let (primary_key_id, primary_key_name) = compute_primary_key_pair(
self.index.primary_key(self.rtxn)?,
&mut fields_ids_map,
alternative_name,
self.autogenerate_docids,
)?;
if documents.peek().is_none() {
return Ok(TransformOutput {
primary_key,
fields_ids_map,
field_distribution: self.index.field_distribution(self.rtxn)?,
external_documents_ids: ExternalDocumentsIds::default(),
new_documents_ids: RoaringBitmap::new(),
replaced_documents_ids: RoaringBitmap::new(),
documents_count: 0,
documents_file: tempfile::tempfile()?,
});
}
// We must choose the appropriate merge function for when two or more documents
// with the same user id must be merged or fully replaced in the same batch.
let merge_function = match self.index_documents_method {
@ -151,204 +125,103 @@ impl Transform<'_, '_> {
self.max_memory,
);
let mut json_buffer = Vec::new();
let mut obkv_buffer = Vec::new();
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
let mut documents_count = 0;
for result in documents {
let document = result.map_err(UserError::SerdeJson)?;
let mut external_id_buffer = Vec::new();
let mut field_buffer: Vec<(u16, &[u8])> = Vec::new();
while let Some((addition_index, document)) = reader.next_document_with_index()? {
let mut field_buffer_cache = drop_and_reuse(field_buffer);
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
documents_seen: documents_count,
});
}
obkv_buffer.clear();
let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
// We prepare the fields ids map with the documents keys.
for (key, _value) in &document {
fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
for (k, v) in document.iter() {
let mapped_id = *mapping.get(&k).unwrap();
field_buffer_cache.push((mapped_id, v));
}
// We retrieve the user id from the document based on the primary key name,
// if the document id isn't present we generate a uuid.
let external_id = match document.get(&primary_key) {
Some(value) => match value {
Value::String(string) => Cow::Borrowed(string.as_str()),
Value::Number(number) => Cow::Owned(number.to_string()),
content => {
return Err(
UserError::InvalidDocumentId { document_id: content.clone() }.into()
)
// We need to make sure that every document has a primary key. After we have remapped
// all the fields in the document, we try to find the primary key value. If we can find
// it, transform it into a string and validate it, and then update it in the
// document. If none is found, and we were told to generate missing document ids, then
// we create the missing field, and update the new document.
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
let external_id =
match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) {
Some((_, bytes)) => {
let value = match serde_json::from_slice(bytes).unwrap() {
Value::String(string) => match validate_document_id(&string) {
Some(s) if s.len() == string.len() => string,
Some(s) => s.to_string(),
None => {
return Err(UserError::InvalidDocumentId {
document_id: Value::String(string),
}
.into())
}
},
Value::Number(number) => number.to_string(),
content => {
return Err(UserError::InvalidDocumentId {
document_id: content.clone(),
}
.into())
}
};
serde_json::to_writer(&mut external_id_buffer, &value).unwrap();
*bytes = &external_id_buffer;
Cow::Owned(value)
}
},
None => {
if !self.autogenerate_docids {
return Err(UserError::MissingDocumentId { document }.into());
}
let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
Cow::Borrowed(uuid)
}
};
None => {
if !self.autogenerate_docids {
let mut json = Map::new();
for (key, value) in document.iter() {
let key = addition_index.get_by_left(&key).cloned();
let value = serde_json::from_slice::<Value>(&value).ok();
// We iterate in the fields ids ordered.
for (field_id, name) in fields_ids_map.iter() {
json_buffer.clear();
if let Some((k, v)) = key.zip(value) {
json.insert(k, v);
}
}
// We try to extract the value from the document and if we don't find anything
// and this should be the document id we return the one we generated.
if let Some(value) = document.get(name) {
// We serialize the attribute values.
serde_json::to_writer(&mut json_buffer, value)
.map_err(InternalError::SerdeJson)?;
writer.insert(field_id, &json_buffer)?;
}
// We validate the document id [a-zA-Z0-9\-_].
if field_id == primary_key_id && validate_document_id(&external_id).is_none() {
return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
return Err(UserError::MissingDocumentId { document: json }.into());
}
let uuid =
uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
serde_json::to_writer(&mut external_id_buffer, &uuid).unwrap();
field_buffer_cache.push((primary_key_id, &external_id_buffer));
Cow::Borrowed(&*uuid)
}
.into());
}
};
// Insertion in a obkv need to be done with keys ordered. For now they are ordered
// according to the document addition key order, so we sort it according to the
// fieldids map keys order.
field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2));
// The last step is to build the new obkv document, and insert it in the sorter.
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
for (k, v) in field_buffer_cache.iter() {
writer.insert(*k, v)?;
}
// We use the extracted/generated user id as the key for this document.
sorter.insert(external_id.as_bytes(), &obkv_buffer)?;
sorter.insert(&external_id.as_ref().as_bytes(), &obkv_buffer)?;
documents_count += 1;
}
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
documents_seen: documents_count,
});
// Now that we have a valid sorter that contains the user id and the obkv we
// give it to the last transforming function which returns the TransformOutput.
self.output_from_sorter(
sorter,
primary_key,
fields_ids_map,
documents_count,
external_documents_ids,
progress_callback,
)
}
pub fn output_from_csv<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput>
where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
{
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
let mut csv = csv::Reader::from_reader(reader);
let headers = csv.headers().map_err(UserError::Csv)?;
let mut fields_ids = Vec::new();
// Generate the new fields ids based on the current fields ids and this CSV headers.
for (i, header) in headers.iter().enumerate() {
let id = fields_ids_map.insert(header).ok_or(UserError::AttributeLimitReached)?;
fields_ids.push((id, i));
}
// Extract the position of the primary key in the current headers, None if not found.
let primary_key_pos = match self.index.primary_key(self.rtxn)? {
Some(primary_key) => {
// The primary key is known so we must find the position in the CSV headers.
headers.iter().position(|h| h == primary_key)
}
None => headers.iter().position(is_primary_key),
};
// Returns the field id in the fields ids map, create an "id" field
// in case it is not in the current headers.
let alternative_name = primary_key_pos.map(|pos| headers[pos].to_string());
let (primary_key_id, primary_key_name) = compute_primary_key_pair(
self.index.primary_key(self.rtxn)?,
&mut fields_ids_map,
alternative_name,
self.autogenerate_docids,
)?;
// The primary key field is not present in the header, so we need to create it.
if primary_key_pos.is_none() {
fields_ids.push((primary_key_id, usize::max_value()));
}
// We sort the fields ids by the fields ids map id, this way we are sure to iterate over
// the records fields in the fields ids map order and correctly generate the obkv.
fields_ids.sort_unstable_by_key(|(field_id, _)| *field_id);
// We initialize the sorter with the user indexing settings.
let mut sorter = create_sorter(
keep_latest_obkv,
self.chunk_compression_type,
self.chunk_compression_level,
self.max_nb_chunks,
self.max_memory,
);
// We write into the sorter to merge and deduplicate the documents
// based on the external ids.
let mut json_buffer = Vec::new();
let mut obkv_buffer = Vec::new();
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
let mut documents_count = 0;
let mut record = csv::StringRecord::new();
while csv.read_record(&mut record).map_err(UserError::Csv)? {
obkv_buffer.clear();
let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
documents_seen: documents_count,
});
}
// We extract the user id if we know where it is or generate an UUID V4 otherwise.
let external_id = match primary_key_pos {
Some(pos) => {
let external_id = &record[pos];
// We validate the document id [a-zA-Z0-9\-_].
match validate_document_id(&external_id) {
Some(valid) => valid,
None => {
return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
}
.into())
}
}
}
None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer),
};
// When the primary_key_field_id is found in the fields ids list
// we return the generated document id instead of the record field.
let iter = fields_ids.iter().map(|(fi, i)| {
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
(fi, field)
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
documents_seen: documents_count,
});
// We retrieve the field id based on the fields ids map fields ids order.
for (field_id, field) in iter {
// We serialize the attribute values as JSON strings.
json_buffer.clear();
serde_json::to_writer(&mut json_buffer, &field)
.map_err(InternalError::SerdeJson)?;
writer.insert(*field_id, &json_buffer)?;
}
// We use the extracted/generated user id as the key for this document.
sorter.insert(external_id, &obkv_buffer)?;
documents_count += 1;
obkv_buffer.clear();
field_buffer = drop_and_reuse(field_buffer_cache);
external_id_buffer.clear();
}
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
documents_seen: documents_count,
});
@ -359,7 +232,6 @@ impl Transform<'_, '_> {
primary_key_name,
fields_ids_map,
documents_count,
external_documents_ids,
progress_callback,
)
}
@ -373,12 +245,12 @@ impl Transform<'_, '_> {
primary_key: String,
fields_ids_map: FieldsIdsMap,
approximate_number_of_documents: usize,
mut external_documents_ids: ExternalDocumentsIds<'_>,
progress_callback: F,
) -> Result<TransformOutput>
where
F: Fn(UpdateIndexingStep) + Sync,
{
let mut external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
let documents_ids = self.index.documents_ids(self.rtxn)?;
let mut field_distribution = self.index.field_distribution(self.rtxn)?;
let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
@ -610,6 +482,17 @@ fn validate_document_id(document_id: &str) -> Option<&str> {
})
}
/// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec<T>`.
///
/// The size and alignment of T and U must match.
fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> {
debug_assert_eq!(std::mem::align_of::<U>(), std::mem::align_of::<T>());
debug_assert_eq!(std::mem::size_of::<U>(), std::mem::size_of::<T>());
vec.clear();
debug_assert!(vec.is_empty());
vec.into_iter().map(|_| unreachable!()).collect()
}
#[cfg(test)]
mod test {
use super::*;

View File

@ -2,9 +2,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds;
pub use self::clear_documents::ClearDocuments;
pub use self::delete_documents::DeleteDocuments;
pub use self::facets::Facets;
pub use self::index_documents::{
DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat,
};
pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod};
pub use self::settings::{Setting, Settings};
pub use self::update_builder::UpdateBuilder;
pub use self::update_step::UpdateIndexingStep;

View File

@ -111,6 +111,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
}
pub fn log_every_n(&mut self, n: usize) {
self.log_every_n = Some(n);
}
pub fn reset_searchable_fields(&mut self) {
self.searchable_fields = Setting::Reset;
}
@ -501,7 +505,7 @@ mod tests {
use super::*;
use crate::error::Error;
use crate::update::{IndexDocuments, UpdateFormat};
use crate::update::IndexDocuments;
use crate::{Criterion, FilterCondition, SearchResult};
#[test]
@ -513,9 +517,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name,age\n0,kevin,23\n1,kevina,21\n2,benoit,34\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
let content = documents!([
{ "id": 1, "name": "kevin", "age": 23 },
{ "id": 2, "name": "kevina", "age": 21},
{ "id": 3, "name": "benoit", "age": 34 }
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -567,10 +575,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -611,10 +622,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -633,10 +647,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
// In the same transaction we change the displayed fields to be only the age.
@ -678,13 +695,12 @@ mod tests {
builder.execute(|_, _| ()).unwrap();
// Then index some documents.
let content = &br#"[
{ "name": "kevin", "age": 23 },
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]"#[..];
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.enable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -695,11 +711,19 @@ mod tests {
assert_eq!(fields_ids, hashset! { S("age") });
// Only count the field_id 0 and level 0 facet values.
// TODO we must support typed CSVs for numbers to be understood.
let fidmap = index.fields_ids_map(&rtxn).unwrap();
println!("fidmap: {:?}", fidmap);
for document in index.all_documents(&rtxn).unwrap() {
let document = document.unwrap();
let json = crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document.1)
.unwrap();
println!("json: {:?}", json);
}
let count = index
.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
// The faceted field id is 2u16
.prefix_iter(&rtxn, &[0, 2, 0])
// The faceted field id is 1u16
.prefix_iter(&rtxn, &[0, 1, 0])
.unwrap()
.count();
assert_eq!(count, 3);
@ -707,25 +731,23 @@ mod tests {
// Index a little more documents with new and current facets values.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
{ "name": "kevin2", "age": 23 },
let content = documents!([
{ "name": "kevin2", "age": 23},
{ "name": "kevina2", "age": 21 },
{ "name": "benoit", "age": 35 }
]"#[..];
{ "name": "benoit", "age": 35 }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 2);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values.
// TODO we must support typed CSVs for numbers to be understood.
let count = index
.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 2, 0])
.prefix_iter(&rtxn, &[0, 1, 0])
.unwrap()
.count();
assert_eq!(count, 4);
@ -747,13 +769,12 @@ mod tests {
builder.execute(|_, _| ()).unwrap();
// Then index some documents.
let content = &br#"[
{ "name": "kevin", "age": 23 },
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]"#[..];
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.enable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -790,7 +811,7 @@ mod tests {
builder.execute(|_, _| ()).unwrap();
// Then index some documents.
let content = &br#"[
let content = documents!([
{ "name": "kevin", "age": 23 },
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 },
@ -798,9 +819,8 @@ mod tests {
{ "name": "bertrand", "age": 34 },
{ "name": "bernie", "age": 34 },
{ "name": "ben", "age": 34 }
]"#[..];
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.enable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -822,10 +842,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -844,10 +867,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23, "maxim": "I love dogs" },
{ "name": "kevina", "age": 21, "maxim": "Doggos are the best" },
{ "name": "benoit", "age": 34, "maxim": "The crepes are really good" },
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
// In the same transaction we provide some stop_words
@ -915,10 +941,13 @@ mod tests {
// Send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23, "maxim": "I love dogs"},
{ "name": "kevina", "age": 21, "maxim": "Doggos are the best"},
{ "name": "benoit", "age": 34, "maxim": "The crepes are really good"},
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
// In the same transaction provide some synonyms
@ -1038,7 +1067,7 @@ mod tests {
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey"));
// Then index some documents with the "mykey" primary key.
let content = &br#"[
let content = documents!([
{ "mykey": 1, "name": "kevin", "age": 23 },
{ "mykey": 2, "name": "kevina", "age": 21 },
{ "mykey": 3, "name": "benoit", "age": 34 },
@ -1046,9 +1075,8 @@ mod tests {
{ "mykey": 5, "name": "bertrand", "age": 34 },
{ "mykey": 6, "name": "bernie", "age": 34 },
{ "mykey": 7, "name": "ben", "age": 34 }
]"#[..];
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.disable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -1087,7 +1115,7 @@ mod tests {
builder.set_filterable_fields(hashset! { S("genres") });
builder.execute(|_, _| ()).unwrap();
let content = &br#"[
let content = documents!([
{
"id": 11,
"title": "Star Wars",
@ -1105,9 +1133,8 @@ mod tests {
"poster": "https://image.tmdb.org/t/p/w500/gSuHDeWemA1menrwfMRChnSmMVN.jpg",
"release_date": 819676800
}
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();

View File

@ -2,10 +2,9 @@ use UpdateIndexingStep::*;
#[derive(Debug, Clone, Copy)]
pub enum UpdateIndexingStep {
/// Transform from the original user given format (CSV, JSON, JSON lines)
/// into a generic format based on the obkv and grenad crates. This step also
/// deduplicate potential documents in this batch update by merging or replacing them.
TransformFromUserIntoGenericFormat { documents_seen: usize },
/// Remap document addition fields the one present in the database, adding new fields in to the
/// schema on the go.
RemapDocumentAddition { documents_seen: usize },
/// This step check the external document id, computes the internal ids and merge
/// the documents that are already present in the database.
@ -23,7 +22,7 @@ pub enum UpdateIndexingStep {
impl UpdateIndexingStep {
pub const fn step(&self) -> usize {
match self {
TransformFromUserIntoGenericFormat { .. } => 0,
RemapDocumentAddition { .. } => 0,
ComputeIdsAndMergeDocuments { .. } => 1,
IndexDocuments { .. } => 2,
MergeDataIntoFinalDatabase { .. } => 3,

View File

@ -1,11 +1,13 @@
use std::cmp::Reverse;
use std::collections::HashSet;
use std::io::Cursor;
use big_s::S;
use either::{Either, Left, Right};
use heed::EnvOpenOptions;
use maplit::{hashmap, hashset};
use milli::update::{Settings, UpdateBuilder, UpdateFormat};
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
use milli::update::{Settings, UpdateBuilder};
use milli::{AscDesc, Criterion, DocumentId, Index, Member};
use serde::Deserialize;
use slice_group_by::GroupBy;
@ -55,9 +57,20 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
let mut builder = UpdateBuilder::new(0);
builder.max_memory(10 * 1024 * 1024); // 10MiB
let mut builder = builder.index_documents(&mut wtxn, &index);
builder.update_format(UpdateFormat::JsonStream);
builder.enable_autogenerate_docids();
builder.execute(CONTENT.as_bytes(), |_, _| ()).unwrap();
let mut cursor = Cursor::new(Vec::new());
let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let reader = Cursor::new(CONTENT.as_bytes());
for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
documents_builder.add_documents(doc.unwrap()).unwrap();
}
documents_builder.finish().unwrap();
cursor.set_position(0);
// index documents
let content = DocumentBatchReader::from_reader(cursor).unwrap();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();

View File

@ -1,10 +1,12 @@
use std::cmp::Reverse;
use std::io::Cursor;
use big_s::S;
use heed::EnvOpenOptions;
use itertools::Itertools;
use maplit::hashset;
use milli::update::{Settings, UpdateBuilder, UpdateFormat};
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
use milli::update::{Settings, UpdateBuilder};
use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult};
use rand::Rng;
use Criterion::*;
@ -386,31 +388,37 @@ fn criteria_ascdesc() {
let mut builder = UpdateBuilder::new(0);
builder.max_memory(10 * 1024 * 1024); // 10MiB
let mut builder = builder.index_documents(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
builder.enable_autogenerate_docids();
let content = [
vec![S("name,age")],
(0..ASC_DESC_CANDIDATES_THRESHOLD + 1)
.map(|_| {
let mut rng = rand::thread_rng();
let mut cursor = Cursor::new(Vec::new());
let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let age = rng.gen::<u32>().to_string();
let name = rng
.sample_iter(&rand::distributions::Alphanumeric)
.map(char::from)
.filter(|c| *c >= 'a' && *c <= 'z')
.take(10)
.collect::<String>();
(0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| {
let mut rng = rand::thread_rng();
format!("{},{}", name, age)
})
.collect::<Vec<_>>(),
]
.iter()
.flatten()
.join("\n");
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
let age = rng.gen::<u32>().to_string();
let name = rng
.sample_iter(&rand::distributions::Alphanumeric)
.map(char::from)
.filter(|c| *c >= 'a' && *c <= 'z')
.take(10)
.collect::<String>();
let json = serde_json::json!({
"name": name,
"age": age,
});
batch_builder.add_documents(json).unwrap();
});
batch_builder.finish().unwrap();
cursor.set_position(0);
let reader = DocumentBatchReader::from_reader(cursor).unwrap();
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();

View File

@ -1,98 +0,0 @@
use std::io::{self, BufRead, Write};
use std::iter::once;
use std::path::PathBuf;
use std::time::Instant;
use byte_unit::Byte;
use heed::EnvOpenOptions;
use log::debug;
use milli::{obkv_to_json, Index};
use structopt::StructOpt;
#[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
#[derive(Debug, StructOpt)]
/// A simple search helper binary for the milli project.
pub struct Opt {
/// The database path where the database is located.
/// It is created if it doesn't already exist.
#[structopt(long = "db", parse(from_os_str))]
database: PathBuf,
/// The maximum size the database can take on disk. It is recommended to specify
/// the whole disk space (value must be a multiple of a page size).
#[structopt(long = "db-size", default_value = "100 GiB")]
database_size: Byte,
/// Verbose mode (-v, -vv, -vvv, etc.)
#[structopt(short, long, parse(from_occurrences))]
verbose: usize,
/// The query string to search for (doesn't support prefix search yet).
query: Option<String>,
/// Compute and print the facet distribution of all the faceted fields.
#[structopt(long)]
print_facet_distribution: bool,
}
fn main() -> anyhow::Result<()> {
let opt = Opt::from_args();
stderrlog::new()
.verbosity(opt.verbose)
.show_level(false)
.timestamp(stderrlog::Timestamp::Off)
.init()?;
// Return an error if the database does not exist.
if !opt.database.exists() {
anyhow::bail!("The database ({}) does not exist.", opt.database.display());
}
let mut options = EnvOpenOptions::new();
options.map_size(opt.database_size.get_bytes() as usize);
// Open the LMDB database.
let index = Index::new(options, &opt.database)?;
let rtxn = index.read_txn()?;
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let displayed_fields = match index.displayed_fields_ids(&rtxn)? {
Some(fields) => fields,
None => fields_ids_map.iter().map(|(id, _)| id).collect(),
};
let stdin = io::stdin();
let lines = match opt.query {
Some(query) => Box::new(once(Ok(query))),
None => Box::new(stdin.lock().lines()) as Box<dyn Iterator<Item = _>>,
};
let mut stdout = io::stdout();
for result in lines {
let before = Instant::now();
let query = result?;
let result = index.search(&rtxn).query(query).execute()?;
let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?;
for (_id, record) in documents {
let val = obkv_to_json(&displayed_fields, &fields_ids_map, record)?;
serde_json::to_writer(&mut stdout, &val)?;
let _ = writeln!(&mut stdout);
}
if opt.print_facet_distribution {
let facets =
index.facets_distribution(&rtxn).candidates(result.candidates).execute()?;
serde_json::to_writer(&mut stdout, &facets)?;
let _ = writeln!(&mut stdout);
}
debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len());
}
Ok(())
}