Fix the cli for the new DocumentsBatchBuilder/Reader structs

This commit is contained in:
Kerollmops 2022-06-14 16:35:59 +02:00
parent 6d0498df24
commit a4ceef9624
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -8,6 +8,7 @@ use std::time::Instant;
use byte_unit::Byte; use byte_unit::Byte;
use eyre::Result; use eyre::Result;
use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::UpdateIndexingStep::{ use milli::update::UpdateIndexingStep::{
ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
}; };
@ -225,9 +226,9 @@ impl Performer for DocumentAddition {
DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?, DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?,
}; };
let reader = milli::documents::DocumentBatchReader::from_reader(Cursor::new(documents))?; let reader = DocumentsBatchReader::from_reader(Cursor::new(documents))?;
println!("Adding {} documents to the index.", reader.len()); println!("Adding {} documents to the index.", reader.documents_count());
let mut txn = index.write_txn()?; let mut txn = index.write_txn()?;
let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() }; let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() };
@ -321,35 +322,35 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa
} }
fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> { fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new()); let mut documents = DocumentsBatchBuilder::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; let reader = BufReader::new(reader);
let mut buf = String::new(); for result in serde_json::Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
let mut reader = BufReader::new(reader); let object = result?;
documents.append_json_object(&object)?;
while reader.read_line(&mut buf)? > 0 {
documents.extend_from_json(&mut buf.as_bytes())?;
} }
documents.finish()?;
Ok(writer.into_inner()) documents.into_inner().map_err(Into::into)
} }
fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> { fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new()); let mut documents = DocumentsBatchBuilder::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; let list: Vec<Map<String, Value>> = serde_json::from_reader(reader)?;
documents.extend_from_json(reader)?; for object in list {
documents.finish()?; documents.append_json_object(&object)?;
}
Ok(writer.into_inner()) documents.into_inner().map_err(Into::into)
} }
fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> { fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new()); let csv = csv::Reader::from_reader(reader);
milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
Ok(writer.into_inner()) let mut documents = DocumentsBatchBuilder::new(Vec::new());
documents.append_csv(csv)?;
documents.into_inner().map_err(Into::into)
} }
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]