Make the Transform read from an EnrichedDocumentsBatchReader

This commit is contained in:
Kerollmops 2022-06-20 13:48:02 +02:00
parent ea852200bb
commit 6a0a0ae94f
No known key found for this signature in database
GPG key ID: 92ADA4E935E71FA4
7 changed files with 158 additions and 24 deletions

View file

@ -0,0 +1,103 @@
use std::fs::File;
use std::{io, str};
use obkv::KvReader;
use super::{
DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader,
Error,
};
use crate::FieldId;
/// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have
/// been created with a `DocumentsBatchWriter` and, for the enriched data,
/// a simple `grenad::Reader<File>`.
///
/// The documents are returned in the form of `obkv::Reader` where each field is identified with a
/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
pub struct EnrichedDocumentsBatchReader<R> {
documents: DocumentsBatchReader<R>,
external_ids: grenad::ReaderCursor<File>,
}
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
pub fn new(
documents: DocumentsBatchReader<R>,
external_ids: grenad::Reader<File>,
) -> Result<Self, Error> {
if documents.documents_count() as u64 == external_ids.len() {
Ok(EnrichedDocumentsBatchReader {
documents,
external_ids: external_ids.into_cursor()?,
})
} else {
Err(Error::InvalidEnrichedData)
}
}
pub fn documents_count(&self) -> u32 {
self.documents.documents_count()
}
pub fn is_empty(&self) -> bool {
self.documents.is_empty()
}
pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
self.documents.documents_batch_index()
}
/// This method returns a forward cursor over the enriched documents.
pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor<R> {
let EnrichedDocumentsBatchReader { documents, mut external_ids } = self;
external_ids.reset();
EnrichedDocumentsBatchCursor { documents: documents.into_cursor(), external_ids }
}
}
#[derive(Debug, Clone, Copy)]
pub struct EnrichedDocument<'a> {
pub document: KvReader<'a, FieldId>,
pub external_id: &'a str,
}
pub struct EnrichedDocumentsBatchCursor<R> {
documents: DocumentsBatchCursor<R>,
external_ids: grenad::ReaderCursor<File>,
}
impl<R> EnrichedDocumentsBatchCursor<R> {
pub fn into_reader(self) -> EnrichedDocumentsBatchReader<R> {
let EnrichedDocumentsBatchCursor { documents, external_ids } = self;
EnrichedDocumentsBatchReader { documents: documents.into_reader(), external_ids }
}
pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
self.documents.documents_batch_index()
}
/// Resets the cursor to be able to read from the start again.
pub fn reset(&mut self) {
self.documents.reset();
self.external_ids.reset();
}
}
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchCursor<R> {
/// Returns the next document, starting from the first one. Subsequent calls to
/// `next_document` advance the document reader until all the documents have been read.
pub fn next_enriched_document(
&mut self,
) -> Result<Option<EnrichedDocument>, DocumentsBatchCursorError> {
let document = self.documents.next_document()?;
let external_id = match self.external_ids.move_on_next()? {
Some((_, bytes)) => Some(str::from_utf8(bytes)?),
None => None,
};
match document.zip(external_id) {
Some((document, external_id)) => Ok(Some(EnrichedDocument { document, external_id })),
None => Ok(None),
}
}
}

View file

@ -1,11 +1,14 @@
mod builder;
mod enriched;
mod reader;
use std::fmt::{self, Debug};
use std::io;
use std::str::Utf8Error;
use bimap::BiHashMap;
pub use builder::DocumentsBatchBuilder;
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
use obkv::KvReader;
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
use serde::{Deserialize, Serialize};
@ -87,6 +90,8 @@ impl DocumentsBatchIndex {
pub enum Error {
ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
InvalidDocumentFormat,
InvalidEnrichedData,
InvalidUtf8(Utf8Error),
Csv(csv::Error),
Json(serde_json::Error),
Serialize(serde_json::Error),
@ -118,6 +123,12 @@ impl From<grenad::Error> for Error {
}
}
impl From<Utf8Error> for Error {
fn from(other: Utf8Error) -> Self {
Self::InvalidUtf8(other)
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
@ -127,6 +138,8 @@ impl fmt::Display for Error {
Error::InvalidDocumentFormat => {
f.write_str("Invalid document addition format, missing the documents batch index.")
}
Error::InvalidEnrichedData => f.write_str("Invalid enriched data."),
Error::InvalidUtf8(e) => write!(f, "{}", e),
Error::Io(e) => write!(f, "{}", e),
Error::Serialize(e) => write!(f, "{}", e),
Error::Grenad(e) => write!(f, "{}", e),

View file

@ -1,5 +1,5 @@
use std::convert::TryInto;
use std::{error, fmt, io};
use std::{error, fmt, io, str};
use obkv::KvReader;
@ -93,19 +93,20 @@ impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> {
/// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents.
#[derive(Debug)]
pub struct DocumentsBatchCursorError {
inner: grenad::Error,
pub enum DocumentsBatchCursorError {
Grenad(grenad::Error),
Utf8(str::Utf8Error),
}
impl From<grenad::Error> for DocumentsBatchCursorError {
fn from(error: grenad::Error) -> DocumentsBatchCursorError {
DocumentsBatchCursorError { inner: error }
DocumentsBatchCursorError::Grenad(error)
}
}
impl Into<grenad::Error> for DocumentsBatchCursorError {
fn into(self) -> grenad::Error {
self.inner
impl From<str::Utf8Error> for DocumentsBatchCursorError {
fn from(error: str::Utf8Error) -> DocumentsBatchCursorError {
DocumentsBatchCursorError::Utf8(error)
}
}
@ -113,6 +114,9 @@ impl error::Error for DocumentsBatchCursorError {}
impl fmt::Display for DocumentsBatchCursorError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.inner.fmt(f)
match self {
DocumentsBatchCursorError::Grenad(e) => e.fmt(f),
DocumentsBatchCursorError::Utf8(e) => e.fmt(f),
}
}
}