2022-06-20 13:48:02 +02:00
|
|
|
use std::fs::File;
|
2023-09-28 16:26:01 +02:00
|
|
|
use std::io::BufReader;
|
2022-06-20 13:48:02 +02:00
|
|
|
use std::{io, str};
|
|
|
|
|
|
|
|
use obkv::KvReader;
|
|
|
|
|
|
|
|
use super::{
|
|
|
|
DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader,
|
|
|
|
Error,
|
|
|
|
};
|
2022-06-21 14:41:19 +02:00
|
|
|
use crate::update::DocumentId;
|
2022-06-20 13:48:02 +02:00
|
|
|
use crate::FieldId;
|
|
|
|
|
|
|
|
/// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have
|
|
|
|
/// been created with a `DocumentsBatchWriter` and, for the enriched data,
|
|
|
|
/// a simple `grenad::Reader<File>`.
|
|
|
|
///
|
|
|
|
/// The documents are returned in the form of `obkv::Reader` where each field is identified with a
|
|
|
|
/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
|
|
|
|
pub struct EnrichedDocumentsBatchReader<R> {
|
|
|
|
documents: DocumentsBatchReader<R>,
|
2022-06-21 10:45:25 +02:00
|
|
|
primary_key: String,
|
2023-09-28 16:26:01 +02:00
|
|
|
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
2022-06-20 13:48:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
|
|
|
|
pub fn new(
|
|
|
|
documents: DocumentsBatchReader<R>,
|
2022-06-21 10:45:25 +02:00
|
|
|
primary_key: String,
|
2023-09-28 16:26:01 +02:00
|
|
|
external_ids: grenad::Reader<BufReader<File>>,
|
2022-06-20 13:48:02 +02:00
|
|
|
) -> Result<Self, Error> {
|
|
|
|
if documents.documents_count() as u64 == external_ids.len() {
|
|
|
|
Ok(EnrichedDocumentsBatchReader {
|
|
|
|
documents,
|
2022-06-21 10:45:25 +02:00
|
|
|
primary_key,
|
2022-06-20 13:48:02 +02:00
|
|
|
external_ids: external_ids.into_cursor()?,
|
|
|
|
})
|
|
|
|
} else {
|
|
|
|
Err(Error::InvalidEnrichedData)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn documents_count(&self) -> u32 {
|
|
|
|
self.documents.documents_count()
|
|
|
|
}
|
|
|
|
|
2022-06-21 10:45:25 +02:00
|
|
|
pub fn primary_key(&self) -> &str {
|
|
|
|
&self.primary_key
|
|
|
|
}
|
|
|
|
|
2022-06-20 13:48:02 +02:00
|
|
|
pub fn is_empty(&self) -> bool {
|
|
|
|
self.documents.is_empty()
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
|
|
|
|
self.documents.documents_batch_index()
|
|
|
|
}
|
|
|
|
|
|
|
|
/// This method returns a forward cursor over the enriched documents.
|
2022-07-18 16:08:01 +02:00
|
|
|
pub fn into_cursor_and_fields_index(
|
|
|
|
self,
|
|
|
|
) -> (EnrichedDocumentsBatchCursor<R>, DocumentsBatchIndex) {
|
2022-06-21 10:45:25 +02:00
|
|
|
let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self;
|
2022-07-18 16:08:01 +02:00
|
|
|
let (documents, fields_index) = documents.into_cursor_and_fields_index();
|
2022-06-20 13:48:02 +02:00
|
|
|
external_ids.reset();
|
2022-07-18 16:08:01 +02:00
|
|
|
(EnrichedDocumentsBatchCursor { documents, primary_key, external_ids }, fields_index)
|
2022-06-20 13:48:02 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-21 14:41:19 +02:00
|
|
|
#[derive(Debug, Clone)]
|
2022-06-20 13:48:02 +02:00
|
|
|
pub struct EnrichedDocument<'a> {
|
|
|
|
pub document: KvReader<'a, FieldId>,
|
2022-06-21 14:41:19 +02:00
|
|
|
pub document_id: DocumentId,
|
2022-06-20 13:48:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
pub struct EnrichedDocumentsBatchCursor<R> {
|
|
|
|
documents: DocumentsBatchCursor<R>,
|
2022-06-21 10:45:25 +02:00
|
|
|
primary_key: String,
|
2023-09-28 16:26:01 +02:00
|
|
|
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
2022-06-20 13:48:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
impl<R> EnrichedDocumentsBatchCursor<R> {
|
2022-06-21 10:45:25 +02:00
|
|
|
pub fn primary_key(&self) -> &str {
|
|
|
|
&self.primary_key
|
2022-06-20 13:48:02 +02:00
|
|
|
}
|
|
|
|
/// Resets the cursor to be able to read from the start again.
|
|
|
|
pub fn reset(&mut self) {
|
|
|
|
self.documents.reset();
|
|
|
|
self.external_ids.reset();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchCursor<R> {
|
|
|
|
/// Returns the next document, starting from the first one. Subsequent calls to
|
|
|
|
/// `next_document` advance the document reader until all the documents have been read.
|
|
|
|
pub fn next_enriched_document(
|
|
|
|
&mut self,
|
|
|
|
) -> Result<Option<EnrichedDocument>, DocumentsBatchCursorError> {
|
|
|
|
let document = self.documents.next_document()?;
|
2022-06-21 14:41:19 +02:00
|
|
|
let document_id = match self.external_ids.move_on_next()? {
|
|
|
|
Some((_, bytes)) => serde_json::from_slice(bytes).map(Some)?,
|
2022-06-20 13:48:02 +02:00
|
|
|
None => None,
|
|
|
|
};
|
|
|
|
|
2022-06-21 14:41:19 +02:00
|
|
|
match document.zip(document_id) {
|
|
|
|
Some((document, document_id)) => Ok(Some(EnrichedDocument { document, document_id })),
|
2022-06-20 13:48:02 +02:00
|
|
|
None => Ok(None),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|