Change DocumentsBatchReader to access cursor and index at same time

Otherwise it is not possible to iterate over all documents while
using the fields index at the same time.
This commit is contained in:
Loïc Lecrenier 2022-07-18 16:08:01 +02:00
parent ab1571cdec
commit fc9f3f31e7
6 changed files with 65 additions and 82 deletions

View File

@ -216,9 +216,9 @@ mod test {
assert_eq!(builder.documents_count(), 2);
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
assert_eq!(index.len(), 3);
let document = cursor.next_document().unwrap().unwrap();
@ -240,9 +240,9 @@ mod test {
assert_eq!(builder.documents_count(), 2);
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
assert_eq!(index.len(), 2);
let document = cursor.next_document().unwrap().unwrap();
@ -264,9 +264,9 @@ mod test {
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@ -292,9 +292,9 @@ mod test {
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@ -319,9 +319,9 @@ mod test {
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@ -346,9 +346,9 @@ mod test {
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@ -373,9 +373,9 @@ mod test {
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@ -400,9 +400,9 @@ mod test {
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@ -427,9 +427,9 @@ mod test {
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@ -454,9 +454,9 @@ mod test {
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@ -482,8 +482,9 @@ mod test {
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
assert!(cursor.next_document().is_err());
}
@ -498,9 +499,9 @@ mod test {
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();

View File

@ -56,14 +56,13 @@ impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
}
/// This method returns a forward cursor over the enriched documents.
pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor<R> {
pub fn into_cursor_and_fields_index(
self,
) -> (EnrichedDocumentsBatchCursor<R>, DocumentsBatchIndex) {
let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self;
let (documents, fields_index) = documents.into_cursor_and_fields_index();
external_ids.reset();
EnrichedDocumentsBatchCursor {
documents: documents.into_cursor(),
primary_key,
external_ids,
}
(EnrichedDocumentsBatchCursor { documents, primary_key, external_ids }, fields_index)
}
}
@ -80,23 +79,9 @@ pub struct EnrichedDocumentsBatchCursor<R> {
}
impl<R> EnrichedDocumentsBatchCursor<R> {
pub fn into_reader(self) -> EnrichedDocumentsBatchReader<R> {
let EnrichedDocumentsBatchCursor { documents, primary_key, external_ids } = self;
EnrichedDocumentsBatchReader {
documents: documents.into_reader(),
primary_key,
external_ids,
}
}
pub fn primary_key(&self) -> &str {
&self.primary_key
}
pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
self.documents.documents_batch_index()
}
/// Resets the cursor to be able to read from the start again.
pub fn reset(&mut self) {
self.documents.reset();

View File

@ -203,10 +203,11 @@ mod test {
builder.append_json_object(value.as_object().unwrap()).unwrap();
let vector = builder.into_inner().unwrap();
let mut documents =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let (mut documents, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
assert_eq!(documents.documents_batch_index().iter().count(), 5);
assert_eq!(index.iter().count(), 5);
let reader = documents.next_document().unwrap().unwrap();
assert_eq!(reader.iter().count(), 5);
assert!(documents.next_document().unwrap().is_none());
@ -226,9 +227,10 @@ mod test {
builder.append_json_object(doc2.as_object().unwrap()).unwrap();
let vector = builder.into_inner().unwrap();
let mut documents =
DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor();
assert_eq!(documents.documents_batch_index().iter().count(), 2);
let (mut documents, index) = DocumentsBatchReader::from_reader(io::Cursor::new(vector))
.unwrap()
.into_cursor_and_fields_index();
assert_eq!(index.iter().count(), 2);
let reader = documents.next_document().unwrap().unwrap();
assert_eq!(reader.iter().count(), 1);
assert!(documents.next_document().unwrap().is_some());
@ -243,7 +245,7 @@ mod test {
}
}]);
let mut cursor = docs_reader.into_cursor();
let (mut cursor, _) = docs_reader.into_cursor_and_fields_index();
let doc = cursor.next_document().unwrap().unwrap();
let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
assert_eq!(nested, json!({ "toto": ["hello"] }));

View File

@ -17,6 +17,10 @@ pub struct DocumentsBatchReader<R> {
}
impl<R: io::Read + io::Seek> DocumentsBatchReader<R> {
pub fn new(cursor: DocumentsBatchCursor<R>, fields_index: DocumentsBatchIndex) -> Self {
Self { cursor: cursor.cursor, fields_index }
}
/// Construct a `DocumentsReader` from a reader.
///
/// It first retrieves the index, then moves to the first document. Use the `into_cursor`
@ -46,30 +50,20 @@ impl<R: io::Read + io::Seek> DocumentsBatchReader<R> {
}
/// This method returns a forward cursor over the documents.
pub fn into_cursor(self) -> DocumentsBatchCursor<R> {
pub fn into_cursor_and_fields_index(self) -> (DocumentsBatchCursor<R>, DocumentsBatchIndex) {
let DocumentsBatchReader { cursor, fields_index } = self;
let mut cursor = DocumentsBatchCursor { cursor, fields_index };
let mut cursor = DocumentsBatchCursor { cursor };
cursor.reset();
cursor
(cursor, fields_index)
}
}
/// A forward cursor over the documents in a `DocumentsBatchReader`.
pub struct DocumentsBatchCursor<R> {
cursor: grenad::ReaderCursor<R>,
fields_index: DocumentsBatchIndex,
}
impl<R> DocumentsBatchCursor<R> {
pub fn into_reader(self) -> DocumentsBatchReader<R> {
let DocumentsBatchCursor { cursor, fields_index, .. } = self;
DocumentsBatchReader { cursor, fields_index }
}
pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
&self.fields_index
}
/// Resets the cursor to be able to read from the start again.
pub fn reset(&mut self) {
self.cursor.reset();

View File

@ -27,8 +27,8 @@ pub fn enrich_documents_batch<R: Read + Seek>(
autogenerate_docids: bool,
reader: DocumentsBatchReader<R>,
) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
let mut cursor = reader.into_cursor();
let mut documents_batch_index = cursor.documents_batch_index().clone();
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
@ -103,9 +103,10 @@ pub fn enrich_documents_batch<R: Read + Seek>(
}
let external_ids = writer_into_reader(external_ids)?;
let primary_key_name = primary_key.name().to_string();
let reader = EnrichedDocumentsBatchReader::new(
cursor.into_reader(),
primary_key.name().to_string(),
DocumentsBatchReader::new(cursor, documents_batch_index),
primary_key_name,
external_ids,
)?;

View File

@ -146,11 +146,11 @@ impl<'a, 'i> Transform<'a, 'i> {
R: Read + Seek,
F: Fn(UpdateIndexingStep) + Sync,
{
let mut cursor = reader.into_cursor();
let fields_index = cursor.documents_batch_index();
let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
let primary_key = cursor.primary_key().to_string();
let primary_key_id =