mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 06:44:27 +01:00
Change DocumentsBatchReader to access cursor and index at same time
Otherwise it is not possible to iterate over all documents while using the fields index at the same time.
This commit is contained in:
parent
ab1571cdec
commit
fc9f3f31e7
@ -216,9 +216,9 @@ mod test {
|
||||
assert_eq!(builder.documents_count(), 2);
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
assert_eq!(index.len(), 3);
|
||||
|
||||
let document = cursor.next_document().unwrap().unwrap();
|
||||
@ -240,9 +240,9 @@ mod test {
|
||||
assert_eq!(builder.documents_count(), 2);
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
assert_eq!(index.len(), 2);
|
||||
|
||||
let document = cursor.next_document().unwrap().unwrap();
|
||||
@ -264,9 +264,9 @@ mod test {
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
||||
@ -292,9 +292,9 @@ mod test {
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
@ -319,9 +319,9 @@ mod test {
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
@ -346,9 +346,9 @@ mod test {
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
@ -373,9 +373,9 @@ mod test {
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
@ -400,9 +400,9 @@ mod test {
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
@ -427,9 +427,9 @@ mod test {
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
@ -454,9 +454,9 @@ mod test {
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
@ -482,8 +482,9 @@ mod test {
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
assert!(cursor.next_document().is_err());
|
||||
}
|
||||
@ -498,9 +499,9 @@ mod test {
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut cursor =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
@ -56,14 +56,13 @@ impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
|
||||
}
|
||||
|
||||
/// This method returns a forward cursor over the enriched documents.
|
||||
pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor<R> {
|
||||
pub fn into_cursor_and_fields_index(
|
||||
self,
|
||||
) -> (EnrichedDocumentsBatchCursor<R>, DocumentsBatchIndex) {
|
||||
let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self;
|
||||
let (documents, fields_index) = documents.into_cursor_and_fields_index();
|
||||
external_ids.reset();
|
||||
EnrichedDocumentsBatchCursor {
|
||||
documents: documents.into_cursor(),
|
||||
primary_key,
|
||||
external_ids,
|
||||
}
|
||||
(EnrichedDocumentsBatchCursor { documents, primary_key, external_ids }, fields_index)
|
||||
}
|
||||
}
|
||||
|
||||
@ -80,23 +79,9 @@ pub struct EnrichedDocumentsBatchCursor<R> {
|
||||
}
|
||||
|
||||
impl<R> EnrichedDocumentsBatchCursor<R> {
|
||||
pub fn into_reader(self) -> EnrichedDocumentsBatchReader<R> {
|
||||
let EnrichedDocumentsBatchCursor { documents, primary_key, external_ids } = self;
|
||||
EnrichedDocumentsBatchReader {
|
||||
documents: documents.into_reader(),
|
||||
primary_key,
|
||||
external_ids,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn primary_key(&self) -> &str {
|
||||
&self.primary_key
|
||||
}
|
||||
|
||||
pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
|
||||
self.documents.documents_batch_index()
|
||||
}
|
||||
|
||||
/// Resets the cursor to be able to read from the start again.
|
||||
pub fn reset(&mut self) {
|
||||
self.documents.reset();
|
||||
|
@ -203,10 +203,11 @@ mod test {
|
||||
builder.append_json_object(value.as_object().unwrap()).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut documents =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
|
||||
let (mut documents, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
assert_eq!(documents.documents_batch_index().iter().count(), 5);
|
||||
assert_eq!(index.iter().count(), 5);
|
||||
let reader = documents.next_document().unwrap().unwrap();
|
||||
assert_eq!(reader.iter().count(), 5);
|
||||
assert!(documents.next_document().unwrap().is_none());
|
||||
@ -226,9 +227,10 @@ mod test {
|
||||
builder.append_json_object(doc2.as_object().unwrap()).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let mut documents =
|
||||
DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor();
|
||||
assert_eq!(documents.documents_batch_index().iter().count(), 2);
|
||||
let (mut documents, index) = DocumentsBatchReader::from_reader(io::Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
assert_eq!(index.iter().count(), 2);
|
||||
let reader = documents.next_document().unwrap().unwrap();
|
||||
assert_eq!(reader.iter().count(), 1);
|
||||
assert!(documents.next_document().unwrap().is_some());
|
||||
@ -243,7 +245,7 @@ mod test {
|
||||
}
|
||||
}]);
|
||||
|
||||
let mut cursor = docs_reader.into_cursor();
|
||||
let (mut cursor, _) = docs_reader.into_cursor_and_fields_index();
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
|
||||
assert_eq!(nested, json!({ "toto": ["hello"] }));
|
||||
|
@ -17,6 +17,10 @@ pub struct DocumentsBatchReader<R> {
|
||||
}
|
||||
|
||||
impl<R: io::Read + io::Seek> DocumentsBatchReader<R> {
|
||||
pub fn new(cursor: DocumentsBatchCursor<R>, fields_index: DocumentsBatchIndex) -> Self {
|
||||
Self { cursor: cursor.cursor, fields_index }
|
||||
}
|
||||
|
||||
/// Construct a `DocumentsReader` from a reader.
|
||||
///
|
||||
/// It first retrieves the index, then moves to the first document. Use the `into_cursor`
|
||||
@ -46,30 +50,20 @@ impl<R: io::Read + io::Seek> DocumentsBatchReader<R> {
|
||||
}
|
||||
|
||||
/// This method returns a forward cursor over the documents.
|
||||
pub fn into_cursor(self) -> DocumentsBatchCursor<R> {
|
||||
pub fn into_cursor_and_fields_index(self) -> (DocumentsBatchCursor<R>, DocumentsBatchIndex) {
|
||||
let DocumentsBatchReader { cursor, fields_index } = self;
|
||||
let mut cursor = DocumentsBatchCursor { cursor, fields_index };
|
||||
let mut cursor = DocumentsBatchCursor { cursor };
|
||||
cursor.reset();
|
||||
cursor
|
||||
(cursor, fields_index)
|
||||
}
|
||||
}
|
||||
|
||||
/// A forward cursor over the documents in a `DocumentsBatchReader`.
|
||||
pub struct DocumentsBatchCursor<R> {
|
||||
cursor: grenad::ReaderCursor<R>,
|
||||
fields_index: DocumentsBatchIndex,
|
||||
}
|
||||
|
||||
impl<R> DocumentsBatchCursor<R> {
|
||||
pub fn into_reader(self) -> DocumentsBatchReader<R> {
|
||||
let DocumentsBatchCursor { cursor, fields_index, .. } = self;
|
||||
DocumentsBatchReader { cursor, fields_index }
|
||||
}
|
||||
|
||||
pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
|
||||
&self.fields_index
|
||||
}
|
||||
|
||||
/// Resets the cursor to be able to read from the start again.
|
||||
pub fn reset(&mut self) {
|
||||
self.cursor.reset();
|
||||
|
@ -27,8 +27,8 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
autogenerate_docids: bool,
|
||||
reader: DocumentsBatchReader<R>,
|
||||
) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
|
||||
let mut cursor = reader.into_cursor();
|
||||
let mut documents_batch_index = cursor.documents_batch_index().clone();
|
||||
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||
|
||||
let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
|
||||
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
|
||||
|
||||
@ -103,9 +103,10 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
}
|
||||
|
||||
let external_ids = writer_into_reader(external_ids)?;
|
||||
let primary_key_name = primary_key.name().to_string();
|
||||
let reader = EnrichedDocumentsBatchReader::new(
|
||||
cursor.into_reader(),
|
||||
primary_key.name().to_string(),
|
||||
DocumentsBatchReader::new(cursor, documents_batch_index),
|
||||
primary_key_name,
|
||||
external_ids,
|
||||
)?;
|
||||
|
||||
|
@ -146,11 +146,11 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
R: Read + Seek,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let mut cursor = reader.into_cursor();
|
||||
let fields_index = cursor.documents_batch_index();
|
||||
let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
|
||||
|
||||
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||
|
||||
let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
|
||||
let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
|
||||
|
||||
let primary_key = cursor.primary_key().to_string();
|
||||
let primary_key_id =
|
||||
|
Loading…
Reference in New Issue
Block a user