Support the auto-generated ids when validating documents

This commit is contained in:
Kerollmops 2022-06-15 15:14:20 +02:00
parent 399eec5c01
commit 2ceeb51c37
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 13 additions and 5 deletions

View File

@ -778,7 +778,8 @@ mod tests {
let indexing_config = IndexDocumentsConfig::default(); let indexing_config = IndexDocumentsConfig::default();
let builder = let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
assert!(builder.add_documents(content).is_err()); let (_builder, user_error) = builder.add_documents(content).unwrap();
assert!(user_error.is_err());
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is no document. // Check that there is no document.
@ -943,7 +944,8 @@ mod tests {
let builder = let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
.unwrap(); .unwrap();
assert!(builder.add_documents(content).is_err()); let (_builder, user_error) = builder.add_documents(content).unwrap();
assert!(user_error.is_err());
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// First we send 1 document with a valid id. // First we send 1 document with a valid id.

View File

@ -19,20 +19,20 @@ pub fn validate_documents_batch<R: Read + Seek>(
reader: DocumentsBatchReader<R>, reader: DocumentsBatchReader<R>,
) -> Result<StdResult<DocumentsBatchReader<R>, UserError>> { ) -> Result<StdResult<DocumentsBatchReader<R>, UserError>> {
let mut cursor = reader.into_cursor(); let mut cursor = reader.into_cursor();
let documents_batch_index = cursor.documents_batch_index().clone(); let mut documents_batch_index = cursor.documents_batch_index().clone();
// The primary key *field id* that has already been set for this index or the one // The primary key *field id* that has already been set for this index or the one
// we will guess by searching for the first key that contains "id" as a substring. // we will guess by searching for the first key that contains "id" as a substring.
let (primary_key, primary_key_id) = match index.primary_key(rtxn)? { let (primary_key, primary_key_id) = match index.primary_key(rtxn)? {
Some(primary_key) => match documents_batch_index.id(primary_key) { Some(primary_key) => match documents_batch_index.id(primary_key) {
Some(id) => (primary_key, id), Some(id) => (primary_key, id),
None if autogenerate_docids => (primary_key, documents_batch_index.insert(primary_key)),
None => { None => {
return match cursor.next_document()? { return match cursor.next_document()? {
Some(first_document) => Ok(Err(UserError::MissingDocumentId { Some(first_document) => Ok(Err(UserError::MissingDocumentId {
primary_key: primary_key.to_string(), primary_key: primary_key.to_string(),
document: obkv_to_object(&first_document, &documents_batch_index)?, document: obkv_to_object(&first_document, &documents_batch_index)?,
})), })),
// If there is no document in this batch the best we can do is to return this error.
None => Ok(Err(UserError::MissingPrimaryKey)), None => Ok(Err(UserError::MissingPrimaryKey)),
}; };
} }
@ -40,10 +40,11 @@ pub fn validate_documents_batch<R: Read + Seek>(
None => { None => {
let guessed = documents_batch_index let guessed = documents_batch_index
.iter() .iter()
.filter(|(_, name)| name.contains("id")) .filter(|(_, name)| name.to_lowercase().contains("id"))
.min_by_key(|(fid, _)| *fid); .min_by_key(|(fid, _)| *fid);
match guessed { match guessed {
Some((id, name)) => (name.as_str(), *id), Some((id, name)) => (name.as_str(), *id),
None if autogenerate_docids => ("id", documents_batch_index.insert("id")),
None => return Ok(Err(UserError::MissingPrimaryKey)), None => return Ok(Err(UserError::MissingPrimaryKey)),
} }
} }
@ -56,12 +57,16 @@ pub fn validate_documents_batch<R: Read + Seek>(
_otherwise => None, _otherwise => None,
}; };
let mut count = 0;
while let Some(document) = cursor.next_document()? { while let Some(document) = cursor.next_document()? {
let document_id = match document.get(primary_key_id) { let document_id = match document.get(primary_key_id) {
Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? { Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? {
Ok(document_id) => document_id, Ok(document_id) => document_id,
Err(user_error) => return Ok(Err(user_error)), Err(user_error) => return Ok(Err(user_error)),
}, },
None if autogenerate_docids => {
format!("{{auto-generated id of the {}nth document}}", count)
}
None => { None => {
return Ok(Err(UserError::MissingDocumentId { return Ok(Err(UserError::MissingDocumentId {
primary_key: primary_key.to_string(), primary_key: primary_key.to_string(),
@ -75,6 +80,7 @@ pub fn validate_documents_batch<R: Read + Seek>(
return Ok(Err(UserError::from(user_error))); return Ok(Err(UserError::from(user_error)));
} }
} }
count += 1;
} }
Ok(Ok(cursor.into_reader())) Ok(Ok(cursor.into_reader()))