From 6d52c5b2f0a7fb30c29705f883e7f908a9aab285 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sat, 31 Oct 2020 21:46:55 +0100 Subject: [PATCH] Introduce a parameter to disable the engine to autogenerate docids --- src/update/index_documents/mod.rs | 63 +++++++++++++++++++++++++ src/update/index_documents/transform.rs | 24 ++++++++-- 2 files changed, 83 insertions(+), 4 deletions(-) diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 25264e7ae..8a1571328 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -202,6 +202,7 @@ pub struct IndexDocuments<'t, 'u, 'i> { indexing_jobs: Option, update_method: IndexDocumentsMethod, update_format: UpdateFormat, + autogenerate_docids: bool, } impl<'t, 'u, 'i> IndexDocuments<'t, 'u, 'i> { @@ -219,6 +220,7 @@ impl<'t, 'u, 'i> IndexDocuments<'t, 'u, 'i> { indexing_jobs: None, update_method: IndexDocumentsMethod::ReplaceDocuments, update_format: UpdateFormat::Json, + autogenerate_docids: true, } } @@ -272,6 +274,16 @@ impl<'t, 'u, 'i> IndexDocuments<'t, 'u, 'i> { self } + pub fn enable_autogenerate_docids(&mut self) -> &mut Self { + self.autogenerate_docids = true; + self + } + + pub fn disable_autogenerate_docids(&mut self) -> &mut Self { + self.autogenerate_docids = false; + self + } + pub fn execute(self, reader: R, progress_callback: F) -> anyhow::Result<()> where R: io::Read, @@ -288,6 +300,7 @@ impl<'t, 'u, 'i> IndexDocuments<'t, 'u, 'i> { max_nb_chunks: self.max_nb_chunks, max_memory: self.max_memory, index_documents_method: self.update_method, + autogenerate_docids: self.autogenerate_docids, }; let output = match self.update_format { @@ -636,6 +649,56 @@ mod tests { drop(rtxn); } + #[test] + fn not_auto_generated_csv_documents_ids() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name\nkevin\nkevina\nbenoit\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index); + builder.disable_autogenerate_docids(); + builder.update_format(UpdateFormat::Csv); + assert!(builder.execute(content, |_, _| ()).is_err()); + wtxn.commit().unwrap(); + + // Check that there is no document. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 0); + drop(rtxn); + } + + #[test] + fn not_auto_generated_json_documents_ids() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents and 2 without ids. + let mut wtxn = index.write_txn().unwrap(); + let content = &br#"[ + { "name": "kevina", "id": 21 }, + { "name": "kevin" }, + { "name": "benoit" } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index); + builder.disable_autogenerate_docids(); + builder.update_format(UpdateFormat::Json); + assert!(builder.execute(content, |_, _| ()).is_err()); + wtxn.commit().unwrap(); + + // Check that there is no document. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 0); + drop(rtxn); + } + #[test] fn simple_auto_generated_documents_ids() { let path = tempfile::tempdir().unwrap(); diff --git a/src/update/index_documents/transform.rs b/src/update/index_documents/transform.rs index b68d37b42..1a4c599eb 100644 --- a/src/update/index_documents/transform.rs +++ b/src/update/index_documents/transform.rs @@ -33,6 +33,7 @@ pub struct Transform<'t, 'i> { pub max_nb_chunks: Option, pub max_memory: Option, pub index_documents_method: IndexDocumentsMethod, + pub autogenerate_docids: bool, } impl Transform<'_, '_> { @@ -57,7 +58,14 @@ impl Transform<'_, '_> { None => { match documents.get(0).and_then(|doc| doc.keys().find(|k| k.contains("id"))) { Some(key) => fields_ids_map.insert(&key).context("field id limit reached")?, - None => fields_ids_map.insert("id").context("field id limit reached")?, + None => { + if !self.autogenerate_docids { + // If there is no primary key in the current document batch, we must + // return an error and not automatically generate any document id. + return Err(anyhow!("missing primary key")) + } + fields_ids_map.insert("id").context("field id limit reached")? + }, } }, }; @@ -130,6 +138,9 @@ impl Transform<'_, '_> { _ => return Err(anyhow!("documents ids must be either strings or numbers")), }, None => { + if !self.autogenerate_docids { + return Err(anyhow!("missing primary key")); + } let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); Cow::Borrowed(uuid) }, @@ -180,11 +191,16 @@ impl Transform<'_, '_> { let primary_key_field_id = match user_id_pos { Some(pos) => fields_ids_map.id(&headers[pos]).expect("found the primary key"), None => { - let id = fields_ids_map.insert("id").context("field id limit reached")?; + if !self.autogenerate_docids { + // If there is no primary key in the current document batch, we must + // return an error and not automatically generate any document id. + return Err(anyhow!("missing primary key")) + } + let field_id = fields_ids_map.insert("id").context("field id limit reached")?; // We make sure to add the primary key field id to the fields ids, // this way it is added to the obks. - fields_ids.push((id, usize::max_value())); - id + fields_ids.push((field_id, usize::max_value())); + field_id }, };