From 13c95d25aa94abd33cfab9dedd06e59fe6637505 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 19 Dec 2022 15:59:22 +0100 Subject: [PATCH 1/5] Remove uses of UserError::MissingPrimaryKey not related to inference --- milli/src/update/index_documents/enrich.rs | 6 +++++- milli/src/update/index_documents/transform.rs | 17 +++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 7eda5dca4..8874b836c 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -21,6 +21,10 @@ const DEFAULT_PRIMARY_KEY: &str = "id"; /// - all the documents id exist and are extracted, /// - the validity of them but also, /// - the validity of the `_geo` field depending on the settings. +/// +/// # Panics +/// +/// - if reader.is_empty(), this function may panic in some cases pub fn enrich_documents_batch( rtxn: &heed::RoTxn, index: &Index, @@ -49,7 +53,7 @@ pub fn enrich_documents_batch( primary_key: primary_key.to_string(), document: obkv_to_object(&first_document, &documents_batch_index)?, })), - None => Ok(Err(UserError::MissingPrimaryKey)), + None => unreachable!("Called with reader.is_empty()"), }; } }, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f414569b9..68ef2b7ee 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -16,7 +16,7 @@ use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; -use crate::index::db_name; +use crate::index::{db_name, main_key}; use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, @@ -459,7 +459,10 @@ impl<'a, 'i> Transform<'a, 'i> { let primary_key = self .index .primary_key(wtxn)? - .ok_or(Error::UserError(UserError::MissingPrimaryKey))? + .ok_or(Error::InternalError(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + }))? .to_string(); let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; @@ -557,8 +560,14 @@ impl<'a, 'i> Transform<'a, 'i> { mut new_fields_ids_map: FieldsIdsMap, ) -> Result { // There already has been a document addition, the primary key should be set by now. - let primary_key = - self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string(); + let primary_key = self + .index + .primary_key(wtxn)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + })? + .to_string(); let field_distribution = self.index.field_distribution(wtxn)?; // Delete the soft deleted document ids from the maps inside the external_document_ids structure From 402dcd6b2fb8f21c89d3eac8d94297c3023af4b3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 19 Dec 2022 17:37:44 +0100 Subject: [PATCH 2/5] Simplify primary key inference --- milli/src/error.rs | 6 ++-- milli/src/update/index_documents/enrich.rs | 33 +++++++++++++++++----- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index bd691ab1d..0b8649067 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -130,8 +130,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco MissingDocumentId { primary_key: String, document: Object }, #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] TooManyDocumentIds { primary_key: String, document: Object }, - #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")] - MissingPrimaryKey, + #[error("The primary key inference process failed because the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")] + NoPrimaryKeyCandidateFound, + #[error("The primary key inference process failed because the engine found {} fields ending with `id` in their name, such as '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())] + MultiplePrimaryKeyCandidatesFound { candidates: Vec }, #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] NoSpaceLeftOnDevice, #[error("Index already has a primary key: `{0}`.")] diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 8874b836c..0d6a8dcbf 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -58,17 +58,36 @@ pub fn enrich_documents_batch( } }, None => { - let guessed = documents_batch_index + let mut guesses: Vec<(u16, &str)> = documents_batch_index .iter() - .filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY)) - .min_by_key(|(fid, _)| *fid); - match guessed { - Some((id, name)) => PrimaryKey::flat(name.as_str(), *id), - None if autogenerate_docids => PrimaryKey::flat( + .filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY)) + .map(|(field_id, name)| (*field_id, name.as_str())) + .collect(); + + // sort the keys in a deterministic, obvious way, so that fields are always in the same order. + guesses.sort_by(|(_, left_name), (_, right_name)| { + // shortest name first + left_name.len().cmp(&right_name.len()).then_with( + // then alphabetical order + || left_name.cmp(right_name), + ) + }); + + match guesses.as_slice() { + [] if autogenerate_docids => PrimaryKey::flat( DEFAULT_PRIMARY_KEY, documents_batch_index.insert(DEFAULT_PRIMARY_KEY), ), - None => return Ok(Err(UserError::MissingPrimaryKey)), + [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), + [(field_id, name)] => PrimaryKey::flat(name, *field_id), + multiple => { + return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { + candidates: multiple + .iter() + .map(|(_, candidate)| candidate.to_string()) + .collect(), + })); + } } } }; From b24def328180af74daf9b2a562fcdb24b224f873 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 21 Dec 2022 10:20:39 +0100 Subject: [PATCH 3/5] Add logging when inference took place. Displays log message in the form: ``` [2022-12-21T09:19:42Z INFO milli::update::index_documents::enrich] Primary key was not specified in index. Inferred to 'id' ``` --- milli/src/update/index_documents/enrich.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 0d6a8dcbf..3331497c9 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -79,7 +79,10 @@ pub fn enrich_documents_batch( documents_batch_index.insert(DEFAULT_PRIMARY_KEY), ), [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), - [(field_id, name)] => PrimaryKey::flat(name, *field_id), + [(field_id, name)] => { + log::info!("Primary key was not specified in index. Inferred to '{name}'"); + PrimaryKey::flat(name, *field_id) + } multiple => { return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { candidates: multiple From 59431007542cfd7851c49cda9fd0bba504ce8322 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 20 Dec 2022 11:21:29 +0100 Subject: [PATCH 4/5] Fix existing tests --- milli/src/update/index_documents/mod.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7b8408fe4..3656a3bc4 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1658,6 +1658,12 @@ mod tests { "branch_id_number": 0 }]}; + { + let mut wtxn = index.write_txn().unwrap(); + index.put_primary_key(&mut wtxn, "id").unwrap(); + wtxn.commit().unwrap(); + } + index.add_documents(doc1).unwrap(); index.add_documents(doc2).unwrap(); From 4b166bea2b7198fc0124bf85c06aeab4b6db01f3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 20 Dec 2022 11:21:42 +0100 Subject: [PATCH 5/5] Add primary_key_inference test --- milli/src/update/index_documents/mod.rs | 50 +++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3656a3bc4..9e55318ca 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1820,6 +1820,56 @@ mod tests { index.add_documents(doc4).unwrap_err(); } + #[test] + fn primary_key_inference() { + let index = TempIndex::new(); + + let doc_no_id = documents! {[{ + "title": "asdsad", + "state": "automated", + "priority": "normal", + "branch_id_number": 0 + }]}; + assert!(matches!( + index.add_documents(doc_no_id), + Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound)) + )); + + let doc_multiple_ids = documents! {[{ + "id": 228143, + "title": "something", + "state": "automated", + "priority": "normal", + "public_uid": "39c6499b", + "project_id": 78207, + "branch_id_number": 0 + }]}; + + let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound { + candidates + })) = + index.add_documents(doc_multiple_ids) else { panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)") }; + + assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]); + + let doc_inferable = documents! {[{ + "video": "test.mp4", + "id": 228143, + "title": "something", + "state": "automated", + "priority": "normal", + "public_uid_": "39c6499b", + "project_id_": 78207, + "branch_id_number": 0 + }]}; + + index.add_documents(doc_inferable).unwrap(); + + let txn = index.read_txn().unwrap(); + + assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id"); + } + #[test] fn long_words_must_be_skipped() { let index = TempIndex::new();