From 402dcd6b2fb8f21c89d3eac8d94297c3023af4b3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 19 Dec 2022 17:37:44 +0100 Subject: [PATCH] Simplify primary key inference --- milli/src/error.rs | 6 ++-- milli/src/update/index_documents/enrich.rs | 33 +++++++++++++++++----- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index bd691ab1d..0b8649067 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -130,8 +130,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco MissingDocumentId { primary_key: String, document: Object }, #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] TooManyDocumentIds { primary_key: String, document: Object }, - #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")] - MissingPrimaryKey, + #[error("The primary key inference process failed because the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")] + NoPrimaryKeyCandidateFound, + #[error("The primary key inference process failed because the engine found {} fields ending with `id` in their name, such as '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())] + MultiplePrimaryKeyCandidatesFound { candidates: Vec }, #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] NoSpaceLeftOnDevice, #[error("Index already has a primary key: `{0}`.")] diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 8874b836c..0d6a8dcbf 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -58,17 +58,36 @@ pub fn enrich_documents_batch( } }, None => { - let guessed = documents_batch_index + let mut guesses: Vec<(u16, &str)> = documents_batch_index .iter() - .filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY)) - .min_by_key(|(fid, _)| *fid); - match guessed { - Some((id, name)) => PrimaryKey::flat(name.as_str(), *id), - None if autogenerate_docids => PrimaryKey::flat( + .filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY)) + .map(|(field_id, name)| (*field_id, name.as_str())) + .collect(); + + // sort the keys in a deterministic, obvious way, so that fields are always in the same order. + guesses.sort_by(|(_, left_name), (_, right_name)| { + // shortest name first + left_name.len().cmp(&right_name.len()).then_with( + // then alphabetical order + || left_name.cmp(right_name), + ) + }); + + match guesses.as_slice() { + [] if autogenerate_docids => PrimaryKey::flat( DEFAULT_PRIMARY_KEY, documents_batch_index.insert(DEFAULT_PRIMARY_KEY), ), - None => return Ok(Err(UserError::MissingPrimaryKey)), + [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), + [(field_id, name)] => PrimaryKey::flat(name, *field_id), + multiple => { + return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { + candidates: multiple + .iter() + .map(|(_, candidate)| candidate.to_string()) + .collect(), + })); + } } } };