diff --git a/milli/src/error.rs b/milli/src/error.rs index bd691ab1d..0b8649067 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -130,8 +130,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco MissingDocumentId { primary_key: String, document: Object }, #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] TooManyDocumentIds { primary_key: String, document: Object }, - #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")] - MissingPrimaryKey, + #[error("The primary key inference process failed because the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")] + NoPrimaryKeyCandidateFound, + #[error("The primary key inference process failed because the engine found {} fields ending with `id` in their name, such as '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())] + MultiplePrimaryKeyCandidatesFound { candidates: Vec }, #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] NoSpaceLeftOnDevice, #[error("Index already has a primary key: `{0}`.")] diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 7eda5dca4..3331497c9 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -21,6 +21,10 @@ const DEFAULT_PRIMARY_KEY: &str = "id"; /// - all the documents id exist and are extracted, /// - the validity of them but also, /// - the validity of the `_geo` field depending on the settings. +/// +/// # Panics +/// +/// - if reader.is_empty(), this function may panic in some cases pub fn enrich_documents_batch( rtxn: &heed::RoTxn, index: &Index, @@ -49,22 +53,44 @@ pub fn enrich_documents_batch( primary_key: primary_key.to_string(), document: obkv_to_object(&first_document, &documents_batch_index)?, })), - None => Ok(Err(UserError::MissingPrimaryKey)), + None => unreachable!("Called with reader.is_empty()"), }; } }, None => { - let guessed = documents_batch_index + let mut guesses: Vec<(u16, &str)> = documents_batch_index .iter() - .filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY)) - .min_by_key(|(fid, _)| *fid); - match guessed { - Some((id, name)) => PrimaryKey::flat(name.as_str(), *id), - None if autogenerate_docids => PrimaryKey::flat( + .filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY)) + .map(|(field_id, name)| (*field_id, name.as_str())) + .collect(); + + // sort the keys in a deterministic, obvious way, so that fields are always in the same order. + guesses.sort_by(|(_, left_name), (_, right_name)| { + // shortest name first + left_name.len().cmp(&right_name.len()).then_with( + // then alphabetical order + || left_name.cmp(right_name), + ) + }); + + match guesses.as_slice() { + [] if autogenerate_docids => PrimaryKey::flat( DEFAULT_PRIMARY_KEY, documents_batch_index.insert(DEFAULT_PRIMARY_KEY), ), - None => return Ok(Err(UserError::MissingPrimaryKey)), + [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), + [(field_id, name)] => { + log::info!("Primary key was not specified in index. Inferred to '{name}'"); + PrimaryKey::flat(name, *field_id) + } + multiple => { + return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { + candidates: multiple + .iter() + .map(|(_, candidate)| candidate.to_string()) + .collect(), + })); + } } } }; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7b8408fe4..9e55318ca 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1658,6 +1658,12 @@ mod tests { "branch_id_number": 0 }]}; + { + let mut wtxn = index.write_txn().unwrap(); + index.put_primary_key(&mut wtxn, "id").unwrap(); + wtxn.commit().unwrap(); + } + index.add_documents(doc1).unwrap(); index.add_documents(doc2).unwrap(); @@ -1814,6 +1820,56 @@ mod tests { index.add_documents(doc4).unwrap_err(); } + #[test] + fn primary_key_inference() { + let index = TempIndex::new(); + + let doc_no_id = documents! {[{ + "title": "asdsad", + "state": "automated", + "priority": "normal", + "branch_id_number": 0 + }]}; + assert!(matches!( + index.add_documents(doc_no_id), + Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound)) + )); + + let doc_multiple_ids = documents! {[{ + "id": 228143, + "title": "something", + "state": "automated", + "priority": "normal", + "public_uid": "39c6499b", + "project_id": 78207, + "branch_id_number": 0 + }]}; + + let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound { + candidates + })) = + index.add_documents(doc_multiple_ids) else { panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)") }; + + assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]); + + let doc_inferable = documents! {[{ + "video": "test.mp4", + "id": 228143, + "title": "something", + "state": "automated", + "priority": "normal", + "public_uid_": "39c6499b", + "project_id_": 78207, + "branch_id_number": 0 + }]}; + + index.add_documents(doc_inferable).unwrap(); + + let txn = index.read_txn().unwrap(); + + assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id"); + } + #[test] fn long_words_must_be_skipped() { let index = TempIndex::new(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f414569b9..68ef2b7ee 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -16,7 +16,7 @@ use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; -use crate::index::db_name; +use crate::index::{db_name, main_key}; use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, @@ -459,7 +459,10 @@ impl<'a, 'i> Transform<'a, 'i> { let primary_key = self .index .primary_key(wtxn)? - .ok_or(Error::UserError(UserError::MissingPrimaryKey))? + .ok_or(Error::InternalError(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + }))? .to_string(); let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; @@ -557,8 +560,14 @@ impl<'a, 'i> Transform<'a, 'i> { mut new_fields_ids_map: FieldsIdsMap, ) -> Result { // There already has been a document addition, the primary key should be set by now. - let primary_key = - self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string(); + let primary_key = self + .index + .primary_key(wtxn)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + })? + .to_string(); let field_distribution = self.index.field_distribution(wtxn)?; // Delete the soft deleted document ids from the maps inside the external_document_ids structure