752: Simplify primary key inference r=irevoire a=dureuill

# Pull Request

## Related issue
Related to https://github.com/meilisearch/meilisearch/issues/3233

## What does this PR do?

### User PoV

- Change primary key inference to only consider a value as a candidate when it ends with "id", rather than when it simply contains "id".
- Change primary key inference to always fail when there are multiple candidates.
- Replace UserError::MissingPrimaryKey with `UserError::NoPrimaryKeyCandidateFound` and `UserError::MultiplePrimaryKeyCandidatesFound`

### Implementation-wise

- Remove uses of UserError::MissingPrimaryKey not pertaining to inference. This introduces a possible panicking path.

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
bors[bot] 2023-01-02 11:44:22 +00:00 committed by GitHub
commit 31155dce4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 107 additions and 14 deletions

View File

@ -130,8 +130,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
MissingDocumentId { primary_key: String, document: Object },
#[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
TooManyDocumentIds { primary_key: String, document: Object },
#[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")]
MissingPrimaryKey,
#[error("The primary key inference process failed because the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")]
NoPrimaryKeyCandidateFound,
#[error("The primary key inference process failed because the engine found {} fields ending with `id` in their name, such as '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())]
MultiplePrimaryKeyCandidatesFound { candidates: Vec<String> },
#[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")]
NoSpaceLeftOnDevice,
#[error("Index already has a primary key: `{0}`.")]

View File

@ -21,6 +21,10 @@ const DEFAULT_PRIMARY_KEY: &str = "id";
/// - all the documents id exist and are extracted,
/// - the validity of them but also,
/// - the validity of the `_geo` field depending on the settings.
///
/// # Panics
///
/// - if reader.is_empty(), this function may panic in some cases
pub fn enrich_documents_batch<R: Read + Seek>(
rtxn: &heed::RoTxn,
index: &Index,
@ -49,22 +53,44 @@ pub fn enrich_documents_batch<R: Read + Seek>(
primary_key: primary_key.to_string(),
document: obkv_to_object(&first_document, &documents_batch_index)?,
})),
None => Ok(Err(UserError::MissingPrimaryKey)),
None => unreachable!("Called with reader.is_empty()"),
};
}
},
None => {
let guessed = documents_batch_index
let mut guesses: Vec<(u16, &str)> = documents_batch_index
.iter()
.filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY))
.min_by_key(|(fid, _)| *fid);
match guessed {
Some((id, name)) => PrimaryKey::flat(name.as_str(), *id),
None if autogenerate_docids => PrimaryKey::flat(
.filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
.map(|(field_id, name)| (*field_id, name.as_str()))
.collect();
// sort the keys in a deterministic, obvious way, so that fields are always in the same order.
guesses.sort_by(|(_, left_name), (_, right_name)| {
// shortest name first
left_name.len().cmp(&right_name.len()).then_with(
// then alphabetical order
|| left_name.cmp(right_name),
)
});
match guesses.as_slice() {
[] if autogenerate_docids => PrimaryKey::flat(
DEFAULT_PRIMARY_KEY,
documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
),
None => return Ok(Err(UserError::MissingPrimaryKey)),
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
[(field_id, name)] => {
log::info!("Primary key was not specified in index. Inferred to '{name}'");
PrimaryKey::flat(name, *field_id)
}
multiple => {
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
candidates: multiple
.iter()
.map(|(_, candidate)| candidate.to_string())
.collect(),
}));
}
}
}
};

View File

@ -1658,6 +1658,12 @@ mod tests {
"branch_id_number": 0
}]};
{
let mut wtxn = index.write_txn().unwrap();
index.put_primary_key(&mut wtxn, "id").unwrap();
wtxn.commit().unwrap();
}
index.add_documents(doc1).unwrap();
index.add_documents(doc2).unwrap();
@ -1814,6 +1820,56 @@ mod tests {
index.add_documents(doc4).unwrap_err();
}
#[test]
fn primary_key_inference() {
let index = TempIndex::new();
let doc_no_id = documents! {[{
"title": "asdsad",
"state": "automated",
"priority": "normal",
"branch_id_number": 0
}]};
assert!(matches!(
index.add_documents(doc_no_id),
Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound))
));
let doc_multiple_ids = documents! {[{
"id": 228143,
"title": "something",
"state": "automated",
"priority": "normal",
"public_uid": "39c6499b",
"project_id": 78207,
"branch_id_number": 0
}]};
let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound {
candidates
})) =
index.add_documents(doc_multiple_ids) else { panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)") };
assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]);
let doc_inferable = documents! {[{
"video": "test.mp4",
"id": 228143,
"title": "something",
"state": "automated",
"priority": "normal",
"public_uid_": "39c6499b",
"project_id_": 78207,
"branch_id_number": 0
}]};
index.add_documents(doc_inferable).unwrap();
let txn = index.read_txn().unwrap();
assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id");
}
#[test]
fn long_words_must_be_skipped() {
let index = TempIndex::new();

View File

@ -16,7 +16,7 @@ use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs
use super::{IndexDocumentsMethod, IndexerConfig};
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
use crate::error::{Error, InternalError, UserError};
use crate::index::db_name;
use crate::index::{db_name, main_key};
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
use crate::{
ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
@ -459,7 +459,10 @@ impl<'a, 'i> Transform<'a, 'i> {
let primary_key = self
.index
.primary_key(wtxn)?
.ok_or(Error::UserError(UserError::MissingPrimaryKey))?
.ok_or(Error::InternalError(InternalError::DatabaseMissingEntry {
db_name: db_name::MAIN,
key: Some(main_key::PRIMARY_KEY_KEY),
}))?
.to_string();
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
@ -557,8 +560,14 @@ impl<'a, 'i> Transform<'a, 'i> {
mut new_fields_ids_map: FieldsIdsMap,
) -> Result<TransformOutput> {
// There already has been a document addition, the primary key should be set by now.
let primary_key =
self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string();
let primary_key = self
.index
.primary_key(wtxn)?
.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::MAIN,
key: Some(main_key::PRIMARY_KEY_KEY),
})?
.to_string();
let field_distribution = self.index.field_distribution(wtxn)?;
// Delete the soft deleted document ids from the maps inside the external_document_ids structure