Simplify primary key inference

This commit is contained in:
Louis Dureuil 2022-12-19 17:37:44 +01:00
parent 13c95d25aa
commit 402dcd6b2f
No known key found for this signature in database
2 changed files with 30 additions and 9 deletions

View File

@ -130,8 +130,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
MissingDocumentId { primary_key: String, document: Object },
#[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
TooManyDocumentIds { primary_key: String, document: Object },
#[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")]
MissingPrimaryKey,
#[error("The primary key inference process failed because the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")]
NoPrimaryKeyCandidateFound,
#[error("The primary key inference process failed because the engine found {} fields ending with `id` in their name, such as '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())]
MultiplePrimaryKeyCandidatesFound { candidates: Vec<String> },
#[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")]
NoSpaceLeftOnDevice,
#[error("Index already has a primary key: `{0}`.")]

View File

@ -58,17 +58,36 @@ pub fn enrich_documents_batch<R: Read + Seek>(
}
},
None => {
let guessed = documents_batch_index
let mut guesses: Vec<(u16, &str)> = documents_batch_index
.iter()
.filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY))
.min_by_key(|(fid, _)| *fid);
match guessed {
Some((id, name)) => PrimaryKey::flat(name.as_str(), *id),
None if autogenerate_docids => PrimaryKey::flat(
.filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
.map(|(field_id, name)| (*field_id, name.as_str()))
.collect();
// sort the keys in a deterministic, obvious way, so that fields are always in the same order.
guesses.sort_by(|(_, left_name), (_, right_name)| {
// shortest name first
left_name.len().cmp(&right_name.len()).then_with(
// then alphabetical order
|| left_name.cmp(right_name),
)
});
match guesses.as_slice() {
[] if autogenerate_docids => PrimaryKey::flat(
DEFAULT_PRIMARY_KEY,
documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
),
None => return Ok(Err(UserError::MissingPrimaryKey)),
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
[(field_id, name)] => PrimaryKey::flat(name, *field_id),
multiple => {
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
candidates: multiple
.iter()
.map(|(_, candidate)| candidate.to_string())
.collect(),
}));
}
}
}
};