mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-29 16:24:26 +01:00
Merge #752
752: Simplify primary key inference r=irevoire a=dureuill # Pull Request ## Related issue Related to https://github.com/meilisearch/meilisearch/issues/3233 ## What does this PR do? ### User PoV - Change primary key inference to only consider a value as a candidate when it ends with "id", rather than when it simply contains "id". - Change primary key inference to always fail when there are multiple candidates. - Replace UserError::MissingPrimaryKey with `UserError::NoPrimaryKeyCandidateFound` and `UserError::MultiplePrimaryKeyCandidatesFound` ### Implementation-wise - Remove uses of UserError::MissingPrimaryKey not pertaining to inference. This introduces a possible panicking path. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
commit
31155dce4c
@ -130,8 +130,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
MissingDocumentId { primary_key: String, document: Object },
|
||||
#[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
|
||||
TooManyDocumentIds { primary_key: String, document: Object },
|
||||
#[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")]
|
||||
MissingPrimaryKey,
|
||||
#[error("The primary key inference process failed because the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")]
|
||||
NoPrimaryKeyCandidateFound,
|
||||
#[error("The primary key inference process failed because the engine found {} fields ending with `id` in their name, such as '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())]
|
||||
MultiplePrimaryKeyCandidatesFound { candidates: Vec<String> },
|
||||
#[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")]
|
||||
NoSpaceLeftOnDevice,
|
||||
#[error("Index already has a primary key: `{0}`.")]
|
||||
|
@ -21,6 +21,10 @@ const DEFAULT_PRIMARY_KEY: &str = "id";
|
||||
/// - all the documents id exist and are extracted,
|
||||
/// - the validity of them but also,
|
||||
/// - the validity of the `_geo` field depending on the settings.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if reader.is_empty(), this function may panic in some cases
|
||||
pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
rtxn: &heed::RoTxn,
|
||||
index: &Index,
|
||||
@ -49,22 +53,44 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
primary_key: primary_key.to_string(),
|
||||
document: obkv_to_object(&first_document, &documents_batch_index)?,
|
||||
})),
|
||||
None => Ok(Err(UserError::MissingPrimaryKey)),
|
||||
None => unreachable!("Called with reader.is_empty()"),
|
||||
};
|
||||
}
|
||||
},
|
||||
None => {
|
||||
let guessed = documents_batch_index
|
||||
let mut guesses: Vec<(u16, &str)> = documents_batch_index
|
||||
.iter()
|
||||
.filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY))
|
||||
.min_by_key(|(fid, _)| *fid);
|
||||
match guessed {
|
||||
Some((id, name)) => PrimaryKey::flat(name.as_str(), *id),
|
||||
None if autogenerate_docids => PrimaryKey::flat(
|
||||
.filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
|
||||
.map(|(field_id, name)| (*field_id, name.as_str()))
|
||||
.collect();
|
||||
|
||||
// sort the keys in a deterministic, obvious way, so that fields are always in the same order.
|
||||
guesses.sort_by(|(_, left_name), (_, right_name)| {
|
||||
// shortest name first
|
||||
left_name.len().cmp(&right_name.len()).then_with(
|
||||
// then alphabetical order
|
||||
|| left_name.cmp(right_name),
|
||||
)
|
||||
});
|
||||
|
||||
match guesses.as_slice() {
|
||||
[] if autogenerate_docids => PrimaryKey::flat(
|
||||
DEFAULT_PRIMARY_KEY,
|
||||
documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||
),
|
||||
None => return Ok(Err(UserError::MissingPrimaryKey)),
|
||||
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
||||
[(field_id, name)] => {
|
||||
log::info!("Primary key was not specified in index. Inferred to '{name}'");
|
||||
PrimaryKey::flat(name, *field_id)
|
||||
}
|
||||
multiple => {
|
||||
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||
candidates: multiple
|
||||
.iter()
|
||||
.map(|(_, candidate)| candidate.to_string())
|
||||
.collect(),
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -1658,6 +1658,12 @@ mod tests {
|
||||
"branch_id_number": 0
|
||||
}]};
|
||||
|
||||
{
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index.put_primary_key(&mut wtxn, "id").unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
index.add_documents(doc1).unwrap();
|
||||
index.add_documents(doc2).unwrap();
|
||||
|
||||
@ -1814,6 +1820,56 @@ mod tests {
|
||||
index.add_documents(doc4).unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn primary_key_inference() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let doc_no_id = documents! {[{
|
||||
"title": "asdsad",
|
||||
"state": "automated",
|
||||
"priority": "normal",
|
||||
"branch_id_number": 0
|
||||
}]};
|
||||
assert!(matches!(
|
||||
index.add_documents(doc_no_id),
|
||||
Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound))
|
||||
));
|
||||
|
||||
let doc_multiple_ids = documents! {[{
|
||||
"id": 228143,
|
||||
"title": "something",
|
||||
"state": "automated",
|
||||
"priority": "normal",
|
||||
"public_uid": "39c6499b",
|
||||
"project_id": 78207,
|
||||
"branch_id_number": 0
|
||||
}]};
|
||||
|
||||
let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||
candidates
|
||||
})) =
|
||||
index.add_documents(doc_multiple_ids) else { panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)") };
|
||||
|
||||
assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]);
|
||||
|
||||
let doc_inferable = documents! {[{
|
||||
"video": "test.mp4",
|
||||
"id": 228143,
|
||||
"title": "something",
|
||||
"state": "automated",
|
||||
"priority": "normal",
|
||||
"public_uid_": "39c6499b",
|
||||
"project_id_": 78207,
|
||||
"branch_id_number": 0
|
||||
}]};
|
||||
|
||||
index.add_documents(doc_inferable).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn long_words_must_be_skipped() {
|
||||
let index = TempIndex::new();
|
||||
|
@ -16,7 +16,7 @@ use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs
|
||||
use super::{IndexDocumentsMethod, IndexerConfig};
|
||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::index::db_name;
|
||||
use crate::index::{db_name, main_key};
|
||||
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
||||
use crate::{
|
||||
ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
|
||||
@ -459,7 +459,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
let primary_key = self
|
||||
.index
|
||||
.primary_key(wtxn)?
|
||||
.ok_or(Error::UserError(UserError::MissingPrimaryKey))?
|
||||
.ok_or(Error::InternalError(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::MAIN,
|
||||
key: Some(main_key::PRIMARY_KEY_KEY),
|
||||
}))?
|
||||
.to_string();
|
||||
|
||||
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||
@ -557,8 +560,14 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
mut new_fields_ids_map: FieldsIdsMap,
|
||||
) -> Result<TransformOutput> {
|
||||
// There already has been a document addition, the primary key should be set by now.
|
||||
let primary_key =
|
||||
self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string();
|
||||
let primary_key = self
|
||||
.index
|
||||
.primary_key(wtxn)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::MAIN,
|
||||
key: Some(main_key::PRIMARY_KEY_KEY),
|
||||
})?
|
||||
.to_string();
|
||||
let field_distribution = self.index.field_distribution(wtxn)?;
|
||||
|
||||
// Delete the soft deleted document ids from the maps inside the external_document_ids structure
|
||||
|
Loading…
Reference in New Issue
Block a user