mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-02 01:25:45 +01:00
Merge #752
752: Simplify primary key inference r=irevoire a=dureuill # Pull Request ## Related issue Related to https://github.com/meilisearch/meilisearch/issues/3233 ## What does this PR do? ### User PoV - Change primary key inference to only consider a value as a candidate when it ends with "id", rather than when it simply contains "id". - Change primary key inference to always fail when there are multiple candidates. - Replace UserError::MissingPrimaryKey with `UserError::NoPrimaryKeyCandidateFound` and `UserError::MultiplePrimaryKeyCandidatesFound` ### Implementation-wise - Remove uses of UserError::MissingPrimaryKey not pertaining to inference. This introduces a possible panicking path. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
commit
31155dce4c
@ -130,8 +130,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
|||||||
MissingDocumentId { primary_key: String, document: Object },
|
MissingDocumentId { primary_key: String, document: Object },
|
||||||
#[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
|
#[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
|
||||||
TooManyDocumentIds { primary_key: String, document: Object },
|
TooManyDocumentIds { primary_key: String, document: Object },
|
||||||
#[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")]
|
#[error("The primary key inference process failed because the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")]
|
||||||
MissingPrimaryKey,
|
NoPrimaryKeyCandidateFound,
|
||||||
|
#[error("The primary key inference process failed because the engine found {} fields ending with `id` in their name, such as '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())]
|
||||||
|
MultiplePrimaryKeyCandidatesFound { candidates: Vec<String> },
|
||||||
#[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")]
|
#[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")]
|
||||||
NoSpaceLeftOnDevice,
|
NoSpaceLeftOnDevice,
|
||||||
#[error("Index already has a primary key: `{0}`.")]
|
#[error("Index already has a primary key: `{0}`.")]
|
||||||
|
@ -21,6 +21,10 @@ const DEFAULT_PRIMARY_KEY: &str = "id";
|
|||||||
/// - all the documents id exist and are extracted,
|
/// - all the documents id exist and are extracted,
|
||||||
/// - the validity of them but also,
|
/// - the validity of them but also,
|
||||||
/// - the validity of the `_geo` field depending on the settings.
|
/// - the validity of the `_geo` field depending on the settings.
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// - if reader.is_empty(), this function may panic in some cases
|
||||||
pub fn enrich_documents_batch<R: Read + Seek>(
|
pub fn enrich_documents_batch<R: Read + Seek>(
|
||||||
rtxn: &heed::RoTxn,
|
rtxn: &heed::RoTxn,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
@ -49,22 +53,44 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
|||||||
primary_key: primary_key.to_string(),
|
primary_key: primary_key.to_string(),
|
||||||
document: obkv_to_object(&first_document, &documents_batch_index)?,
|
document: obkv_to_object(&first_document, &documents_batch_index)?,
|
||||||
})),
|
})),
|
||||||
None => Ok(Err(UserError::MissingPrimaryKey)),
|
None => unreachable!("Called with reader.is_empty()"),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
None => {
|
None => {
|
||||||
let guessed = documents_batch_index
|
let mut guesses: Vec<(u16, &str)> = documents_batch_index
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY))
|
.filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
|
||||||
.min_by_key(|(fid, _)| *fid);
|
.map(|(field_id, name)| (*field_id, name.as_str()))
|
||||||
match guessed {
|
.collect();
|
||||||
Some((id, name)) => PrimaryKey::flat(name.as_str(), *id),
|
|
||||||
None if autogenerate_docids => PrimaryKey::flat(
|
// sort the keys in a deterministic, obvious way, so that fields are always in the same order.
|
||||||
|
guesses.sort_by(|(_, left_name), (_, right_name)| {
|
||||||
|
// shortest name first
|
||||||
|
left_name.len().cmp(&right_name.len()).then_with(
|
||||||
|
// then alphabetical order
|
||||||
|
|| left_name.cmp(right_name),
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
match guesses.as_slice() {
|
||||||
|
[] if autogenerate_docids => PrimaryKey::flat(
|
||||||
DEFAULT_PRIMARY_KEY,
|
DEFAULT_PRIMARY_KEY,
|
||||||
documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||||
),
|
),
|
||||||
None => return Ok(Err(UserError::MissingPrimaryKey)),
|
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
||||||
|
[(field_id, name)] => {
|
||||||
|
log::info!("Primary key was not specified in index. Inferred to '{name}'");
|
||||||
|
PrimaryKey::flat(name, *field_id)
|
||||||
|
}
|
||||||
|
multiple => {
|
||||||
|
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||||
|
candidates: multiple
|
||||||
|
.iter()
|
||||||
|
.map(|(_, candidate)| candidate.to_string())
|
||||||
|
.collect(),
|
||||||
|
}));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1658,6 +1658,12 @@ mod tests {
|
|||||||
"branch_id_number": 0
|
"branch_id_number": 0
|
||||||
}]};
|
}]};
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
index.put_primary_key(&mut wtxn, "id").unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
index.add_documents(doc1).unwrap();
|
index.add_documents(doc1).unwrap();
|
||||||
index.add_documents(doc2).unwrap();
|
index.add_documents(doc2).unwrap();
|
||||||
|
|
||||||
@ -1814,6 +1820,56 @@ mod tests {
|
|||||||
index.add_documents(doc4).unwrap_err();
|
index.add_documents(doc4).unwrap_err();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn primary_key_inference() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
let doc_no_id = documents! {[{
|
||||||
|
"title": "asdsad",
|
||||||
|
"state": "automated",
|
||||||
|
"priority": "normal",
|
||||||
|
"branch_id_number": 0
|
||||||
|
}]};
|
||||||
|
assert!(matches!(
|
||||||
|
index.add_documents(doc_no_id),
|
||||||
|
Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound))
|
||||||
|
));
|
||||||
|
|
||||||
|
let doc_multiple_ids = documents! {[{
|
||||||
|
"id": 228143,
|
||||||
|
"title": "something",
|
||||||
|
"state": "automated",
|
||||||
|
"priority": "normal",
|
||||||
|
"public_uid": "39c6499b",
|
||||||
|
"project_id": 78207,
|
||||||
|
"branch_id_number": 0
|
||||||
|
}]};
|
||||||
|
|
||||||
|
let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||||
|
candidates
|
||||||
|
})) =
|
||||||
|
index.add_documents(doc_multiple_ids) else { panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)") };
|
||||||
|
|
||||||
|
assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]);
|
||||||
|
|
||||||
|
let doc_inferable = documents! {[{
|
||||||
|
"video": "test.mp4",
|
||||||
|
"id": 228143,
|
||||||
|
"title": "something",
|
||||||
|
"state": "automated",
|
||||||
|
"priority": "normal",
|
||||||
|
"public_uid_": "39c6499b",
|
||||||
|
"project_id_": 78207,
|
||||||
|
"branch_id_number": 0
|
||||||
|
}]};
|
||||||
|
|
||||||
|
index.add_documents(doc_inferable).unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn long_words_must_be_skipped() {
|
fn long_words_must_be_skipped() {
|
||||||
let index = TempIndex::new();
|
let index = TempIndex::new();
|
||||||
|
@ -16,7 +16,7 @@ use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs
|
|||||||
use super::{IndexDocumentsMethod, IndexerConfig};
|
use super::{IndexDocumentsMethod, IndexerConfig};
|
||||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||||
use crate::error::{Error, InternalError, UserError};
|
use crate::error::{Error, InternalError, UserError};
|
||||||
use crate::index::db_name;
|
use crate::index::{db_name, main_key};
|
||||||
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
||||||
use crate::{
|
use crate::{
|
||||||
ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
|
ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
|
||||||
@ -459,7 +459,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let primary_key = self
|
let primary_key = self
|
||||||
.index
|
.index
|
||||||
.primary_key(wtxn)?
|
.primary_key(wtxn)?
|
||||||
.ok_or(Error::UserError(UserError::MissingPrimaryKey))?
|
.ok_or(Error::InternalError(InternalError::DatabaseMissingEntry {
|
||||||
|
db_name: db_name::MAIN,
|
||||||
|
key: Some(main_key::PRIMARY_KEY_KEY),
|
||||||
|
}))?
|
||||||
.to_string();
|
.to_string();
|
||||||
|
|
||||||
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||||
@ -557,8 +560,14 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
mut new_fields_ids_map: FieldsIdsMap,
|
mut new_fields_ids_map: FieldsIdsMap,
|
||||||
) -> Result<TransformOutput> {
|
) -> Result<TransformOutput> {
|
||||||
// There already has been a document addition, the primary key should be set by now.
|
// There already has been a document addition, the primary key should be set by now.
|
||||||
let primary_key =
|
let primary_key = self
|
||||||
self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string();
|
.index
|
||||||
|
.primary_key(wtxn)?
|
||||||
|
.ok_or(InternalError::DatabaseMissingEntry {
|
||||||
|
db_name: db_name::MAIN,
|
||||||
|
key: Some(main_key::PRIMARY_KEY_KEY),
|
||||||
|
})?
|
||||||
|
.to_string();
|
||||||
let field_distribution = self.index.field_distribution(wtxn)?;
|
let field_distribution = self.index.field_distribution(wtxn)?;
|
||||||
|
|
||||||
// Delete the soft deleted document ids from the maps inside the external_document_ids structure
|
// Delete the soft deleted document ids from the maps inside the external_document_ids structure
|
||||||
|
Loading…
Reference in New Issue
Block a user