mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 05:54:30 +01:00
Display docid when there are documents without manual embeddings for a manual embedder
This commit is contained in:
parent
553440632e
commit
2413592bbf
@ -95,6 +95,84 @@ enum ExtractionAction {
|
|||||||
DocumentOperation(DocumentOperation),
|
DocumentOperation(DocumentOperation),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ManualEmbedderErrors {
|
||||||
|
embedder_name: String,
|
||||||
|
docid: String,
|
||||||
|
other_docids: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ManualEmbedderErrors {
|
||||||
|
pub fn push_error(
|
||||||
|
errors: &mut Option<ManualEmbedderErrors>,
|
||||||
|
embedder_name: &str,
|
||||||
|
document_id: impl Fn() -> Value,
|
||||||
|
) {
|
||||||
|
match errors {
|
||||||
|
Some(errors) => {
|
||||||
|
if errors.embedder_name == embedder_name {
|
||||||
|
errors.other_docids = errors.other_docids.saturating_add(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
*errors = Some(Self {
|
||||||
|
embedder_name: embedder_name.to_owned(),
|
||||||
|
docid: document_id().to_string(),
|
||||||
|
other_docids: 0,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn to_result(
|
||||||
|
errors: Option<ManualEmbedderErrors>,
|
||||||
|
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||||
|
unused_vectors_distribution: &UnusedVectorsDistribution,
|
||||||
|
) -> Result<()> {
|
||||||
|
match errors {
|
||||||
|
Some(errors) => {
|
||||||
|
let embedder_name = &errors.embedder_name;
|
||||||
|
let mut msg = format!(
|
||||||
|
r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document {}{}",
|
||||||
|
errors.docid,
|
||||||
|
if errors.other_docids != 0 {
|
||||||
|
format!(" and at least {} other document(s)", errors.other_docids)
|
||||||
|
} else {
|
||||||
|
"".to_string()
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
||||||
|
|
||||||
|
let mut hint_count = 0;
|
||||||
|
|
||||||
|
for (vector_misspelling, count) in
|
||||||
|
possible_embedding_mistakes.vector_mistakes().take(2)
|
||||||
|
{
|
||||||
|
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
|
||||||
|
hint_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (embedder_misspelling, count) in possible_embedding_mistakes
|
||||||
|
.embedder_mistakes(embedder_name, unused_vectors_distribution)
|
||||||
|
.take(2)
|
||||||
|
{
|
||||||
|
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
||||||
|
hint_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if hint_count == 0 {
|
||||||
|
msg += &format!(
|
||||||
|
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)))
|
||||||
|
}
|
||||||
|
None => Ok(()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||||
///
|
///
|
||||||
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
|
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
|
||||||
@ -104,8 +182,10 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
embedders_configs: &[IndexEmbeddingConfig],
|
embedders_configs: &[IndexEmbeddingConfig],
|
||||||
settings_diff: &InnerIndexSettingsDiff,
|
settings_diff: &InnerIndexSettingsDiff,
|
||||||
|
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||||
) -> Result<(Vec<ExtractedVectorPoints>, UnusedVectorsDistribution)> {
|
) -> Result<(Vec<ExtractedVectorPoints>, UnusedVectorsDistribution)> {
|
||||||
let mut unused_vectors_distribution = UnusedVectorsDistribution::new();
|
let mut unused_vectors_distribution = UnusedVectorsDistribution::new();
|
||||||
|
let mut manual_errors = None;
|
||||||
let reindex_vectors = settings_diff.reindex_vectors();
|
let reindex_vectors = settings_diff.reindex_vectors();
|
||||||
|
|
||||||
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||||
@ -246,7 +326,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
for EmbedderVectorExtractor {
|
for EmbedderVectorExtractor {
|
||||||
embedder_name,
|
embedder_name,
|
||||||
embedder: _,
|
embedder,
|
||||||
prompt,
|
prompt,
|
||||||
prompts_writer,
|
prompts_writer,
|
||||||
remove_vectors_writer,
|
remove_vectors_writer,
|
||||||
@ -255,6 +335,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
action,
|
action,
|
||||||
} in extractors.iter_mut()
|
} in extractors.iter_mut()
|
||||||
{
|
{
|
||||||
|
let embedder_is_manual = matches!(**embedder, Embedder::UserProvided(_));
|
||||||
|
|
||||||
let (old, new) = parsed_vectors.remove(embedder_name);
|
let (old, new) = parsed_vectors.remove(embedder_name);
|
||||||
let delta = match action {
|
let delta = match action {
|
||||||
ExtractionAction::SettingsFullReindex => match old {
|
ExtractionAction::SettingsFullReindex => match old {
|
||||||
@ -285,11 +367,29 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
|
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
|
||||||
VectorState::Manual => VectorStateDelta::NoChange,
|
VectorState::Manual => VectorStateDelta::NoChange,
|
||||||
// generated vectors must be regenerated
|
// generated vectors must be regenerated
|
||||||
VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?,
|
VectorState::Generated => {
|
||||||
|
if embedder_is_manual {
|
||||||
|
ManualEmbedderErrors::push_error(
|
||||||
|
&mut manual_errors,
|
||||||
|
embedder_name.as_str(),
|
||||||
|
document_id,
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
regenerate_prompt(obkv, prompt, new_fields_ids_map)?
|
||||||
|
}
|
||||||
},
|
},
|
||||||
// prompt regeneration is only triggered for existing embedders
|
// prompt regeneration is only triggered for existing embedders
|
||||||
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
|
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
|
||||||
if old.must_regenerate() {
|
if old.must_regenerate() {
|
||||||
|
if embedder_is_manual {
|
||||||
|
ManualEmbedderErrors::push_error(
|
||||||
|
&mut manual_errors,
|
||||||
|
embedder_name.as_str(),
|
||||||
|
document_id,
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
regenerate_if_prompt_changed(
|
regenerate_if_prompt_changed(
|
||||||
obkv,
|
obkv,
|
||||||
(old_prompt, prompt),
|
(old_prompt, prompt),
|
||||||
@ -311,6 +411,9 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
(old, new),
|
(old, new),
|
||||||
(old_fields_ids_map, new_fields_ids_map),
|
(old_fields_ids_map, new_fields_ids_map),
|
||||||
document_id,
|
document_id,
|
||||||
|
embedder_name,
|
||||||
|
embedder_is_manual,
|
||||||
|
&mut manual_errors,
|
||||||
)?,
|
)?,
|
||||||
};
|
};
|
||||||
// and we finally push the unique vectors into the writer
|
// and we finally push the unique vectors into the writer
|
||||||
@ -326,6 +429,12 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
unused_vectors_distribution.append(parsed_vectors);
|
unused_vectors_distribution.append(parsed_vectors);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ManualEmbedderErrors::to_result(
|
||||||
|
manual_errors,
|
||||||
|
possible_embedding_mistakes,
|
||||||
|
&unused_vectors_distribution,
|
||||||
|
)?;
|
||||||
|
|
||||||
let mut results = Vec::new();
|
let mut results = Vec::new();
|
||||||
|
|
||||||
for EmbedderVectorExtractor {
|
for EmbedderVectorExtractor {
|
||||||
@ -371,6 +480,9 @@ fn extract_vector_document_diff(
|
|||||||
(old, new): (VectorState, VectorState),
|
(old, new): (VectorState, VectorState),
|
||||||
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
|
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
|
||||||
document_id: impl Fn() -> Value,
|
document_id: impl Fn() -> Value,
|
||||||
|
embedder_name: &str,
|
||||||
|
embedder_is_manual: bool,
|
||||||
|
manual_errors: &mut Option<ManualEmbedderErrors>,
|
||||||
) -> Result<VectorStateDelta> {
|
) -> Result<VectorStateDelta> {
|
||||||
match (old.must_regenerate(), new.must_regenerate()) {
|
match (old.must_regenerate(), new.must_regenerate()) {
|
||||||
(true, true) | (false, false) => {}
|
(true, true) | (false, false) => {}
|
||||||
@ -408,6 +520,10 @@ fn extract_vector_document_diff(
|
|||||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||||
|
|
||||||
if document_is_kept {
|
if document_is_kept {
|
||||||
|
if embedder_is_manual {
|
||||||
|
ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id);
|
||||||
|
return Ok(VectorStateDelta::NoChange);
|
||||||
|
}
|
||||||
// Don't give up if the old prompt was failing
|
// Don't give up if the old prompt was failing
|
||||||
let old_prompt = Some(&prompt).map(|p| {
|
let old_prompt = Some(&prompt).map(|p| {
|
||||||
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
|
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
|
||||||
@ -439,6 +555,10 @@ fn extract_vector_document_diff(
|
|||||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||||
if document_is_kept {
|
if document_is_kept {
|
||||||
|
if embedder_is_manual {
|
||||||
|
ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id);
|
||||||
|
return Ok(VectorStateDelta::NoChange);
|
||||||
|
}
|
||||||
// becomes autogenerated
|
// becomes autogenerated
|
||||||
VectorStateDelta::NowGenerated(prompt.render(
|
VectorStateDelta::NowGenerated(prompt.render(
|
||||||
obkv,
|
obkv,
|
||||||
|
@ -251,6 +251,7 @@ fn send_original_documents_data(
|
|||||||
indexer,
|
indexer,
|
||||||
&embedders_configs,
|
&embedders_configs,
|
||||||
&settings_diff,
|
&settings_diff,
|
||||||
|
&possible_embedding_mistakes,
|
||||||
) {
|
) {
|
||||||
Ok((extracted_vectors, unused_vectors_distribution)) => {
|
Ok((extracted_vectors, unused_vectors_distribution)) => {
|
||||||
for ExtractedVectorPoints {
|
for ExtractedVectorPoints {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user