4906: Add searchable fields to template r=dureuill a=dureuill

# Pull Request

## Related issue
Fixes #4886 

See [public usage](https://meilisearch.notion.site/v1-11-AI-search-changes-0e37727193884a70999f254fa953ce6e#1dd6f0eee5a1422888e1c5d48e107cd1)

## What does this PR do?
- `Prompt::render` now requires and uses metadata to indicate if the fields are searchable or not
- Changes default template
- Updated tests
- Correctly reindex vectors when the list of searchable fields changes in a settings update.


Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-09-03 07:14:58 +00:00 committed by GitHub
commit 80408c92dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 274 additions and 24 deletions

View File

@ -5617,7 +5617,7 @@ mod tests {
},
),
prompt: PromptData {
template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}",
},
},
user_provided: RoaringBitmap<[0]>,
@ -5657,7 +5657,7 @@ mod tests {
},
),
prompt: PromptData {
template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}",
},
},
user_provided: RoaringBitmap<[]>,

View File

@ -190,7 +190,7 @@ async fn secrets_are_hidden_in_settings() {
"source": "rest",
"apiKey": "My suXXXXXX...",
"dimensions": 4,
"documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
"documentTemplate": "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}",
"url": "https://localhost:7777",
"request": "{{text}}",
"response": "{{embedding}}",

View File

@ -1,3 +1,4 @@
use std::collections::BTreeMap;
use std::sync::atomic::{AtomicUsize, Ordering};
use meili_snap::{json_string, snapshot};
@ -37,6 +38,46 @@ async fn create_mock() -> (MockServer, Value) {
(mock_server, embedder_settings)
}
async fn create_mock_map() -> (MockServer, Value) {
let mock_server = MockServer::start().await;
let text_to_embedding: BTreeMap<_, _> = vec![
// text -> embedding
("name: kefir\n", [0.0, 0.1, 0.2]),
]
// turn into btree
.into_iter()
.collect();
Mock::given(method("POST"))
.and(path("/"))
.respond_with(move |req: &Request| {
let text: String = req.body_json().unwrap();
match text_to_embedding.get(text.as_str()) {
Some(embedding) => {
ResponseTemplate::new(200).set_body_json(json!({ "data": embedding }))
}
None => ResponseTemplate::new(404)
.set_body_json(json!({"error": "text not found", "text": text})),
}
})
.mount(&mock_server)
.await;
let url = mock_server.uri();
let embedder_settings = json!({
"source": "rest",
"url": url,
"dimensions": 3,
"request": "{{text}}",
"response": {
"data": "{{embedding}}"
}
});
(mock_server, embedder_settings)
}
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
struct MultipleRequest {
input: Vec<String>,
@ -1100,6 +1141,7 @@ async fn server_returns_bad_request() {
let (response, code) = index
.update_settings(json!({
"searchableAttributes": ["name", "missing_field"],
"embedders": {
"rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": "{{embedding}}", "dimensions": 3 }),
},
@ -1115,6 +1157,10 @@ async fn server_returns_bad_request() {
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"searchableAttributes": [
"name",
"missing_field"
],
"embedders": {
"rest": {
"source": "rest",
@ -1148,7 +1194,7 @@ async fn server_returns_bad_request() {
"indexedDocuments": 0
},
"error": {
"message": "While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\" id: 1\\\\n name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 24\"}`",
"message": "While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 15\"}`",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1891,3 +1937,109 @@ async fn server_custom_header() {
}
"###);
}
#[actix_rt::test]
async fn searchable_reindex() {
let (_mock, setting) = create_mock_map().await;
let server = get_server_vector().await;
let index = server.index("doggo");
let (response, code) = index
.update_settings(json!({
"searchableAttributes": ["name", "missing_field"],
"embedders": {
"rest": setting,
},
}))
.await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task, @r###"
{
"uid": "[uid]",
"indexUid": "doggo",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"searchableAttributes": [
"name",
"missing_field"
],
"embedders": {
"rest": {
"source": "rest",
"dimensions": 3,
"url": "[url]",
"request": "{{text}}",
"response": {
"data": "{{embedding}}"
}
}
}
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let (response, code) =
index.add_documents(json!( { "id": 1, "name": "kefir", "breed": "patou" }), None).await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task, @r###"
{
"uid": "[uid]",
"indexUid": "doggo",
"status": "succeeded",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 1
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
// triggers reindexing with the new searchable attribute.
// as the mock intentionally doesn't know of this text, the task will fail, outputting the putative rendered text.
let (response, code) = index
.update_settings(json!({
"searchableAttributes": ["breed"],
}))
.await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task, @r###"
{
"uid": "[uid]",
"indexUid": "doggo",
"status": "failed",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"searchableAttributes": [
"breed"
]
},
"error": {
"message": "While embedding documents for embedder `rest`: error: received unexpected HTTP 404 from embedding server\n - server replied with `{\"error\":\"text not found\",\"text\":\"breed: patou\\n\"}`",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
}

View File

@ -5,7 +5,7 @@ use liquid::{ObjectView, ValueView};
use super::document::Document;
use super::fields::Fields;
use crate::FieldsIdsMap;
use super::FieldsIdsMapWithMetadata;
#[derive(Debug, Clone)]
pub struct Context<'a> {
@ -14,7 +14,7 @@ pub struct Context<'a> {
}
impl<'a> Context<'a> {
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMap) -> Self {
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self {
Self { document, fields: Fields::new(document, field_id_map) }
}
}

View File

@ -4,16 +4,20 @@ use liquid::model::{
use liquid::{ObjectView, ValueView};
use super::document::Document;
use crate::FieldsIdsMap;
use super::{FieldMetadata, FieldsIdsMapWithMetadata};
#[derive(Debug, Clone)]
pub struct Fields<'a>(Vec<FieldValue<'a>>);
impl<'a> Fields<'a> {
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMap) -> Self {
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self {
Self(
std::iter::repeat(document)
.zip(field_id_map.iter())
.map(|(document, (_fid, name))| FieldValue { document, name })
.map(|(document, (fid, name))| FieldValue {
document,
name,
metadata: field_id_map.metadata(fid).unwrap_or_default(),
})
.collect(),
)
}
@ -23,6 +27,7 @@ impl<'a> Fields<'a> {
pub struct FieldValue<'a> {
name: &'a str,
document: &'a Document<'a>,
metadata: FieldMetadata,
}
impl<'a> ValueView for FieldValue<'a> {
@ -74,6 +79,10 @@ impl<'a> FieldValue<'a> {
self.document.get(self.name).unwrap_or(&LiquidValue::Nil)
}
pub fn is_searchable(&self) -> &bool {
&self.metadata.searchable
}
pub fn is_empty(&self) -> bool {
self.size() == 0
}
@ -89,12 +98,14 @@ impl<'a> ObjectView for FieldValue<'a> {
}
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
Box::new(["name", "value"].iter().map(|&x| KStringCow::from_static(x)))
Box::new(["name", "value", "is_searchable"].iter().map(|&x| KStringCow::from_static(x)))
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(
std::iter::once(self.name() as &dyn ValueView).chain(std::iter::once(self.value())),
std::iter::once(self.name() as &dyn ValueView)
.chain(std::iter::once(self.value()))
.chain(std::iter::once(self.is_searchable() as &dyn ValueView)),
)
}
@ -103,13 +114,14 @@ impl<'a> ObjectView for FieldValue<'a> {
}
fn contains_key(&self, index: &str) -> bool {
index == "name" || index == "value"
index == "name" || index == "value" || index == "is_searchable"
}
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
match index {
"name" => Some(self.name()),
"value" => Some(self.value()),
"is_searchable" => Some(self.is_searchable()),
_ => None,
}
}

View File

@ -4,14 +4,16 @@ pub(crate) mod error;
mod fields;
mod template_checker;
use std::collections::BTreeMap;
use std::convert::TryFrom;
use std::ops::Deref;
use error::{NewPromptError, RenderPromptError};
use self::context::Context;
use self::document::Document;
use crate::update::del_add::DelAdd;
use crate::FieldsIdsMap;
use crate::{FieldId, FieldsIdsMap};
pub struct Prompt {
template: liquid::Template,
@ -53,8 +55,10 @@ fn default_template() -> liquid::Template {
}
fn default_template_text() -> &'static str {
"{% for field in fields %} \
"{% for field in fields %}\
{% if field.is_searchable and field.value != nil %}\
{{ field.name }}: {{ field.value }}\n\
{% endif %}\
{% endfor %}"
}
@ -93,7 +97,7 @@ impl Prompt {
&self,
document: obkv::KvReaderU16<'_>,
side: DelAdd,
field_id_map: &FieldsIdsMap,
field_id_map: &FieldsIdsMapWithMetadata,
) -> Result<String, RenderPromptError> {
let document = Document::new(document, side, field_id_map);
let context = Context::new(&document, field_id_map);
@ -102,6 +106,40 @@ impl Prompt {
}
}
pub struct FieldsIdsMapWithMetadata<'a> {
fields_ids_map: &'a FieldsIdsMap,
metadata: BTreeMap<FieldId, FieldMetadata>,
}
impl<'a> FieldsIdsMapWithMetadata<'a> {
pub fn new(fields_ids_map: &'a FieldsIdsMap, searchable_fields_ids: &'_ [FieldId]) -> Self {
let mut metadata: BTreeMap<FieldId, FieldMetadata> =
fields_ids_map.ids().map(|id| (id, Default::default())).collect();
for searchable_field_id in searchable_fields_ids {
let Some(metadata) = metadata.get_mut(searchable_field_id) else { continue };
metadata.searchable = true;
}
Self { fields_ids_map, metadata }
}
pub fn metadata(&self, field_id: FieldId) -> Option<FieldMetadata> {
self.metadata.get(&field_id).copied()
}
}
impl<'a> Deref for FieldsIdsMapWithMetadata<'a> {
type Target = FieldsIdsMap;
fn deref(&self) -> &Self::Target {
self.fields_ids_map
}
}
#[derive(Debug, Default, Clone, Copy)]
pub struct FieldMetadata {
pub searchable: bool,
}
#[cfg(test)]
mod test {
use super::Prompt;

View File

@ -15,14 +15,14 @@ use serde_json::Value;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::error::FaultSource;
use crate::index::IndexEmbeddingConfig;
use crate::prompt::Prompt;
use crate::prompt::{FieldsIdsMapWithMetadata, Prompt};
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME};
use crate::vector::settings::{EmbedderAction, ReindexAction};
use crate::vector::{Embedder, Embeddings};
use crate::{try_split_array_at, DocumentId, FieldId, FieldsIdsMap, Result, ThreadPoolNoAbort};
use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort};
/// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -189,7 +189,13 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
let reindex_vectors = settings_diff.reindex_vectors();
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
let old_fields_ids_map =
FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids);
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
let new_fields_ids_map =
FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids);
// the vector field id may have changed
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
@ -376,7 +382,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
);
continue;
}
regenerate_prompt(obkv, prompt, new_fields_ids_map)?
regenerate_prompt(obkv, prompt, &new_fields_ids_map)?
}
},
// prompt regeneration is only triggered for existing embedders
@ -393,7 +399,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
regenerate_if_prompt_changed(
obkv,
(old_prompt, prompt),
(old_fields_ids_map, new_fields_ids_map),
(&old_fields_ids_map, &new_fields_ids_map),
)?
} else {
// we can simply ignore user provided vectors as they are not regenerated and are
@ -409,7 +415,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
prompt,
(add_to_user_provided, remove_from_user_provided),
(old, new),
(old_fields_ids_map, new_fields_ids_map),
(&old_fields_ids_map, &new_fields_ids_map),
document_id,
embedder_name,
embedder_is_manual,
@ -479,7 +485,10 @@ fn extract_vector_document_diff(
prompt: &Prompt,
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
(old, new): (VectorState, VectorState),
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
(old_fields_ids_map, new_fields_ids_map): (
&FieldsIdsMapWithMetadata,
&FieldsIdsMapWithMetadata,
),
document_id: impl Fn() -> Value,
embedder_name: &str,
embedder_is_manual: bool,
@ -599,7 +608,10 @@ fn extract_vector_document_diff(
fn regenerate_if_prompt_changed(
obkv: obkv::KvReader<'_, FieldId>,
(old_prompt, new_prompt): (&Prompt, &Prompt),
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
(old_fields_ids_map, new_fields_ids_map): (
&FieldsIdsMapWithMetadata,
&FieldsIdsMapWithMetadata,
),
) -> Result<VectorStateDelta> {
let old_prompt =
old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default());
@ -614,7 +626,7 @@ fn regenerate_if_prompt_changed(
fn regenerate_prompt(
obkv: obkv::KvReader<'_, FieldId>,
prompt: &Prompt,
new_fields_ids_map: &FieldsIdsMap,
new_fields_ids_map: &FieldsIdsMapWithMetadata,
) -> Result<VectorStateDelta> {
let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;

View File

@ -1238,7 +1238,7 @@ impl InnerIndexSettingsDiff {
old_settings: InnerIndexSettings,
new_settings: InnerIndexSettings,
primary_key_id: Option<FieldId>,
embedding_config_updates: BTreeMap<String, EmbedderAction>,
mut embedding_config_updates: BTreeMap<String, EmbedderAction>,
settings_update_only: bool,
) -> Self {
let only_additional_fields = match (
@ -1273,6 +1273,32 @@ impl InnerIndexSettingsDiff {
let cache_user_defined_searchables = old_settings.user_defined_searchable_fields
!= new_settings.user_defined_searchable_fields;
// if the user-defined searchables changed, then we need to reindex prompts.
if cache_user_defined_searchables {
for (embedder_name, (config, _)) in new_settings.embedding_configs.inner_as_ref() {
// skip embedders that don't use document templates
if !config.uses_document_template() {
continue;
}
// note: this could currently be entry.or_insert(..), but we're future-proofing with an explicit match
// this always makes the code clearer by explicitly handling the cases
match embedding_config_updates.entry(embedder_name.clone()) {
std::collections::btree_map::Entry::Vacant(entry) => {
entry.insert(EmbedderAction::Reindex(ReindexAction::RegeneratePrompts));
}
std::collections::btree_map::Entry::Occupied(entry) => match entry.get() {
EmbedderAction::WriteBackToDocuments(_) => { /* we are deleting this embedder, so no point in regeneration */
}
EmbedderAction::Reindex(ReindexAction::FullReindex) => { /* we are already fully reindexing */
}
EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => { /* we are already regenerating prompts */
}
},
};
}
}
InnerIndexSettingsDiff {
old: old_settings,
new: new_settings,

View File

@ -305,6 +305,16 @@ impl Embedder {
Embedder::Rest(embedder) => embedder.distribution(),
}
}
pub fn uses_document_template(&self) -> bool {
match self {
Embedder::HuggingFace(_)
| Embedder::OpenAi(_)
| Embedder::Ollama(_)
| Embedder::Rest(_) => true,
Embedder::UserProvided(_) => false,
}
}
}
/// Describes the mean and sigma of distribution of embedding similarity in the embedding space.