mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 20:07:09 +02:00
Merge branch 'main' into granular-filterable-attributes
This commit is contained in:
commit
6d52c6e711
227 changed files with 6074 additions and 1626 deletions
|
@ -13,6 +13,7 @@ use thiserror::Error;
|
|||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::documents::{self, DocumentsBatchCursorError};
|
||||
use crate::thread_pool_no_abort::PanicCatched;
|
||||
use crate::vector::settings::EmbeddingSettings;
|
||||
use crate::{CriterionError, DocumentId, FieldId, Object, SortError};
|
||||
|
||||
pub fn is_reserved_keyword(keyword: &str) -> bool {
|
||||
|
@ -240,28 +241,52 @@ and can not be more than 511 bytes.", .document_id.to_string()
|
|||
InvalidSimilarEmbedder(String),
|
||||
#[error("Too many vectors for document with id {0}: found {1}, but limited to 256.")]
|
||||
TooManyVectors(String, usize),
|
||||
#[error("`.embedders.{embedder_name}`: Field `{field}` unavailable for source `{source_}` (only available for sources: {}). Available fields: {}",
|
||||
allowed_sources_for_field
|
||||
.iter()
|
||||
.map(|accepted| format!("`{}`", accepted))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", "),
|
||||
allowed_fields_for_source
|
||||
.iter()
|
||||
.map(|accepted| format!("`{}`", accepted))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", ")
|
||||
#[error("`.embedders.{embedder_name}`: Field `{field}` unavailable for source `{source_}`{for_context}.{available_sources}{available_fields}{available_contexts}",
|
||||
field=field.name(),
|
||||
for_context={
|
||||
context.in_context()
|
||||
},
|
||||
available_sources={
|
||||
let allowed_sources_for_field = EmbeddingSettings::allowed_sources_for_field(*field, *context);
|
||||
if allowed_sources_for_field.is_empty() {
|
||||
String::new()
|
||||
} else {
|
||||
format!("\n - note: `{}` is available for sources: {}",
|
||||
field.name(),
|
||||
allowed_sources_for_field
|
||||
.iter()
|
||||
.map(|accepted| format!("`{}`", accepted))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", "),
|
||||
)
|
||||
}
|
||||
},
|
||||
available_fields={
|
||||
let allowed_fields_for_source = EmbeddingSettings::allowed_fields_for_source(*source_, *context);
|
||||
format!("\n - note: available fields for source `{source_}`{}: {}",context.in_context(), allowed_fields_for_source
|
||||
.iter()
|
||||
.map(|accepted| format!("`{}`", accepted))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", "),)
|
||||
},
|
||||
available_contexts={
|
||||
let available_not_nested = !matches!(EmbeddingSettings::field_status(*source_, *field, crate::vector::settings::NestingContext::NotNested), crate::vector::settings::FieldStatus::Disallowed);
|
||||
if available_not_nested {
|
||||
format!("\n - note: `{}` is available when source `{source_}` is not{}", field.name(), context.in_context())
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
}
|
||||
)]
|
||||
InvalidFieldForSource {
|
||||
embedder_name: String,
|
||||
source_: crate::vector::settings::EmbedderSource,
|
||||
field: &'static str,
|
||||
allowed_fields_for_source: &'static [&'static str],
|
||||
allowed_sources_for_field: &'static [crate::vector::settings::EmbedderSource],
|
||||
context: crate::vector::settings::NestingContext,
|
||||
field: crate::vector::settings::MetaEmbeddingSetting,
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}.model`: Invalid model `{model}` for OpenAI. Supported models: {:?}", crate::vector::openai::EmbeddingModel::supported_models())]
|
||||
InvalidOpenAiModel { embedder_name: String, model: String },
|
||||
#[error("`.embedders.{embedder_name}`: Missing field `{field}` (note: this field is mandatory for source {source_})")]
|
||||
#[error("`.embedders.{embedder_name}`: Missing field `{field}` (note: this field is mandatory for source `{source_}`)")]
|
||||
MissingFieldForSource {
|
||||
field: &'static str,
|
||||
source_: crate::vector::settings::EmbedderSource,
|
||||
|
@ -281,6 +306,15 @@ and can not be more than 511 bytes.", .document_id.to_string()
|
|||
dimensions: usize,
|
||||
max_dimensions: usize,
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}.source`: Source `{source_}` is not available in a nested embedder")]
|
||||
InvalidSourceForNested {
|
||||
embedder_name: String,
|
||||
source_: crate::vector::settings::EmbedderSource,
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}`: Missing field `source`.\n - note: this field is mandatory for nested embedders")]
|
||||
MissingSourceForNested { embedder_name: String },
|
||||
#[error("`.embedders.{embedder_name}`: {message}")]
|
||||
InvalidSettingsEmbedder { embedder_name: String, message: String },
|
||||
#[error("`.embedders.{embedder_name}.dimensions`: `dimensions` cannot be zero")]
|
||||
InvalidSettingsDimensions { embedder_name: String },
|
||||
#[error(
|
||||
|
|
|
@ -203,7 +203,7 @@ impl<'a> Search<'a> {
|
|||
|
||||
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(3);
|
||||
|
||||
match embedder.embed_one(query, Some(deadline)) {
|
||||
match embedder.embed_search(query, Some(deadline)) {
|
||||
Ok(embedding) => embedding,
|
||||
Err(error) => {
|
||||
tracing::error!(error=%error, "Embedding failed");
|
||||
|
|
|
@ -786,7 +786,7 @@ fn embed_chunks(
|
|||
unused_vectors_distribution: &UnusedVectorsDistribution,
|
||||
request_threads: &ThreadPoolNoAbort,
|
||||
) -> Result<Vec<Vec<Embedding>>> {
|
||||
match embedder.embed_chunks(text_chunks, request_threads) {
|
||||
match embedder.embed_index(text_chunks, request_threads) {
|
||||
Ok(chunks) => Ok(chunks),
|
||||
Err(error) => {
|
||||
if let FaultSource::Bug = error.fault {
|
||||
|
|
|
@ -2772,6 +2772,8 @@ mod tests {
|
|||
response: Setting::NotSet,
|
||||
distribution: Setting::NotSet,
|
||||
headers: Setting::NotSet,
|
||||
search_embedder: Setting::NotSet,
|
||||
indexing_embedder: Setting::NotSet,
|
||||
binary_quantized: Setting::NotSet,
|
||||
}),
|
||||
);
|
||||
|
|
|
@ -416,7 +416,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||
return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)));
|
||||
}
|
||||
|
||||
let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) {
|
||||
let res = match embedder.embed_index_ref(texts.as_slice(), threads) {
|
||||
Ok(embeddings) => {
|
||||
for (docid, embedding) in ids.into_iter().zip(embeddings) {
|
||||
sender.set_vector(*docid, embedder_id, embedding).unwrap();
|
||||
|
|
File diff suppressed because it is too large
Load diff
956
crates/milli/src/update/test_settings.rs
Normal file
956
crates/milli/src/update/test_settings.rs
Normal file
|
@ -0,0 +1,956 @@
|
|||
use big_s::S;
|
||||
use heed::types::Bytes;
|
||||
use maplit::{btreemap, btreeset};
|
||||
use meili_snap::snapshot;
|
||||
|
||||
use super::*;
|
||||
use crate::error::Error;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::ClearDocuments;
|
||||
use crate::{db_snap, Criterion, Filter, SearchResult};
|
||||
|
||||
#[test]
|
||||
fn set_and_reset_searchable_fields() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": 1, "name": "kevin", "age": 23 },
|
||||
{ "id": 2, "name": "kevina", "age": 21},
|
||||
{ "id": 3, "name": "benoit", "age": 34 }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// We change the searchable fields to be the "name" field only.
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_searchable_fields(vec!["name".into()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 id |
|
||||
1 name |
|
||||
2 age |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["name"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
1 0 |
|
||||
"###);
|
||||
|
||||
// Check that the searchable field is correctly set to "name" only.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// When we search for something that is not in
|
||||
// the searchable fields it must not return any document.
|
||||
let result = index.search(&rtxn).query("23").execute().unwrap();
|
||||
assert_eq!(result.documents_ids, Vec::<u32>::new());
|
||||
|
||||
// When we search for something that is in the searchable fields
|
||||
// we must find the appropriate document.
|
||||
let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap();
|
||||
let documents = index.documents(&rtxn, result.documents_ids).unwrap();
|
||||
let fid_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
assert_eq!(documents.len(), 1);
|
||||
assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
|
||||
drop(rtxn);
|
||||
|
||||
// We change the searchable fields to be the "name" field only.
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.reset_searchable_fields();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 id |
|
||||
1 name |
|
||||
2 age |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["id", "name", "age"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
0 0 |
|
||||
1 0 |
|
||||
2 0 |
|
||||
"###);
|
||||
|
||||
// Check that the searchable field have been reset and documents are found now.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fid_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap();
|
||||
snapshot!(format!("{user_defined_searchable_fields:?}"), @"None");
|
||||
// the searchable fields should contain all the fields
|
||||
let searchable_fields = index.searchable_fields(&rtxn).unwrap();
|
||||
snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###);
|
||||
let result = index.search(&rtxn).query("23").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
let documents = index.documents(&rtxn, result.documents_ids).unwrap();
|
||||
assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixup_searchable_with_displayed_fields() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 23},
|
||||
{ "id": 1, "name": "kevina", "age": 21 },
|
||||
{ "id": 2, "name": "benoit", "age": 34 }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// In the same transaction we change the displayed fields to be only the "age".
|
||||
// We also change the searchable fields to be the "name" field only.
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_displayed_fields(vec!["age".into()]);
|
||||
settings.set_searchable_fields(vec!["name".into()]);
|
||||
})
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that the displayed fields are correctly set to `None` (default value).
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
||||
assert_eq!(fields_ids.unwrap(), (&["age"][..]));
|
||||
drop(rtxn);
|
||||
|
||||
// We change the searchable fields to be the "name" field only.
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.reset_searchable_fields();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// Check that the displayed fields always contains only the "age" field.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
||||
assert_eq!(fields_ids.unwrap(), &["age"][..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_displayed_fields() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 23},
|
||||
{ "id": 1, "name": "kevina", "age": 21 },
|
||||
{ "id": 2, "name": "benoit", "age": 34 }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
// Check that the displayed fields are correctly set to `None` (default value).
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
||||
assert_eq!(fields_ids, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_and_reset_displayed_field() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 23},
|
||||
{ "id": 1, "name": "kevina", "age": 21 },
|
||||
{ "id": 2, "name": "benoit", "age": 34 }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_displayed_fields(vec!["age".into()]);
|
||||
})
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that the displayed fields are correctly set to only the "age" field.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
||||
assert_eq!(fields_ids.unwrap(), &["age"][..]);
|
||||
drop(rtxn);
|
||||
|
||||
// We reset the fields ids to become `None`, the default value.
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.reset_displayed_fields();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// Check that the displayed fields are correctly set to `None` (default value).
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
||||
assert_eq!(fields_ids, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_filterable_fields() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// Set the filterable fields to be the age.
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("age"))]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// Then index some documents.
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 23},
|
||||
{ "id": 1, "name": "kevina", "age": 21 },
|
||||
{ "id": 2, "name": "benoit", "age": 34 }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
// Check that the displayed fields are correctly set.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 0 and level 0 facet values.
|
||||
// TODO we must support typed CSVs for numbers to be understood.
|
||||
let fidmap = index.fields_ids_map(&rtxn).unwrap();
|
||||
for document in index.all_documents(&rtxn).unwrap() {
|
||||
let document = document.unwrap();
|
||||
let json =
|
||||
crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document.1).unwrap();
|
||||
println!("json: {:?}", json);
|
||||
}
|
||||
let count = index
|
||||
.facet_id_f64_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
// The faceted field id is 2u16
|
||||
.prefix_iter(&rtxn, &[0, 2, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 3);
|
||||
drop(rtxn);
|
||||
|
||||
// Index a little more documents with new and current facets values.
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 3, "name": "kevin2", "age": 23},
|
||||
{ "id": 4, "name": "kevina2", "age": 21 },
|
||||
{ "id": 5, "name": "benoit", "age": 35 }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 0 and level 0 facet values.
|
||||
let count = index
|
||||
.facet_id_f64_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(&rtxn, &[0, 2, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 4);
|
||||
|
||||
// Set the filterable fields to be the age and the name.
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(vec![
|
||||
FilterableAttributesRule::Field(S("age")),
|
||||
FilterableAttributesRule::Field(S("name")),
|
||||
]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 2 and level 0 facet values.
|
||||
let count = index
|
||||
.facet_id_f64_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(&rtxn, &[0, 2, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 4);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 1 and level 0 facet values.
|
||||
let count = index
|
||||
.facet_id_string_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(&rtxn, &[0, 1])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 5);
|
||||
|
||||
// Remove the age from the filterable fields.
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("name"))]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 2 and level 0 facet values.
|
||||
let count = index
|
||||
.facet_id_f64_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(&rtxn, &[0, 2, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 0);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 1 and level 0 facet values.
|
||||
let count = index
|
||||
.facet_id_string_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(&rtxn, &[0, 1])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_asc_desc_field() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// Set the filterable fields to be the age.
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_displayed_fields(vec![S("name")]);
|
||||
settings.set_criteria(vec![Criterion::Asc("age".to_owned())]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// Then index some documents.
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 23},
|
||||
{ "id": 1, "name": "kevina", "age": 21 },
|
||||
{ "id": 2, "name": "benoit", "age": 34 }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
// Run an empty query just to ensure that the search results are ordered.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap();
|
||||
let documents = index.documents(&rtxn, documents_ids).unwrap();
|
||||
|
||||
// Fetch the documents "age" field in the ordre in which the documents appear.
|
||||
let age_field_id = index.fields_ids_map(&rtxn).unwrap().id("age").unwrap();
|
||||
let iter = documents.into_iter().map(|(_, doc)| {
|
||||
let bytes = doc.get(age_field_id).unwrap();
|
||||
let string = std::str::from_utf8(bytes).unwrap();
|
||||
string.parse::<u32>().unwrap()
|
||||
});
|
||||
|
||||
assert_eq!(iter.collect::<Vec<_>>(), vec![21, 23, 34]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_distinct_field() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// Set the filterable fields to be the age.
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
// Don't display the generated `id` field.
|
||||
settings.set_displayed_fields(vec![S("name"), S("age")]);
|
||||
settings.set_distinct_field(S("age"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// Then index some documents.
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 23 },
|
||||
{ "id": 1, "name": "kevina", "age": 21 },
|
||||
{ "id": 2, "name": "benoit", "age": 34 },
|
||||
{ "id": 3, "name": "bernard", "age": 34 },
|
||||
{ "id": 4, "name": "bertrand", "age": 34 },
|
||||
{ "id": 5, "name": "bernie", "age": 34 },
|
||||
{ "id": 6, "name": "ben", "age": 34 }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
// Run an empty query just to ensure that the search results are ordered.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap();
|
||||
|
||||
// There must be at least one document with a 34 as the age.
|
||||
assert_eq!(documents_ids.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_nested_distinct_field() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// Set the filterable fields to be the age.
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
// Don't display the generated `id` field.
|
||||
settings.set_displayed_fields(vec![S("person")]);
|
||||
settings.set_distinct_field(S("person.age"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// Then index some documents.
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "person": { "name": "kevin", "age": 23 }},
|
||||
{ "id": 1, "person": { "name": "kevina", "age": 21 }},
|
||||
{ "id": 2, "person": { "name": "benoit", "age": 34 }},
|
||||
{ "id": 3, "person": { "name": "bernard", "age": 34 }},
|
||||
{ "id": 4, "person": { "name": "bertrand", "age": 34 }},
|
||||
{ "id": 5, "person": { "name": "bernie", "age": 34 }},
|
||||
{ "id": 6, "person": { "name": "ben", "age": 34 }}
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
// Run an empty query just to ensure that the search results are ordered.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap();
|
||||
|
||||
// There must be at least one document with a 34 as the age.
|
||||
assert_eq!(documents_ids.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_stop_words() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 23},
|
||||
{ "id": 1, "name": "kevina", "age": 21 },
|
||||
{ "id": 2, "name": "benoit", "age": 34 }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
// Ensure there is no stop_words by default
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let stop_words = index.stop_words(&rtxn).unwrap();
|
||||
assert!(stop_words.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_and_reset_stop_words() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 23, "maxim": "I love dogs" },
|
||||
{ "id": 1, "name": "kevina", "age": 21, "maxim": "Doggos are the best" },
|
||||
{ "id": 2, "name": "benoit", "age": 34, "maxim": "The crepes are really good" },
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// In the same transaction we provide some stop_words
|
||||
let set = btreeset! { "i".to_string(), "the".to_string(), "are".to_string() };
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_stop_words(set.clone());
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Ensure stop_words are effectively stored
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let stop_words = index.stop_words(&rtxn).unwrap();
|
||||
assert!(stop_words.is_some()); // at this point the index should return something
|
||||
|
||||
let stop_words = stop_words.unwrap();
|
||||
let expected = fst::Set::from_iter(&set).unwrap();
|
||||
assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes());
|
||||
|
||||
// when we search for something that is a non prefix stop_words it should be ignored
|
||||
// thus we should get a placeholder search (all the results = 3)
|
||||
let result = index.search(&rtxn).query("the ").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 3);
|
||||
let result = index.search(&rtxn).query("i ").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 3);
|
||||
let result = index.search(&rtxn).query("are ").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 3);
|
||||
|
||||
let result = index.search(&rtxn).query("dog").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos
|
||||
let result = index.search(&rtxn).query("benoît").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data
|
||||
|
||||
// now we'll reset the stop_words and ensure it's None
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.reset_stop_words();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let stop_words = index.stop_words(&rtxn).unwrap();
|
||||
assert!(stop_words.is_none());
|
||||
|
||||
// now we can search for the stop words
|
||||
let result = index.search(&rtxn).query("the").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 2);
|
||||
let result = index.search(&rtxn).query("i").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
let result = index.search(&rtxn).query("are").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 2);
|
||||
|
||||
// the rest of the search is still not impacted
|
||||
let result = index.search(&rtxn).query("dog").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos
|
||||
let result = index.search(&rtxn).query("benoît").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_and_reset_synonyms() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
// Send 3 documents with ids from 1 to 3.
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 23, "maxim": "I love dogs"},
|
||||
{ "id": 1, "name": "kevina", "age": 21, "maxim": "Doggos are the best"},
|
||||
{ "id": 2, "name": "benoit", "age": 34, "maxim": "The crepes are really good"},
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// In the same transaction provide some synonyms
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_synonyms(btreemap! {
|
||||
"blini".to_string() => vec!["crepes".to_string()],
|
||||
"super like".to_string() => vec!["love".to_string()],
|
||||
"puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()]
|
||||
});
|
||||
})
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Ensure synonyms are effectively stored
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let synonyms = index.synonyms(&rtxn).unwrap();
|
||||
assert!(!synonyms.is_empty()); // at this point the index should return something
|
||||
|
||||
// Check that we can use synonyms
|
||||
let result = index.search(&rtxn).query("blini").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
let result = index.search(&rtxn).query("super like").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
let result = index.search(&rtxn).query("puppies").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 2);
|
||||
|
||||
// Reset the synonyms
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.reset_synonyms();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// Ensure synonyms are reset
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let synonyms = index.synonyms(&rtxn).unwrap();
|
||||
assert!(synonyms.is_empty());
|
||||
|
||||
// Check that synonyms are no longer work
|
||||
let result = index.search(&rtxn).query("blini").execute().unwrap();
|
||||
assert!(result.documents_ids.is_empty());
|
||||
let result = index.search(&rtxn).query("super like").execute().unwrap();
|
||||
assert!(result.documents_ids.is_empty());
|
||||
let result = index.search(&rtxn).query("puppies").execute().unwrap();
|
||||
assert!(result.documents_ids.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn thai_synonyms() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
// Send 3 documents with ids from 1 to 3.
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": 0, "name": "ยี่ปุ่น" },
|
||||
{ "id": 1, "name": "ญี่ปุ่น" },
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// In the same transaction provide some synonyms
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_synonyms(btreemap! {
|
||||
"japanese".to_string() => vec![S("ญี่ปุ่น"), S("ยี่ปุ่น")],
|
||||
});
|
||||
})
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Ensure synonyms are effectively stored
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let synonyms = index.synonyms(&rtxn).unwrap();
|
||||
assert!(!synonyms.is_empty()); // at this point the index should return something
|
||||
|
||||
// Check that we can use synonyms
|
||||
let result = index.search(&rtxn).query("japanese").execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn setting_searchable_recomputes_other_settings() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// Set all the settings except searchable
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_displayed_fields(vec!["hello".to_string()]);
|
||||
settings.set_filterable_fields(vec![
|
||||
FilterableAttributesRule::Field(S("age")),
|
||||
FilterableAttributesRule::Field(S("toto")),
|
||||
]);
|
||||
settings.set_criteria(vec![Criterion::Asc(S("toto"))]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// check the output
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap());
|
||||
// since no documents have been pushed the primary key is still unset
|
||||
assert!(index.primary_key(&rtxn).unwrap().is_none());
|
||||
assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap());
|
||||
drop(rtxn);
|
||||
|
||||
// We set toto and age as searchable to force reordering of the fields
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec!["toto".to_string(), "age".to_string()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap());
|
||||
assert!(index.primary_key(&rtxn).unwrap().is_none());
|
||||
assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn setting_not_filterable_cant_filter() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// Set all the settings except searchable
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_displayed_fields(vec!["hello".to_string()]);
|
||||
// It is only Asc(toto), there is a facet database but it is denied to filter with toto.
|
||||
settings.set_criteria(vec![Criterion::Asc(S("toto"))]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let filter = Filter::from_str("toto = 32").unwrap().unwrap();
|
||||
let _ = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn setting_primary_key() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
// Set the primary key settings
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("mykey"));
|
||||
})
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey"));
|
||||
|
||||
// Then index some documents with the "mykey" primary key.
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "mykey": 1, "name": "kevin", "age": 23 },
|
||||
{ "mykey": 2, "name": "kevina", "age": 21 },
|
||||
{ "mykey": 3, "name": "benoit", "age": 34 },
|
||||
{ "mykey": 4, "name": "bernard", "age": 34 },
|
||||
{ "mykey": 5, "name": "bertrand", "age": 34 },
|
||||
{ "mykey": 6, "name": "bernie", "age": 34 },
|
||||
{ "mykey": 7, "name": "ben", "age": 34 }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Updating settings with the same primary key should do nothing
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("mykey"));
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey"));
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Updating the settings with a different (or no) primary key causes an error
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let error = index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.reset_primary_key();
|
||||
})
|
||||
.unwrap_err();
|
||||
assert!(matches!(error, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_))));
|
||||
wtxn.abort();
|
||||
|
||||
// But if we clear the database...
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let builder = ClearDocuments::new(&mut wtxn, &index);
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// ...we can change the primary key
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key(S("myid"));
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn setting_impact_relevancy() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// Set the genres setting
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("genres"))]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index.add_documents(documents!([
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Star Wars",
|
||||
"overview":
|
||||
"Princess Leia is captured and held hostage by the evil Imperial forces in their effort to take over the galactic Empire. Venturesome Luke Skywalker and dashing captain Han Solo team together with the loveable robot duo R2-D2 and C-3PO to rescue the beautiful princess and restore peace and justice in the Empire.",
|
||||
"genres": ["Adventure", "Action", "Science Fiction"],
|
||||
"poster": "https://image.tmdb.org/t/p/w500/6FfCtAuVAW8XJjZ7eWeLibRLWTw.jpg",
|
||||
"release_date": 233366400
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"title": "Magnetic Rose",
|
||||
"overview": "",
|
||||
"genres": ["Animation", "Science Fiction"],
|
||||
"poster": "https://image.tmdb.org/t/p/w500/gSuHDeWemA1menrwfMRChnSmMVN.jpg",
|
||||
"release_date": 819676800
|
||||
}
|
||||
])).unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let SearchResult { documents_ids, .. } = index.search(&rtxn).query("S").execute().unwrap();
|
||||
let first_id = documents_ids[0];
|
||||
let documents = index.documents(&rtxn, documents_ids).unwrap();
|
||||
let (_, content) = documents.iter().find(|(id, _)| *id == first_id).unwrap();
|
||||
|
||||
let fid = index.fields_ids_map(&rtxn).unwrap().id("title").unwrap();
|
||||
let line = std::str::from_utf8(content.get(fid).unwrap()).unwrap();
|
||||
assert_eq!(line, r#""Star Wars""#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_disable_typo() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut txn = index.write_txn().unwrap();
|
||||
assert!(index.authorize_typos(&txn).unwrap());
|
||||
|
||||
index
|
||||
.update_settings_using_wtxn(&mut txn, |settings| {
|
||||
settings.set_autorize_typos(false);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
assert!(!index.authorize_typos(&txn).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn update_min_word_len_for_typo() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// Set the genres setting
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_min_word_len_one_typo(8);
|
||||
settings.set_min_word_len_two_typos(8);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 8);
|
||||
assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 8);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.reset_min_word_len_one_typo();
|
||||
settings.reset_min_word_len_two_typos();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_ONE_TYPO);
|
||||
assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_TWO_TYPOS);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn update_invalid_min_word_len_for_typo() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// Set the genres setting
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_min_word_len_one_typo(10);
|
||||
settings.set_min_word_len_two_typos(7);
|
||||
})
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn update_exact_words_normalization() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut txn = index.write_txn().unwrap();
|
||||
// Set the genres setting
|
||||
index
|
||||
.update_settings_using_wtxn(&mut txn, |settings| {
|
||||
let words = btreeset! { S("Ab"), S("ac") };
|
||||
settings.set_exact_words(words);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let exact_words = index.exact_words(&txn).unwrap().unwrap();
|
||||
for word in exact_words.into_fst().stream().into_str_vec().unwrap() {
|
||||
assert!(word.0 == "ac" || word.0 == "ab");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_correct_settings_init() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
// we don't actually update the settings, just check their content
|
||||
let Settings {
|
||||
wtxn: _,
|
||||
index: _,
|
||||
indexer_config: _,
|
||||
searchable_fields,
|
||||
displayed_fields,
|
||||
filterable_fields,
|
||||
sortable_fields,
|
||||
criteria,
|
||||
stop_words,
|
||||
non_separator_tokens,
|
||||
separator_tokens,
|
||||
dictionary,
|
||||
distinct_field,
|
||||
synonyms,
|
||||
primary_key,
|
||||
authorize_typos,
|
||||
min_word_len_two_typos,
|
||||
min_word_len_one_typo,
|
||||
exact_words,
|
||||
exact_attributes,
|
||||
max_values_per_facet,
|
||||
sort_facet_values_by,
|
||||
pagination_max_total_hits,
|
||||
proximity_precision,
|
||||
embedder_settings,
|
||||
search_cutoff,
|
||||
localized_attributes_rules,
|
||||
prefix_search,
|
||||
facet_search,
|
||||
} = settings;
|
||||
assert!(matches!(searchable_fields, Setting::NotSet));
|
||||
assert!(matches!(displayed_fields, Setting::NotSet));
|
||||
assert!(matches!(filterable_fields, Setting::NotSet));
|
||||
assert!(matches!(sortable_fields, Setting::NotSet));
|
||||
assert!(matches!(criteria, Setting::NotSet));
|
||||
assert!(matches!(stop_words, Setting::NotSet));
|
||||
assert!(matches!(non_separator_tokens, Setting::NotSet));
|
||||
assert!(matches!(separator_tokens, Setting::NotSet));
|
||||
assert!(matches!(dictionary, Setting::NotSet));
|
||||
assert!(matches!(distinct_field, Setting::NotSet));
|
||||
assert!(matches!(synonyms, Setting::NotSet));
|
||||
assert!(matches!(primary_key, Setting::NotSet));
|
||||
assert!(matches!(authorize_typos, Setting::NotSet));
|
||||
assert!(matches!(min_word_len_two_typos, Setting::NotSet));
|
||||
assert!(matches!(min_word_len_one_typo, Setting::NotSet));
|
||||
assert!(matches!(exact_words, Setting::NotSet));
|
||||
assert!(matches!(exact_attributes, Setting::NotSet));
|
||||
assert!(matches!(max_values_per_facet, Setting::NotSet));
|
||||
assert!(matches!(sort_facet_values_by, Setting::NotSet));
|
||||
assert!(matches!(pagination_max_total_hits, Setting::NotSet));
|
||||
assert!(matches!(proximity_precision, Setting::NotSet));
|
||||
assert!(matches!(embedder_settings, Setting::NotSet));
|
||||
assert!(matches!(search_cutoff, Setting::NotSet));
|
||||
assert!(matches!(localized_attributes_rules, Setting::NotSet));
|
||||
assert!(matches!(prefix_search, Setting::NotSet));
|
||||
assert!(matches!(facet_search, Setting::NotSet));
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn settings_must_ignore_soft_deleted() {
|
||||
use serde_json::json;
|
||||
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut docs = vec![];
|
||||
for i in 0..10 {
|
||||
docs.push(json!({ "id": i, "title": format!("{:x}", i) }));
|
||||
}
|
||||
index.add_documents(documents! { docs }).unwrap();
|
||||
|
||||
index.delete_documents((0..5).map(|id| id.to_string()).collect());
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_searchable_fields(vec!["id".to_string()]);
|
||||
})
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let rtxn = index.write_txn().unwrap();
|
||||
let docs: StdResult<Vec<_>, _> = index.all_documents(&rtxn).unwrap().collect();
|
||||
let docs = docs.unwrap();
|
||||
assert_eq!(docs.len(), 5);
|
||||
}
|
|
@ -39,9 +39,8 @@ pub fn upgrade(
|
|||
(1, 12, 0..=2) => 0,
|
||||
(1, 12, 3..) => 1,
|
||||
(1, 13, 0) => 2,
|
||||
(1, 13, 1) => 3,
|
||||
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
|
||||
(1, 13, _) => return Ok(false),
|
||||
(1, 13, _) => 3,
|
||||
(major, minor, patch) => {
|
||||
return Err(InternalError::CannotUpgradeToVersion(major, minor, patch).into())
|
||||
}
|
||||
|
|
280
crates/milli/src/vector/composite.rs
Normal file
280
crates/milli/src/vector/composite.rs
Normal file
|
@ -0,0 +1,280 @@
|
|||
use std::time::Instant;
|
||||
|
||||
use arroy::Distance;
|
||||
|
||||
use super::error::CompositeEmbedderContainsHuggingFace;
|
||||
use super::{
|
||||
hf, manual, ollama, openai, rest, DistributionShift, EmbedError, Embedding, NewEmbedderError,
|
||||
};
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum SubEmbedder {
|
||||
/// An embedder based on running local models, fetched from the Hugging Face Hub.
|
||||
HuggingFace(hf::Embedder),
|
||||
/// An embedder based on making embedding queries against the OpenAI API.
|
||||
OpenAi(openai::Embedder),
|
||||
/// An embedder based on the user providing the embeddings in the documents and queries.
|
||||
UserProvided(manual::Embedder),
|
||||
/// An embedder based on making embedding queries against an <https://ollama.com> embedding server.
|
||||
Ollama(ollama::Embedder),
|
||||
/// An embedder based on making embedding queries against a generic JSON/REST embedding server.
|
||||
Rest(rest::Embedder),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub enum SubEmbedderOptions {
|
||||
HuggingFace(hf::EmbedderOptions),
|
||||
OpenAi(openai::EmbedderOptions),
|
||||
Ollama(ollama::EmbedderOptions),
|
||||
UserProvided(manual::EmbedderOptions),
|
||||
Rest(rest::EmbedderOptions),
|
||||
}
|
||||
|
||||
impl SubEmbedderOptions {
|
||||
pub fn distribution(&self) -> Option<DistributionShift> {
|
||||
match self {
|
||||
SubEmbedderOptions::HuggingFace(embedder_options) => embedder_options.distribution,
|
||||
SubEmbedderOptions::OpenAi(embedder_options) => embedder_options.distribution,
|
||||
SubEmbedderOptions::Ollama(embedder_options) => embedder_options.distribution,
|
||||
SubEmbedderOptions::UserProvided(embedder_options) => embedder_options.distribution,
|
||||
SubEmbedderOptions::Rest(embedder_options) => embedder_options.distribution,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Embedder {
|
||||
pub(super) search: SubEmbedder,
|
||||
pub(super) index: SubEmbedder,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub struct EmbedderOptions {
|
||||
pub search: SubEmbedderOptions,
|
||||
pub index: SubEmbedderOptions,
|
||||
}
|
||||
|
||||
impl Embedder {
|
||||
pub fn new(
|
||||
EmbedderOptions { search, index }: EmbedderOptions,
|
||||
) -> Result<Self, NewEmbedderError> {
|
||||
let search = SubEmbedder::new(search)?;
|
||||
let index = SubEmbedder::new(index)?;
|
||||
|
||||
// check dimensions
|
||||
if search.dimensions() != index.dimensions() {
|
||||
return Err(NewEmbedderError::composite_dimensions_mismatch(
|
||||
search.dimensions(),
|
||||
index.dimensions(),
|
||||
));
|
||||
}
|
||||
// check similarity
|
||||
let search_embeddings = search
|
||||
.embed(
|
||||
vec![
|
||||
"test".into(),
|
||||
"a brave dog".into(),
|
||||
"This is a sample text. It is meant to compare similarity.".into(),
|
||||
],
|
||||
None,
|
||||
)
|
||||
.map_err(|error| NewEmbedderError::composite_test_embedding_failed(error, "search"))?;
|
||||
|
||||
let index_embeddings = index
|
||||
.embed(
|
||||
vec![
|
||||
"test".into(),
|
||||
"a brave dog".into(),
|
||||
"This is a sample text. It is meant to compare similarity.".into(),
|
||||
],
|
||||
None,
|
||||
)
|
||||
.map_err(|error| {
|
||||
NewEmbedderError::composite_test_embedding_failed(error, "indexing")
|
||||
})?;
|
||||
|
||||
let hint = configuration_hint(&search, &index);
|
||||
|
||||
check_similarity(search_embeddings, index_embeddings, hint)?;
|
||||
|
||||
Ok(Self { search, index })
|
||||
}
|
||||
|
||||
/// Indicates the dimensions of a single embedding produced by the embedder.
|
||||
pub fn dimensions(&self) -> usize {
|
||||
// can use the dimensions of any embedder since they should match
|
||||
self.index.dimensions()
|
||||
}
|
||||
|
||||
/// An optional distribution used to apply an affine transformation to the similarity score of a document.
|
||||
pub fn distribution(&self) -> Option<DistributionShift> {
|
||||
// 3 cases here:
|
||||
// 1. distribution provided by user => use that one, which was stored in search
|
||||
// 2. no user-provided distribution, distribution in search embedder => use that one
|
||||
// 2. no user-provided distribution, no distribution in search embedder => use the distribution in indexing embedder
|
||||
self.search.distribution().or_else(|| self.index.distribution())
|
||||
}
|
||||
}
|
||||
|
||||
impl SubEmbedder {
|
||||
pub fn new(options: SubEmbedderOptions) -> std::result::Result<Self, NewEmbedderError> {
|
||||
Ok(match options {
|
||||
SubEmbedderOptions::HuggingFace(options) => {
|
||||
Self::HuggingFace(hf::Embedder::new(options)?)
|
||||
}
|
||||
SubEmbedderOptions::OpenAi(options) => Self::OpenAi(openai::Embedder::new(options)?),
|
||||
SubEmbedderOptions::Ollama(options) => Self::Ollama(ollama::Embedder::new(options)?),
|
||||
SubEmbedderOptions::UserProvided(options) => {
|
||||
Self::UserProvided(manual::Embedder::new(options))
|
||||
}
|
||||
SubEmbedderOptions::Rest(options) => {
|
||||
Self::Rest(rest::Embedder::new(options, rest::ConfigurationSource::User)?)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn embed(
|
||||
&self,
|
||||
texts: Vec<String>,
|
||||
deadline: Option<Instant>,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.embed(texts),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.embed(&texts, deadline),
|
||||
SubEmbedder::Ollama(embedder) => embedder.embed(&texts, deadline),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.embed(&texts),
|
||||
SubEmbedder::Rest(embedder) => embedder.embed(texts, deadline),
|
||||
}
|
||||
}
|
||||
|
||||
/// Embed multiple chunks of texts.
|
||||
///
|
||||
/// Each chunk is composed of one or multiple texts.
|
||||
pub fn embed_index(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
) -> std::result::Result<Vec<Vec<Embedding>>, EmbedError> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.embed_index(text_chunks),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
SubEmbedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.embed_index(text_chunks),
|
||||
SubEmbedder::Rest(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
}
|
||||
}
|
||||
|
||||
/// Non-owning variant of [`Self::embed_index`].
|
||||
pub fn embed_index_ref(
|
||||
&self,
|
||||
texts: &[&str],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.embed_index_ref(texts),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
SubEmbedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.embed_index_ref(texts),
|
||||
SubEmbedder::Rest(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
}
|
||||
}
|
||||
|
||||
/// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`]
|
||||
pub fn chunk_count_hint(&self) -> usize {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.chunk_count_hint(),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.chunk_count_hint(),
|
||||
SubEmbedder::Ollama(embedder) => embedder.chunk_count_hint(),
|
||||
SubEmbedder::UserProvided(_) => 100,
|
||||
SubEmbedder::Rest(embedder) => embedder.chunk_count_hint(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Indicates the preferred number of texts in a single chunk passed to [`Self::embed`]
|
||||
pub fn prompt_count_in_chunk_hint(&self) -> usize {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
SubEmbedder::Ollama(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
SubEmbedder::UserProvided(_) => 1,
|
||||
SubEmbedder::Rest(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uses_document_template(&self) -> bool {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(_)
|
||||
| SubEmbedder::OpenAi(_)
|
||||
| SubEmbedder::Ollama(_)
|
||||
| SubEmbedder::Rest(_) => true,
|
||||
SubEmbedder::UserProvided(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Indicates the dimensions of a single embedding produced by the embedder.
|
||||
pub fn dimensions(&self) -> usize {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.dimensions(),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.dimensions(),
|
||||
SubEmbedder::Ollama(embedder) => embedder.dimensions(),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.dimensions(),
|
||||
SubEmbedder::Rest(embedder) => embedder.dimensions(),
|
||||
}
|
||||
}
|
||||
|
||||
/// An optional distribution used to apply an affine transformation to the similarity score of a document.
|
||||
pub fn distribution(&self) -> Option<DistributionShift> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.distribution(),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.distribution(),
|
||||
SubEmbedder::Ollama(embedder) => embedder.distribution(),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.distribution(),
|
||||
SubEmbedder::Rest(embedder) => embedder.distribution(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn check_similarity(
|
||||
left: Vec<Embedding>,
|
||||
right: Vec<Embedding>,
|
||||
hint: CompositeEmbedderContainsHuggingFace,
|
||||
) -> Result<(), NewEmbedderError> {
|
||||
if left.len() != right.len() {
|
||||
return Err(NewEmbedderError::composite_embedding_count_mismatch(left.len(), right.len()));
|
||||
}
|
||||
|
||||
for (left, right) in left.into_iter().zip(right) {
|
||||
let left = arroy::internals::UnalignedVector::from_slice(&left);
|
||||
let right = arroy::internals::UnalignedVector::from_slice(&right);
|
||||
let left = arroy::internals::Leaf {
|
||||
header: arroy::distances::Cosine::new_header(&left),
|
||||
vector: left,
|
||||
};
|
||||
let right = arroy::internals::Leaf {
|
||||
header: arroy::distances::Cosine::new_header(&right),
|
||||
vector: right,
|
||||
};
|
||||
|
||||
let distance = arroy::distances::Cosine::built_distance(&left, &right);
|
||||
|
||||
if distance > super::MAX_COMPOSITE_DISTANCE {
|
||||
return Err(NewEmbedderError::composite_embedding_value_mismatch(distance, hint));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn configuration_hint(
|
||||
search: &SubEmbedder,
|
||||
index: &SubEmbedder,
|
||||
) -> CompositeEmbedderContainsHuggingFace {
|
||||
match (search, index) {
|
||||
(SubEmbedder::HuggingFace(_), SubEmbedder::HuggingFace(_)) => {
|
||||
CompositeEmbedderContainsHuggingFace::Both
|
||||
}
|
||||
(SubEmbedder::HuggingFace(_), _) => CompositeEmbedderContainsHuggingFace::Search,
|
||||
(_, SubEmbedder::HuggingFace(_)) => CompositeEmbedderContainsHuggingFace::Indexing,
|
||||
_ => CompositeEmbedderContainsHuggingFace::None,
|
||||
}
|
||||
}
|
|
@ -6,6 +6,7 @@ use hf_hub::api::sync::ApiError;
|
|||
|
||||
use super::parsed_vectors::ParsedVectorsDiff;
|
||||
use super::rest::ConfigurationSource;
|
||||
use super::MAX_COMPOSITE_DISTANCE;
|
||||
use crate::error::FaultSource;
|
||||
use crate::update::new::vector_document::VectorDocument;
|
||||
use crate::{FieldDistribution, PanicCatched};
|
||||
|
@ -335,6 +336,77 @@ impl NewEmbedderError {
|
|||
pub(crate) fn ollama_unsupported_url(url: String) -> NewEmbedderError {
|
||||
Self { kind: NewEmbedderErrorKind::OllamaUnsupportedUrl(url), fault: FaultSource::User }
|
||||
}
|
||||
|
||||
pub(crate) fn composite_dimensions_mismatch(
|
||||
search_dimensions: usize,
|
||||
index_dimensions: usize,
|
||||
) -> NewEmbedderError {
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::CompositeDimensionsMismatch {
|
||||
search_dimensions,
|
||||
index_dimensions,
|
||||
},
|
||||
fault: FaultSource::User,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn composite_test_embedding_failed(
|
||||
inner: EmbedError,
|
||||
failing_embedder: &'static str,
|
||||
) -> NewEmbedderError {
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::CompositeTestEmbeddingFailed { inner, failing_embedder },
|
||||
fault: FaultSource::Runtime,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn composite_embedding_count_mismatch(
|
||||
search_count: usize,
|
||||
index_count: usize,
|
||||
) -> NewEmbedderError {
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::CompositeEmbeddingCountMismatch {
|
||||
search_count,
|
||||
index_count,
|
||||
},
|
||||
fault: FaultSource::Runtime,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn composite_embedding_value_mismatch(
|
||||
distance: f32,
|
||||
hint: CompositeEmbedderContainsHuggingFace,
|
||||
) -> NewEmbedderError {
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::CompositeEmbeddingValueMismatch { distance, hint },
|
||||
fault: FaultSource::User,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum CompositeEmbedderContainsHuggingFace {
|
||||
Both,
|
||||
Search,
|
||||
Indexing,
|
||||
None,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for CompositeEmbedderContainsHuggingFace {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
CompositeEmbedderContainsHuggingFace::Both => f.write_str(
|
||||
"\n - Make sure the `model`, `revision` and `pooling` of both embedders match.",
|
||||
),
|
||||
CompositeEmbedderContainsHuggingFace::Search => f.write_str(
|
||||
"\n - Consider trying a different `pooling` method for the search embedder.",
|
||||
),
|
||||
CompositeEmbedderContainsHuggingFace::Indexing => f.write_str(
|
||||
"\n - Consider trying a different `pooling` method for the indexing embedder.",
|
||||
),
|
||||
CompositeEmbedderContainsHuggingFace::None => Ok(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
|
@ -419,6 +491,14 @@ pub enum NewEmbedderErrorKind {
|
|||
CouldNotParseTemplate(String),
|
||||
#[error("unsupported Ollama URL.\n - For `ollama` sources, the URL must end with `/api/embed` or `/api/embeddings`\n - Got `{0}`")]
|
||||
OllamaUnsupportedUrl(String),
|
||||
#[error("error while generating test embeddings.\n - the dimensions of embeddings produced at search time and at indexing time don't match.\n - Search time dimensions: {search_dimensions}\n - Indexing time dimensions: {index_dimensions}\n - Note: Dimensions of embeddings produced by both embedders are required to match.")]
|
||||
CompositeDimensionsMismatch { search_dimensions: usize, index_dimensions: usize },
|
||||
#[error("error while generating test embeddings.\n - could not generate test embedding with embedder at {failing_embedder} time.\n - Embedding failed with {inner}")]
|
||||
CompositeTestEmbeddingFailed { inner: EmbedError, failing_embedder: &'static str },
|
||||
#[error("error while generating test embeddings.\n - the number of generated embeddings differs.\n - {search_count} embeddings for the search time embedder.\n - {index_count} embeddings for the indexing time embedder.")]
|
||||
CompositeEmbeddingCountMismatch { search_count: usize, index_count: usize },
|
||||
#[error("error while generating test embeddings.\n - the embeddings produced at search time and indexing time are not similar enough.\n - angular distance {distance:.2}\n - Meilisearch requires a maximum distance of {MAX_COMPOSITE_DISTANCE}.\n - Note: check that both embedders produce similar embeddings.{hint}")]
|
||||
CompositeEmbeddingValueMismatch { distance: f32, hint: CompositeEmbedderContainsHuggingFace },
|
||||
}
|
||||
|
||||
pub struct PossibleEmbeddingMistakes {
|
||||
|
|
|
@ -255,34 +255,8 @@ impl Embedder {
|
|||
Ok(this)
|
||||
}
|
||||
|
||||
pub fn embed(&self, mut texts: Vec<String>) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
let tokens = match texts.len() {
|
||||
1 => vec![self
|
||||
.tokenizer
|
||||
.encode(texts.pop().unwrap(), true)
|
||||
.map_err(EmbedError::tokenize)?],
|
||||
_ => self.tokenizer.encode_batch(texts, true).map_err(EmbedError::tokenize)?,
|
||||
};
|
||||
let token_ids = tokens
|
||||
.iter()
|
||||
.map(|tokens| {
|
||||
let mut tokens = tokens.get_ids().to_vec();
|
||||
tokens.truncate(512);
|
||||
Tensor::new(tokens.as_slice(), &self.model.device).map_err(EmbedError::tensor_shape)
|
||||
})
|
||||
.collect::<Result<Vec<_>, EmbedError>>()?;
|
||||
|
||||
let token_ids = Tensor::stack(&token_ids, 0).map_err(EmbedError::tensor_shape)?;
|
||||
let token_type_ids = token_ids.zeros_like().map_err(EmbedError::tensor_shape)?;
|
||||
let embeddings = self
|
||||
.model
|
||||
.forward(&token_ids, &token_type_ids, None)
|
||||
.map_err(EmbedError::model_forward)?;
|
||||
|
||||
let embeddings = Self::pooling(embeddings, self.pooling)?;
|
||||
|
||||
let embeddings: Vec<Embedding> = embeddings.to_vec2().map_err(EmbedError::tensor_shape)?;
|
||||
Ok(embeddings)
|
||||
pub fn embed(&self, texts: Vec<String>) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
texts.into_iter().map(|text| self.embed_one(&text)).collect()
|
||||
}
|
||||
|
||||
fn pooling(embeddings: Tensor, pooling: Pooling) -> Result<Tensor, EmbedError> {
|
||||
|
@ -346,7 +320,7 @@ impl Embedder {
|
|||
Ok(embedding)
|
||||
}
|
||||
|
||||
pub fn embed_chunks(
|
||||
pub fn embed_index(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
) -> std::result::Result<Vec<Vec<Embedding>>, EmbedError> {
|
||||
|
@ -378,7 +352,7 @@ impl Embedder {
|
|||
})
|
||||
}
|
||||
|
||||
pub(crate) fn embed_chunks_ref(&self, texts: &[&str]) -> Result<Vec<Embedding>, EmbedError> {
|
||||
pub(crate) fn embed_index_ref(&self, texts: &[&str]) -> Result<Vec<Embedding>, EmbedError> {
|
||||
texts.iter().map(|text| self.embed_one(text)).collect()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,7 +30,7 @@ impl Embedder {
|
|||
self.dimensions
|
||||
}
|
||||
|
||||
pub fn embed_chunks(
|
||||
pub fn embed_index(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
) -> Result<Vec<Vec<Embedding>>, EmbedError> {
|
||||
|
@ -41,7 +41,7 @@ impl Embedder {
|
|||
self.distribution
|
||||
}
|
||||
|
||||
pub(crate) fn embed_chunks_ref(&self, texts: &[&str]) -> Result<Vec<Embedding>, EmbedError> {
|
||||
pub(crate) fn embed_index_ref(&self, texts: &[&str]) -> Result<Vec<Embedding>, EmbedError> {
|
||||
texts.iter().map(|text| self.embed_one(text)).collect()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@ use self::error::{EmbedError, NewEmbedderError};
|
|||
use crate::prompt::{Prompt, PromptData};
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
pub mod composite;
|
||||
pub mod error;
|
||||
pub mod hf;
|
||||
pub mod json_template;
|
||||
|
@ -31,6 +32,7 @@ pub use self::error::Error;
|
|||
pub type Embedding = Vec<f32>;
|
||||
|
||||
pub const REQUEST_PARALLELISM: usize = 40;
|
||||
pub const MAX_COMPOSITE_DISTANCE: f32 = 0.01;
|
||||
|
||||
pub struct ArroyWrapper {
|
||||
quantized: bool,
|
||||
|
@ -536,6 +538,8 @@ pub enum Embedder {
|
|||
Ollama(ollama::Embedder),
|
||||
/// An embedder based on making embedding queries against a generic JSON/REST embedding server.
|
||||
Rest(rest::Embedder),
|
||||
/// An embedder composed of an embedder at search time and an embedder at indexing time.
|
||||
Composite(composite::Embedder),
|
||||
}
|
||||
|
||||
/// Configuration for an embedder.
|
||||
|
@ -605,6 +609,7 @@ pub enum EmbedderOptions {
|
|||
Ollama(ollama::EmbedderOptions),
|
||||
UserProvided(manual::EmbedderOptions),
|
||||
Rest(rest::EmbedderOptions),
|
||||
Composite(composite::EmbedderOptions),
|
||||
}
|
||||
|
||||
impl Default for EmbedderOptions {
|
||||
|
@ -626,33 +631,29 @@ impl Embedder {
|
|||
EmbedderOptions::Rest(options) => {
|
||||
Self::Rest(rest::Embedder::new(options, rest::ConfigurationSource::User)?)
|
||||
}
|
||||
EmbedderOptions::Composite(options) => {
|
||||
Self::Composite(composite::Embedder::new(options)?)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Embed one or multiple texts.
|
||||
///
|
||||
/// Each text can be embedded as one or multiple embeddings.
|
||||
pub fn embed(
|
||||
/// Embed in search context
|
||||
|
||||
#[tracing::instrument(level = "debug", skip_all, target = "search")]
|
||||
pub fn embed_search(
|
||||
&self,
|
||||
texts: Vec<String>,
|
||||
text: String,
|
||||
deadline: Option<Instant>,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
match self {
|
||||
) -> std::result::Result<Embedding, EmbedError> {
|
||||
let texts = vec![text];
|
||||
let mut embedding = match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.embed(texts),
|
||||
Embedder::OpenAi(embedder) => embedder.embed(&texts, deadline),
|
||||
Embedder::Ollama(embedder) => embedder.embed(&texts, deadline),
|
||||
Embedder::UserProvided(embedder) => embedder.embed(&texts),
|
||||
Embedder::Rest(embedder) => embedder.embed(texts, deadline),
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "debug", skip_all, target = "search")]
|
||||
pub fn embed_one(
|
||||
&self,
|
||||
text: String,
|
||||
deadline: Option<Instant>,
|
||||
) -> std::result::Result<Embedding, EmbedError> {
|
||||
let mut embedding = self.embed(vec![text], deadline)?;
|
||||
Embedder::Composite(embedder) => embedder.search.embed(texts, deadline),
|
||||
}?;
|
||||
let embedding = embedding.pop().ok_or_else(EmbedError::missing_embedding)?;
|
||||
Ok(embedding)
|
||||
}
|
||||
|
@ -660,31 +661,34 @@ impl Embedder {
|
|||
/// Embed multiple chunks of texts.
|
||||
///
|
||||
/// Each chunk is composed of one or multiple texts.
|
||||
pub fn embed_chunks(
|
||||
pub fn embed_index(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
) -> std::result::Result<Vec<Vec<Embedding>>, EmbedError> {
|
||||
match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks),
|
||||
Embedder::OpenAi(embedder) => embedder.embed_chunks(text_chunks, threads),
|
||||
Embedder::Ollama(embedder) => embedder.embed_chunks(text_chunks, threads),
|
||||
Embedder::UserProvided(embedder) => embedder.embed_chunks(text_chunks),
|
||||
Embedder::Rest(embedder) => embedder.embed_chunks(text_chunks, threads),
|
||||
Embedder::HuggingFace(embedder) => embedder.embed_index(text_chunks),
|
||||
Embedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
Embedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
Embedder::UserProvided(embedder) => embedder.embed_index(text_chunks),
|
||||
Embedder::Rest(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
Embedder::Composite(embedder) => embedder.index.embed_index(text_chunks, threads),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn embed_chunks_ref(
|
||||
/// Non-owning variant of [`Self::embed_index`].
|
||||
pub fn embed_index_ref(
|
||||
&self,
|
||||
texts: &[&str],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.embed_chunks_ref(texts),
|
||||
Embedder::OpenAi(embedder) => embedder.embed_chunks_ref(texts, threads),
|
||||
Embedder::Ollama(embedder) => embedder.embed_chunks_ref(texts, threads),
|
||||
Embedder::UserProvided(embedder) => embedder.embed_chunks_ref(texts),
|
||||
Embedder::Rest(embedder) => embedder.embed_chunks_ref(texts, threads),
|
||||
Embedder::HuggingFace(embedder) => embedder.embed_index_ref(texts),
|
||||
Embedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
Embedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
Embedder::UserProvided(embedder) => embedder.embed_index_ref(texts),
|
||||
Embedder::Rest(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
Embedder::Composite(embedder) => embedder.index.embed_index_ref(texts, threads),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -696,6 +700,7 @@ impl Embedder {
|
|||
Embedder::Ollama(embedder) => embedder.chunk_count_hint(),
|
||||
Embedder::UserProvided(_) => 100,
|
||||
Embedder::Rest(embedder) => embedder.chunk_count_hint(),
|
||||
Embedder::Composite(embedder) => embedder.index.chunk_count_hint(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -707,6 +712,7 @@ impl Embedder {
|
|||
Embedder::Ollama(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
Embedder::UserProvided(_) => 1,
|
||||
Embedder::Rest(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
Embedder::Composite(embedder) => embedder.index.prompt_count_in_chunk_hint(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -718,6 +724,7 @@ impl Embedder {
|
|||
Embedder::Ollama(embedder) => embedder.dimensions(),
|
||||
Embedder::UserProvided(embedder) => embedder.dimensions(),
|
||||
Embedder::Rest(embedder) => embedder.dimensions(),
|
||||
Embedder::Composite(embedder) => embedder.dimensions(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -729,6 +736,7 @@ impl Embedder {
|
|||
Embedder::Ollama(embedder) => embedder.distribution(),
|
||||
Embedder::UserProvided(embedder) => embedder.distribution(),
|
||||
Embedder::Rest(embedder) => embedder.distribution(),
|
||||
Embedder::Composite(embedder) => embedder.distribution(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -739,6 +747,7 @@ impl Embedder {
|
|||
| Embedder::Ollama(_)
|
||||
| Embedder::Rest(_) => true,
|
||||
Embedder::UserProvided(_) => false,
|
||||
Embedder::Composite(embedder) => embedder.index.uses_document_template(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -113,7 +113,7 @@ impl Embedder {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn embed_chunks(
|
||||
pub fn embed_index(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
|
@ -134,7 +134,7 @@ impl Embedder {
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn embed_chunks_ref(
|
||||
pub(crate) fn embed_index_ref(
|
||||
&self,
|
||||
texts: &[&str],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
|
|
|
@ -250,7 +250,7 @@ impl Embedder {
|
|||
Ok(all_embeddings)
|
||||
}
|
||||
|
||||
pub fn embed_chunks(
|
||||
pub fn embed_index(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
|
@ -271,7 +271,7 @@ impl Embedder {
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn embed_chunks_ref(
|
||||
pub(crate) fn embed_index_ref(
|
||||
&self,
|
||||
texts: &[&str],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
|
|
|
@ -184,7 +184,7 @@ impl Embedder {
|
|||
Ok(embeddings.pop().unwrap())
|
||||
}
|
||||
|
||||
pub fn embed_chunks(
|
||||
pub fn embed_index(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
|
@ -205,7 +205,7 @@ impl Embedder {
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn embed_chunks_ref(
|
||||
pub(crate) fn embed_index_ref(
|
||||
&self,
|
||||
texts: &[&str],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
|
|
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue