mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 12:27:13 +02:00
Merge branch 'main' into tmp-release-v1.7.4
This commit is contained in:
commit
796213af9a
68 changed files with 4625 additions and 1492 deletions
|
@ -339,6 +339,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
|
|||
prompt_reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
embedder: Arc<Embedder>,
|
||||
request_threads: &rayon::ThreadPool,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
|
||||
|
@ -376,7 +377,10 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
|
|||
|
||||
if chunks.len() == chunks.capacity() {
|
||||
let chunked_embeds = embedder
|
||||
.embed_chunks(std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)))
|
||||
.embed_chunks(
|
||||
std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)),
|
||||
request_threads,
|
||||
)
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::Error::from)?;
|
||||
|
||||
|
@ -394,7 +398,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
|
|||
// send last chunk
|
||||
if !chunks.is_empty() {
|
||||
let chunked_embeds = embedder
|
||||
.embed_chunks(std::mem::take(&mut chunks))
|
||||
.embed_chunks(std::mem::take(&mut chunks), request_threads)
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::Error::from)?;
|
||||
for (docid, embeddings) in chunks_ids
|
||||
|
@ -408,7 +412,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
|
|||
|
||||
if !current_chunk.is_empty() {
|
||||
let embeds = embedder
|
||||
.embed_chunks(vec![std::mem::take(&mut current_chunk)])
|
||||
.embed_chunks(vec![std::mem::take(&mut current_chunk)], request_threads)
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::Error::from)?;
|
||||
|
||||
|
|
|
@ -238,6 +238,12 @@ fn send_original_documents_data(
|
|||
|
||||
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
|
||||
let request_threads = rayon::ThreadPoolBuilder::new()
|
||||
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
||||
.thread_name(|index| format!("embedding-request-{index}"))
|
||||
.build()?;
|
||||
|
||||
rayon::spawn(move || {
|
||||
for (name, (embedder, prompt)) in embedders {
|
||||
let result = extract_vector_points(
|
||||
|
@ -249,7 +255,12 @@ fn send_original_documents_data(
|
|||
);
|
||||
match result {
|
||||
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
|
||||
let embeddings = match extract_embeddings(prompts, indexer, embedder.clone()) {
|
||||
let embeddings = match extract_embeddings(
|
||||
prompts,
|
||||
indexer,
|
||||
embedder.clone(),
|
||||
&request_threads,
|
||||
) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
|
|
|
@ -2646,6 +2646,12 @@ mod tests {
|
|||
api_key: Setting::NotSet,
|
||||
dimensions: Setting::Set(3),
|
||||
document_template: Setting::NotSet,
|
||||
url: Setting::NotSet,
|
||||
query: Setting::NotSet,
|
||||
input_field: Setting::NotSet,
|
||||
path_to_embeddings: Setting::NotSet,
|
||||
embedding_object: Setting::NotSet,
|
||||
input_type: Setting::NotSet,
|
||||
}),
|
||||
);
|
||||
settings.set_embedder_settings(embedders);
|
||||
|
|
|
@ -14,12 +14,13 @@ use super::IndexerConfig;
|
|||
use crate::criterion::Criterion;
|
||||
use crate::error::UserError;
|
||||
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||
use crate::order_by_map::OrderByMap;
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::index_documents::IndexDocumentsMethod;
|
||||
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
||||
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
|
||||
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
|
||||
use crate::{FieldsIdsMap, Index, OrderBy, Result};
|
||||
use crate::{FieldsIdsMap, Index, Result};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
|
||||
pub enum Setting<T> {
|
||||
|
@ -145,10 +146,11 @@ pub struct Settings<'a, 't, 'i> {
|
|||
/// Attributes on which typo tolerance is disabled.
|
||||
exact_attributes: Setting<HashSet<String>>,
|
||||
max_values_per_facet: Setting<usize>,
|
||||
sort_facet_values_by: Setting<HashMap<String, OrderBy>>,
|
||||
sort_facet_values_by: Setting<OrderByMap>,
|
||||
pagination_max_total_hits: Setting<usize>,
|
||||
proximity_precision: Setting<ProximityPrecision>,
|
||||
embedder_settings: Setting<BTreeMap<String, Setting<EmbeddingSettings>>>,
|
||||
search_cutoff: Setting<u64>,
|
||||
}
|
||||
|
||||
impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
|
@ -182,6 +184,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
pagination_max_total_hits: Setting::NotSet,
|
||||
proximity_precision: Setting::NotSet,
|
||||
embedder_settings: Setting::NotSet,
|
||||
search_cutoff: Setting::NotSet,
|
||||
indexer_config,
|
||||
}
|
||||
}
|
||||
|
@ -340,7 +343,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
self.max_values_per_facet = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_sort_facet_values_by(&mut self, value: HashMap<String, OrderBy>) {
|
||||
pub fn set_sort_facet_values_by(&mut self, value: OrderByMap) {
|
||||
self.sort_facet_values_by = Setting::Set(value);
|
||||
}
|
||||
|
||||
|
@ -372,6 +375,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
self.embedder_settings = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_search_cutoff(&mut self, value: u64) {
|
||||
self.search_cutoff = Setting::Set(value);
|
||||
}
|
||||
|
||||
pub fn reset_search_cutoff(&mut self) {
|
||||
self.search_cutoff = Setting::Reset;
|
||||
}
|
||||
|
||||
#[tracing::instrument(
|
||||
level = "trace"
|
||||
skip(self, progress_callback, should_abort, old_fields_ids_map),
|
||||
|
@ -1025,6 +1036,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
Ok(update)
|
||||
}
|
||||
|
||||
fn update_search_cutoff(&mut self) -> Result<bool> {
|
||||
let changed = match self.search_cutoff {
|
||||
Setting::Set(new) => {
|
||||
let old = self.index.search_cutoff(self.wtxn)?;
|
||||
if old == Some(new) {
|
||||
false
|
||||
} else {
|
||||
self.index.put_search_cutoff(self.wtxn, new)?;
|
||||
true
|
||||
}
|
||||
}
|
||||
Setting::Reset => self.index.delete_search_cutoff(self.wtxn)?,
|
||||
Setting::NotSet => false,
|
||||
};
|
||||
|
||||
Ok(changed)
|
||||
}
|
||||
|
||||
pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()>
|
||||
where
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
|
@ -1073,6 +1102,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
|
||||
let embedding_configs_updated = self.update_embedding_configs()?;
|
||||
|
||||
// never trigger re-indexing
|
||||
self.update_search_cutoff()?;
|
||||
|
||||
if stop_words_updated
|
||||
|| non_separator_tokens_updated
|
||||
|| separator_tokens_updated
|
||||
|
@ -1131,6 +1163,12 @@ fn validate_prompt(
|
|||
api_key,
|
||||
dimensions,
|
||||
document_template: Setting::Set(template),
|
||||
url,
|
||||
query,
|
||||
input_field,
|
||||
path_to_embeddings,
|
||||
embedding_object,
|
||||
input_type,
|
||||
}) => {
|
||||
// validate
|
||||
let template = crate::prompt::Prompt::new(template)
|
||||
|
@ -1144,6 +1182,12 @@ fn validate_prompt(
|
|||
api_key,
|
||||
dimensions,
|
||||
document_template: Setting::Set(template),
|
||||
url,
|
||||
query,
|
||||
input_field,
|
||||
path_to_embeddings,
|
||||
embedding_object,
|
||||
input_type,
|
||||
}))
|
||||
}
|
||||
new => Ok(new),
|
||||
|
@ -1156,8 +1200,20 @@ pub fn validate_embedding_settings(
|
|||
) -> Result<Setting<EmbeddingSettings>> {
|
||||
let settings = validate_prompt(name, settings)?;
|
||||
let Setting::Set(settings) = settings else { return Ok(settings) };
|
||||
let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } =
|
||||
settings;
|
||||
let EmbeddingSettings {
|
||||
source,
|
||||
model,
|
||||
revision,
|
||||
api_key,
|
||||
dimensions,
|
||||
document_template,
|
||||
url,
|
||||
query,
|
||||
input_field,
|
||||
path_to_embeddings,
|
||||
embedding_object,
|
||||
input_type,
|
||||
} = settings;
|
||||
|
||||
if let Some(0) = dimensions.set() {
|
||||
return Err(crate::error::UserError::InvalidSettingsDimensions {
|
||||
|
@ -1166,6 +1222,14 @@ pub fn validate_embedding_settings(
|
|||
.into());
|
||||
}
|
||||
|
||||
if let Some(url) = url.as_ref().set() {
|
||||
url::Url::parse(url).map_err(|error| crate::error::UserError::InvalidUrl {
|
||||
embedder_name: name.to_owned(),
|
||||
inner_error: error,
|
||||
url: url.to_owned(),
|
||||
})?;
|
||||
}
|
||||
|
||||
let Some(inferred_source) = source.set() else {
|
||||
return Ok(Setting::Set(EmbeddingSettings {
|
||||
source,
|
||||
|
@ -1174,11 +1238,25 @@ pub fn validate_embedding_settings(
|
|||
api_key,
|
||||
dimensions,
|
||||
document_template,
|
||||
url,
|
||||
query,
|
||||
input_field,
|
||||
path_to_embeddings,
|
||||
embedding_object,
|
||||
input_type,
|
||||
}));
|
||||
};
|
||||
match inferred_source {
|
||||
EmbedderSource::OpenAi => {
|
||||
check_unset(&revision, "revision", inferred_source, name)?;
|
||||
|
||||
check_unset(&url, "url", inferred_source, name)?;
|
||||
check_unset(&query, "query", inferred_source, name)?;
|
||||
check_unset(&input_field, "inputField", inferred_source, name)?;
|
||||
check_unset(&path_to_embeddings, "pathToEmbeddings", inferred_source, name)?;
|
||||
check_unset(&embedding_object, "embeddingObject", inferred_source, name)?;
|
||||
check_unset(&input_type, "inputType", inferred_source, name)?;
|
||||
|
||||
if let Setting::Set(model) = &model {
|
||||
let model = crate::vector::openai::EmbeddingModel::from_name(model.as_str())
|
||||
.ok_or(crate::error::UserError::InvalidOpenAiModel {
|
||||
|
@ -1209,9 +1287,30 @@ pub fn validate_embedding_settings(
|
|||
}
|
||||
}
|
||||
}
|
||||
EmbedderSource::Ollama => {
|
||||
// Dimensions get inferred, only model name is required
|
||||
check_unset(&dimensions, "dimensions", inferred_source, name)?;
|
||||
check_set(&model, "model", inferred_source, name)?;
|
||||
check_unset(&api_key, "apiKey", inferred_source, name)?;
|
||||
check_unset(&revision, "revision", inferred_source, name)?;
|
||||
|
||||
check_unset(&url, "url", inferred_source, name)?;
|
||||
check_unset(&query, "query", inferred_source, name)?;
|
||||
check_unset(&input_field, "inputField", inferred_source, name)?;
|
||||
check_unset(&path_to_embeddings, "pathToEmbeddings", inferred_source, name)?;
|
||||
check_unset(&embedding_object, "embeddingObject", inferred_source, name)?;
|
||||
check_unset(&input_type, "inputType", inferred_source, name)?;
|
||||
}
|
||||
EmbedderSource::HuggingFace => {
|
||||
check_unset(&api_key, "apiKey", inferred_source, name)?;
|
||||
check_unset(&dimensions, "dimensions", inferred_source, name)?;
|
||||
|
||||
check_unset(&url, "url", inferred_source, name)?;
|
||||
check_unset(&query, "query", inferred_source, name)?;
|
||||
check_unset(&input_field, "inputField", inferred_source, name)?;
|
||||
check_unset(&path_to_embeddings, "pathToEmbeddings", inferred_source, name)?;
|
||||
check_unset(&embedding_object, "embeddingObject", inferred_source, name)?;
|
||||
check_unset(&input_type, "inputType", inferred_source, name)?;
|
||||
}
|
||||
EmbedderSource::UserProvided => {
|
||||
check_unset(&model, "model", inferred_source, name)?;
|
||||
|
@ -1219,6 +1318,18 @@ pub fn validate_embedding_settings(
|
|||
check_unset(&api_key, "apiKey", inferred_source, name)?;
|
||||
check_unset(&document_template, "documentTemplate", inferred_source, name)?;
|
||||
check_set(&dimensions, "dimensions", inferred_source, name)?;
|
||||
|
||||
check_unset(&url, "url", inferred_source, name)?;
|
||||
check_unset(&query, "query", inferred_source, name)?;
|
||||
check_unset(&input_field, "inputField", inferred_source, name)?;
|
||||
check_unset(&path_to_embeddings, "pathToEmbeddings", inferred_source, name)?;
|
||||
check_unset(&embedding_object, "embeddingObject", inferred_source, name)?;
|
||||
check_unset(&input_type, "inputType", inferred_source, name)?;
|
||||
}
|
||||
EmbedderSource::Rest => {
|
||||
check_unset(&model, "model", inferred_source, name)?;
|
||||
check_unset(&revision, "revision", inferred_source, name)?;
|
||||
check_set(&url, "url", inferred_source, name)?;
|
||||
}
|
||||
}
|
||||
Ok(Setting::Set(EmbeddingSettings {
|
||||
|
@ -1228,6 +1339,12 @@ pub fn validate_embedding_settings(
|
|||
api_key,
|
||||
dimensions,
|
||||
document_template,
|
||||
url,
|
||||
query,
|
||||
input_field,
|
||||
path_to_embeddings,
|
||||
embedding_object,
|
||||
input_type,
|
||||
}))
|
||||
}
|
||||
|
||||
|
@ -2050,6 +2167,7 @@ mod tests {
|
|||
pagination_max_total_hits,
|
||||
proximity_precision,
|
||||
embedder_settings,
|
||||
search_cutoff,
|
||||
} = settings;
|
||||
assert!(matches!(searchable_fields, Setting::NotSet));
|
||||
assert!(matches!(displayed_fields, Setting::NotSet));
|
||||
|
@ -2073,6 +2191,7 @@ mod tests {
|
|||
assert!(matches!(pagination_max_total_hits, Setting::NotSet));
|
||||
assert!(matches!(proximity_precision, Setting::NotSet));
|
||||
assert!(matches!(embedder_settings, Setting::NotSet));
|
||||
assert!(matches!(search_cutoff, Setting::NotSet));
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue