From 138d20b277c5490669f317ab90f6b8897fcc59d0 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 18 Jun 2025 16:46:20 +0200 Subject: [PATCH 001/150] Remove old dependencies --- Cargo.lock | 21 --------------------- crates/meilisearch/Cargo.toml | 1 - crates/milli/Cargo.toml | 2 -- 3 files changed, 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7bf27bce4..7455ff1b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3744,7 +3744,6 @@ dependencies = [ "actix-web-lab", "anyhow", "async-openai", - "async-trait", "brotli", "bstr", "build-info", @@ -3969,7 +3968,6 @@ dependencies = [ "ordered-float 5.0.0", "rand 0.8.5", "rayon", - "rayon-par-bridge", "rhai", "roaring", "rstar", @@ -3987,7 +3985,6 @@ dependencies = [ "time", "tokenizers", "tracing", - "uell", "ureq", "url", "utoipa", @@ -5002,15 +4999,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "rayon-par-bridge" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb6a14d8f65834aca6b0fe4cbbd7a27e639cd3efb1f2a32de9942368f1991de8" -dependencies = [ - "rayon", -] - [[package]] name = "reborrow" version = "0.5.5" @@ -6457,15 +6445,6 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" -[[package]] -name = "uell" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40de5982e28612e20330e77d81f1559b74f66caf3c7fc10b19ada4843f4b4fd7" -dependencies = [ - "bumpalo", -] - [[package]] name = "ug" version = "0.4.0" diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 28ecb0147..fe00d9fee 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -28,7 +28,6 @@ actix-web = { version = "4.11.0", default-features = false, features = [ "rustls-0_23", ] } anyhow = { version = "1.0.98", features = ["backtrace"] } -async-trait = "0.1.88" bstr = "1.12.0" byte-unit = { version = "5.1.6", features = ["serde"] } bytes = "1.10.1" diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index b98725a8f..3d08252ac 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -92,14 +92,12 @@ rand = "0.8.5" tracing = "0.1.41" ureq = { version = "2.12.1", features = ["json"] } url = "2.5.4" -rayon-par-bridge = "0.1.0" hashbrown = "0.15.4" bumpalo = "3.18.1" bumparaw-collections = "0.1.4" thread_local = "1.1.9" allocator-api2 = "0.3.0" rustc-hash = "2.1.1" -uell = "0.1.0" enum-iterator = "2.1.0" bbqueue = { git = "https://github.com/meilisearch/bbqueue" } flume = { version = "0.11.1", default-features = false } From c17031d3de2d2011c4371e172796b4bde805d907 Mon Sep 17 00:00:00 2001 From: Dijana Pavlovic Date: Thu, 19 Jun 2025 15:11:37 +0200 Subject: [PATCH 002/150] Fix Gemini base_url when used with OpenAI clients --- crates/meilisearch-types/src/features.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch-types/src/features.rs b/crates/meilisearch-types/src/features.rs index 83054e784..49bee8d97 100644 --- a/crates/meilisearch-types/src/features.rs +++ b/crates/meilisearch-types/src/features.rs @@ -154,7 +154,7 @@ impl ChatCompletionSource { match self { OpenAi => Some("https://api.openai.com/v1/"), Mistral => Some("https://api.mistral.ai/v1/"), - Gemini => Some("https://generativelanguage.googleapis.com/v1beta/openai/"), + Gemini => Some("https://generativelanguage.googleapis.com/v1beta/openai"), AzureOpenAi | VLlm => None, } } From 4cadc8113b2d93be6b59333e5ba49e0e6f4d906a Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Fri, 20 Jun 2025 12:42:22 +0200 Subject: [PATCH 003/150] Add embedder stats in batches --- crates/benchmarks/benches/indexing.rs | 2 +- crates/benchmarks/benches/utils.rs | 2 +- crates/dump/src/lib.rs | 1 + .../src/scheduler/process_batch.rs | 1 + .../src/scheduler/process_index_operation.rs | 3 ++ crates/meilisearch-types/src/batches.rs | 10 ++++ crates/meilisearch/src/lib.rs | 3 +- crates/milli/src/progress.rs | 34 +++++++++++- .../milli/src/search/new/tests/integration.rs | 2 +- crates/milli/src/test_index.rs | 2 +- .../extract/extract_vector_points.rs | 8 ++- .../src/update/index_documents/extract/mod.rs | 5 ++ .../milli/src/update/index_documents/mod.rs | 9 +++- .../src/update/new/extract/vectors/mod.rs | 2 +- crates/milli/src/update/settings.rs | 9 ++-- crates/milli/src/vector/composite.rs | 32 ++++++----- crates/milli/src/vector/mod.rs | 28 +++++----- crates/milli/src/vector/ollama.rs | 15 ++++-- crates/milli/src/vector/openai.rs | 17 +++--- crates/milli/src/vector/rest.rs | 54 ++++++++++++++----- crates/milli/tests/search/distinct.rs | 2 +- .../milli/tests/search/facet_distribution.rs | 2 +- crates/milli/tests/search/mod.rs | 2 +- crates/milli/tests/search/phrase_search.rs | 2 +- crates/milli/tests/search/query_criteria.rs | 6 +-- crates/milli/tests/search/typo_tolerance.rs | 8 +-- 26 files changed, 188 insertions(+), 73 deletions(-) diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 9199c3877..b882b598d 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -65,7 +65,7 @@ fn setup_settings<'t>( let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); builder.set_sortable_fields(sortable_fields); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); } fn setup_index_with_settings( diff --git a/crates/benchmarks/benches/utils.rs b/crates/benchmarks/benches/utils.rs index aaa2d50a0..913807b45 100644 --- a/crates/benchmarks/benches/utils.rs +++ b/crates/benchmarks/benches/utils.rs @@ -90,7 +90,7 @@ pub fn base_setup(conf: &Conf) -> Index { (conf.configure)(&mut builder); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); wtxn.commit().unwrap(); let config = IndexerConfig::default(); diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 285818a87..c48c68f62 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -328,6 +328,7 @@ pub(crate) mod test { progress_trace: Default::default(), write_channel_congestion: None, internal_database_sizes: Default::default(), + embeddings: Default::default(), }, enqueued_at: Some(BatchEnqueuedAt { earliest: datetime!(2022-11-11 0:00 UTC), diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index c349f90ad..71e423a58 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -242,6 +242,7 @@ impl IndexScheduler { .execute( |indexing_step| tracing::debug!(update = ?indexing_step), || must_stop_processing.get(), + Some(progress.embedder_stats), ) .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; index_wtxn.commit()?; diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index 093c6209d..92d13e7e7 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use bumpalo::collections::CollectIn; use bumpalo::Bump; use meilisearch_types::heed::RwTxn; @@ -472,6 +474,7 @@ impl IndexScheduler { .execute( |indexing_step| tracing::debug!(update = ?indexing_step), || must_stop_processing.get(), + Some(Arc::clone(&progress.embedder_stats)) ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index 4d40189db..2ef373eac 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -82,4 +82,14 @@ pub struct BatchStats { pub write_channel_congestion: Option>, #[serde(default, skip_serializing_if = "serde_json::Map::is_empty")] pub internal_database_sizes: serde_json::Map, + pub embeddings: BatchEmbeddingStats +} + +#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct BatchEmbeddingStats { + pub total_count: usize, + pub error_count: usize, + pub last_error: Option, } diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 1e0c205d0..782d6172f 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -543,7 +543,7 @@ fn import_dump( let settings = index_reader.settings()?; apply_settings_to_builder(&settings, &mut builder); builder - .execute(|indexing_step| tracing::debug!("update: {:?}", indexing_step), || false)?; + .execute(|indexing_step| tracing::debug!("update: {:?}", indexing_step), || false, None)?; // 4.3 Import the documents. // 4.3.1 We need to recreate the grenad+obkv format accepted by the index. @@ -574,6 +574,7 @@ fn import_dump( }, |indexing_step| tracing::trace!("update: {:?}", indexing_step), || false, + None, )?; let builder = builder.with_embedders(embedders); diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index fa651e17f..ff795b220 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -1,7 +1,7 @@ use std::any::TypeId; use std::borrow::Cow; use std::marker::PhantomData; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::{AtomicU32, AtomicUsize, Ordering}; use std::sync::{Arc, RwLock}; use std::time::{Duration, Instant}; @@ -20,6 +20,13 @@ pub trait Step: 'static + Send + Sync { #[derive(Clone, Default)] pub struct Progress { steps: Arc>, + pub embedder_stats: Arc, +} + +#[derive(Default)] +pub struct EmbedderStats { + pub errors: Arc, u32)>>, + pub total_requests: AtomicUsize } #[derive(Default)] @@ -65,7 +72,19 @@ impl Progress { }); } - ProgressView { steps: step_view, percentage: percentage * 100.0 } + let embedder_view = { + let (last_error, error_count) = match self.embedder_stats.errors.read() { + Ok(guard) => (guard.0.clone(), guard.1), + Err(_) => (None, 0), + }; + EmbedderStatsView { + last_error, + request_count: self.embedder_stats.total_requests.load(Ordering::Relaxed) as u32, + error_count, + } + }; + + ProgressView { steps: step_view, percentage: percentage * 100.0, embedder: embedder_view } } pub fn accumulated_durations(&self) -> IndexMap { @@ -209,6 +228,7 @@ make_enum_progress! { pub struct ProgressView { pub steps: Vec, pub percentage: f32, + pub embedder: EmbedderStatsView, } #[derive(Debug, Serialize, Clone, ToSchema)] @@ -220,6 +240,16 @@ pub struct ProgressStepView { pub total: u32, } +#[derive(Debug, Serialize, Clone, ToSchema)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct EmbedderStatsView { + #[serde(skip_serializing_if = "Option::is_none")] + pub last_error: Option, + pub request_count: u32, + pub error_count: u32, +} + /// Used when the name can change but it's still the same step. /// To avoid conflicts on the `TypeId`, create a unique type every time you use this step: /// ```text diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 4a6cc9b90..e7634a4eb 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -44,7 +44,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("america") => vec![S("the united states")], }); builder.set_searchable_fields(vec![S("title"), S("description")]); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); wtxn.commit().unwrap(); // index documents diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index dfd570b96..634d45195 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -134,7 +134,7 @@ impl TempIndex { ) -> Result<(), crate::error::Error> { let mut builder = update::Settings::new(wtxn, &self.inner, &self.indexer_config); update(&mut builder); - builder.execute(drop, || false)?; + builder.execute(drop, || false, None)?; Ok(()) } diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index cb8c121ce..5e6bde53d 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -17,6 +17,7 @@ use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::error::FaultSource; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::index::IndexEmbeddingConfig; +use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; @@ -682,6 +683,7 @@ pub fn extract_embeddings( embedder: Arc, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, + embedder_stats: Option>, unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, ) -> Result>> { @@ -724,6 +726,7 @@ pub fn extract_embeddings( std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)), embedder_name, possible_embedding_mistakes, + embedder_stats.clone(), unused_vectors_distribution, request_threads, )?; @@ -746,6 +749,7 @@ pub fn extract_embeddings( std::mem::take(&mut chunks), embedder_name, possible_embedding_mistakes, + embedder_stats.clone(), unused_vectors_distribution, request_threads, )?; @@ -764,6 +768,7 @@ pub fn extract_embeddings( vec![std::mem::take(&mut current_chunk)], embedder_name, possible_embedding_mistakes, + embedder_stats, unused_vectors_distribution, request_threads, )?; @@ -783,10 +788,11 @@ fn embed_chunks( text_chunks: Vec>, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, + embedder_stats: Option>, unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, ) -> Result>> { - match embedder.embed_index(text_chunks, request_threads) { + match embedder.embed_index(text_chunks, request_threads, embedder_stats) { Ok(chunks) => Ok(chunks), Err(error) => { if let FaultSource::Bug = error.fault { diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index 8cd664a2f..020b48f2c 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -31,6 +31,7 @@ use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; use crate::index::IndexEmbeddingConfig; +use crate::progress::EmbedderStats; use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::error::PossibleEmbeddingMistakes; use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; @@ -49,6 +50,7 @@ pub(crate) fn data_from_obkv_documents( settings_diff: Arc, max_positions_per_attributes: Option, possible_embedding_mistakes: Arc, + embedder_stats: Option>, ) -> Result<()> { let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join( || { @@ -62,6 +64,7 @@ pub(crate) fn data_from_obkv_documents( embedders_configs.clone(), settings_diff.clone(), possible_embedding_mistakes.clone(), + embedder_stats.clone(), ) }) .collect::>() @@ -231,6 +234,7 @@ fn send_original_documents_data( embedders_configs: Arc>, settings_diff: Arc, possible_embedding_mistakes: Arc, + embedder_stats: Option>, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -270,6 +274,7 @@ fn send_original_documents_data( embedder.clone(), &embedder_name, &possible_embedding_mistakes, + embedder_stats.clone(), &unused_vectors_distribution, request_threads(), ) { diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index f547c68d4..fad43bd30 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -32,7 +32,7 @@ use crate::database_stats::DatabaseStats; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError}; use crate::index::{PrefixSearch, PrefixSettings}; -use crate::progress::Progress; +use crate::progress::{EmbedderStats, Progress}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, @@ -81,6 +81,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { added_documents: u64, deleted_documents: u64, embedders: EmbeddingConfigs, + embedder_stats: Option>, } #[derive(Default, Debug, Clone)] @@ -103,6 +104,7 @@ where config: IndexDocumentsConfig, progress: FP, should_abort: FA, + embedder_stats: Option>, ) -> Result> { let transform = Some(Transform::new( wtxn, @@ -123,6 +125,7 @@ where added_documents: 0, deleted_documents: 0, embedders: Default::default(), + embedder_stats, }) } @@ -292,6 +295,7 @@ where // Run extraction pipeline in parallel. let mut modified_docids = RoaringBitmap::new(); + let embedder_stats = self.embedder_stats.clone(); pool.install(|| { let settings_diff_cloned = settings_diff.clone(); rayon::spawn(move || { @@ -326,7 +330,8 @@ where embedders_configs.clone(), settings_diff_cloned, max_positions_per_attributes, - Arc::new(possible_embedding_mistakes) + Arc::new(possible_embedding_mistakes), + embedder_stats.clone() ) }); diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 43647e786..5b6559d74 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -450,7 +450,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))); } - let res = match embedder.embed_index_ref(texts.as_slice(), threads) { + let res = match embedder.embed_index_ref(texts.as_slice(), threads, None) { Ok(embeddings) => { for (docid, embedding) in ids.into_iter().zip(embeddings) { sender.set_vector(*docid, embedder_id, embedding).unwrap(); diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index f396cd079..7c5a70aa3 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -27,6 +27,7 @@ use crate::index::{ DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, }; use crate::order_by_map::OrderByMap; +use crate::progress::EmbedderStats; use crate::prompt::{default_max_bytes, default_template_text, PromptData}; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; @@ -466,7 +467,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { #[tracing::instrument( level = "trace" - skip(self, progress_callback, should_abort, settings_diff), + skip(self, progress_callback, should_abort, settings_diff, embedder_stats), target = "indexing::documents" )] fn reindex( @@ -474,6 +475,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { progress_callback: &FP, should_abort: &FA, settings_diff: InnerIndexSettingsDiff, + embedder_stats: Option>, ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -505,6 +507,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { IndexDocumentsConfig::default(), &progress_callback, &should_abort, + embedder_stats, )?; indexing_builder.execute_raw(output)?; @@ -1355,7 +1358,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } - pub fn execute(mut self, progress_callback: FP, should_abort: FA) -> Result<()> + pub fn execute(mut self, progress_callback: FP, should_abort: FA, embedder_stats: Option>) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, @@ -1413,7 +1416,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { ); if inner_settings_diff.any_reindexing_needed() { - self.reindex(&progress_callback, &should_abort, inner_settings_diff)?; + self.reindex(&progress_callback, &should_abort, inner_settings_diff, embedder_stats)?; } Ok(()) diff --git a/crates/milli/src/vector/composite.rs b/crates/milli/src/vector/composite.rs index 9c5992bd3..daec50e4b 100644 --- a/crates/milli/src/vector/composite.rs +++ b/crates/milli/src/vector/composite.rs @@ -1,3 +1,4 @@ +use std::sync::Arc; use std::time::Instant; use arroy::Distance; @@ -7,6 +8,7 @@ use super::{ hf, manual, ollama, openai, rest, DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, }; +use crate::progress::EmbedderStats; use crate::ThreadPoolNoAbort; #[derive(Debug)] @@ -81,6 +83,7 @@ impl Embedder { "This is a sample text. It is meant to compare similarity.".into(), ], None, + None, ) .map_err(|error| NewEmbedderError::composite_test_embedding_failed(error, "search"))?; @@ -92,6 +95,7 @@ impl Embedder { "This is a sample text. It is meant to compare similarity.".into(), ], None, + None, ) .map_err(|error| { NewEmbedderError::composite_test_embedding_failed(error, "indexing") @@ -150,13 +154,14 @@ impl SubEmbedder { &self, texts: Vec, deadline: Option, + embedder_stats: Option>, ) -> std::result::Result, EmbedError> { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed(texts), - SubEmbedder::OpenAi(embedder) => embedder.embed(&texts, deadline), - SubEmbedder::Ollama(embedder) => embedder.embed(&texts, deadline), + SubEmbedder::OpenAi(embedder) => embedder.embed(&texts, deadline, embedder_stats), + SubEmbedder::Ollama(embedder) => embedder.embed(&texts, deadline, embedder_stats), SubEmbedder::UserProvided(embedder) => embedder.embed(&texts), - SubEmbedder::Rest(embedder) => embedder.embed(texts, deadline), + SubEmbedder::Rest(embedder) => embedder.embed(texts, deadline, embedder_stats), } } @@ -164,18 +169,19 @@ impl SubEmbedder { &self, text: &str, deadline: Option, + embedder_stats: Option>, ) -> std::result::Result { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed_one(text), SubEmbedder::OpenAi(embedder) => { - embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding) + embedder.embed(&[text], deadline, embedder_stats)?.pop().ok_or_else(EmbedError::missing_embedding) } SubEmbedder::Ollama(embedder) => { - embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding) + embedder.embed(&[text], deadline, embedder_stats)?.pop().ok_or_else(EmbedError::missing_embedding) } SubEmbedder::UserProvided(embedder) => embedder.embed_one(text), SubEmbedder::Rest(embedder) => embedder - .embed_ref(&[text], deadline)? + .embed_ref(&[text], deadline, embedder_stats)? .pop() .ok_or_else(EmbedError::missing_embedding), } @@ -188,13 +194,14 @@ impl SubEmbedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, + embedder_stats: Option>, ) -> std::result::Result>, EmbedError> { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), - SubEmbedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads), - SubEmbedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads), + SubEmbedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), + SubEmbedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), SubEmbedder::UserProvided(embedder) => embedder.embed_index(text_chunks), - SubEmbedder::Rest(embedder) => embedder.embed_index(text_chunks, threads), + SubEmbedder::Rest(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), } } @@ -203,13 +210,14 @@ impl SubEmbedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, + embedder_stats: Option>, ) -> std::result::Result, EmbedError> { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), - SubEmbedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads), - SubEmbedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads), + SubEmbedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), + SubEmbedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), SubEmbedder::UserProvided(embedder) => embedder.embed_index_ref(texts), - SubEmbedder::Rest(embedder) => embedder.embed_index_ref(texts, threads), + SubEmbedder::Rest(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), } } diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index c2978f5db..124e17cff 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -13,7 +13,7 @@ use serde::{Deserialize, Serialize}; use utoipa::ToSchema; use self::error::{EmbedError, NewEmbedderError}; -use crate::progress::Progress; +use crate::progress::{EmbedderStats, Progress}; use crate::prompt::{Prompt, PromptData}; use crate::ThreadPoolNoAbort; @@ -720,17 +720,17 @@ impl Embedder { let embedding = match self { Embedder::HuggingFace(embedder) => embedder.embed_one(text), Embedder::OpenAi(embedder) => { - embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding) + embedder.embed(&[text], deadline, None)?.pop().ok_or_else(EmbedError::missing_embedding) } Embedder::Ollama(embedder) => { - embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding) + embedder.embed(&[text], deadline, None)?.pop().ok_or_else(EmbedError::missing_embedding) } Embedder::UserProvided(embedder) => embedder.embed_one(text), Embedder::Rest(embedder) => embedder - .embed_ref(&[text], deadline)? + .embed_ref(&[text], deadline, None)? .pop() .ok_or_else(EmbedError::missing_embedding), - Embedder::Composite(embedder) => embedder.search.embed_one(text, deadline), + Embedder::Composite(embedder) => embedder.search.embed_one(text, deadline, None), }?; if let Some(cache) = self.cache() { @@ -747,14 +747,15 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, + embedder_stats: Option>, ) -> std::result::Result>, EmbedError> { match self { Embedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), - Embedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads), - Embedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads), + Embedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), + Embedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), Embedder::UserProvided(embedder) => embedder.embed_index(text_chunks), - Embedder::Rest(embedder) => embedder.embed_index(text_chunks, threads), - Embedder::Composite(embedder) => embedder.index.embed_index(text_chunks, threads), + Embedder::Rest(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), + Embedder::Composite(embedder) => embedder.index.embed_index(text_chunks, threads, embedder_stats), } } @@ -763,14 +764,15 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, + embedder_stats: Option>, ) -> std::result::Result, EmbedError> { match self { Embedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), - Embedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads), - Embedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads), + Embedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), + Embedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), Embedder::UserProvided(embedder) => embedder.embed_index_ref(texts), - Embedder::Rest(embedder) => embedder.embed_index_ref(texts, threads), - Embedder::Composite(embedder) => embedder.index.embed_index_ref(texts, threads), + Embedder::Rest(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), + Embedder::Composite(embedder) => embedder.index.embed_index_ref(texts, threads, embedder_stats), } } diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/ollama.rs index 8beae6205..b3ee925e6 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/ollama.rs @@ -1,3 +1,4 @@ +use std::sync::Arc; use std::time::Instant; use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; @@ -7,6 +8,7 @@ use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErro use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; use super::{DistributionShift, EmbeddingCache, REQUEST_PARALLELISM}; use crate::error::FaultSource; +use crate::progress::EmbedderStats; use crate::vector::Embedding; use crate::ThreadPoolNoAbort; @@ -104,8 +106,9 @@ impl Embedder { &self, texts: &[S], deadline: Option, + embedder_stats: Option> ) -> Result, EmbedError> { - match self.rest_embedder.embed_ref(texts, deadline) { + match self.rest_embedder.embed_ref(texts, deadline, embedder_stats) { Ok(embeddings) => Ok(embeddings), Err(EmbedError { kind: EmbedErrorKind::RestOtherStatusCode(404, error), fault: _ }) => { Err(EmbedError::ollama_model_not_found(error)) @@ -118,15 +121,16 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, + embedder_stats: Option>, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { - text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None)).collect() + text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())).collect() } else { threads .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() + text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())).collect() }) .map_err(|error| EmbedError { kind: EmbedErrorKind::PanicInThreadPool(error), @@ -139,13 +143,14 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, + embedder_stats: Option> ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { let embeddings: Result>, _> = texts .chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None)) + .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) .collect(); let embeddings = embeddings?; @@ -155,7 +160,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None)) + .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) .collect(); let embeddings = embeddings?; diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/openai.rs index df29f6916..384abe880 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/openai.rs @@ -1,4 +1,5 @@ use std::fmt; +use std::sync::Arc; use std::time::Instant; use ordered_float::OrderedFloat; @@ -9,6 +10,7 @@ use super::error::{EmbedError, NewEmbedderError}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; use super::{DistributionShift, EmbeddingCache, REQUEST_PARALLELISM}; use crate::error::FaultSource; +use crate::progress::EmbedderStats; use crate::vector::error::EmbedErrorKind; use crate::vector::Embedding; use crate::ThreadPoolNoAbort; @@ -215,8 +217,9 @@ impl Embedder { &self, texts: &[S], deadline: Option, + embedder_stats: Option>, ) -> Result, EmbedError> { - match self.rest_embedder.embed_ref(texts, deadline) { + match self.rest_embedder.embed_ref(texts, deadline, embedder_stats) { Ok(embeddings) => Ok(embeddings), Err(EmbedError { kind: EmbedErrorKind::RestBadRequest(error, _), fault: _ }) => { tracing::warn!(error=?error, "OpenAI: received `BAD_REQUEST`. Input was maybe too long, retrying on tokenized version. For best performance, limit the size of your document template."); @@ -238,7 +241,7 @@ impl Embedder { let encoded = self.tokenizer.encode_ordinary(text); let len = encoded.len(); if len < max_token_count { - all_embeddings.append(&mut self.rest_embedder.embed_ref(&[text], deadline)?); + all_embeddings.append(&mut self.rest_embedder.embed_ref(&[text], deadline, None)?); continue; } @@ -255,15 +258,16 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, + embedder_stats: Option>, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { - text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None)).collect() + text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())).collect() } else { threads .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() + text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())).collect() }) .map_err(|error| EmbedError { kind: EmbedErrorKind::PanicInThreadPool(error), @@ -276,13 +280,14 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, + embedder_stats: Option>, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { let embeddings: Result>, _> = texts .chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None)) + .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) .collect(); let embeddings = embeddings?; Ok(embeddings.into_iter().flatten().collect()) @@ -291,7 +296,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None)) + .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) .collect(); let embeddings = embeddings?; diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index b87ac9f77..fc0ff308b 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -1,4 +1,5 @@ use std::collections::BTreeMap; +use std::sync::Arc; use std::time::Instant; use deserr::Deserr; @@ -14,6 +15,7 @@ use super::{ }; use crate::error::FaultSource; use crate::ThreadPoolNoAbort; +use crate::progress::EmbedderStats; // retrying in case of failure pub struct Retry { @@ -168,19 +170,21 @@ impl Embedder { &self, texts: Vec, deadline: Option, + embedder_stats: Option>, ) -> Result, EmbedError> { - embed(&self.data, texts.as_slice(), texts.len(), Some(self.dimensions), deadline) + embed(&self.data, texts.as_slice(), texts.len(), Some(self.dimensions), deadline, embedder_stats) } pub fn embed_ref( &self, texts: &[S], deadline: Option, + embedder_stats: Option>, ) -> Result, EmbedError> where S: AsRef + Serialize, { - embed(&self.data, texts, texts.len(), Some(self.dimensions), deadline) + embed(&self.data, texts, texts.len(), Some(self.dimensions), deadline, embedder_stats) } pub fn embed_tokens( @@ -188,7 +192,7 @@ impl Embedder { tokens: &[u32], deadline: Option, ) -> Result { - let mut embeddings = embed(&self.data, tokens, 1, Some(self.dimensions), deadline)?; + let mut embeddings = embed(&self.data, tokens, 1, Some(self.dimensions), deadline, None)?; // unwrap: guaranteed that embeddings.len() == 1, otherwise the previous line terminated in error Ok(embeddings.pop().unwrap()) } @@ -197,15 +201,16 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, + embedder_stats: Option>, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { - text_chunks.into_iter().map(move |chunk| self.embed(chunk, None)).collect() + text_chunks.into_iter().map(move |chunk| self.embed(chunk, None, embedder_stats.clone())).collect() } else { threads .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(chunk, None)).collect() + text_chunks.into_par_iter().map(move |chunk| self.embed(chunk, None, embedder_stats.clone())).collect() }) .map_err(|error| EmbedError { kind: EmbedErrorKind::PanicInThreadPool(error), @@ -218,13 +223,14 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, + embedder_stats: Option> ) -> Result, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { let embeddings: Result>, _> = texts .chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed_ref(chunk, None)) + .map(move |chunk| self.embed_ref(chunk, None, embedder_stats.clone())) .collect(); let embeddings = embeddings?; @@ -234,7 +240,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed_ref(chunk, None)) + .map(move |chunk| self.embed_ref(chunk, None, embedder_stats.clone())) .collect(); let embeddings = embeddings?; @@ -272,7 +278,7 @@ impl Embedder { } fn infer_dimensions(data: &EmbedderData) -> Result { - let v = embed(data, ["test"].as_slice(), 1, None, None) + let v = embed(data, ["test"].as_slice(), 1, None, None, None) .map_err(NewEmbedderError::could_not_determine_dimension)?; // unwrap: guaranteed that v.len() == 1, otherwise the previous line terminated in error Ok(v.first().unwrap().len()) @@ -284,6 +290,7 @@ fn embed( expected_count: usize, expected_dimension: Option, deadline: Option, + embedder_stats: Option>, ) -> Result, EmbedError> where S: Serialize, @@ -302,6 +309,9 @@ where let body = data.request.inject_texts(inputs); for attempt in 0..10 { + if let Some(embedder_stats) = &embedder_stats { + embedder_stats.as_ref().total_requests.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + } let response = request.clone().send_json(&body); let result = check_response(response, data.configuration_source).and_then(|response| { response_to_embedding(response, data, expected_count, expected_dimension) @@ -311,6 +321,12 @@ where Ok(response) => return Ok(response), Err(retry) => { tracing::warn!("Failed: {}", retry.error); + if let Some(embedder_stats) = &embedder_stats { + if let Ok(mut errors) = embedder_stats.errors.write() { + errors.0 = Some(retry.error.to_string()); + errors.1 += 1; + } + } if let Some(deadline) = deadline { let now = std::time::Instant::now(); if now > deadline { @@ -336,12 +352,26 @@ where std::thread::sleep(retry_duration); } + if let Some(embedder_stats) = &embedder_stats { + embedder_stats.as_ref().total_requests.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + } let response = request.send_json(&body); - let result = check_response(response, data.configuration_source); - result.map_err(Retry::into_error).and_then(|response| { + let result = check_response(response, data.configuration_source).and_then(|response| { response_to_embedding(response, data, expected_count, expected_dimension) - .map_err(Retry::into_error) - }) + }); + + match result { + Ok(response) => Ok(response), + Err(retry) => { + if let Some(embedder_stats) = &embedder_stats { + if let Ok(mut errors) = embedder_stats.errors.write() { + errors.0 = Some(retry.error.to_string()); + errors.1 += 1; + } + } + Err(retry.into_error()) + } + } } fn check_response( diff --git a/crates/milli/tests/search/distinct.rs b/crates/milli/tests/search/distinct.rs index fc890dfe8..55e43c8fa 100644 --- a/crates/milli/tests/search/distinct.rs +++ b/crates/milli/tests/search/distinct.rs @@ -19,7 +19,7 @@ macro_rules! test_distinct { let config = milli::update::IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_distinct_field(S(stringify!($distinct))); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index 8934cbea4..588662735 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -25,7 +25,7 @@ fn test_facet_distribution_with_no_facet_values() { FilterableAttributesRule::Field(S("genres")), FilterableAttributesRule::Field(S("tags")), ]); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); wtxn.commit().unwrap(); // index documents diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 906956716..1e0c24608 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -63,7 +63,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("america") => vec![S("the united states")], }); builder.set_searchable_fields(vec![S("title"), S("description")]); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); wtxn.commit().unwrap(); // index documents diff --git a/crates/milli/tests/search/phrase_search.rs b/crates/milli/tests/search/phrase_search.rs index b7f792bfc..c5a95f7cd 100644 --- a/crates/milli/tests/search/phrase_search.rs +++ b/crates/milli/tests/search/phrase_search.rs @@ -10,7 +10,7 @@ fn set_stop_words(index: &Index, stop_words: &[&str]) { let mut builder = Settings::new(&mut wtxn, index, &config); let stop_words = stop_words.iter().map(|s| s.to_string()).collect(); builder.set_stop_words(stop_words); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); wtxn.commit().unwrap(); } diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index 1acc89484..b7614c215 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -236,7 +236,7 @@ fn criteria_mixup() { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.clone()); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -276,7 +276,7 @@ fn criteria_ascdesc() { S("name"), S("age"), }); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); wtxn.commit().unwrap(); let mut wtxn = index.write_txn().unwrap(); @@ -358,7 +358,7 @@ fn criteria_ascdesc() { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(vec![criterion.clone()]); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index 3c0717063..bf9a730c9 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -46,7 +46,7 @@ fn test_typo_tolerance_one_typo() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); builder.set_min_word_len_one_typo(4); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); @@ -92,7 +92,7 @@ fn test_typo_tolerance_two_typo() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); builder.set_min_word_len_two_typos(7); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); @@ -180,7 +180,7 @@ fn test_typo_disabled_on_word() { // `zealand` doesn't allow typos anymore exact_words.insert("zealand".to_string()); builder.set_exact_words(exact_words); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); let mut search = Search::new(&txn, &index); search.query("zealand"); @@ -218,7 +218,7 @@ fn test_disable_typo_on_attribute() { let mut builder = Settings::new(&mut txn, &index, &config); // disable typos on `description` builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect()); - builder.execute(|_| (), || false).unwrap(); + builder.execute(|_| (), || false, None).unwrap(); let mut search = Search::new(&txn, &index); search.query("antebelum"); From 5c46dc702aa7c19016913b81291c48039876ed97 Mon Sep 17 00:00:00 2001 From: Martin Tzvetanov Grigorov Date: Sun, 22 Jun 2025 14:22:59 +0300 Subject: [PATCH 004/150] tests: Use Server::wait_task() instead of Index::wait_task() The code is mostly duplicated. Server::wait_task() has better handling for errors and more retries. Signed-off-by: Martin Tzvetanov Grigorov --- .../tests/documents/add_documents.rs | 90 +++++++++---------- .../tests/documents/delete_documents.rs | 41 ++++----- .../tests/documents/get_documents.rs | 22 ++--- .../tests/documents/update_documents.rs | 22 ++--- 4 files changed, 88 insertions(+), 87 deletions(-) diff --git a/crates/meilisearch/tests/documents/add_documents.rs b/crates/meilisearch/tests/documents/add_documents.rs index 1cf492fc0..b69d289e1 100644 --- a/crates/meilisearch/tests/documents/add_documents.rs +++ b/crates/meilisearch/tests/documents/add_documents.rs @@ -293,7 +293,7 @@ async fn add_csv_document() { "enqueuedAt": "[date]" } "#); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" { "uid": "[uid]", @@ -358,7 +358,7 @@ async fn add_csv_document_with_types() { "enqueuedAt": "[date]" } "#); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" { "uid": "[uid]", @@ -434,7 +434,7 @@ async fn add_csv_document_with_custom_delimiter() { "enqueuedAt": "[date]" } "#); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" { "uid": "[uid]", @@ -991,7 +991,7 @@ async fn add_documents_no_index_creation() { let (response, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(code, @"202 Accepted"); snapshot!(response, @r###" @@ -1068,7 +1068,7 @@ async fn document_addition_with_primary_key() { } "#); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.get_task(response.uid()).await; snapshot!(code, @"200 OK"); @@ -1120,7 +1120,7 @@ async fn document_addition_with_huge_int_primary_key() { let (response, code) = index.add_documents(documents, Some("primary")).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(response, @r###" { @@ -1178,7 +1178,7 @@ async fn replace_document() { } "#); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let documents = json!([ { @@ -1190,7 +1190,7 @@ async fn replace_document() { let (task, code) = index.add_documents(documents, None).await; snapshot!(code,@"202 Accepted"); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); @@ -1362,7 +1362,7 @@ async fn error_add_documents_bad_document_id() { } ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -1399,7 +1399,7 @@ async fn error_add_documents_bad_document_id() { } ]); let (value, _code) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.failed(); + server.wait_task(value.uid()).await.failed(); let (response, code) = index.get_task(value.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -1436,7 +1436,7 @@ async fn error_add_documents_bad_document_id() { } ]); let (value, _code) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.failed(); + server.wait_task(value.uid()).await.failed(); let (response, code) = index.get_task(value.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -1478,7 +1478,7 @@ async fn error_add_documents_missing_document_id() { } ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -1527,7 +1527,7 @@ async fn error_document_field_limit_reached_in_one_document() { let (response, code) = index.update_documents(documents, Some("id")).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await.failed(); + let response = server.wait_task(response.uid()).await.failed(); snapshot!(code, @"202 Accepted"); // Documents without a primary key are not accepted. snapshot!(response, @@ -1576,7 +1576,7 @@ async fn error_document_field_limit_reached_over_multiple_documents() { let (response, code) = index.update_documents(documents, Some("id")).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(code, @"202 Accepted"); snapshot!(response, @r###" @@ -1611,7 +1611,7 @@ async fn error_document_field_limit_reached_over_multiple_documents() { let (response, code) = index.update_documents(documents, Some("id")).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await.failed(); + let response = server.wait_task(response.uid()).await.failed(); snapshot!(code, @"202 Accepted"); snapshot!(response, @r###" @@ -1660,7 +1660,7 @@ async fn error_document_field_limit_reached_in_one_nested_document() { let (response, code) = index.update_documents(documents, Some("id")).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(code, @"202 Accepted"); // Documents without a primary key are not accepted. snapshot!(response, @@ -1705,7 +1705,7 @@ async fn error_document_field_limit_reached_over_multiple_documents_with_nested_ let (response, code) = index.update_documents(documents, Some("id")).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(code, @"202 Accepted"); snapshot!(response, @r###" @@ -1741,7 +1741,7 @@ async fn error_document_field_limit_reached_over_multiple_documents_with_nested_ let (response, code) = index.update_documents(documents, Some("id")).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(code, @"202 Accepted"); snapshot!(response, @r###" @@ -1790,7 +1790,7 @@ async fn add_documents_with_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - let response = index.wait_task(task.uid()).await.succeeded(); + let response = server.wait_task(task.uid()).await.succeeded(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r#" { @@ -1914,7 +1914,7 @@ async fn update_documents_with_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - let response = index.wait_task(task.uid()).await.succeeded(); + let response = server.wait_task(task.uid()).await.succeeded(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r#" { @@ -1983,7 +1983,7 @@ async fn update_documents_with_geo_field() { } ]); let (task, _status_code) = index.update_documents(updated_documents, None).await; - let response = index.wait_task(task.uid()).await.succeeded(); + let response = server.wait_task(task.uid()).await.succeeded(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" { @@ -2097,7 +2097,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".indexUid" => "[uuid]" }), @@ -2135,7 +2135,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2173,7 +2173,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2211,7 +2211,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2249,7 +2249,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2287,7 +2287,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2325,7 +2325,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2363,7 +2363,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2401,7 +2401,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2439,7 +2439,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2477,7 +2477,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2515,7 +2515,7 @@ async fn add_documents_invalid_geo_field() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2556,7 +2556,7 @@ async fn add_documents_invalid_geo_field() { let (response, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await.failed(); + let response = server.wait_task(response.uid()).await.failed(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" { @@ -2593,7 +2593,7 @@ async fn add_documents_invalid_geo_field() { let (response, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await.failed(); + let response = server.wait_task(response.uid()).await.failed(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" { @@ -2630,7 +2630,7 @@ async fn add_documents_invalid_geo_field() { let (response, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await.failed(); + let response = server.wait_task(response.uid()).await.failed(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" { @@ -2674,7 +2674,7 @@ async fn add_invalid_geo_and_then_settings() { ]); let (ret, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - let ret = index.wait_task(ret.uid()).await.succeeded(); + let ret = server.wait_task(ret.uid()).await.succeeded(); snapshot!(ret, @r###" { "uid": "[uid]", @@ -2697,7 +2697,7 @@ async fn add_invalid_geo_and_then_settings() { let (ret, code) = index.update_settings(json!({ "sortableAttributes": ["_geo"] })).await; snapshot!(code, @"202 Accepted"); - let ret = index.wait_task(ret.uid()).await.failed(); + let ret = server.wait_task(ret.uid()).await.failed(); snapshot!(ret, @r###" { "uid": "[uid]", @@ -2765,7 +2765,7 @@ async fn error_primary_key_inference() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; assert_eq!(code, 200); @@ -2806,7 +2806,7 @@ async fn error_primary_key_inference() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let (response, code) = index.get_task(task.uid()).await; assert_eq!(code, 200); @@ -2845,7 +2845,7 @@ async fn error_primary_key_inference() { ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get_task(task.uid()).await; assert_eq!(code, 200); @@ -2884,12 +2884,12 @@ async fn add_documents_with_primary_key_twice() { ]); let (task, _status_code) = index.add_documents(documents.clone(), Some("title")).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, _code) = index.get_task(task.uid()).await; assert_eq!(response["status"], "succeeded"); let (task, _status_code) = index.add_documents(documents, Some("title")).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, _code) = index.get_task(task.uid()).await; assert_eq!(response["status"], "succeeded"); } @@ -2922,7 +2922,7 @@ async fn batch_several_documents_addition() { // wait first batch of documents to finish let finished_tasks = futures::future::join_all(waiter).await; for (task, _code) in finished_tasks { - index.wait_task(task.uid()).await; + server.wait_task(task.uid()).await; } // run a second completely failing batch @@ -2936,7 +2936,7 @@ async fn batch_several_documents_addition() { // wait second batch of documents to finish let finished_tasks = futures::future::join_all(waiter).await; for (task, _code) in finished_tasks { - index.wait_task(task.uid()).await; + server.wait_task(task.uid()).await; } let (response, _code) = index.filtered_tasks(&[], &["failed"], &[]).await; diff --git a/crates/meilisearch/tests/documents/delete_documents.rs b/crates/meilisearch/tests/documents/delete_documents.rs index 5ea122bd0..9c367cb51 100644 --- a/crates/meilisearch/tests/documents/delete_documents.rs +++ b/crates/meilisearch/tests/documents/delete_documents.rs @@ -5,11 +5,12 @@ use crate::json; #[actix_rt::test] async fn delete_one_document_unexisting_index() { + let server = Server::new_shared(); let index = shared_does_not_exists_index().await; let (task, code) = index.delete_document_by_filter_fail(json!({"filter": "a = b"})).await; assert_eq!(code, 202); - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); } #[actix_rt::test] @@ -19,7 +20,7 @@ async fn delete_one_unexisting_document() { index.create(None).await; let (response, code) = index.delete_document(0).await; assert_eq!(code, 202, "{response}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); } #[actix_rt::test] @@ -28,10 +29,10 @@ async fn delete_one_document() { let index = server.unique_index(); let (task, _status_code) = index.add_documents(json!([{ "id": 0, "content": "foobar" }]), None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, status_code) = index.delete_document(0).await; assert_eq!(status_code, 202); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (_response, code) = index.get_document(0, None).await; assert_eq!(code, 404); @@ -44,7 +45,7 @@ async fn clear_all_documents_unexisting_index() { let (task, code) = index.clear_all_documents().await; assert_eq!(code, 202); - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); } #[actix_rt::test] @@ -57,11 +58,11 @@ async fn clear_all_documents() { None, ) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, code) = index.clear_all_documents().await; assert_eq!(code, 202); - let _update = index.wait_task(task.uid()).await.succeeded(); + let _update = server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; assert_eq!(code, 200); assert!(response["results"].as_array().unwrap().is_empty()); @@ -72,11 +73,11 @@ async fn clear_all_documents_empty_index() { let server = Server::new_shared(); let index = server.unique_index(); let (task, _status_code) = index.create(None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, code) = index.clear_all_documents().await; assert_eq!(code, 202); - let _update = index.wait_task(task.uid()).await.succeeded(); + let _update = server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; assert_eq!(code, 200); assert!(response["results"].as_array().unwrap().is_empty()); @@ -95,7 +96,7 @@ async fn error_delete_batch_unexisting_index() { }); assert_eq!(code, 202); - let response = index.wait_task(task.uid()).await.failed(); + let response = server.wait_task(task.uid()).await.failed(); assert_eq!(response["error"], expected_response); } @@ -104,11 +105,11 @@ async fn delete_batch() { let server = Server::new_shared(); let index = server.unique_index(); let (task,_status_code) = index.add_documents(json!([{ "id": 1, "content": "foobar" }, { "id": 0, "content": "foobar" }, { "id": 3, "content": "foobar" }]), Some("id")).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, code) = index.delete_batch(vec![1, 0]).await; assert_eq!(code, 202); - let _update = index.wait_task(task.uid()).await.succeeded(); + let _update = server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 1); @@ -120,11 +121,11 @@ async fn delete_no_document_batch() { let server = Server::new_shared(); let index = server.unique_index(); let (task,_status_code) = index.add_documents(json!([{ "id": 1, "content": "foobar" }, { "id": 0, "content": "foobar" }, { "id": 3, "content": "foobar" }]), Some("id")).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.delete_batch(vec![]).await; assert_eq!(code, 202, "{response}"); - let _update = index.wait_task(response.uid()).await.succeeded(); + let _update = server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 3); @@ -146,7 +147,7 @@ async fn delete_document_by_filter() { Some("id"), ) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (stats, _) = index.stats().await; snapshot!(json_string!(stats, { @@ -180,7 +181,7 @@ async fn delete_document_by_filter() { } "###); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" { "uid": "[uid]", @@ -253,7 +254,7 @@ async fn delete_document_by_filter() { } "###); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" { "uid": "[uid]", @@ -328,7 +329,7 @@ async fn delete_document_by_complex_filter() { Some("id"), ) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index .delete_document_by_filter( json!({ "filter": ["color != red", "color != green", "color EXISTS"] }), @@ -345,7 +346,7 @@ async fn delete_document_by_complex_filter() { } "###); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" { "uid": "[uid]", @@ -404,7 +405,7 @@ async fn delete_document_by_complex_filter() { } "###); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); snapshot!(json_string!(response, { ".uid" => "[uid]", ".batchUid" => "[batch_uid]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" { "uid": "[uid]", diff --git a/crates/meilisearch/tests/documents/get_documents.rs b/crates/meilisearch/tests/documents/get_documents.rs index 4f82faf99..63dc224c2 100644 --- a/crates/meilisearch/tests/documents/get_documents.rs +++ b/crates/meilisearch/tests/documents/get_documents.rs @@ -23,7 +23,7 @@ async fn error_get_unexisting_document() { let server = Server::new_shared(); let index = server.unique_index(); let (task, _code) = index.create(None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get_document(1, None).await; @@ -43,7 +43,7 @@ async fn get_document() { let server = Server::new_shared(); let index = server.unique_index(); let (task, _code) = index.create(None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let documents = json!([ { "id": 0, @@ -52,7 +52,7 @@ async fn get_document() { ]); let (task, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get_document(0, None).await; assert_eq!(code, 200); assert_eq!( @@ -276,7 +276,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { let server = Server::new_shared(); let index = server.unique_index(); let (task, _code) = index.create(None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let documents = json!([ { @@ -293,7 +293,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { ]); let (task, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get_document(0, Some(json!({ "fields": ["content"] }))).await; assert_eq!(code, 200); @@ -369,7 +369,7 @@ async fn get_document_by_filter() { Some("id"), ) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.fetch_documents(json!({})).await; let (response2, code2) = index.get_all_documents_raw("").await; @@ -525,7 +525,7 @@ async fn get_document_by_ids() { Some("id"), ) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index .fetch_documents(json!({ @@ -651,7 +651,7 @@ async fn get_document_invalid_ids() { Some("id"), ) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.fetch_documents(json!({"ids": ["0", "illegal/docid"] })).await; let (response2, code2) = index.get_all_documents_raw("?ids=0,illegal/docid").await; @@ -683,7 +683,7 @@ async fn get_document_not_found_ids() { Some("id"), ) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.fetch_documents(json!({"ids": ["0", 3, 42] })).await; let (response2, code2) = index.get_all_documents_raw("?ids=0,3,42").await; @@ -726,7 +726,7 @@ async fn get_document_by_ids_and_filter() { Some("id"), ) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.fetch_documents(json!({"ids": [2], "filter": "color = blue" })).await; @@ -854,7 +854,7 @@ async fn get_document_with_vectors() { ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); // by default you shouldn't see the `_vectors` object let (documents, _code) = index.get_all_documents(Default::default()).await; diff --git a/crates/meilisearch/tests/documents/update_documents.rs b/crates/meilisearch/tests/documents/update_documents.rs index 534be1fe6..b74d91506 100644 --- a/crates/meilisearch/tests/documents/update_documents.rs +++ b/crates/meilisearch/tests/documents/update_documents.rs @@ -34,7 +34,7 @@ async fn document_update_with_primary_key() { let (response, code) = index.update_documents(documents, Some("primary")).await; assert_eq!(code, 202); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.get_task(response.uid()).await; assert_eq!(code, 200); @@ -63,7 +63,7 @@ async fn update_document() { let (response, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let documents = json!([ { @@ -75,7 +75,7 @@ async fn update_document() { let (response, code) = index.update_documents(documents, None).await; assert_eq!(code, 202, "response: {}", response); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.get_task(response.uid()).await; assert_eq!(code, 200); @@ -107,7 +107,7 @@ async fn update_document_gzip_encoded() { let (response, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let documents = json!([ { @@ -119,7 +119,7 @@ async fn update_document_gzip_encoded() { let (response, code) = index.update_documents(documents, None).await; assert_eq!(code, 202, "response: {}", response); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.get_task(response.uid()).await; assert_eq!(code, 200); @@ -142,7 +142,7 @@ async fn update_larger_dataset() { let index = server.unique_index(); let documents = serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(); let (task, _code) = index.update_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get_task(task.uid()).await; assert_eq!(code, 200); assert_eq!(response["type"], "documentAdditionOrUpdate"); @@ -166,7 +166,7 @@ async fn error_update_documents_bad_document_id() { } ]); let (task, _code) = index.update_documents(documents, None).await; - let response = index.wait_task(task.uid()).await; + let response = server.wait_task(task.uid()).await; assert_eq!(response["status"], json!("failed")); assert_eq!( response["error"]["message"], @@ -194,7 +194,7 @@ async fn error_update_documents_missing_document_id() { } ]); let (task, _code) = index.update_documents(documents, None).await; - let response = index.wait_task(task.uid()).await; + let response = server.wait_task(task.uid()).await; assert_eq!(response["status"], "failed"); assert_eq!( response["error"]["message"], @@ -219,7 +219,7 @@ async fn update_faceted_document() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let documents: Vec<_> = (0..1000) .map(|id| { @@ -233,7 +233,7 @@ async fn update_faceted_document() { let (response, code) = index.add_documents(documents.into(), None).await; assert_eq!(code, 202); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let documents = json!([ { @@ -245,7 +245,7 @@ async fn update_faceted_document() { let (response, code) = index.update_documents(documents, None).await; assert_eq!(code, 202, "response: {}", response); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); index .search(json!({"limit": 10}), |response, code| { From a237c0797a075fbd750287da9f7f373d4eec42e1 Mon Sep 17 00:00:00 2001 From: Martin Tzvetanov Grigorov Date: Sun, 22 Jun 2025 14:32:45 +0300 Subject: [PATCH 005/150] tests: Use Server::wait_task() instead of Index::wait_task() in settings:: The code is mostly duplicated. Server::wait_task() has better handling for errors and more retries. Signed-off-by: Martin Tzvetanov Grigorov --- crates/meilisearch/tests/settings/distinct.rs | 8 ++--- .../tests/settings/get_settings.rs | 30 +++++++++---------- .../tests/settings/proximity_settings.rs | 18 +++++------ .../tests/settings/tokenizer_customization.rs | 14 ++++----- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/crates/meilisearch/tests/settings/distinct.rs b/crates/meilisearch/tests/settings/distinct.rs index a3b1b5276..a704ab3da 100644 --- a/crates/meilisearch/tests/settings/distinct.rs +++ b/crates/meilisearch/tests/settings/distinct.rs @@ -7,7 +7,7 @@ async fn set_and_reset_distinct_attribute() { let index = server.unique_index(); let (task1, _code) = index.update_settings(json!({ "distinctAttribute": "test"})).await; - index.wait_task(task1.uid()).await.succeeded(); + server.wait_task(task1.uid()).await.succeeded(); let (response, _) = index.settings().await; @@ -15,7 +15,7 @@ async fn set_and_reset_distinct_attribute() { let (task2, _status_code) = index.update_settings(json!({ "distinctAttribute": null })).await; - index.wait_task(task2.uid()).await.succeeded(); + server.wait_task(task2.uid()).await.succeeded(); let (response, _) = index.settings().await; @@ -28,7 +28,7 @@ async fn set_and_reset_distinct_attribute_with_dedicated_route() { let index = server.unique_index(); let (update_task1, _code) = index.update_distinct_attribute(json!("test")).await; - index.wait_task(update_task1.uid()).await.succeeded(); + server.wait_task(update_task1.uid()).await.succeeded(); let (response, _) = index.get_distinct_attribute().await; @@ -36,7 +36,7 @@ async fn set_and_reset_distinct_attribute_with_dedicated_route() { let (update_task2, _status_code) = index.update_distinct_attribute(json!(null)).await; - index.wait_task(update_task2.uid()).await.succeeded(); + server.wait_task(update_task2.uid()).await.succeeded(); let (response, _) = index.get_distinct_attribute().await; diff --git a/crates/meilisearch/tests/settings/get_settings.rs b/crates/meilisearch/tests/settings/get_settings.rs index 941533dda..cdb803e8b 100644 --- a/crates/meilisearch/tests/settings/get_settings.rs +++ b/crates/meilisearch/tests/settings/get_settings.rs @@ -58,7 +58,7 @@ macro_rules! test_setting_routes { let index = server.unique_index(); let (response, code) = index.create(None).await; assert_eq!(code, 202, "{response}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let url = format!("/indexes/{}/settings/{}", index.uid, stringify!($setting) @@ -209,7 +209,7 @@ async fn get_settings() { let server = Server::new_shared(); let index = server.unique_index(); let (response, _code) = index.create(None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.settings().await; assert_eq!(code, 200); let settings = response.as_object().unwrap(); @@ -254,7 +254,7 @@ async fn secrets_are_hidden_in_settings() { let server = Server::new_shared(); let index = server.unique_index(); let (response, _code) = index.create(None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index .update_settings(json!({ @@ -285,7 +285,7 @@ async fn secrets_are_hidden_in_settings() { let settings_update_uid = response.uid(); - index.wait_task(settings_update_uid).await.succeeded(); + server.wait_task(settings_update_uid).await.succeeded(); let (response, code) = index.settings().await; meili_snap::snapshot!(code, @"200 OK"); @@ -384,14 +384,14 @@ async fn test_partial_update() { let server = Server::new_shared(); let index = server.unique_index(); let (task, _code) = index.update_settings(json!({"displayedAttributes": ["foo"]})).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.settings().await; assert_eq!(code, 200); assert_eq!(response["displayedAttributes"], json!(["foo"])); assert_eq!(response["searchableAttributes"], json!(["*"])); let (task, _) = index.update_settings(json!({"searchableAttributes": ["bar"]})).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.settings().await; assert_eq!(code, 200); @@ -406,7 +406,7 @@ async fn error_delete_settings_unexisting_index() { let (task, code) = index.delete_settings().await; assert_eq!(code, 202); - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); } #[actix_rt::test] @@ -424,12 +424,12 @@ async fn reset_all_settings() { let (response, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (update_task,_status_code) = index .update_settings(json!({"displayedAttributes": ["name", "age"], "searchableAttributes": ["name"], "stopWords": ["the"], "filterableAttributes": ["age"], "synonyms": {"puppy": ["dog", "doggo", "potat"] }})) .await; - index.wait_task(update_task.uid()).await.succeeded(); + server.wait_task(update_task.uid()).await.succeeded(); let (response, code) = index.settings().await; assert_eq!(code, 200); assert_eq!(response["displayedAttributes"], json!(["name", "age"])); @@ -439,7 +439,7 @@ async fn reset_all_settings() { assert_eq!(response["filterableAttributes"], json!(["age"])); let (delete_task, _status_code) = index.delete_settings().await; - index.wait_task(delete_task.uid()).await.succeeded(); + server.wait_task(delete_task.uid()).await.succeeded(); let (response, code) = index.settings().await; assert_eq!(code, 200); @@ -460,11 +460,11 @@ async fn update_setting_unexisting_index() { let index = server.unique_index(); let (task, code) = index.update_settings(json!({})).await; assert_eq!(code, 202); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (_response, code) = index.get().await; assert_eq!(code, 200); let (task, _status_code) = index.delete_settings().await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); } #[actix_rt::test] @@ -507,7 +507,7 @@ async fn set_and_reset_distinct_attribute_with_dedicated_route() { let index = server.unique_index(); let (task, _code) = index.update_distinct_attribute(json!("test")).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, _) = index.get_distinct_attribute().await; @@ -515,7 +515,7 @@ async fn set_and_reset_distinct_attribute_with_dedicated_route() { let (task, _status_code) = index.update_distinct_attribute(json!(null)).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, _) = index.get_distinct_attribute().await; @@ -540,7 +540,7 @@ async fn granular_filterable_attributes() { { "attributePatterns": ["default-facet-search"], "features": { "filter": {"equality": true, "comparison": true} } }, ] })).await; assert_eq!(code, 202); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.settings().await; assert_eq!(code, 200, "{response}"); diff --git a/crates/meilisearch/tests/settings/proximity_settings.rs b/crates/meilisearch/tests/settings/proximity_settings.rs index 6de1ffe0e..555c13b58 100644 --- a/crates/meilisearch/tests/settings/proximity_settings.rs +++ b/crates/meilisearch/tests/settings/proximity_settings.rs @@ -30,7 +30,7 @@ async fn attribute_scale_search() { let index = server.unique_index(); let (task, _status_code) = index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index .update_settings(json!({ @@ -39,7 +39,7 @@ async fn attribute_scale_search() { })) .await; assert_eq!("202", code.as_str(), "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); // the expected order is [1, 3, 2] instead of [3, 1, 2] // because the attribute scale doesn't make the difference between 1 and 3. @@ -103,7 +103,7 @@ async fn attribute_scale_phrase_search() { let index = server.unique_index(); let (task, _status_code) = index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, _code) = index .update_settings(json!({ @@ -111,7 +111,7 @@ async fn attribute_scale_phrase_search() { "rankingRules": ["words", "typo", "proximity"], })) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // the expected order is [1, 3] instead of [3, 1] // because the attribute scale doesn't make the difference between 1 and 3. @@ -171,7 +171,7 @@ async fn word_scale_set_and_reset() { let index = server.unique_index(); let (task, _status_code) = index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // Set and reset the setting ensuring the swap between the 2 settings is applied. let (update_task1, _code) = index @@ -180,7 +180,7 @@ async fn word_scale_set_and_reset() { "rankingRules": ["words", "typo", "proximity"], })) .await; - index.wait_task(update_task1.uid()).await.succeeded(); + server.wait_task(update_task1.uid()).await.succeeded(); let (update_task2, _code) = index .update_settings(json!({ @@ -188,7 +188,7 @@ async fn word_scale_set_and_reset() { "rankingRules": ["words", "typo", "proximity"], })) .await; - index.wait_task(update_task2.uid()).await.succeeded(); + server.wait_task(update_task2.uid()).await.succeeded(); // [3, 1, 2] index @@ -286,7 +286,7 @@ async fn attribute_scale_default_ranking_rules() { let index = server.unique_index(); let (task, _status_code) = index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index .update_settings(json!({ @@ -294,7 +294,7 @@ async fn attribute_scale_default_ranking_rules() { })) .await; assert_eq!("202", code.as_str(), "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); // the expected order is [3, 1, 2] index diff --git a/crates/meilisearch/tests/settings/tokenizer_customization.rs b/crates/meilisearch/tests/settings/tokenizer_customization.rs index 7c58368f7..a0631418f 100644 --- a/crates/meilisearch/tests/settings/tokenizer_customization.rs +++ b/crates/meilisearch/tests/settings/tokenizer_customization.rs @@ -15,7 +15,7 @@ async fn set_and_reset() { "dictionary": ["J.R.R.", "J. R. R."], })) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, _) = index.settings().await; snapshot!(json_string!(response["nonSeparatorTokens"]), @r###" @@ -45,7 +45,7 @@ async fn set_and_reset() { })) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, _) = index.settings().await; snapshot!(json_string!(response["nonSeparatorTokens"]), @"[]"); @@ -74,7 +74,7 @@ async fn set_and_search() { let index = server.unique_index(); let (add_task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(add_task.uid()).await.succeeded(); + server.wait_task(add_task.uid()).await.succeeded(); let (update_task, _code) = index .update_settings(json!({ @@ -83,7 +83,7 @@ async fn set_and_search() { "dictionary": ["#", "A#", "B#", "C#", "D#", "E#", "F#", "G#"], })) .await; - index.wait_task(update_task.uid()).await.succeeded(); + server.wait_task(update_task.uid()).await.succeeded(); index .search(json!({"q": "&", "attributesToHighlight": ["content"]}), |response, code| { @@ -228,7 +228,7 @@ async fn advanced_synergies() { let index = server.unique_index(); let (add_task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(add_task.uid()).await.succeeded(); + server.wait_task(add_task.uid()).await.succeeded(); let (update_task, _code) = index .update_settings(json!({ @@ -243,7 +243,7 @@ async fn advanced_synergies() { } })) .await; - index.wait_task(update_task.uid()).await.succeeded(); + server.wait_task(update_task.uid()).await.succeeded(); index .search(json!({"q": "J.R.R.", "attributesToHighlight": ["content"]}), |response, code| { @@ -353,7 +353,7 @@ async fn advanced_synergies() { "dictionary": ["J.R.R.", "J. R. R.", "J.K.", "J. K."], })) .await; - index.wait_task(_response.uid()).await.succeeded(); + server.wait_task(_response.uid()).await.succeeded(); index .search(json!({"q": "jk", "attributesToHighlight": ["content"]}), |response, code| { From 855fa555a3973d0521cb896d274b534150d68a8a Mon Sep 17 00:00:00 2001 From: Martin Tzvetanov Grigorov Date: Sun, 22 Jun 2025 14:37:54 +0300 Subject: [PATCH 006/150] tests: Use Server::wait_task() instead of Index::wait_task() in search:: The code is mostly duplicated. Server::wait_task() has better handling for errors and more retries. Signed-off-by: Martin Tzvetanov Grigorov --- crates/meilisearch/tests/search/distinct.rs | 6 +- crates/meilisearch/tests/search/errors.rs | 24 +++--- .../meilisearch/tests/search/facet_search.rs | 54 ++++++------- crates/meilisearch/tests/search/filters.rs | 12 +-- crates/meilisearch/tests/search/formatted.rs | 8 +- crates/meilisearch/tests/search/hybrid.rs | 14 ++-- crates/meilisearch/tests/search/locales.rs | 32 ++++---- .../tests/search/matching_strategy.rs | 2 +- crates/meilisearch/tests/search/mod.rs | 66 +++++++-------- crates/meilisearch/tests/search/multi/mod.rs | 80 +++++++++---------- crates/meilisearch/tests/search/pagination.rs | 4 +- .../tests/search/restrict_searchable.rs | 18 ++--- 12 files changed, 160 insertions(+), 160 deletions(-) diff --git a/crates/meilisearch/tests/search/distinct.rs b/crates/meilisearch/tests/search/distinct.rs index bdc5875e0..33a4c5453 100644 --- a/crates/meilisearch/tests/search/distinct.rs +++ b/crates/meilisearch/tests/search/distinct.rs @@ -152,7 +152,7 @@ async fn distinct_search_with_offset_no_ranking() { let documents = DOCUMENTS.clone(); index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await; let (task, _status_code) = index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); fn get_hits(response: &Value) -> Vec<&str> { let hits_array = response["hits"].as_array().unwrap(); @@ -211,7 +211,7 @@ async fn distinct_search_with_pagination_no_ranking() { let documents = DOCUMENTS.clone(); index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await; let (task, _status_code) = index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); fn get_hits(response: &Value) -> Vec<&str> { let hits_array = response["hits"].as_array().unwrap(); @@ -281,7 +281,7 @@ async fn distinct_at_search_time() { let documents = NESTED_DOCUMENTS.clone(); index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await; let (task, _) = index.update_settings_filterable_attributes(json!(["color.main"])).await; - let task = index.wait_task(task.uid()).await.succeeded(); + let task = server.wait_task(task.uid()).await.succeeded(); snapshot!(task, name: "succeed"); fn get_hits(response: &Value) -> Vec { diff --git a/crates/meilisearch/tests/search/errors.rs b/crates/meilisearch/tests/search/errors.rs index eca4a8cfb..363ece067 100644 --- a/crates/meilisearch/tests/search/errors.rs +++ b/crates/meilisearch/tests/search/errors.rs @@ -425,7 +425,7 @@ async fn search_non_filterable_facets() { let index = server.unique_index(); let (response, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; // Wait for the settings update to complete - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"facets": ["doggo"]})).await; snapshot!(code, @"400 Bad Request"); @@ -456,7 +456,7 @@ async fn search_non_filterable_facets_multiple_filterable() { let index = server.unique_index(); let (response, _code) = index.update_settings(json!({"filterableAttributes": ["title", "genres"]})).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"facets": ["doggo"]})).await; snapshot!(code, @"400 Bad Request"); @@ -486,7 +486,7 @@ async fn search_non_filterable_facets_no_filterable() { let server = Server::new_shared(); let index = server.unique_index(); let (response, _code) = index.update_settings(json!({"filterableAttributes": []})).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"facets": ["doggo"]})).await; snapshot!(code, @"400 Bad Request"); @@ -517,7 +517,7 @@ async fn search_non_filterable_facets_multiple_facets() { let index = server.unique_index(); let (response, _uid) = index.update_settings(json!({"filterableAttributes": ["title", "genres"]})).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"facets": ["doggo", "neko"]})).await; snapshot!(code, @"400 Bad Request"); @@ -1001,7 +1001,7 @@ async fn sort_geo_reserved_attribute() { let index = server.unique_index(); let (task, _code) = index.update_settings(json!({"sortableAttributes": ["id"]})).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let expected_response = json!({ "message": "`_geo` is a reserved keyword and thus can't be used as a sort expression. Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.", @@ -1028,7 +1028,7 @@ async fn sort_reserved_attribute() { let index = server.unique_index(); let (task, _code) = index.update_settings(json!({"sortableAttributes": ["id"]})).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let expected_response = json!({ "message": "`_geoDistance` is a reserved keyword and thus can't be used as a sort expression.", @@ -1054,7 +1054,7 @@ async fn sort_unsortable_attribute() { let server = Server::new_shared(); let index = server.unique_index(); let (response, _code) = index.update_settings(json!({"sortableAttributes": ["id"]})).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let expected_response = json!({ "message": format!("Index `{}`: Attribute `title` is not sortable. Available sortable attributes are: `id`.", index.uid), @@ -1081,7 +1081,7 @@ async fn sort_invalid_syntax() { let index = server.unique_index(); let (response, _code) = index.update_settings(json!({"sortableAttributes": ["id"]})).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let expected_response = json!({ "message": "Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `title`.", @@ -1112,7 +1112,7 @@ async fn sort_unset_ranking_rule() { json!({"sortableAttributes": ["title"], "rankingRules": ["proximity", "exactness"]}), ) .await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let expected_response = json!({ "message": format!("Index `{}`: You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.", index.uid), @@ -1199,7 +1199,7 @@ async fn distinct_at_search_time() { let index = server.unique_index(); let (response, _code) = index.add_documents(json!([{"id": 1, "color": "Doggo", "machin": "Action"}]), None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; @@ -1214,7 +1214,7 @@ async fn distinct_at_search_time() { "###); let (task, _) = index.update_settings_filterable_attributes(json!(["color", "machin"])).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; @@ -1229,7 +1229,7 @@ async fn distinct_at_search_time() { "###); let (task, _) = index.update_settings_displayed_attributes(json!(["color"])).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; diff --git a/crates/meilisearch/tests/search/facet_search.rs b/crates/meilisearch/tests/search/facet_search.rs index 57d2cfcd2..da713fc22 100644 --- a/crates/meilisearch/tests/search/facet_search.rs +++ b/crates/meilisearch/tests/search/facet_search.rs @@ -50,11 +50,11 @@ async fn test_settings_documents_indexing_swapping_and_facet_search( let (task, code) = index.add_documents(documents.clone(), None).await; assert_eq!(code, 202, "{}", task); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, code) = index.update_settings(settings.clone()).await; assert_eq!(code, 202, "{}", task); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(query.clone()).await; insta::allow_duplicates! { @@ -70,11 +70,11 @@ async fn test_settings_documents_indexing_swapping_and_facet_search( let (task, code) = index.update_settings(settings.clone()).await; assert_eq!(code, 202, "{}", task); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, code) = index.add_documents(documents.clone(), None).await; assert_eq!(code, 202, "{}", task); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(query.clone()).await; insta::allow_duplicates! { @@ -94,7 +94,7 @@ async fn simple_facet_search() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -207,10 +207,10 @@ async fn simple_facet_search_on_movies() { let (response, code) = index.update_settings_filterable_attributes(json!(["genres", "color"])).await; assert_eq!(202, code, "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, _code) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetQuery": "", "facetName": "genres", "q": "" })).await; @@ -228,7 +228,7 @@ async fn advanced_facet_search() { index.update_settings_filterable_attributes(json!(["genres"])).await; index.update_settings_typo_tolerance(json!({ "enabled": false })).await; let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await; @@ -252,7 +252,7 @@ async fn more_advanced_facet_search() { index.update_settings_filterable_attributes(json!(["genres"])).await; index.update_settings_typo_tolerance(json!({ "disableOnWords": ["adventre"] })).await; let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await; @@ -276,7 +276,7 @@ async fn simple_facet_search_with_max_values() { index.update_settings_faceting(json!({ "maxValuesPerFacet": 1 })).await; index.update_settings_filterable_attributes(json!(["genres"])).await; let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -298,7 +298,7 @@ async fn simple_facet_search_by_count_with_max_values() { .await; index.update_settings_filterable_attributes(json!(["genres"])).await; let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -314,7 +314,7 @@ async fn non_filterable_facet_search_error() { let documents = DOCUMENTS.clone(); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -333,7 +333,7 @@ async fn facet_search_dont_support_words() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "words"})).await; @@ -351,7 +351,7 @@ async fn simple_facet_search_with_sort_by_count() { index.update_settings_faceting(json!({ "sortFacetValuesBy": { "*": "count" } })).await; index.update_settings_filterable_attributes(json!(["genres"])).await; let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -370,7 +370,7 @@ async fn add_documents_and_deactivate_facet_search() { let documents = DOCUMENTS.clone(); let (response, _code) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index .update_settings(json!({ "facetSearch": false, @@ -378,7 +378,7 @@ async fn add_documents_and_deactivate_facet_search() { })) .await; assert_eq!("202", code.as_str(), "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -406,10 +406,10 @@ async fn deactivate_facet_search_and_add_documents() { })) .await; assert_eq!("202", code.as_str(), "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let documents = DOCUMENTS.clone(); let (response, _code) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -437,10 +437,10 @@ async fn deactivate_facet_search_add_documents_and_activate_facet_search() { })) .await; assert_eq!("202", code.as_str(), "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let documents = DOCUMENTS.clone(); let (response, _code) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index .update_settings(json!({ @@ -448,7 +448,7 @@ async fn deactivate_facet_search_add_documents_and_activate_facet_search() { })) .await; assert_eq!("202", code.as_str(), "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -469,10 +469,10 @@ async fn deactivate_facet_search_add_documents_and_reset_facet_search() { })) .await; assert_eq!("202", code.as_str(), "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let documents = DOCUMENTS.clone(); let (response, _code) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index .update_settings(json!({ @@ -480,7 +480,7 @@ async fn deactivate_facet_search_add_documents_and_reset_facet_search() { })) .await; assert_eq!("202", code.as_str(), "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -920,13 +920,13 @@ async fn distinct_facet_search_on_movies() { let (response, code) = index.update_settings_filterable_attributes(json!(["genres", "color"])).await; assert_eq!(202, code, "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.update_settings_distinct_attribute(json!("color")).await; assert_eq!(202, code, "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, _code) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetQuery": "blob", "facetName": "genres", "q": "" })).await; diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 564cae5a5..ffa025f5c 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -90,7 +90,7 @@ async fn search_with_contains_filter() { let documents = DOCUMENTS.clone(); let (request, _code) = index.add_documents(documents, None).await; - index.wait_task(request.uid()).await.succeeded(); + server.wait_task(request.uid()).await.succeeded(); let (response, code) = index .search_post(json!({ @@ -257,7 +257,7 @@ async fn search_with_pattern_filter_settings_scenario_1() { let (task, code) = index.add_documents(NESTED_DOCUMENTS.clone(), None).await; assert_eq!(code, 202, "{task}"); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, code) = index .update_settings(json!({"filterableAttributes": [{ @@ -269,7 +269,7 @@ async fn search_with_pattern_filter_settings_scenario_1() { }]})) .await; assert_eq!(code, 202, "{task}"); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // Check if the Equality filter works index @@ -334,7 +334,7 @@ async fn search_with_pattern_filter_settings_scenario_1() { }]})) .await; assert_eq!(code, 202, "{task}"); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // Check if the Equality filter works index @@ -445,7 +445,7 @@ async fn search_with_pattern_filter_settings_scenario_1() { }]})) .await; assert_eq!(code, 202, "{task}"); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // Check if the Equality filter returns an error index @@ -544,7 +544,7 @@ async fn search_with_pattern_filter_settings_scenario_1() { }]})) .await; assert_eq!(code, 202, "{task}"); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // Check if the Equality filter works index diff --git a/crates/meilisearch/tests/search/formatted.rs b/crates/meilisearch/tests/search/formatted.rs index 2b9383034..43a59e823 100644 --- a/crates/meilisearch/tests/search/formatted.rs +++ b/crates/meilisearch/tests/search/formatted.rs @@ -26,7 +26,7 @@ async fn search_formatted_from_sdk() { { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy" } ]); let (response, _) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await; + server.wait_task(response.uid()).await; index .search( @@ -65,7 +65,7 @@ async fn formatted_contain_wildcard() { let documents = NESTED_DOCUMENTS.clone(); let (response, _) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); index.search(json!({ "q": "pésti", "attributesToRetrieve": ["father", "mother"], "attributesToHighlight": ["father", "mother", "*"], "attributesToCrop": ["doggos"], "showMatchesPosition": true }), |response, code| @@ -398,7 +398,7 @@ async fn displayedattr_2_smol() { let documents = NESTED_DOCUMENTS.clone(); let (response, _) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); index .search(json!({ "attributesToRetrieve": ["father", "id"], "attributesToHighlight": ["mother"], "attributesToCrop": ["cattos"] }), @@ -596,7 +596,7 @@ async fn test_cjk_highlight() { { "id": 1, "title": "大卫到了扫罗那里" }, ]); let (response, _) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); index .search(json!({"q": "で", "attributesToHighlight": ["title"]}), |response, code| { diff --git a/crates/meilisearch/tests/search/hybrid.rs b/crates/meilisearch/tests/search/hybrid.rs index c6eb39a3a..be2a724b0 100644 --- a/crates/meilisearch/tests/search/hybrid.rs +++ b/crates/meilisearch/tests/search/hybrid.rs @@ -17,11 +17,11 @@ async fn index_with_documents_user_provided<'a>( "dimensions": 2}}} )) .await; assert_eq!(202, code, "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.add_documents(documents.clone(), None).await; assert_eq!(202, code, "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); index } @@ -37,11 +37,11 @@ async fn index_with_documents_hf<'a>(server: &'a Server, documents: &Val }}} )) .await; assert_eq!(202, code, "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.add_documents(documents.clone(), None).await; assert_eq!(202, code, "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); index } @@ -543,7 +543,7 @@ async fn distinct_is_applied() { let (response, code) = index.update_settings(json!({ "distinctAttribute": "distinct" } )).await; assert_eq!(202, code, "{:?}", response); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); // pure keyword let (response, code) = index @@ -633,7 +633,7 @@ async fn retrieve_vectors() { .update_settings(json!({ "displayedAttributes": ["id", "title", "desc", "_vectors"]} )) .await; assert_eq!(202, code, "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index .search_post( @@ -683,7 +683,7 @@ async fn retrieve_vectors() { let (response, code) = index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await; assert_eq!(202, code, "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index .search_post( diff --git a/crates/meilisearch/tests/search/locales.rs b/crates/meilisearch/tests/search/locales.rs index f45554d41..96c7fc7f5 100644 --- a/crates/meilisearch/tests/search/locales.rs +++ b/crates/meilisearch/tests/search/locales.rs @@ -99,7 +99,7 @@ async fn simple_search() { ) .await; let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // english index @@ -215,7 +215,7 @@ async fn force_locales() { } "###); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // chinese detection index @@ -293,7 +293,7 @@ async fn force_locales_with_pattern() { } "###); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // chinese detection index @@ -369,7 +369,7 @@ async fn force_locales_with_pattern_nested() { } "###); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // chinese index @@ -444,7 +444,7 @@ async fn force_different_locales_with_pattern() { } "###); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // force chinese index @@ -522,7 +522,7 @@ async fn auto_infer_locales_at_search_with_attributes_to_search_on() { } "###); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // auto infer any language index @@ -596,7 +596,7 @@ async fn auto_infer_locales_at_search() { } "###); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( @@ -695,7 +695,7 @@ async fn force_different_locales_with_pattern_nested() { } "###); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // chinese index @@ -773,7 +773,7 @@ async fn settings_change() { let documents = NESTED_DOCUMENTS.clone(); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, _) = index .update_settings(json!({ "searchableAttributes": ["document_en", "document_ja", "document_zh"], @@ -792,7 +792,7 @@ async fn settings_change() { "enqueuedAt": "[date]" } "###); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); // chinese index @@ -855,7 +855,7 @@ async fn settings_change() { "enqueuedAt": "[date]" } "###); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); // chinese index @@ -910,7 +910,7 @@ async fn invalid_locales() { ) .await; let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"q": "Atta", "locales": ["invalid"]})).await; snapshot!(code, @"400 Bad Request"); @@ -1028,7 +1028,7 @@ async fn simple_facet_search() { } "###); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, _) = index .facet_search(json!({"facetName": "name_zh", "facetQuery": "進撃", "locales": ["cmn"]})) @@ -1090,7 +1090,7 @@ async fn facet_search_with_localized_attributes() { } "###); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, _) = index .facet_search(json!({"facetName": "name_zh", "facetQuery": "进击", "locales": ["cmn"]})) @@ -1159,7 +1159,7 @@ async fn swedish_search() { ] })) .await; - index.wait_task(_response.uid()).await.succeeded(); + server.wait_task(_response.uid()).await.succeeded(); // infer swedish index @@ -1280,7 +1280,7 @@ async fn german_search() { ] })) .await; - index.wait_task(_response.uid()).await.succeeded(); + server.wait_task(_response.uid()).await.succeeded(); // infer swedish index diff --git a/crates/meilisearch/tests/search/matching_strategy.rs b/crates/meilisearch/tests/search/matching_strategy.rs index ece320b2a..10b93be76 100644 --- a/crates/meilisearch/tests/search/matching_strategy.rs +++ b/crates/meilisearch/tests/search/matching_strategy.rs @@ -9,7 +9,7 @@ async fn index_with_documents<'a>(server: &'a Server, documents: &Value) let index = server.unique_index(); let (task, _status_code) = index.add_documents(documents.clone(), None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index } diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index f547ce281..3f70e1ba9 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -38,11 +38,11 @@ async fn test_settings_documents_indexing_swapping_and_search( let (task, code) = index.add_documents(documents.clone(), None).await; assert_eq!(code, 202, "{task}"); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, code) = index.update_settings(settings.clone()).await; assert_eq!(code, 202, "{task}"); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index.search(query.clone(), test.clone()).await; @@ -51,11 +51,11 @@ async fn test_settings_documents_indexing_swapping_and_search( let (task, code) = index.update_settings(settings.clone()).await; assert_eq!(code, 202, "{task}"); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, code) = index.add_documents(documents.clone(), None).await; assert_eq!(code, 202, "{task}"); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index.search(query.clone(), test.clone()).await; } @@ -104,7 +104,7 @@ async fn bug_5547() { let server = Server::new_shared(); let index = server.unique_index(); let (response, _code) = index.create(None).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let mut documents = Vec::new(); for i in 0..65_535 { @@ -112,7 +112,7 @@ async fn bug_5547() { } let (response, _code) = index.add_documents(json!(documents), Some("id")).await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"q": "title"})).await; assert_eq!(code, 200); snapshot!(response["hits"], @r###"[{"id":0,"title":"title0"},{"id":1,"title":"title1"},{"id":10,"title":"title10"},{"id":100,"title":"title100"},{"id":101,"title":"title101"},{"id":102,"title":"title102"},{"id":103,"title":"title103"},{"id":104,"title":"title104"},{"id":105,"title":"title105"},{"id":106,"title":"title106"},{"id":107,"title":"title107"},{"id":108,"title":"title108"},{"id":1000,"title":"title1000"},{"id":1001,"title":"title1001"},{"id":1002,"title":"title1002"},{"id":1003,"title":"title1003"},{"id":1004,"title":"title1004"},{"id":1005,"title":"title1005"},{"id":1006,"title":"title1006"},{"id":1007,"title":"title1007"}]"###); @@ -131,7 +131,7 @@ async fn search_with_stop_word() { let documents = DOCUMENTS.clone(); let (task, _code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // prefix search index @@ -196,7 +196,7 @@ async fn search_with_typo_settings() { let documents = DOCUMENTS.clone(); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "287947" }), |response, code| { @@ -228,7 +228,7 @@ async fn phrase_search_with_stop_word() { let documents = DOCUMENTS.clone(); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "how \"to\" train \"the" }), |response, code| { @@ -308,11 +308,11 @@ async fn negative_special_cases_search() { let documents = DOCUMENTS.clone(); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, _status_code) = index.update_settings(json!({"synonyms": { "escape": ["gläss"] }})).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass index @@ -338,7 +338,7 @@ async fn test_kanji_language_detection() { { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" } ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "東京"}), |response, code| { @@ -361,10 +361,10 @@ async fn test_thai_language() { { "id": 2, "title": "สบู่สมุนไพรฝางแดงผสมว่านหางจรเข้ 100 กรัม จำนวน 6 ก้อน" } ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, _status_code) = index.update_settings(json!({"rankingRules": ["exactness"]})).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "สบู"}), |response, code| { @@ -586,7 +586,7 @@ async fn displayed_attributes() { let documents = DOCUMENTS.clone(); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.search_post(json!({ "attributesToRetrieve": ["title", "id"] })).await; @@ -601,7 +601,7 @@ async fn placeholder_search_is_hard_limited() { let documents: Vec<_> = (0..1200).map(|i| json!({ "id": i, "text": "I am unique!" })).collect(); let (task, _status_code) = index.add_documents(documents.into(), None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( @@ -630,7 +630,7 @@ async fn placeholder_search_is_hard_limited() { let (task, _status_code) = index.update_settings(json!({ "pagination": { "maxTotalHits": 10_000 } })).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( @@ -665,7 +665,7 @@ async fn search_is_hard_limited() { let documents: Vec<_> = (0..1200).map(|i| json!({ "id": i, "text": "I am unique!" })).collect(); let (task, _status_code) = index.add_documents(documents.into(), None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( @@ -696,7 +696,7 @@ async fn search_is_hard_limited() { let (task, _status_code) = index.update_settings(json!({ "pagination": { "maxTotalHits": 10_000 } })).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( @@ -735,7 +735,7 @@ async fn faceting_max_values_per_facet() { let documents: Vec<_> = (0..10_000).map(|id| json!({ "id": id, "number": id * 10 })).collect(); let (task, _status_code) = index.add_documents(json!(documents), None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( @@ -752,7 +752,7 @@ async fn faceting_max_values_per_facet() { let (task, _status_code) = index.update_settings(json!({ "faceting": { "maxValuesPerFacet": 10_000 } })).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( @@ -1033,7 +1033,7 @@ async fn test_degraded_score_details() { index.add_documents(json!(documents), None).await; // We can't really use anything else than 0ms here; otherwise, the test will get flaky. let (res, _code) = index.update_settings(json!({ "searchCutoffMs": 0 })).await; - index.wait_task(res.uid()).await.succeeded(); + server.wait_task(res.uid()).await.succeeded(); index .search( @@ -1126,7 +1126,7 @@ async fn camelcased_words() { { "id": 4, "title": "testab" }, ]); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "deLonghi"}), |response, code| { @@ -1345,12 +1345,12 @@ async fn simple_search_with_strange_synonyms() { let (task, _status_code) = index.update_settings(json!({ "synonyms": {"&": ["to"], "to": ["&"]} })).await; - let r = index.wait_task(task.uid()).await.succeeded(); + let r = server.wait_task(task.uid()).await.succeeded(); snapshot!(r["status"], @r###""succeeded""###); let documents = DOCUMENTS.clone(); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "How to train"}), |response, code| { @@ -1416,11 +1416,11 @@ async fn change_attributes_settings() { let documents = NESTED_DOCUMENTS.clone(); let (task, _status_code) = index.add_documents(json!(documents), None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task,_status_code) = index.update_settings(json!({ "searchableAttributes": ["father", "mother", "doggos"], "filterableAttributes": ["doggos"] })).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // search index @@ -1923,7 +1923,7 @@ async fn change_facet_casing() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, _code) = index .add_documents( @@ -1936,7 +1936,7 @@ async fn change_facet_casing() { None, ) .await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, _code) = index .add_documents( @@ -1949,7 +1949,7 @@ async fn change_facet_casing() { None, ) .await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); index .search(json!({ "facets": ["dog"] }), |response, code| { @@ -2062,7 +2062,7 @@ async fn simple_search_changing_unrelated_settings() { let documents = DOCUMENTS.clone(); let (task, _status_code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "Dragon"}), |response, code| { @@ -2084,7 +2084,7 @@ async fn simple_search_changing_unrelated_settings() { let (task, _status_code) = index.update_settings(json!({ "filterableAttributes": ["title"] })).await; - let r = index.wait_task(task.uid()).await.succeeded(); + let r = server.wait_task(task.uid()).await.succeeded(); snapshot!(r["status"], @r###""succeeded""###); index @@ -2106,7 +2106,7 @@ async fn simple_search_changing_unrelated_settings() { .await; let (task, _status_code) = index.update_settings(json!({ "filterableAttributes": [] })).await; - let r = index.wait_task(task.uid()).await.succeeded(); + let r = server.wait_task(task.uid()).await.succeeded(); snapshot!(r["status"], @r###""succeeded""###); index diff --git a/crates/meilisearch/tests/search/multi/mod.rs b/crates/meilisearch/tests/search/multi/mod.rs index cf98baa10..b9eed56da 100644 --- a/crates/meilisearch/tests/search/multi/mod.rs +++ b/crates/meilisearch/tests/search/multi/mod.rs @@ -21,7 +21,7 @@ pub async fn shared_movies_index() -> &'static Index<'static, Shared> { let documents = DOCUMENTS.clone(); let (response, _code) = movies_index.add_documents(documents, None).await; - movies_index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (value, _) = movies_index .update_settings(json!({ @@ -37,7 +37,7 @@ pub async fn shared_movies_index() -> &'static Index<'static, Shared> { ] })) .await; - movies_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); movies_index.to_shared() }) .await @@ -52,7 +52,7 @@ pub async fn shared_batman_index() -> &'static Index<'static, Shared> { let documents = SCORE_DOCUMENTS.clone(); let (response, _code) = batman_index.add_documents(documents, None).await; - batman_index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (value, _) = batman_index .update_settings(json!({ @@ -68,7 +68,7 @@ pub async fn shared_batman_index() -> &'static Index<'static, Shared> { ] })) .await; - batman_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); batman_index.to_shared() }) .await @@ -1085,14 +1085,14 @@ async fn federation_filter() { let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings( json!({"searchableAttributes": ["name"], "filterableAttributes": ["BOOST"]}), ) .await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -1152,7 +1152,7 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { let documents = NESTED_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -1167,7 +1167,7 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { ] })) .await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); // two identical placeholder searches should have all results from the first query let (response, code) = server @@ -1365,7 +1365,7 @@ async fn federation_sort_same_indexes_same_criterion_opposite_direction() { let documents = NESTED_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -1380,7 +1380,7 @@ async fn federation_sort_same_indexes_same_criterion_opposite_direction() { ] })) .await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); // two identical placeholder searches should have all results from the first query let (response, code) = server @@ -1424,7 +1424,7 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { let documents = NESTED_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -1439,7 +1439,7 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { ] })) .await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); // return mothers and fathers ordered across fields. let (response, code) = server @@ -1638,7 +1638,7 @@ async fn federation_sort_same_indexes_different_criterion_opposite_direction() { let documents = NESTED_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -1653,7 +1653,7 @@ async fn federation_sort_same_indexes_different_criterion_opposite_direction() { ] })) .await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); // two identical placeholder searches should have all results from the first query let (response, code) = server @@ -3048,14 +3048,14 @@ async fn federation_invalid_weight() { let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings( json!({"searchableAttributes": ["name"], "filterableAttributes": ["BOOST"]}), ) .await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -3082,14 +3082,14 @@ async fn federation_null_weight() { let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings( json!({"searchableAttributes": ["name"], "filterableAttributes": ["BOOST"]}), ) .await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -3150,7 +3150,7 @@ async fn federation_federated_contains_pagination() { let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); // fail when a federated query contains "limit" let (response, code) = server @@ -3230,11 +3230,11 @@ async fn federation_federated_contains_facets() { ) .await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); // empty facets are actually OK let (response, code) = server @@ -3314,7 +3314,7 @@ async fn federation_non_faceted_for_an_index() { ) .await; - fruits_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let fruits_no_name_index = server.unique_index_with_prefix("fruits-no-name"); @@ -3324,18 +3324,18 @@ async fn federation_non_faceted_for_an_index() { ) .await; - fruits_no_name_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let fruits_no_facets_index = server.unique_index_with_prefix("fruits-no-facets"); let (value, _) = fruits_no_facets_index.update_settings(json!({"searchableAttributes": ["name"]})).await; - fruits_no_facets_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = fruits_no_facets_index.add_documents(documents, None).await; - fruits_no_facets_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); // fails let (response, code) = server @@ -3435,7 +3435,7 @@ async fn federation_non_federated_contains_federation_option() { let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); // fail when a non-federated query contains "federationOptions" let (response, code) = server @@ -3473,12 +3473,12 @@ async fn federation_vector_single_index() { } }})) .await; - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let documents = VECTOR_DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); // same embedder let (response, code) = server @@ -3670,12 +3670,12 @@ async fn federation_vector_two_indexes() { }, }})) .await; - vectors_animal_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let documents = VECTOR_DOCUMENTS.clone(); let (value, code) = vectors_animal_index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - vectors_animal_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let vectors_sentiment_index = server.unique_index_with_prefix("vectors-sentiment"); @@ -3687,12 +3687,12 @@ async fn federation_vector_two_indexes() { } }})) .await; - vectors_sentiment_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let documents = VECTOR_DOCUMENTS.clone(); let (value, code) = vectors_sentiment_index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - vectors_sentiment_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -4154,7 +4154,7 @@ async fn federation_facets_different_indexes_same_facet() { let documents = SCORE_DOCUMENTS.clone(); let (value, _) = batman_2_index.add_documents(documents, None).await; - batman_2_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _) = batman_2_index .update_settings(json!({ @@ -4170,7 +4170,7 @@ async fn federation_facets_different_indexes_same_facet() { ] })) .await; - batman_2_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); // return titles ordered across indexes let (response, code) = server @@ -4677,7 +4677,7 @@ async fn federation_facets_same_indexes() { let documents = NESTED_DOCUMENTS.clone(); let (value, _) = doggos_index.add_documents(documents, None).await; - doggos_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _) = doggos_index .update_settings(json!({ @@ -4692,13 +4692,13 @@ async fn federation_facets_same_indexes() { ] })) .await; - doggos_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let doggos2_index = server.unique_index_with_prefix("doggos_2"); let documents = NESTED_DOCUMENTS.clone(); let (value, _) = doggos2_index.add_documents(documents, None).await; - doggos2_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _) = doggos2_index .update_settings(json!({ @@ -4713,7 +4713,7 @@ async fn federation_facets_same_indexes() { ] })) .await; - doggos2_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": { @@ -4980,7 +4980,7 @@ async fn federation_inconsistent_merge_order() { let documents = DOCUMENTS.clone(); let (value, _) = movies2_index.add_documents(documents, None).await; - movies2_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _) = movies2_index .update_settings(json!({ @@ -4999,7 +4999,7 @@ async fn federation_inconsistent_merge_order() { } })) .await; - movies2_index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let batman_index = shared_batman_index().await; diff --git a/crates/meilisearch/tests/search/pagination.rs b/crates/meilisearch/tests/search/pagination.rs index f8b698a95..c0752e7ec 100644 --- a/crates/meilisearch/tests/search/pagination.rs +++ b/crates/meilisearch/tests/search/pagination.rs @@ -114,14 +114,14 @@ async fn ensure_placeholder_search_hit_count_valid() { } ]); let (task, _code) = index.add_documents(documents, None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, _code) = index .update_settings( json!({ "rankingRules": ["distinct:asc"], "distinctAttribute": "distinct"}), ) .await; - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); for page in 0..=4 { index diff --git a/crates/meilisearch/tests/search/restrict_searchable.rs b/crates/meilisearch/tests/search/restrict_searchable.rs index e5408a210..bbd2a4ee3 100644 --- a/crates/meilisearch/tests/search/restrict_searchable.rs +++ b/crates/meilisearch/tests/search/restrict_searchable.rs @@ -9,7 +9,7 @@ async fn index_with_documents<'a>(server: &'a Server, documents: &Value) let index = server.unique_index(); let (task, _code) = index.add_documents(documents.clone(), None).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index } @@ -65,7 +65,7 @@ async fn search_no_searchable_attribute_set() { .await; let (task, _status_code) = index.update_settings_searchable_attributes(json!(["*"])).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( @@ -78,7 +78,7 @@ async fn search_no_searchable_attribute_set() { .await; let (task, _status_code) = index.update_settings_searchable_attributes(json!(["*"])).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( @@ -109,7 +109,7 @@ async fn search_on_all_attributes_restricted_set() { let server = Server::new_shared(); let index = index_with_documents(server, &SIMPLE_SEARCH_DOCUMENTS).await; let (task, _status_code) = index.update_settings_searchable_attributes(json!(["title"])).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "Captain Marvel", "attributesToSearchOn": ["*"]}), |response, code| { @@ -194,7 +194,7 @@ async fn word_ranking_rule_order_exact_words() { let (task, _status_code) = index .update_settings_typo_tolerance(json!({"disableOnWords": ["Captain", "Marvel"]})) .await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); // simple search should return 2 documents (ids: 2 and 3). index @@ -360,7 +360,7 @@ async fn search_on_exact_field() { let (response, code) = index.update_settings_typo_tolerance(json!({ "disableOnAttributes": ["exact"] })).await; assert_eq!(202, code, "{response:?}"); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); // Searching on an exact attribute should only return the document matching without typo. index .search(json!({"q": "Marvel", "attributesToSearchOn": ["exact"]}), |response, code| { @@ -557,7 +557,7 @@ async fn nested_search_on_title_restricted_set_with_suffix_wildcard() { let index = index_with_documents(server, &NESTED_SEARCH_DOCUMENTS).await; let (task, _status_code) = index.update_settings_searchable_attributes(json!(["details.title"])).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( @@ -595,7 +595,7 @@ async fn nested_search_no_searchable_attribute_set_with_any_wildcard() { .await; let (task, _status_code) = index.update_settings_searchable_attributes(json!(["*"])).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( @@ -608,7 +608,7 @@ async fn nested_search_no_searchable_attribute_set_with_any_wildcard() { .await; let (task, _status_code) = index.update_settings_searchable_attributes(json!(["*"])).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); index .search( From ca112a8b95ef8a40ab1d836a4565131b856aa2bc Mon Sep 17 00:00:00 2001 From: Martin Tzvetanov Grigorov Date: Sun, 22 Jun 2025 14:29:33 +0300 Subject: [PATCH 007/150] tests: Use Server::wait_task() instead of Index::wait_task() in index:: The code is mostly duplicated. Server::wait_task() has better handling for errors and more retries. Signed-off-by: Martin Tzvetanov Grigorov --- crates/meilisearch/tests/index/create_index.rs | 16 ++++++++-------- crates/meilisearch/tests/index/delete_index.rs | 11 ++++++----- crates/meilisearch/tests/index/get_index.rs | 2 +- crates/meilisearch/tests/index/stats.rs | 4 ++-- crates/meilisearch/tests/index/update_index.rs | 18 ++++++++++-------- 5 files changed, 27 insertions(+), 24 deletions(-) diff --git a/crates/meilisearch/tests/index/create_index.rs b/crates/meilisearch/tests/index/create_index.rs index 3422e8b3f..dc178919e 100644 --- a/crates/meilisearch/tests/index/create_index.rs +++ b/crates/meilisearch/tests/index/create_index.rs @@ -17,7 +17,7 @@ async fn create_index_no_primary_key() { assert_eq!(response["status"], "enqueued"); - let response = index.wait_task(response.uid()).await; + let response = server.wait_task(response.uid()).await; assert_eq!(response["status"], "succeeded"); assert_eq!(response["type"], "indexCreation"); @@ -34,7 +34,7 @@ async fn create_index_with_gzip_encoded_request() { assert_eq!(response["status"], "enqueued"); - let response = index.wait_task(response.uid()).await; + let response = server.wait_task(response.uid()).await; assert_eq!(response["status"], "succeeded"); assert_eq!(response["type"], "indexCreation"); @@ -83,7 +83,7 @@ async fn create_index_with_zlib_encoded_request() { assert_eq!(response["status"], "enqueued"); - let response = index.wait_task(response.uid()).await; + let response = server.wait_task(response.uid()).await; assert_eq!(response["status"], "succeeded"); assert_eq!(response["type"], "indexCreation"); @@ -100,7 +100,7 @@ async fn create_index_with_brotli_encoded_request() { assert_eq!(response["status"], "enqueued"); - let response = index.wait_task(response.uid()).await; + let response = server.wait_task(response.uid()).await; assert_eq!(response["status"], "succeeded"); assert_eq!(response["type"], "indexCreation"); @@ -117,7 +117,7 @@ async fn create_index_with_primary_key() { assert_eq!(response["status"], "enqueued"); - let response = index.wait_task(response.uid()).await.succeeded(); + let response = server.wait_task(response.uid()).await.succeeded(); assert_eq!(response["status"], "succeeded"); assert_eq!(response["type"], "indexCreation"); @@ -132,7 +132,7 @@ async fn create_index_with_invalid_primary_key() { let index = server.unique_index(); let (response, code) = index.add_documents(documents, Some("title")).await; assert_eq!(code, 202); - index.wait_task(response.uid()).await.failed(); + server.wait_task(response.uid()).await.failed(); let (response, code) = index.get().await; assert_eq!(code, 200); @@ -142,7 +142,7 @@ async fn create_index_with_invalid_primary_key() { let (response, code) = index.add_documents(documents, Some("id")).await; assert_eq!(code, 202); - index.wait_task(response.uid()).await.failed(); + server.wait_task(response.uid()).await.failed(); let (response, code) = index.get().await; assert_eq!(code, 200); @@ -181,7 +181,7 @@ async fn error_create_existing_index() { let (task, _) = index.create(Some("primary")).await; - let response = index.wait_task(task.uid()).await; + let response = server.wait_task(task.uid()).await; let msg = format!( "Index `{}` already exists.", task["indexUid"].as_str().expect("indexUid should exist").trim_matches('"') diff --git a/crates/meilisearch/tests/index/delete_index.rs b/crates/meilisearch/tests/index/delete_index.rs index 713891420..085b47294 100644 --- a/crates/meilisearch/tests/index/delete_index.rs +++ b/crates/meilisearch/tests/index/delete_index.rs @@ -9,7 +9,7 @@ async fn create_and_delete_index() { assert_eq!(code, 202); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); assert_eq!(index.get().await.1, 200); @@ -17,18 +17,19 @@ async fn create_and_delete_index() { assert_eq!(code, 202); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); assert_eq!(index.get().await.1, 404); } #[actix_rt::test] async fn error_delete_unexisting_index() { + let server = Server::new_shared(); let index = shared_does_not_exists_index().await; let (task, code) = index.delete_index_fail().await; assert_eq!(code, 202); - index.wait_task(task.uid()).await.failed(); + server.wait_task(task.uid()).await.failed(); let expected_response = json!({ "message": "Index `DOES_NOT_EXISTS` not found.", @@ -37,7 +38,7 @@ async fn error_delete_unexisting_index() { "link": "https://docs.meilisearch.com/errors#index_not_found" }); - let response = index.wait_task(task.uid()).await; + let response = server.wait_task(task.uid()).await; assert_eq!(response["status"], "failed"); assert_eq!(response["error"], expected_response); } @@ -58,7 +59,7 @@ async fn loop_delete_add_documents() { } for task in tasks { - let response = index.wait_task(task).await.succeeded(); + let response = server.wait_task(task).await.succeeded(); assert_eq!(response["status"], "succeeded", "{}", response); } } diff --git a/crates/meilisearch/tests/index/get_index.rs b/crates/meilisearch/tests/index/get_index.rs index 11b1817f0..ece479513 100644 --- a/crates/meilisearch/tests/index/get_index.rs +++ b/crates/meilisearch/tests/index/get_index.rs @@ -12,7 +12,7 @@ async fn create_and_get_index() { assert_eq!(code, 202); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.get().await; diff --git a/crates/meilisearch/tests/index/stats.rs b/crates/meilisearch/tests/index/stats.rs index 90c77cec8..610601318 100644 --- a/crates/meilisearch/tests/index/stats.rs +++ b/crates/meilisearch/tests/index/stats.rs @@ -10,7 +10,7 @@ async fn stats() { assert_eq!(code, 202); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.stats().await; @@ -33,7 +33,7 @@ async fn stats() { let (response, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(response.uid()).await.succeeded(); + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index.stats().await; diff --git a/crates/meilisearch/tests/index/update_index.rs b/crates/meilisearch/tests/index/update_index.rs index 291700728..1c781c386 100644 --- a/crates/meilisearch/tests/index/update_index.rs +++ b/crates/meilisearch/tests/index/update_index.rs @@ -12,10 +12,10 @@ async fn update_primary_key() { let (task, code) = index.create(None).await; assert_eq!(code, 202); - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (task, _status_code) = index.update(Some("primary")).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get().await; @@ -42,12 +42,12 @@ async fn create_and_update_with_different_encoding() { let (create_task, code) = index.create(None).await; assert_eq!(code, 202); - index.wait_task(create_task.uid()).await.succeeded(); + server.wait_task(create_task.uid()).await.succeeded(); let index = index.with_encoder(Encoder::Brotli); let (task, _status_code) = index.update(Some("primary")).await; - index.wait_task(task.uid()).await.succeeded(); + server.wait_task(task.uid()).await.succeeded(); } #[actix_rt::test] @@ -58,23 +58,24 @@ async fn update_nothing() { assert_eq!(code, 202); - index.wait_task(task1.uid()).await.succeeded(); + server.wait_task(task1.uid()).await.succeeded(); let (task2, code) = index.update(None).await; assert_eq!(code, 202); - index.wait_task(task2.uid()).await.succeeded(); + server.wait_task(task2.uid()).await.succeeded(); } #[actix_rt::test] async fn error_update_existing_primary_key() { + let server = Server::new_shared(); let index = shared_index_with_documents().await; let (update_task, code) = index.update_index_fail(Some("primary")).await; assert_eq!(code, 202); - let response = index.wait_task(update_task.uid()).await.failed(); + let response = server.wait_task(update_task.uid()).await.failed(); let expected_response = json!({ "message": format!("Index `{}`: Index already has a primary key: `id`.", index.uid), @@ -88,12 +89,13 @@ async fn error_update_existing_primary_key() { #[actix_rt::test] async fn error_update_unexisting_index() { + let server = Server::new_shared(); let index = shared_does_not_exists_index().await; let (task, code) = index.update_index_fail(Some("my-primary-key")).await; assert_eq!(code, 202); - let response = index.wait_task(task.uid()).await.failed(); + let response = server.wait_task(task.uid()).await.failed(); let expected_response = json!({ "message": format!("Index `{}` not found.", index.uid), From 4925b3019640107759c33fa99521b99c6c202277 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 23 Jun 2025 15:24:14 +0200 Subject: [PATCH 008/150] Move embedder stats out of progress --- crates/benchmarks/benches/indexing.rs | 33 ++++++++++++++- crates/benchmarks/benches/utils.rs | 3 +- crates/dump/src/lib.rs | 2 +- crates/fuzzers/src/bin/fuzz-indexing.rs | 1 + crates/index-scheduler/src/insta_snapshot.rs | 4 +- crates/index-scheduler/src/queue/batches.rs | 10 ++++- .../src/scheduler/process_batch.rs | 17 ++++++-- .../src/scheduler/process_index_operation.rs | 12 ++++-- crates/index-scheduler/src/utils.rs | 19 ++++++++- crates/meilisearch-types/src/batch_view.rs | 19 +++++++-- crates/meilisearch-types/src/batches.rs | 28 ++++++++++++- crates/meilisearch/src/lib.rs | 6 ++- crates/meilisearch/tests/vector/rest.rs | 40 +++++++++++++++++++ crates/milli/src/progress.rs | 37 ++++++----------- .../milli/src/search/new/tests/integration.rs | 3 +- crates/milli/src/test_index.rs | 5 ++- .../extract/extract_vector_points.rs | 2 + .../src/update/index_documents/extract/mod.rs | 6 +-- .../milli/src/update/index_documents/mod.rs | 17 +++++++- .../src/update/new/extract/vectors/mod.rs | 17 ++++++-- .../milli/src/update/new/indexer/extract.rs | 4 ++ crates/milli/src/update/new/indexer/mod.rs | 5 ++- crates/milli/src/update/settings.rs | 4 +- crates/milli/src/vector/rest.rs | 4 ++ crates/milli/tests/search/distinct.rs | 2 +- .../milli/tests/search/facet_distribution.rs | 3 +- crates/milli/tests/search/mod.rs | 3 +- crates/milli/tests/search/phrase_search.rs | 2 +- crates/milli/tests/search/query_criteria.rs | 7 ++-- crates/milli/tests/search/typo_tolerance.rs | 9 +++-- 30 files changed, 255 insertions(+), 69 deletions(-) diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index b882b598d..8241da9d2 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -65,7 +65,7 @@ fn setup_settings<'t>( let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); builder.set_sortable_fields(sortable_fields); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); } fn setup_index_with_settings( @@ -169,6 +169,7 @@ fn indexing_songs_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -235,6 +236,7 @@ fn reindexing_songs_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -279,6 +281,7 @@ fn reindexing_songs_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -347,6 +350,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -423,6 +427,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -467,6 +472,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -507,6 +513,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -574,6 +581,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -640,6 +648,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -706,6 +715,7 @@ fn indexing_wiki(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -771,6 +781,7 @@ fn reindexing_wiki(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -815,6 +826,7 @@ fn reindexing_wiki(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -882,6 +894,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -958,6 +971,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -1003,6 +1017,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -1044,6 +1059,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -1110,6 +1126,7 @@ fn indexing_movies_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -1175,6 +1192,7 @@ fn reindexing_movies_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -1219,6 +1237,7 @@ fn reindexing_movies_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -1286,6 +1305,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -1334,6 +1354,7 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec Index { (conf.configure)(&mut builder); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); wtxn.commit().unwrap(); let config = IndexerConfig::default(); @@ -128,6 +128,7 @@ pub fn base_setup(conf: &Conf) -> Index { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index c48c68f62..b7a35ad5c 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -328,8 +328,8 @@ pub(crate) mod test { progress_trace: Default::default(), write_channel_congestion: None, internal_database_sizes: Default::default(), - embeddings: Default::default(), }, + embedder_stats: None, enqueued_at: Some(BatchEnqueuedAt { earliest: datetime!(2022-11-11 0:00 UTC), oldest: datetime!(2022-11-11 0:00 UTC), diff --git a/crates/fuzzers/src/bin/fuzz-indexing.rs b/crates/fuzzers/src/bin/fuzz-indexing.rs index 4df989b51..23c4cb9c2 100644 --- a/crates/fuzzers/src/bin/fuzz-indexing.rs +++ b/crates/fuzzers/src/bin/fuzz-indexing.rs @@ -144,6 +144,7 @@ fn main() { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index d01548319..06ec01b5e 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -1,7 +1,7 @@ use std::collections::BTreeSet; use std::fmt::Write; -use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchStats}; +use meilisearch_types::batches::{Batch, BatchEmbeddingStats, BatchEnqueuedAt, BatchStats}; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str}; use meilisearch_types::heed::{Database, RoTxn}; use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32}; @@ -343,6 +343,7 @@ pub fn snapshot_batch(batch: &Batch) -> String { uid, details, stats, + embedder_stats, started_at, finished_at, progress: _, @@ -366,6 +367,7 @@ pub fn snapshot_batch(batch: &Batch) -> String { snap.push_str(&format!("uid: {uid}, ")); snap.push_str(&format!("details: {}, ", serde_json::to_string(details).unwrap())); snap.push_str(&format!("stats: {}, ", serde_json::to_string(&stats).unwrap())); + snap.push_str(&format!("embedder_stats: {}, ", serde_json::to_string(&embedder_stats).unwrap())); snap.push_str(&format!("stop reason: {}, ", serde_json::to_string(&stop_reason).unwrap())); snap.push('}'); snap diff --git a/crates/index-scheduler/src/queue/batches.rs b/crates/index-scheduler/src/queue/batches.rs index b5b63e1d7..b14601733 100644 --- a/crates/index-scheduler/src/queue/batches.rs +++ b/crates/index-scheduler/src/queue/batches.rs @@ -1,7 +1,7 @@ use std::collections::HashSet; use std::ops::{Bound, RangeBounds}; -use meilisearch_types::batches::{Batch, BatchId}; +use meilisearch_types::batches::{Batch, BatchEmbeddingStats, BatchId}; use meilisearch_types::heed::types::{DecodeIgnore, SerdeBincode, SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls}; use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32}; @@ -92,7 +92,10 @@ impl BatchQueue { } pub(crate) fn get_batch(&self, rtxn: &RoTxn, batch_id: BatchId) -> Result> { - Ok(self.all_batches.get(rtxn, &batch_id)?) + println!("Got batch from db {batch_id:?}"); + let r = Ok(self.all_batches.get(rtxn, &batch_id)?); + println!("Got batch from db => {:?}", r); + r } /// Returns the whole set of batches that belongs to this index. @@ -171,6 +174,8 @@ impl BatchQueue { pub(crate) fn write_batch(&self, wtxn: &mut RwTxn, batch: ProcessingBatch) -> Result<()> { let old_batch = self.all_batches.get(wtxn, &batch.uid)?; + println!("Saving batch: {}", batch.embedder_stats.is_some()); + self.all_batches.put( wtxn, &batch.uid, @@ -179,6 +184,7 @@ impl BatchQueue { progress: None, details: batch.details, stats: batch.stats, + embedder_stats: batch.embedder_stats.as_ref().map(|s| BatchEmbeddingStats::from(s.as_ref())), started_at: batch.started_at, finished_at: batch.finished_at, enqueued_at: batch.enqueued_at, diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 71e423a58..4e36b65b6 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -1,10 +1,11 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::Ordering; +use std::sync::Arc; use meilisearch_types::batches::{BatchEnqueuedAt, BatchId}; use meilisearch_types::heed::{RoTxn, RwTxn}; -use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::progress::{EmbedderStats, Progress, VariableNameStep}; use meilisearch_types::milli::{self, ChannelCongestion}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::versioning::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; @@ -163,7 +164,7 @@ impl IndexScheduler { let pre_commit_dabases_sizes = index.database_sizes(&index_wtxn)?; let (tasks, congestion) = - self.apply_index_operation(&mut index_wtxn, &index, op, &progress)?; + self.apply_index_operation(&mut index_wtxn, &index, op, &progress, current_batch.clone_embedder_stats())?; { progress.update_progress(FinalizingIndexStep::Committing); @@ -238,11 +239,21 @@ impl IndexScheduler { ); builder.set_primary_key(primary_key); let must_stop_processing = self.scheduler.must_stop_processing.clone(); + + let embedder_stats = match current_batch.embedder_stats { + Some(ref stats) => stats.clone(), + None => { + let embedder_stats: Arc = Default::default(); + current_batch.embedder_stats = Some(embedder_stats.clone()); + embedder_stats + }, + }; + builder .execute( |indexing_step| tracing::debug!(update = ?indexing_step), || must_stop_processing.get(), - Some(progress.embedder_stats), + embedder_stats, ) .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; index_wtxn.commit()?; diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index 92d13e7e7..b5338e511 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -4,7 +4,7 @@ use bumpalo::collections::CollectIn; use bumpalo::Bump; use meilisearch_types::heed::RwTxn; use meilisearch_types::milli::documents::PrimaryKey; -use meilisearch_types::milli::progress::Progress; +use meilisearch_types::milli::progress::{EmbedderStats, Progress}; use meilisearch_types::milli::update::new::indexer::{self, UpdateByFunction}; use meilisearch_types::milli::update::DocumentAdditionResult; use meilisearch_types::milli::{self, ChannelCongestion, Filter}; @@ -26,7 +26,7 @@ impl IndexScheduler { /// The list of processed tasks. #[tracing::instrument( level = "trace", - skip(self, index_wtxn, index, progress), + skip(self, index_wtxn, index, progress, embedder_stats), target = "indexing::scheduler" )] pub(crate) fn apply_index_operation<'i>( @@ -35,6 +35,7 @@ impl IndexScheduler { index: &'i Index, operation: IndexOperation, progress: &Progress, + embedder_stats: Arc, ) -> Result<(Vec, Option)> { let indexer_alloc = Bump::new(); let started_processing_at = std::time::Instant::now(); @@ -179,6 +180,7 @@ impl IndexScheduler { embedders, &|| must_stop_processing.get(), progress, + embedder_stats, ) .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?, ); @@ -290,6 +292,7 @@ impl IndexScheduler { embedders, &|| must_stop_processing.get(), progress, + embedder_stats, ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, ); @@ -438,6 +441,7 @@ impl IndexScheduler { embedders, &|| must_stop_processing.get(), progress, + embedder_stats, ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, ); @@ -474,7 +478,7 @@ impl IndexScheduler { .execute( |indexing_step| tracing::debug!(update = ?indexing_step), || must_stop_processing.get(), - Some(Arc::clone(&progress.embedder_stats)) + embedder_stats, ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; @@ -494,6 +498,7 @@ impl IndexScheduler { tasks: cleared_tasks, }, progress, + embedder_stats.clone(), )?; let (settings_tasks, _congestion) = self.apply_index_operation( @@ -501,6 +506,7 @@ impl IndexScheduler { index, IndexOperation::Settings { index_uid, settings, tasks: settings_tasks }, progress, + embedder_stats, )?; let mut tasks = settings_tasks; diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 67e8fc090..22e319580 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -2,8 +2,10 @@ use std::collections::{BTreeSet, HashSet}; use std::ops::Bound; +use std::sync::Arc; +use crate::milli::progress::EmbedderStats; -use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchId, BatchStats}; +use meilisearch_types::batches::{Batch, BatchEmbeddingStats, BatchEnqueuedAt, BatchId, BatchStats}; use meilisearch_types::heed::{Database, RoTxn, RwTxn}; use meilisearch_types::milli::CboRoaringBitmapCodec; use meilisearch_types::task_view::DetailsView; @@ -27,6 +29,7 @@ pub struct ProcessingBatch { pub uid: BatchId, pub details: DetailsView, pub stats: BatchStats, + pub embedder_stats: Option>, pub statuses: HashSet, pub kinds: HashSet, @@ -48,6 +51,7 @@ impl ProcessingBatch { uid, details: DetailsView::default(), stats: BatchStats::default(), + embedder_stats: None, statuses, kinds: HashSet::default(), @@ -60,6 +64,17 @@ impl ProcessingBatch { } } + pub fn clone_embedder_stats(&mut self) -> Arc { + match self.embedder_stats { + Some(ref stats) => stats.clone(), + None => { + let embedder_stats: Arc = Default::default(); + self.embedder_stats = Some(embedder_stats.clone()); + embedder_stats + }, + } + } + /// Update itself with the content of the task and update the batch id in the task. pub fn processing<'a>(&mut self, tasks: impl IntoIterator) { for task in tasks.into_iter() { @@ -141,11 +156,13 @@ impl ProcessingBatch { } pub fn to_batch(&self) -> Batch { + println!("Converting to batch: {:?}", self.embedder_stats); Batch { uid: self.uid, progress: None, details: self.details.clone(), stats: self.stats.clone(), + embedder_stats: self.embedder_stats.as_ref().map(|s| BatchEmbeddingStats::from(s.as_ref())), started_at: self.started_at, finished_at: self.finished_at, enqueued_at: self.enqueued_at, diff --git a/crates/meilisearch-types/src/batch_view.rs b/crates/meilisearch-types/src/batch_view.rs index f0a5f364b..0a9b80f4e 100644 --- a/crates/meilisearch-types/src/batch_view.rs +++ b/crates/meilisearch-types/src/batch_view.rs @@ -3,7 +3,7 @@ use serde::Serialize; use time::{Duration, OffsetDateTime}; use utoipa::ToSchema; -use crate::batches::{Batch, BatchId, BatchStats}; +use crate::batches::{Batch, BatchEmbeddingStats, BatchId, BatchStats}; use crate::task_view::DetailsView; use crate::tasks::serialize_duration; @@ -14,7 +14,7 @@ pub struct BatchView { pub uid: BatchId, pub progress: Option, pub details: DetailsView, - pub stats: BatchStats, + pub stats: BatchStatsView, #[serde(serialize_with = "serialize_duration", default)] pub duration: Option, #[serde(with = "time::serde::rfc3339", default)] @@ -25,13 +25,26 @@ pub struct BatchView { pub batch_strategy: String, } +#[derive(Debug, Clone, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct BatchStatsView { + #[serde(flatten)] + pub stats: BatchStats, + #[serde(skip_serializing_if = "BatchEmbeddingStats::skip_serializing")] + pub embedder: Option, +} + impl BatchView { pub fn from_batch(batch: &Batch) -> Self { Self { uid: batch.uid, progress: batch.progress.clone(), details: batch.details.clone(), - stats: batch.stats.clone(), + stats: BatchStatsView { + stats: batch.stats.clone(), + embedder: batch.embedder_stats.clone(), + }, duration: batch.finished_at.map(|finished_at| finished_at - batch.started_at), started_at: batch.started_at, finished_at: batch.finished_at, diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index 2ef373eac..24be75d1c 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -1,6 +1,7 @@ use std::collections::BTreeMap; +use std::sync::Arc; -use milli::progress::ProgressView; +use milli::progress::{EmbedderStats, ProgressView}; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use utoipa::ToSchema; @@ -19,6 +20,7 @@ pub struct Batch { pub progress: Option, pub details: DetailsView, pub stats: BatchStats, + pub embedder_stats: Option, #[serde(with = "time::serde::rfc3339")] pub started_at: OffsetDateTime, @@ -43,6 +45,7 @@ impl PartialEq for Batch { progress, details, stats, + embedder_stats, started_at, finished_at, enqueued_at, @@ -53,6 +56,7 @@ impl PartialEq for Batch { && progress.is_none() == other.progress.is_none() && details == &other.details && stats == &other.stats + && embedder_stats == &other.embedder_stats && started_at == &other.started_at && finished_at == &other.finished_at && enqueued_at == &other.enqueued_at @@ -82,7 +86,6 @@ pub struct BatchStats { pub write_channel_congestion: Option>, #[serde(default, skip_serializing_if = "serde_json::Map::is_empty")] pub internal_database_sizes: serde_json::Map, - pub embeddings: BatchEmbeddingStats } #[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] @@ -91,5 +94,26 @@ pub struct BatchStats { pub struct BatchEmbeddingStats { pub total_count: usize, pub error_count: usize, + #[serde(skip_serializing_if = "Option::is_none")] pub last_error: Option, } + +impl From<&EmbedderStats> for BatchEmbeddingStats { + fn from(stats: &EmbedderStats) -> Self { + let errors = stats.errors.read().unwrap(); + Self { + total_count: stats.total_requests.load(std::sync::atomic::Ordering::Relaxed), + error_count: errors.1 as usize, + last_error: errors.0.clone(), + } + } +} + +impl BatchEmbeddingStats { + pub fn skip_serializing(this: &Option) -> bool { + match this { + Some(stats) => stats.total_count == 0 && stats.error_count == 0 && stats.last_error.is_none(), + None => true, + } + } +} diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 782d6172f..72be6aec9 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -37,6 +37,7 @@ use index_scheduler::{IndexScheduler, IndexSchedulerOptions}; use meilisearch_auth::{open_auth_store_env, AuthController}; use meilisearch_types::milli::constants::VERSION_MAJOR; use meilisearch_types::milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use meilisearch_types::milli::progress::EmbedderStats; use meilisearch_types::milli::update::{ default_thread_pool_and_threads, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, }; @@ -542,8 +543,9 @@ fn import_dump( tracing::info!("Importing the settings."); let settings = index_reader.settings()?; apply_settings_to_builder(&settings, &mut builder); + let embedder_stats: Arc = Default::default(); // FIXME: this isn't linked to anything builder - .execute(|indexing_step| tracing::debug!("update: {:?}", indexing_step), || false, None)?; + .execute(|indexing_step| tracing::debug!("update: {:?}", indexing_step), || false, embedder_stats.clone())?; // 4.3 Import the documents. // 4.3.1 We need to recreate the grenad+obkv format accepted by the index. @@ -574,7 +576,7 @@ fn import_dump( }, |indexing_step| tracing::trace!("update: {:?}", indexing_step), || false, - None, + embedder_stats, )?; let builder = builder.with_embedders(embedders); diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 82fc71b26..1ff2dd9fe 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -4,6 +4,8 @@ use meili_snap::{json_string, snapshot}; use reqwest::IntoUrl; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, Request, ResponseTemplate}; +use std::thread::sleep; +use std::time::Duration; use crate::common::Value; use crate::json; @@ -305,6 +307,7 @@ async fn create_mock_raw() -> (MockServer, Value) { Mock::given(method("POST")) .and(path("/")) .respond_with(move |req: &Request| { + println!("Sent!"); let req: String = match req.body_json() { Ok(req) => req, Err(error) => { @@ -2111,3 +2114,40 @@ async fn searchable_reindex() { } "###); } + + +#[actix_rt::test] +async fn observability() { + let (_mock, setting) = create_mock_raw().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel"}, + {"id": 3, "name": "missing"}, // Stuff that doesn't exist + {"id": 4, "name": "invalid"}, + {"id": 5, "name": "foobar"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + + let batches = index.filtered_batches(&[], &[], &[]).await; + println!("Batches: {batches:?}"); + + let task = index.wait_task(value.uid()).await; + let batches = index.filtered_batches(&[], &[], &[]).await; + println!("Batches: {batches:?}"); + +} diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index ff795b220..7026f0c11 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -20,7 +20,6 @@ pub trait Step: 'static + Send + Sync { #[derive(Clone, Default)] pub struct Progress { steps: Arc>, - pub embedder_stats: Arc, } #[derive(Default)] @@ -29,6 +28,17 @@ pub struct EmbedderStats { pub total_requests: AtomicUsize } +impl std::fmt::Debug for EmbedderStats { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let (error, count) = self.errors.read().unwrap().clone(); + f.debug_struct("EmbedderStats") + .field("errors", &error) + .field("total_requests", &self.total_requests.load(Ordering::Relaxed)) + .field("error_count", &count) + .finish() + } +} + #[derive(Default)] struct InnerProgress { /// The hierarchy of steps. @@ -72,19 +82,7 @@ impl Progress { }); } - let embedder_view = { - let (last_error, error_count) = match self.embedder_stats.errors.read() { - Ok(guard) => (guard.0.clone(), guard.1), - Err(_) => (None, 0), - }; - EmbedderStatsView { - last_error, - request_count: self.embedder_stats.total_requests.load(Ordering::Relaxed) as u32, - error_count, - } - }; - - ProgressView { steps: step_view, percentage: percentage * 100.0, embedder: embedder_view } + ProgressView { steps: step_view, percentage: percentage * 100.0 } } pub fn accumulated_durations(&self) -> IndexMap { @@ -228,7 +226,6 @@ make_enum_progress! { pub struct ProgressView { pub steps: Vec, pub percentage: f32, - pub embedder: EmbedderStatsView, } #[derive(Debug, Serialize, Clone, ToSchema)] @@ -240,16 +237,6 @@ pub struct ProgressStepView { pub total: u32, } -#[derive(Debug, Serialize, Clone, ToSchema)] -#[serde(rename_all = "camelCase")] -#[schema(rename_all = "camelCase")] -pub struct EmbedderStatsView { - #[serde(skip_serializing_if = "Option::is_none")] - pub last_error: Option, - pub request_count: u32, - pub error_count: u32, -} - /// Used when the name can change but it's still the same step. /// To avoid conflicts on the `TypeId`, create a unique type every time you use this step: /// ```text diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index e7634a4eb..0b7e1a292 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -44,7 +44,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("america") => vec![S("the united states")], }); builder.set_searchable_fields(vec![S("title"), S("description")]); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); wtxn.commit().unwrap(); // index documents @@ -95,6 +95,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index 634d45195..3546660b0 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -103,6 +103,7 @@ impl TempIndex { embedders, &|| false, &Progress::default(), + Default::default(), ) }) .unwrap()?; @@ -134,7 +135,7 @@ impl TempIndex { ) -> Result<(), crate::error::Error> { let mut builder = update::Settings::new(wtxn, &self.inner, &self.indexer_config); update(&mut builder); - builder.execute(drop, || false, None)?; + builder.execute(drop, || false, Default::default())?; Ok(()) } @@ -185,6 +186,7 @@ impl TempIndex { embedders, &|| false, &Progress::default(), + Default::default(), ) }) .unwrap()?; @@ -259,6 +261,7 @@ fn aborting_indexation() { embedders, &|| should_abort.load(Relaxed), &Progress::default(), + Default::default(), ) }) .unwrap() diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 5e6bde53d..de91e9f10 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -687,6 +687,8 @@ pub fn extract_embeddings( unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, ) -> Result>> { + println!("Extract embedder stats {}:", embedder_stats.is_some()); + let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index 020b48f2c..f4f3ad22e 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -50,7 +50,7 @@ pub(crate) fn data_from_obkv_documents( settings_diff: Arc, max_positions_per_attributes: Option, possible_embedding_mistakes: Arc, - embedder_stats: Option>, + embedder_stats: Arc, ) -> Result<()> { let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join( || { @@ -234,7 +234,7 @@ fn send_original_documents_data( embedders_configs: Arc>, settings_diff: Arc, possible_embedding_mistakes: Arc, - embedder_stats: Option>, + embedder_stats: Arc, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -274,7 +274,7 @@ fn send_original_documents_data( embedder.clone(), &embedder_name, &possible_embedding_mistakes, - embedder_stats.clone(), + Some(embedder_stats.clone()), &unused_vectors_distribution, request_threads(), ) { diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index fad43bd30..f2e1783e4 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -81,7 +81,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { added_documents: u64, deleted_documents: u64, embedders: EmbeddingConfigs, - embedder_stats: Option>, + embedder_stats: Arc, } #[derive(Default, Debug, Clone)] @@ -104,7 +104,7 @@ where config: IndexDocumentsConfig, progress: FP, should_abort: FA, - embedder_stats: Option>, + embedder_stats: Arc, ) -> Result> { let transform = Some(Transform::new( wtxn, @@ -2030,6 +2030,7 @@ mod tests { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2117,6 +2118,7 @@ mod tests { EmbeddingConfigs::default(), &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2302,6 +2304,7 @@ mod tests { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2364,6 +2367,7 @@ mod tests { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2417,6 +2421,7 @@ mod tests { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2469,6 +2474,7 @@ mod tests { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2523,6 +2529,7 @@ mod tests { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2582,6 +2589,7 @@ mod tests { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2634,6 +2642,7 @@ mod tests { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2686,6 +2695,7 @@ mod tests { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2884,6 +2894,7 @@ mod tests { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2943,6 +2954,7 @@ mod tests { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2999,6 +3011,7 @@ mod tests { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); wtxn.commit().unwrap(); diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 5b6559d74..f720b81e2 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -1,4 +1,5 @@ -use std::cell::RefCell; +use std::f32::consts::E; +use std::{cell::RefCell, sync::Arc}; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; @@ -6,6 +7,7 @@ use hashbrown::{DefaultHashBuilder, HashMap}; use super::cache::DelAddRoaringBitmap; use crate::error::FaultSource; +use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::new::channel::EmbeddingSender; use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; @@ -22,6 +24,7 @@ pub struct EmbeddingExtractor<'a, 'b> { embedders: &'a EmbeddingConfigs, sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, + embedder_stats: Option>, threads: &'a ThreadPoolNoAbort, } @@ -30,10 +33,11 @@ impl<'a, 'b> EmbeddingExtractor<'a, 'b> { embedders: &'a EmbeddingConfigs, sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, + embedder_stats: Option>, threads: &'a ThreadPoolNoAbort, ) -> Self { let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution); - Self { embedders, sender, threads, possible_embedding_mistakes } + Self { embedders, sender, threads, possible_embedding_mistakes, embedder_stats } } } @@ -75,6 +79,7 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { prompt, context.data, &self.possible_embedding_mistakes, + self.embedder_stats.clone(), self.threads, self.sender, &context.doc_alloc, @@ -307,6 +312,7 @@ struct Chunks<'a, 'b, 'extractor> { dimensions: usize, prompt: &'a Prompt, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, + embedder_stats: Option>, user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, sender: EmbeddingSender<'a, 'b>, @@ -322,6 +328,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { prompt: &'a Prompt, user_provided: &'a RefCell>, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, + embedder_stats: Option>, threads: &'a ThreadPoolNoAbort, sender: EmbeddingSender<'a, 'b>, doc_alloc: &'a Bump, @@ -336,6 +343,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { embedder, prompt, possible_embedding_mistakes, + embedder_stats, threads, sender, embedder_id, @@ -371,6 +379,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { self.embedder_id, self.embedder_name, self.possible_embedding_mistakes, + self.embedder_stats.clone(), unused_vectors_distribution, self.threads, self.sender, @@ -389,6 +398,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { self.embedder_id, self.embedder_name, self.possible_embedding_mistakes, + self.embedder_stats.clone(), unused_vectors_distribution, self.threads, self.sender, @@ -407,6 +417,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { embedder_id: u8, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, + embedder_stats: Option>, unused_vectors_distribution: &UnusedVectorsDistributionBump, threads: &ThreadPoolNoAbort, sender: EmbeddingSender<'a, 'b>, @@ -450,7 +461,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))); } - let res = match embedder.embed_index_ref(texts.as_slice(), threads, None) { + let res = match embedder.embed_index_ref(texts.as_slice(), threads, embedder_stats) { Ok(embeddings) => { for (docid, embedding) in ids.into_iter().zip(embeddings) { sender.set_vector(*docid, embedder_id, embedding).unwrap(); diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index bb36ddc37..72c63b605 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -1,6 +1,7 @@ use std::collections::BTreeMap; use std::sync::atomic::AtomicBool; use std::sync::OnceLock; +use std::sync::Arc; use bumpalo::Bump; use roaring::RoaringBitmap; @@ -13,6 +14,7 @@ use super::super::thread_local::{FullySend, ThreadLocal}; use super::super::FacetFieldIdsDelta; use super::document_changes::{extract, DocumentChanges, IndexingContext}; use crate::index::IndexEmbeddingConfig; +use crate::progress::EmbedderStats; use crate::progress::MergingWordCache; use crate::proximity::ProximityPrecision; use crate::update::new::extract::EmbeddingExtractor; @@ -34,6 +36,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( mut index_embeddings: Vec, document_ids: &mut RoaringBitmap, modified_docids: &mut RoaringBitmap, + embedder_stats: Arc, ) -> Result<(FacetFieldIdsDelta, Vec)> where DC: DocumentChanges<'pl>, @@ -245,6 +248,7 @@ where embedders, embedding_sender, field_distribution, + Some(embedder_stats), request_threads(), ); let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 2ea3c787e..52fd6cd0b 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -1,6 +1,7 @@ use std::sync::atomic::AtomicBool; use std::sync::{Once, RwLock}; use std::thread::{self, Builder}; +use std::sync::Arc; use big_s::S; use document_changes::{DocumentChanges, IndexingContext}; @@ -19,7 +20,7 @@ use super::steps::IndexingStep; use super::thread_local::ThreadLocal; use crate::documents::PrimaryKey; use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; -use crate::progress::Progress; +use crate::progress::{EmbedderStats, Progress}; use crate::update::GrenadParameters; use crate::vector::{ArroyWrapper, EmbeddingConfigs}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; @@ -55,6 +56,7 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>( embedders: EmbeddingConfigs, must_stop_processing: &'indexer MSP, progress: &'indexer Progress, + embedder_stats: Arc, ) -> Result where DC: DocumentChanges<'pl>, @@ -158,6 +160,7 @@ where index_embeddings, document_ids, modified_docids, + embedder_stats, ) }) .unwrap() diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 7c5a70aa3..98ee86978 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -475,7 +475,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { progress_callback: &FP, should_abort: &FA, settings_diff: InnerIndexSettingsDiff, - embedder_stats: Option>, + embedder_stats: Arc, ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -1358,7 +1358,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } - pub fn execute(mut self, progress_callback: FP, should_abort: FA, embedder_stats: Option>) -> Result<()> + pub fn execute(mut self, progress_callback: FP, should_abort: FA, embedder_stats: Arc) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index fc0ff308b..9aeb73f42 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -295,6 +295,10 @@ fn embed( where S: Serialize, { + use std::backtrace::Backtrace; + + println!("Embedder stats? {}", embedder_stats.is_some()); + let request = data.client.post(&data.url); let request = if let Some(bearer) = &data.bearer { request.set("Authorization", bearer) diff --git a/crates/milli/tests/search/distinct.rs b/crates/milli/tests/search/distinct.rs index 55e43c8fa..15fcf70a2 100644 --- a/crates/milli/tests/search/distinct.rs +++ b/crates/milli/tests/search/distinct.rs @@ -19,7 +19,7 @@ macro_rules! test_distinct { let config = milli::update::IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_distinct_field(S(stringify!($distinct))); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index 588662735..5ed223400 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -25,7 +25,7 @@ fn test_facet_distribution_with_no_facet_values() { FilterableAttributesRule::Field(S("genres")), FilterableAttributesRule::Field(S("tags")), ]); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); wtxn.commit().unwrap(); // index documents @@ -74,6 +74,7 @@ fn test_facet_distribution_with_no_facet_values() { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 1e0c24608..beee4ac54 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -63,7 +63,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("america") => vec![S("the united states")], }); builder.set_searchable_fields(vec![S("title"), S("description")]); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); wtxn.commit().unwrap(); // index documents @@ -114,6 +114,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/phrase_search.rs b/crates/milli/tests/search/phrase_search.rs index c5a95f7cd..180fcd176 100644 --- a/crates/milli/tests/search/phrase_search.rs +++ b/crates/milli/tests/search/phrase_search.rs @@ -10,7 +10,7 @@ fn set_stop_words(index: &Index, stop_words: &[&str]) { let mut builder = Settings::new(&mut wtxn, index, &config); let stop_words = stop_words.iter().map(|s| s.to_string()).collect(); builder.set_stop_words(stop_words); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); wtxn.commit().unwrap(); } diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index b7614c215..04b8374de 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -236,7 +236,7 @@ fn criteria_mixup() { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.clone()); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -276,7 +276,7 @@ fn criteria_ascdesc() { S("name"), S("age"), }); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); wtxn.commit().unwrap(); let mut wtxn = index.write_txn().unwrap(); @@ -344,6 +344,7 @@ fn criteria_ascdesc() { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -358,7 +359,7 @@ fn criteria_ascdesc() { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(vec![criterion.clone()]); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index bf9a730c9..e2cdab550 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -46,7 +46,7 @@ fn test_typo_tolerance_one_typo() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); builder.set_min_word_len_one_typo(4); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); @@ -92,7 +92,7 @@ fn test_typo_tolerance_two_typo() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); builder.set_min_word_len_two_typos(7); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); @@ -153,6 +153,7 @@ fn test_typo_disabled_on_word() { embedders, &|| false, &Progress::default(), + Default::default(), ) .unwrap(); @@ -180,7 +181,7 @@ fn test_typo_disabled_on_word() { // `zealand` doesn't allow typos anymore exact_words.insert("zealand".to_string()); builder.set_exact_words(exact_words); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); let mut search = Search::new(&txn, &index); search.query("zealand"); @@ -218,7 +219,7 @@ fn test_disable_typo_on_attribute() { let mut builder = Settings::new(&mut txn, &index, &config); // disable typos on `description` builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect()); - builder.execute(|_| (), || false, None).unwrap(); + builder.execute(|_| (), || false, Default::default()).unwrap(); let mut search = Search::new(&txn, &index); search.query("antebelum"); From 2f82d945028fce52efaf704c2b0a4060093369fa Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 23 Jun 2025 18:55:23 +0200 Subject: [PATCH 009/150] Fix the test and simplify types --- crates/dump/src/lib.rs | 2 +- crates/index-scheduler/src/queue/batches.rs | 4 +- .../src/scheduler/create_batch.rs | 7 +- .../src/scheduler/process_batch.rs | 13 +--- crates/index-scheduler/src/utils.rs | 26 +++---- crates/meilisearch-types/src/batch_view.rs | 4 +- crates/meilisearch-types/src/batches.rs | 10 ++- crates/meilisearch/tests/vector/rest.rs | 71 +++++++++++++++---- crates/milli/src/vector/rest.rs | 1 + 9 files changed, 87 insertions(+), 51 deletions(-) diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index b7a35ad5c..a84ec4ba5 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -329,7 +329,7 @@ pub(crate) mod test { write_channel_congestion: None, internal_database_sizes: Default::default(), }, - embedder_stats: None, + embedder_stats: Default::default(), enqueued_at: Some(BatchEnqueuedAt { earliest: datetime!(2022-11-11 0:00 UTC), oldest: datetime!(2022-11-11 0:00 UTC), diff --git a/crates/index-scheduler/src/queue/batches.rs b/crates/index-scheduler/src/queue/batches.rs index b14601733..c82d5acd2 100644 --- a/crates/index-scheduler/src/queue/batches.rs +++ b/crates/index-scheduler/src/queue/batches.rs @@ -174,7 +174,7 @@ impl BatchQueue { pub(crate) fn write_batch(&self, wtxn: &mut RwTxn, batch: ProcessingBatch) -> Result<()> { let old_batch = self.all_batches.get(wtxn, &batch.uid)?; - println!("Saving batch: {}", batch.embedder_stats.is_some()); + println!("Saving batch: {:?}", batch.embedder_stats); self.all_batches.put( wtxn, @@ -184,7 +184,7 @@ impl BatchQueue { progress: None, details: batch.details, stats: batch.stats, - embedder_stats: batch.embedder_stats.as_ref().map(|s| BatchEmbeddingStats::from(s.as_ref())), + embedder_stats: batch.embedder_stats.as_ref().into(), started_at: batch.started_at, finished_at: batch.finished_at, enqueued_at: batch.enqueued_at, diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index e3763881b..fc20b6fd5 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -437,8 +437,10 @@ impl IndexScheduler { #[cfg(test)] self.maybe_fail(crate::test_utils::FailureLocation::InsideCreateBatch)?; + println!("create next batch"); let batch_id = self.queue.batches.next_batch_id(rtxn)?; let mut current_batch = ProcessingBatch::new(batch_id); + println!("over"); let enqueued = &self.queue.tasks.get_status(rtxn, Status::Enqueued)?; let count_total_enqueued = enqueued.len(); @@ -454,6 +456,7 @@ impl IndexScheduler { kind: Kind::TaskCancelation, id: task_id, }); + println!("task cancelled"); return Ok(Some((Batch::TaskCancelation { task }, current_batch))); } @@ -524,7 +527,7 @@ impl IndexScheduler { } // 5. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. - let task_id = if let Some(task_id) = enqueued.min() { task_id } else { return Ok(None) }; + let task_id = if let Some(task_id) = enqueued.min() { task_id } else { println!("return"); return Ok(None) }; let mut task = self.queue.tasks.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; @@ -602,6 +605,7 @@ impl IndexScheduler { autobatcher::autobatch(enqueued, index_already_exists, primary_key.as_deref()) { current_batch.reason(autobatch_stop_reason.unwrap_or(stop_reason)); + println!("autobatch"); return Ok(self .create_next_batch_index( rtxn, @@ -615,6 +619,7 @@ impl IndexScheduler { // If we found no tasks then we were notified for something that got autobatched // somehow and there is nothing to do. + println!("nothing to do"); Ok(None) } } diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 4e36b65b6..c5305cf21 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -164,7 +164,7 @@ impl IndexScheduler { let pre_commit_dabases_sizes = index.database_sizes(&index_wtxn)?; let (tasks, congestion) = - self.apply_index_operation(&mut index_wtxn, &index, op, &progress, current_batch.clone_embedder_stats())?; + self.apply_index_operation(&mut index_wtxn, &index, op, &progress, current_batch.embedder_stats.clone())?; { progress.update_progress(FinalizingIndexStep::Committing); @@ -240,20 +240,11 @@ impl IndexScheduler { builder.set_primary_key(primary_key); let must_stop_processing = self.scheduler.must_stop_processing.clone(); - let embedder_stats = match current_batch.embedder_stats { - Some(ref stats) => stats.clone(), - None => { - let embedder_stats: Arc = Default::default(); - current_batch.embedder_stats = Some(embedder_stats.clone()); - embedder_stats - }, - }; - builder .execute( |indexing_step| tracing::debug!(update = ?indexing_step), || must_stop_processing.get(), - embedder_stats, + current_batch.embedder_stats.clone(), ) .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; index_wtxn.commit()?; diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 22e319580..455b6a2e7 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -29,7 +29,7 @@ pub struct ProcessingBatch { pub uid: BatchId, pub details: DetailsView, pub stats: BatchStats, - pub embedder_stats: Option>, + pub embedder_stats: Arc, pub statuses: HashSet, pub kinds: HashSet, @@ -47,11 +47,13 @@ impl ProcessingBatch { let mut statuses = HashSet::default(); statuses.insert(Status::Processing); + println!("Processing batch created: {}", uid); + Self { uid, details: DetailsView::default(), stats: BatchStats::default(), - embedder_stats: None, + embedder_stats: Default::default(), statuses, kinds: HashSet::default(), @@ -64,17 +66,6 @@ impl ProcessingBatch { } } - pub fn clone_embedder_stats(&mut self) -> Arc { - match self.embedder_stats { - Some(ref stats) => stats.clone(), - None => { - let embedder_stats: Arc = Default::default(); - self.embedder_stats = Some(embedder_stats.clone()); - embedder_stats - }, - } - } - /// Update itself with the content of the task and update the batch id in the task. pub fn processing<'a>(&mut self, tasks: impl IntoIterator) { for task in tasks.into_iter() { @@ -113,11 +104,14 @@ impl ProcessingBatch { } pub fn reason(&mut self, reason: BatchStopReason) { + println!("batch stopped: {:?}", reason); self.reason = reason; } /// Must be called once the batch has finished processing. pub fn finished(&mut self) { + println!("Batch finished: {}", self.uid); + self.details = DetailsView::default(); self.stats = BatchStats::default(); self.finished_at = Some(OffsetDateTime::now_utc()); @@ -132,6 +126,8 @@ impl ProcessingBatch { /// Update the timestamp of the tasks and the inner structure of this structure. pub fn update(&mut self, task: &mut Task) { + println!("Updating task: {} in batch: {}", task.uid, self.uid); + // We must re-set this value in case we're dealing with a task that has been added between // the `processing` and `finished` state // We must re-set this value in case we're dealing with a task that has been added between @@ -156,13 +152,13 @@ impl ProcessingBatch { } pub fn to_batch(&self) -> Batch { - println!("Converting to batch: {:?}", self.embedder_stats); + println!("Converting to batch: {:?} {:?}", self.uid, self.embedder_stats); Batch { uid: self.uid, progress: None, details: self.details.clone(), stats: self.stats.clone(), - embedder_stats: self.embedder_stats.as_ref().map(|s| BatchEmbeddingStats::from(s.as_ref())), + embedder_stats: self.embedder_stats.as_ref().into(), started_at: self.started_at, finished_at: self.finished_at, enqueued_at: self.enqueued_at, diff --git a/crates/meilisearch-types/src/batch_view.rs b/crates/meilisearch-types/src/batch_view.rs index 0a9b80f4e..bd56f5b1a 100644 --- a/crates/meilisearch-types/src/batch_view.rs +++ b/crates/meilisearch-types/src/batch_view.rs @@ -31,8 +31,8 @@ pub struct BatchView { pub struct BatchStatsView { #[serde(flatten)] pub stats: BatchStats, - #[serde(skip_serializing_if = "BatchEmbeddingStats::skip_serializing")] - pub embedder: Option, + #[serde(skip_serializing_if = "BatchEmbeddingStats::skip_serializing", default)] + pub embedder: BatchEmbeddingStats, } impl BatchView { diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index 24be75d1c..e1c9411b6 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -20,7 +20,8 @@ pub struct Batch { pub progress: Option, pub details: DetailsView, pub stats: BatchStats, - pub embedder_stats: Option, + #[serde(skip_serializing_if = "BatchEmbeddingStats::skip_serializing", default)] + pub embedder_stats: BatchEmbeddingStats, #[serde(with = "time::serde::rfc3339")] pub started_at: OffsetDateTime, @@ -110,10 +111,7 @@ impl From<&EmbedderStats> for BatchEmbeddingStats { } impl BatchEmbeddingStats { - pub fn skip_serializing(this: &Option) -> bool { - match this { - Some(stats) => stats.total_count == 0 && stats.error_count == 0 && stats.last_error.is_none(), - None => true, - } + pub fn skip_serializing(&self) -> bool { + self.total_count == 0 && self.error_count == 0 && self.last_error.is_none() } } diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 1ff2dd9fe..156a2f07b 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -1,10 +1,12 @@ -use std::collections::BTreeMap; +use std::collections::{BTreeMap, BTreeSet}; use meili_snap::{json_string, snapshot}; use reqwest::IntoUrl; +use tokio::spawn; +use tokio::sync::mpsc; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, Request, ResponseTemplate}; -use std::thread::sleep; +use tokio::time::sleep; use std::time::Duration; use crate::common::Value; @@ -307,7 +309,6 @@ async fn create_mock_raw() -> (MockServer, Value) { Mock::given(method("POST")) .and(path("/")) .respond_with(move |req: &Request| { - println!("Sent!"); let req: String = match req.body_json() { Ok(req) => req, Err(error) => { @@ -337,6 +338,50 @@ async fn create_mock_raw() -> (MockServer, Value) { (mock_server, embedder_settings) } +/// A mock server that returns 500 errors, and sends a message once 5 requests are received +async fn create_faulty_mock_raw(mut sender: mpsc::Sender<()>) -> (MockServer, Value) { + let mock_server = MockServer::start().await; + + Mock::given(method("POST")) + .and(path("/")) + .respond_with(move |req: &Request| { + let req: String = match req.body_json() { + Ok(req) => req, + Err(error) => { + return ResponseTemplate::new(400).set_body_json(json!({ + "error": format!("Invalid request: {error}") + })); + } + }; + + let sender = sender.clone(); + spawn(async move { + sender.send(()).await; + }); + + ResponseTemplate::new(500) + .set_delay(Duration::from_millis(500)) + .set_body_json(json!({ + "error": "Service Unavailable", + "text": req + })) + }) + .mount(&mock_server) + .await; + let url = mock_server.uri(); + + let embedder_settings = json!({ + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{text}}", + "response": "{{embedding}}", + "documentTemplate": "{{doc.name}}" + }); + + (mock_server, embedder_settings) +} + pub async fn post(url: T, text: &str) -> reqwest::Result { reqwest::Client::builder().build()?.post(url).json(&json!(text)).send().await } @@ -2118,7 +2163,8 @@ async fn searchable_reindex() { #[actix_rt::test] async fn observability() { - let (_mock, setting) = create_mock_raw().await; + let (sender, mut receiver) = mpsc::channel(10); + let (_mock, setting) = create_faulty_mock_raw(sender).await; let server = get_server_vector().await; let index = server.index("doggo"); @@ -2133,20 +2179,19 @@ async fn observability() { let task = server.wait_task(response.uid()).await; snapshot!(task["status"], @r###""succeeded""###); let documents = json!([ - {"id": 0, "name": "kefir"}, - {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, - {"id": 2, "name": "intel"}, - {"id": 3, "name": "missing"}, // Stuff that doesn't exist - {"id": 4, "name": "invalid"}, - {"id": 5, "name": "foobar"}, + {"id": 0, "name": "will_return_500"}, // Stuff that doesn't exist + {"id": 1, "name": "will_error"}, + {"id": 2, "name": "must_error"}, ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - let batches = index.filtered_batches(&[], &[], &[]).await; - println!("Batches: {batches:?}"); + // The task will eventually fail, so let's not wait for it. + // Let's just wait for 5 errors from the mock server. + for _errors in 0..5 { + receiver.recv().await; + } - let task = index.wait_task(value.uid()).await; let batches = index.filtered_batches(&[], &[], &[]).await; println!("Batches: {batches:?}"); diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index 9aeb73f42..706a411fb 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -316,6 +316,7 @@ where if let Some(embedder_stats) = &embedder_stats { embedder_stats.as_ref().total_requests.fetch_add(1, std::sync::atomic::Ordering::Relaxed); } + // TODO: also catch 403 errors let response = request.clone().send_json(&body); let result = check_response(response, data.configuration_source).and_then(|response| { response_to_embedding(response, data, expected_count, expected_dimension) From 59a1c5d9a7f25fc4b7113eb99c8fd11e7a19ce95 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 24 Jun 2025 11:08:06 +0200 Subject: [PATCH 010/150] Make test more reproducible --- crates/meilisearch/tests/vector/rest.rs | 45 ++++++++++++++----------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 156a2f07b..54ed52213 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -1,4 +1,5 @@ use std::collections::{BTreeMap, BTreeSet}; +use std::sync::atomic::AtomicUsize; use meili_snap::{json_string, snapshot}; use reqwest::IntoUrl; @@ -338,36 +339,43 @@ async fn create_mock_raw() -> (MockServer, Value) { (mock_server, embedder_settings) } -/// A mock server that returns 500 errors, and sends a message once 5 requests are received -async fn create_faulty_mock_raw(mut sender: mpsc::Sender<()>) -> (MockServer, Value) { +async fn create_faulty_mock_raw(sender: mpsc::Sender<()>) -> (MockServer, Value) { let mock_server = MockServer::start().await; - + let count = AtomicUsize::new(0); + Mock::given(method("POST")) .and(path("/")) .respond_with(move |req: &Request| { - let req: String = match req.body_json() { - Ok(req) => req, + let count = count.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + + let req_body = match req.body_json::() { + Ok(body) => body, Err(error) => { return ResponseTemplate::new(400).set_body_json(json!({ - "error": format!("Invalid request: {error}") + "error": format!("Invalid request: {error}") })); } }; - let sender = sender.clone(); - spawn(async move { - sender.send(()).await; - }); + if count >= 5 { + let _ = sender.try_send(()); + ResponseTemplate::new(500) + .set_delay(Duration::from_secs(u64::MAX)) + .set_body_json(json!({ + "error": "Service Unavailable", + "text": req_body + })) + } else { - ResponseTemplate::new(500) - .set_delay(Duration::from_millis(500)) - .set_body_json(json!({ + ResponseTemplate::new(500).set_body_json(json!({ "error": "Service Unavailable", - "text": req + "text": req_body })) + } }) .mount(&mock_server) .await; + let url = mock_server.uri(); let embedder_settings = json!({ @@ -2187,12 +2195,9 @@ async fn observability() { snapshot!(code, @"202 Accepted"); // The task will eventually fail, so let's not wait for it. - // Let's just wait for 5 errors from the mock server. - for _errors in 0..5 { - receiver.recv().await; - } + // Let's just wait for the server to block + receiver.recv().await; let batches = index.filtered_batches(&[], &[], &[]).await; - println!("Batches: {batches:?}"); - + snapshot!(task, @r###""###); } From 4a179fb3c064aed5320e6a0e0011447e1c17b125 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 24 Jun 2025 11:38:11 +0200 Subject: [PATCH 011/150] Improve code quality --- crates/index-scheduler/src/insta_snapshot.rs | 2 +- crates/index-scheduler/src/queue/batches.rs | 9 ++------- .../index-scheduler/src/scheduler/create_batch.rs | 7 +------ crates/index-scheduler/src/utils.rs | 10 +--------- crates/meilisearch-types/src/batch_view.rs | 6 +++--- crates/meilisearch-types/src/batches.rs | 14 +++++++------- crates/meilisearch/tests/vector/rest.rs | 10 +++++----- crates/milli/src/progress.rs | 6 +++--- crates/milli/src/vector/rest.rs | 9 ++------- 9 files changed, 25 insertions(+), 48 deletions(-) diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index 06ec01b5e..8e1fb1c2c 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -1,7 +1,7 @@ use std::collections::BTreeSet; use std::fmt::Write; -use meilisearch_types::batches::{Batch, BatchEmbeddingStats, BatchEnqueuedAt, BatchStats}; +use meilisearch_types::batches::{Batch, EmbedderStatsView, BatchEnqueuedAt, BatchStats}; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str}; use meilisearch_types::heed::{Database, RoTxn}; use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32}; diff --git a/crates/index-scheduler/src/queue/batches.rs b/crates/index-scheduler/src/queue/batches.rs index c82d5acd2..96a3940a5 100644 --- a/crates/index-scheduler/src/queue/batches.rs +++ b/crates/index-scheduler/src/queue/batches.rs @@ -1,7 +1,7 @@ use std::collections::HashSet; use std::ops::{Bound, RangeBounds}; -use meilisearch_types::batches::{Batch, BatchEmbeddingStats, BatchId}; +use meilisearch_types::batches::{Batch, EmbedderStatsView, BatchId}; use meilisearch_types::heed::types::{DecodeIgnore, SerdeBincode, SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls}; use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32}; @@ -92,10 +92,7 @@ impl BatchQueue { } pub(crate) fn get_batch(&self, rtxn: &RoTxn, batch_id: BatchId) -> Result> { - println!("Got batch from db {batch_id:?}"); - let r = Ok(self.all_batches.get(rtxn, &batch_id)?); - println!("Got batch from db => {:?}", r); - r + Ok(self.all_batches.get(rtxn, &batch_id)?) } /// Returns the whole set of batches that belongs to this index. @@ -174,8 +171,6 @@ impl BatchQueue { pub(crate) fn write_batch(&self, wtxn: &mut RwTxn, batch: ProcessingBatch) -> Result<()> { let old_batch = self.all_batches.get(wtxn, &batch.uid)?; - println!("Saving batch: {:?}", batch.embedder_stats); - self.all_batches.put( wtxn, &batch.uid, diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index fc20b6fd5..e3763881b 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -437,10 +437,8 @@ impl IndexScheduler { #[cfg(test)] self.maybe_fail(crate::test_utils::FailureLocation::InsideCreateBatch)?; - println!("create next batch"); let batch_id = self.queue.batches.next_batch_id(rtxn)?; let mut current_batch = ProcessingBatch::new(batch_id); - println!("over"); let enqueued = &self.queue.tasks.get_status(rtxn, Status::Enqueued)?; let count_total_enqueued = enqueued.len(); @@ -456,7 +454,6 @@ impl IndexScheduler { kind: Kind::TaskCancelation, id: task_id, }); - println!("task cancelled"); return Ok(Some((Batch::TaskCancelation { task }, current_batch))); } @@ -527,7 +524,7 @@ impl IndexScheduler { } // 5. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. - let task_id = if let Some(task_id) = enqueued.min() { task_id } else { println!("return"); return Ok(None) }; + let task_id = if let Some(task_id) = enqueued.min() { task_id } else { return Ok(None) }; let mut task = self.queue.tasks.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; @@ -605,7 +602,6 @@ impl IndexScheduler { autobatcher::autobatch(enqueued, index_already_exists, primary_key.as_deref()) { current_batch.reason(autobatch_stop_reason.unwrap_or(stop_reason)); - println!("autobatch"); return Ok(self .create_next_batch_index( rtxn, @@ -619,7 +615,6 @@ impl IndexScheduler { // If we found no tasks then we were notified for something that got autobatched // somehow and there is nothing to do. - println!("nothing to do"); Ok(None) } } diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 455b6a2e7..226ef9f06 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -5,7 +5,7 @@ use std::ops::Bound; use std::sync::Arc; use crate::milli::progress::EmbedderStats; -use meilisearch_types::batches::{Batch, BatchEmbeddingStats, BatchEnqueuedAt, BatchId, BatchStats}; +use meilisearch_types::batches::{Batch, EmbedderStatsView, BatchEnqueuedAt, BatchId, BatchStats}; use meilisearch_types::heed::{Database, RoTxn, RwTxn}; use meilisearch_types::milli::CboRoaringBitmapCodec; use meilisearch_types::task_view::DetailsView; @@ -47,8 +47,6 @@ impl ProcessingBatch { let mut statuses = HashSet::default(); statuses.insert(Status::Processing); - println!("Processing batch created: {}", uid); - Self { uid, details: DetailsView::default(), @@ -104,14 +102,11 @@ impl ProcessingBatch { } pub fn reason(&mut self, reason: BatchStopReason) { - println!("batch stopped: {:?}", reason); self.reason = reason; } /// Must be called once the batch has finished processing. pub fn finished(&mut self) { - println!("Batch finished: {}", self.uid); - self.details = DetailsView::default(); self.stats = BatchStats::default(); self.finished_at = Some(OffsetDateTime::now_utc()); @@ -126,8 +121,6 @@ impl ProcessingBatch { /// Update the timestamp of the tasks and the inner structure of this structure. pub fn update(&mut self, task: &mut Task) { - println!("Updating task: {} in batch: {}", task.uid, self.uid); - // We must re-set this value in case we're dealing with a task that has been added between // the `processing` and `finished` state // We must re-set this value in case we're dealing with a task that has been added between @@ -152,7 +145,6 @@ impl ProcessingBatch { } pub fn to_batch(&self) -> Batch { - println!("Converting to batch: {:?} {:?}", self.uid, self.embedder_stats); Batch { uid: self.uid, progress: None, diff --git a/crates/meilisearch-types/src/batch_view.rs b/crates/meilisearch-types/src/batch_view.rs index bd56f5b1a..aced97d7a 100644 --- a/crates/meilisearch-types/src/batch_view.rs +++ b/crates/meilisearch-types/src/batch_view.rs @@ -3,7 +3,7 @@ use serde::Serialize; use time::{Duration, OffsetDateTime}; use utoipa::ToSchema; -use crate::batches::{Batch, BatchEmbeddingStats, BatchId, BatchStats}; +use crate::batches::{Batch, EmbedderStatsView, BatchId, BatchStats}; use crate::task_view::DetailsView; use crate::tasks::serialize_duration; @@ -31,8 +31,8 @@ pub struct BatchView { pub struct BatchStatsView { #[serde(flatten)] pub stats: BatchStats, - #[serde(skip_serializing_if = "BatchEmbeddingStats::skip_serializing", default)] - pub embedder: BatchEmbeddingStats, + #[serde(skip_serializing_if = "EmbedderStatsView::skip_serializing", default)] + pub embedder: EmbedderStatsView, } impl BatchView { diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index e1c9411b6..45cc2d9f4 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -20,8 +20,8 @@ pub struct Batch { pub progress: Option, pub details: DetailsView, pub stats: BatchStats, - #[serde(skip_serializing_if = "BatchEmbeddingStats::skip_serializing", default)] - pub embedder_stats: BatchEmbeddingStats, + #[serde(skip_serializing_if = "EmbedderStatsView::skip_serializing", default)] + pub embedder_stats: EmbedderStatsView, #[serde(with = "time::serde::rfc3339")] pub started_at: OffsetDateTime, @@ -92,25 +92,25 @@ pub struct BatchStats { #[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] #[schema(rename_all = "camelCase")] -pub struct BatchEmbeddingStats { +pub struct EmbedderStatsView { pub total_count: usize, pub error_count: usize, - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(skip_serializing_if = "Option::is_none", default)] pub last_error: Option, } -impl From<&EmbedderStats> for BatchEmbeddingStats { +impl From<&EmbedderStats> for EmbedderStatsView { fn from(stats: &EmbedderStats) -> Self { let errors = stats.errors.read().unwrap(); Self { - total_count: stats.total_requests.load(std::sync::atomic::Ordering::Relaxed), + total_count: stats.total_count.load(std::sync::atomic::Ordering::Relaxed), error_count: errors.1 as usize, last_error: errors.0.clone(), } } } -impl BatchEmbeddingStats { +impl EmbedderStatsView { pub fn skip_serializing(&self) -> bool { self.total_count == 0 && self.error_count == 0 && self.last_error.is_none() } diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 54ed52213..1fdd18d28 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -2170,7 +2170,7 @@ async fn searchable_reindex() { #[actix_rt::test] -async fn observability() { +async fn last_error_stats() { let (sender, mut receiver) = mpsc::channel(10); let (_mock, setting) = create_faulty_mock_raw(sender).await; let server = get_server_vector().await; @@ -2187,7 +2187,7 @@ async fn observability() { let task = server.wait_task(response.uid()).await; snapshot!(task["status"], @r###""succeeded""###); let documents = json!([ - {"id": 0, "name": "will_return_500"}, // Stuff that doesn't exist + {"id": 0, "name": "will_return_500"}, {"id": 1, "name": "will_error"}, {"id": 2, "name": "must_error"}, ]); @@ -2195,9 +2195,9 @@ async fn observability() { snapshot!(code, @"202 Accepted"); // The task will eventually fail, so let's not wait for it. - // Let's just wait for the server to block + // Let's just wait for the server's signal receiver.recv().await; - let batches = index.filtered_batches(&[], &[], &[]).await; - snapshot!(task, @r###""###); + let (response, _code) = index.filtered_batches(&[], &[], &[]).await; + snapshot!(response["results"][0], @r###""###); } diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 7026f0c11..8cd2c9336 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -25,15 +25,15 @@ pub struct Progress { #[derive(Default)] pub struct EmbedderStats { pub errors: Arc, u32)>>, - pub total_requests: AtomicUsize + pub total_count: AtomicUsize } impl std::fmt::Debug for EmbedderStats { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let (error, count) = self.errors.read().unwrap().clone(); f.debug_struct("EmbedderStats") - .field("errors", &error) - .field("total_requests", &self.total_requests.load(Ordering::Relaxed)) + .field("last_error", &error) + .field("total_count", &self.total_count.load(Ordering::Relaxed)) .field("error_count", &count) .finish() } diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index 706a411fb..d8de89c6a 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -295,10 +295,6 @@ fn embed( where S: Serialize, { - use std::backtrace::Backtrace; - - println!("Embedder stats? {}", embedder_stats.is_some()); - let request = data.client.post(&data.url); let request = if let Some(bearer) = &data.bearer { request.set("Authorization", bearer) @@ -314,9 +310,8 @@ where for attempt in 0..10 { if let Some(embedder_stats) = &embedder_stats { - embedder_stats.as_ref().total_requests.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + embedder_stats.as_ref().total_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed); } - // TODO: also catch 403 errors let response = request.clone().send_json(&body); let result = check_response(response, data.configuration_source).and_then(|response| { response_to_embedding(response, data, expected_count, expected_dimension) @@ -358,7 +353,7 @@ where } if let Some(embedder_stats) = &embedder_stats { - embedder_stats.as_ref().total_requests.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + embedder_stats.as_ref().total_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed); } let response = request.send_json(&body); let result = check_response(response, data.configuration_source).and_then(|response| { From d7721fe607f4645bc805423d06fddf1f39f63f8d Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 24 Jun 2025 12:20:22 +0200 Subject: [PATCH 012/150] Format --- crates/index-scheduler/src/insta_snapshot.rs | 7 ++-- crates/index-scheduler/src/queue/batches.rs | 2 +- .../src/scheduler/process_batch.rs | 12 ++++--- crates/index-scheduler/src/utils.rs | 4 +-- crates/meilisearch-types/src/batch_view.rs | 2 +- crates/meilisearch-types/src/batches.rs | 1 - crates/meilisearch/src/lib.rs | 7 ++-- crates/meilisearch/tests/vector/rest.rs | 20 +++++------ crates/milli/src/progress.rs | 2 +- .../milli/src/search/new/tests/integration.rs | 2 +- .../milli/src/update/new/indexer/extract.rs | 2 +- crates/milli/src/update/new/indexer/mod.rs | 2 +- crates/milli/src/update/settings.rs | 7 +++- crates/milli/src/vector/composite.rs | 34 +++++++++++++------ crates/milli/src/vector/mod.rs | 30 ++++++++++------ crates/milli/src/vector/ollama.rs | 14 +++++--- crates/milli/src/vector/openai.rs | 16 +++++++-- crates/milli/src/vector/rest.rs | 23 ++++++++++--- 18 files changed, 124 insertions(+), 63 deletions(-) diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index 8e1fb1c2c..d4504c246 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -1,7 +1,7 @@ use std::collections::BTreeSet; use std::fmt::Write; -use meilisearch_types::batches::{Batch, EmbedderStatsView, BatchEnqueuedAt, BatchStats}; +use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchStats}; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str}; use meilisearch_types::heed::{Database, RoTxn}; use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32}; @@ -367,7 +367,10 @@ pub fn snapshot_batch(batch: &Batch) -> String { snap.push_str(&format!("uid: {uid}, ")); snap.push_str(&format!("details: {}, ", serde_json::to_string(details).unwrap())); snap.push_str(&format!("stats: {}, ", serde_json::to_string(&stats).unwrap())); - snap.push_str(&format!("embedder_stats: {}, ", serde_json::to_string(&embedder_stats).unwrap())); + snap.push_str(&format!( + "embedder_stats: {}, ", + serde_json::to_string(&embedder_stats).unwrap() + )); snap.push_str(&format!("stop reason: {}, ", serde_json::to_string(&stop_reason).unwrap())); snap.push('}'); snap diff --git a/crates/index-scheduler/src/queue/batches.rs b/crates/index-scheduler/src/queue/batches.rs index 96a3940a5..b96f65836 100644 --- a/crates/index-scheduler/src/queue/batches.rs +++ b/crates/index-scheduler/src/queue/batches.rs @@ -1,7 +1,7 @@ use std::collections::HashSet; use std::ops::{Bound, RangeBounds}; -use meilisearch_types::batches::{Batch, EmbedderStatsView, BatchId}; +use meilisearch_types::batches::{Batch, BatchId}; use meilisearch_types::heed::types::{DecodeIgnore, SerdeBincode, SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls}; use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32}; diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index c5305cf21..5261692b6 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -1,11 +1,10 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::Ordering; -use std::sync::Arc; use meilisearch_types::batches::{BatchEnqueuedAt, BatchId}; use meilisearch_types::heed::{RoTxn, RwTxn}; -use meilisearch_types::milli::progress::{EmbedderStats, Progress, VariableNameStep}; +use meilisearch_types::milli::progress::{Progress, VariableNameStep}; use meilisearch_types::milli::{self, ChannelCongestion}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::versioning::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; @@ -163,8 +162,13 @@ impl IndexScheduler { .set_currently_updating_index(Some((index_uid.clone(), index.clone()))); let pre_commit_dabases_sizes = index.database_sizes(&index_wtxn)?; - let (tasks, congestion) = - self.apply_index_operation(&mut index_wtxn, &index, op, &progress, current_batch.embedder_stats.clone())?; + let (tasks, congestion) = self.apply_index_operation( + &mut index_wtxn, + &index, + op, + &progress, + current_batch.embedder_stats.clone(), + )?; { progress.update_progress(FinalizingIndexStep::Committing); diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 226ef9f06..ca37065ec 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -1,11 +1,11 @@ //! Utility functions on the DBs. Mainly getter and setters. +use crate::milli::progress::EmbedderStats; use std::collections::{BTreeSet, HashSet}; use std::ops::Bound; use std::sync::Arc; -use crate::milli::progress::EmbedderStats; -use meilisearch_types::batches::{Batch, EmbedderStatsView, BatchEnqueuedAt, BatchId, BatchStats}; +use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchId, BatchStats}; use meilisearch_types::heed::{Database, RoTxn, RwTxn}; use meilisearch_types::milli::CboRoaringBitmapCodec; use meilisearch_types::task_view::DetailsView; diff --git a/crates/meilisearch-types/src/batch_view.rs b/crates/meilisearch-types/src/batch_view.rs index aced97d7a..ea027b74e 100644 --- a/crates/meilisearch-types/src/batch_view.rs +++ b/crates/meilisearch-types/src/batch_view.rs @@ -3,7 +3,7 @@ use serde::Serialize; use time::{Duration, OffsetDateTime}; use utoipa::ToSchema; -use crate::batches::{Batch, EmbedderStatsView, BatchId, BatchStats}; +use crate::batches::{Batch, BatchId, BatchStats, EmbedderStatsView}; use crate::task_view::DetailsView; use crate::tasks::serialize_duration; diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index 45cc2d9f4..cec74fb75 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -1,5 +1,4 @@ use std::collections::BTreeMap; -use std::sync::Arc; use milli::progress::{EmbedderStats, ProgressView}; use serde::{Deserialize, Serialize}; diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 72be6aec9..cdecd520c 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -544,8 +544,11 @@ fn import_dump( let settings = index_reader.settings()?; apply_settings_to_builder(&settings, &mut builder); let embedder_stats: Arc = Default::default(); // FIXME: this isn't linked to anything - builder - .execute(|indexing_step| tracing::debug!("update: {:?}", indexing_step), || false, embedder_stats.clone())?; + builder.execute( + |indexing_step| tracing::debug!("update: {:?}", indexing_step), + || false, + embedder_stats.clone(), + )?; // 4.3 Import the documents. // 4.3.1 We need to recreate the grenad+obkv format accepted by the index. diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 1fdd18d28..363931a86 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -1,14 +1,12 @@ -use std::collections::{BTreeMap, BTreeSet}; +use std::collections::BTreeMap; use std::sync::atomic::AtomicUsize; use meili_snap::{json_string, snapshot}; use reqwest::IntoUrl; -use tokio::spawn; +use std::time::Duration; use tokio::sync::mpsc; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, Request, ResponseTemplate}; -use tokio::time::sleep; -use std::time::Duration; use crate::common::Value; use crate::json; @@ -342,7 +340,7 @@ async fn create_mock_raw() -> (MockServer, Value) { async fn create_faulty_mock_raw(sender: mpsc::Sender<()>) -> (MockServer, Value) { let mock_server = MockServer::start().await; let count = AtomicUsize::new(0); - + Mock::given(method("POST")) .and(path("/")) .respond_with(move |req: &Request| { @@ -359,14 +357,13 @@ async fn create_faulty_mock_raw(sender: mpsc::Sender<()>) -> (MockServer, Value) if count >= 5 { let _ = sender.try_send(()); - ResponseTemplate::new(500) - .set_delay(Duration::from_secs(u64::MAX)) - .set_body_json(json!({ + ResponseTemplate::new(500).set_delay(Duration::from_secs(u64::MAX)).set_body_json( + json!({ "error": "Service Unavailable", "text": req_body - })) + }), + ) } else { - ResponseTemplate::new(500).set_body_json(json!({ "error": "Service Unavailable", "text": req_body @@ -2168,7 +2165,6 @@ async fn searchable_reindex() { "###); } - #[actix_rt::test] async fn last_error_stats() { let (sender, mut receiver) = mpsc::channel(10); @@ -2191,7 +2187,7 @@ async fn last_error_stats() { {"id": 1, "name": "will_error"}, {"id": 2, "name": "must_error"}, ]); - let (value, code) = index.add_documents(documents, None).await; + let (_value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); // The task will eventually fail, so let's not wait for it. diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 8cd2c9336..7ecfcc095 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -25,7 +25,7 @@ pub struct Progress { #[derive(Default)] pub struct EmbedderStats { pub errors: Arc, u32)>>, - pub total_count: AtomicUsize + pub total_count: AtomicUsize, } impl std::fmt::Debug for EmbedderStats { diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 0b7e1a292..c4e521a88 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -95,7 +95,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { embedders, &|| false, &Progress::default(), - Default::default(), + Default::default(), ) .unwrap(); diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 72c63b605..040886236 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -1,7 +1,7 @@ use std::collections::BTreeMap; use std::sync::atomic::AtomicBool; -use std::sync::OnceLock; use std::sync::Arc; +use std::sync::OnceLock; use bumpalo::Bump; use roaring::RoaringBitmap; diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 52fd6cd0b..33774f892 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -1,7 +1,7 @@ use std::sync::atomic::AtomicBool; +use std::sync::Arc; use std::sync::{Once, RwLock}; use std::thread::{self, Builder}; -use std::sync::Arc; use big_s::S; use document_changes::{DocumentChanges, IndexingContext}; diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 98ee86978..b3f70d1b6 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1358,7 +1358,12 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } - pub fn execute(mut self, progress_callback: FP, should_abort: FA, embedder_stats: Arc) -> Result<()> + pub fn execute( + mut self, + progress_callback: FP, + should_abort: FA, + embedder_stats: Arc, + ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, diff --git a/crates/milli/src/vector/composite.rs b/crates/milli/src/vector/composite.rs index daec50e4b..7d9497165 100644 --- a/crates/milli/src/vector/composite.rs +++ b/crates/milli/src/vector/composite.rs @@ -173,12 +173,14 @@ impl SubEmbedder { ) -> std::result::Result { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed_one(text), - SubEmbedder::OpenAi(embedder) => { - embedder.embed(&[text], deadline, embedder_stats)?.pop().ok_or_else(EmbedError::missing_embedding) - } - SubEmbedder::Ollama(embedder) => { - embedder.embed(&[text], deadline, embedder_stats)?.pop().ok_or_else(EmbedError::missing_embedding) - } + SubEmbedder::OpenAi(embedder) => embedder + .embed(&[text], deadline, embedder_stats)? + .pop() + .ok_or_else(EmbedError::missing_embedding), + SubEmbedder::Ollama(embedder) => embedder + .embed(&[text], deadline, embedder_stats)? + .pop() + .ok_or_else(EmbedError::missing_embedding), SubEmbedder::UserProvided(embedder) => embedder.embed_one(text), SubEmbedder::Rest(embedder) => embedder .embed_ref(&[text], deadline, embedder_stats)? @@ -198,10 +200,16 @@ impl SubEmbedder { ) -> std::result::Result>, EmbedError> { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), - SubEmbedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), - SubEmbedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), + SubEmbedder::OpenAi(embedder) => { + embedder.embed_index(text_chunks, threads, embedder_stats) + } + SubEmbedder::Ollama(embedder) => { + embedder.embed_index(text_chunks, threads, embedder_stats) + } SubEmbedder::UserProvided(embedder) => embedder.embed_index(text_chunks), - SubEmbedder::Rest(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), + SubEmbedder::Rest(embedder) => { + embedder.embed_index(text_chunks, threads, embedder_stats) + } } } @@ -214,8 +222,12 @@ impl SubEmbedder { ) -> std::result::Result, EmbedError> { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), - SubEmbedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), - SubEmbedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), + SubEmbedder::OpenAi(embedder) => { + embedder.embed_index_ref(texts, threads, embedder_stats) + } + SubEmbedder::Ollama(embedder) => { + embedder.embed_index_ref(texts, threads, embedder_stats) + } SubEmbedder::UserProvided(embedder) => embedder.embed_index_ref(texts), SubEmbedder::Rest(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), } diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 124e17cff..efa981694 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -719,12 +719,14 @@ impl Embedder { } let embedding = match self { Embedder::HuggingFace(embedder) => embedder.embed_one(text), - Embedder::OpenAi(embedder) => { - embedder.embed(&[text], deadline, None)?.pop().ok_or_else(EmbedError::missing_embedding) - } - Embedder::Ollama(embedder) => { - embedder.embed(&[text], deadline, None)?.pop().ok_or_else(EmbedError::missing_embedding) - } + Embedder::OpenAi(embedder) => embedder + .embed(&[text], deadline, None)? + .pop() + .ok_or_else(EmbedError::missing_embedding), + Embedder::Ollama(embedder) => embedder + .embed(&[text], deadline, None)? + .pop() + .ok_or_else(EmbedError::missing_embedding), Embedder::UserProvided(embedder) => embedder.embed_one(text), Embedder::Rest(embedder) => embedder .embed_ref(&[text], deadline, None)? @@ -751,11 +753,17 @@ impl Embedder { ) -> std::result::Result>, EmbedError> { match self { Embedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), - Embedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), - Embedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), + Embedder::OpenAi(embedder) => { + embedder.embed_index(text_chunks, threads, embedder_stats) + } + Embedder::Ollama(embedder) => { + embedder.embed_index(text_chunks, threads, embedder_stats) + } Embedder::UserProvided(embedder) => embedder.embed_index(text_chunks), Embedder::Rest(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), - Embedder::Composite(embedder) => embedder.index.embed_index(text_chunks, threads, embedder_stats), + Embedder::Composite(embedder) => { + embedder.index.embed_index(text_chunks, threads, embedder_stats) + } } } @@ -772,7 +780,9 @@ impl Embedder { Embedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), Embedder::UserProvided(embedder) => embedder.embed_index_ref(texts), Embedder::Rest(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), - Embedder::Composite(embedder) => embedder.index.embed_index_ref(texts, threads, embedder_stats), + Embedder::Composite(embedder) => { + embedder.index.embed_index_ref(texts, threads, embedder_stats) + } } } diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/ollama.rs index b3ee925e6..e26b7e1ea 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/ollama.rs @@ -106,7 +106,7 @@ impl Embedder { &self, texts: &[S], deadline: Option, - embedder_stats: Option> + embedder_stats: Option>, ) -> Result, EmbedError> { match self.rest_embedder.embed_ref(texts, deadline, embedder_stats) { Ok(embeddings) => Ok(embeddings), @@ -126,11 +126,17 @@ impl Embedder { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { - text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())).collect() + text_chunks + .into_iter() + .map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())) + .collect() } else { threads .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())).collect() + text_chunks + .into_par_iter() + .map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())) + .collect() }) .map_err(|error| EmbedError { kind: EmbedErrorKind::PanicInThreadPool(error), @@ -143,7 +149,7 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Option> + embedder_stats: Option>, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/openai.rs index 384abe880..ca072d6e5 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/openai.rs @@ -241,7 +241,11 @@ impl Embedder { let encoded = self.tokenizer.encode_ordinary(text); let len = encoded.len(); if len < max_token_count { - all_embeddings.append(&mut self.rest_embedder.embed_ref(&[text], deadline, None)?); + all_embeddings.append(&mut self.rest_embedder.embed_ref( + &[text], + deadline, + None, + )?); continue; } @@ -263,11 +267,17 @@ impl Embedder { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { - text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())).collect() + text_chunks + .into_iter() + .map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())) + .collect() } else { threads .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())).collect() + text_chunks + .into_par_iter() + .map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())) + .collect() }) .map_err(|error| EmbedError { kind: EmbedErrorKind::PanicInThreadPool(error), diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index d8de89c6a..294b0ceda 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -14,8 +14,8 @@ use super::{ DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, REQUEST_PARALLELISM, }; use crate::error::FaultSource; -use crate::ThreadPoolNoAbort; use crate::progress::EmbedderStats; +use crate::ThreadPoolNoAbort; // retrying in case of failure pub struct Retry { @@ -172,7 +172,14 @@ impl Embedder { deadline: Option, embedder_stats: Option>, ) -> Result, EmbedError> { - embed(&self.data, texts.as_slice(), texts.len(), Some(self.dimensions), deadline, embedder_stats) + embed( + &self.data, + texts.as_slice(), + texts.len(), + Some(self.dimensions), + deadline, + embedder_stats, + ) } pub fn embed_ref( @@ -206,11 +213,17 @@ impl Embedder { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { - text_chunks.into_iter().map(move |chunk| self.embed(chunk, None, embedder_stats.clone())).collect() + text_chunks + .into_iter() + .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) + .collect() } else { threads .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(chunk, None, embedder_stats.clone())).collect() + text_chunks + .into_par_iter() + .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) + .collect() }) .map_err(|error| EmbedError { kind: EmbedErrorKind::PanicInThreadPool(error), @@ -223,7 +236,7 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Option> + embedder_stats: Option>, ) -> Result, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. From bc4d1530ee3ffbd8434946f26ac582ff46011c61 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 24 Jun 2025 14:50:23 +0200 Subject: [PATCH 013/150] Fix tests --- .gitignore | 3 ++ crates/index-scheduler/src/insta_snapshot.rs | 10 +++--- crates/meilisearch/tests/vector/rest.rs | 32 +++++++++++++++++++- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 07453a58f..fc24b8306 100644 --- a/.gitignore +++ b/.gitignore @@ -18,5 +18,8 @@ ## ... unreviewed *.snap.new +# Database snapshot +crates/meilisearch/db.snapshot + # Fuzzcheck data for the facet indexing fuzz test crates/milli/fuzz/update::facet::incremental::fuzz::fuzz/ diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index d4504c246..a5bb1ea56 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -367,10 +367,12 @@ pub fn snapshot_batch(batch: &Batch) -> String { snap.push_str(&format!("uid: {uid}, ")); snap.push_str(&format!("details: {}, ", serde_json::to_string(details).unwrap())); snap.push_str(&format!("stats: {}, ", serde_json::to_string(&stats).unwrap())); - snap.push_str(&format!( - "embedder_stats: {}, ", - serde_json::to_string(&embedder_stats).unwrap() - )); + if !embedder_stats.skip_serializing() { + snap.push_str(&format!( + "embedder stats: {}, ", + serde_json::to_string(&embedder_stats).unwrap() + )); + } snap.push_str(&format!("stop reason: {}, ", serde_json::to_string(&stop_reason).unwrap())); snap.push('}'); snap diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 363931a86..2c8d3ed7c 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -2195,5 +2195,35 @@ async fn last_error_stats() { receiver.recv().await; let (response, _code) = index.filtered_batches(&[], &[], &[]).await; - snapshot!(response["results"][0], @r###""###); + snapshot!(json_string!(response["results"][0], { ".progress" => "[ignored]", ".stats.embedder.totalCount" => "[ignored]", ".startedAt" => "[ignored]" }), @r#" + { + "uid": 1, + "progress": "[ignored]", + "details": { + "receivedDocuments": 3, + "indexedDocuments": null + }, + "stats": { + "totalNbTasks": 1, + "status": { + "processing": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "doggo": 1 + }, + "embedder": { + "totalCount": "[ignored]", + "errorCount": 5, + "lastError": "runtime error: received internal error HTTP 500 from embedding server\n - server replied with `{\"error\":\"Service Unavailable\",\"text\":\"will_error\"}`" + } + }, + "duration": null, + "startedAt": "[ignored]", + "finishedAt": null, + "batchStrategy": "batched all enqueued tasks" + } + "#); } From 695877043ae6af463915447a9319771c0ee2974c Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 24 Jun 2025 14:53:39 +0200 Subject: [PATCH 014/150] Fix warnings --- .../src/update/index_documents/extract/extract_vector_points.rs | 1 + crates/milli/src/update/new/extract/vectors/mod.rs | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index de91e9f10..e940e743b 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -675,6 +675,7 @@ fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering { a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat)) } +#[allow(clippy::too_many_arguments)] #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] pub fn extract_embeddings( // docid, prompt diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index f720b81e2..946fb00b5 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -1,4 +1,3 @@ -use std::f32::consts::E; use std::{cell::RefCell, sync::Arc}; use bumpalo::collections::Vec as BVec; From d08e89ea3d36026d4dcb554f7fb45170bc398d17 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 24 Jun 2025 15:10:15 +0200 Subject: [PATCH 015/150] Remove options --- .../index_documents/extract/extract_vector_points.rs | 6 ++---- .../milli/src/update/index_documents/extract/mod.rs | 2 +- crates/milli/src/update/new/extract/vectors/mod.rs | 10 +++++----- crates/milli/src/update/new/indexer/extract.rs | 2 +- crates/milli/src/vector/composite.rs | 4 ++-- crates/milli/src/vector/mod.rs | 4 ++-- crates/milli/src/vector/ollama.rs | 12 ++++++------ crates/milli/src/vector/openai.rs | 12 ++++++------ crates/milli/src/vector/rest.rs | 12 ++++++------ 9 files changed, 31 insertions(+), 33 deletions(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index e940e743b..e6d874a69 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -684,12 +684,10 @@ pub fn extract_embeddings( embedder: Arc, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, - embedder_stats: Option>, + embedder_stats: Arc, unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, ) -> Result>> { - println!("Extract embedder stats {}:", embedder_stats.is_some()); - let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk @@ -791,7 +789,7 @@ fn embed_chunks( text_chunks: Vec>, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, - embedder_stats: Option>, + embedder_stats: Arc, unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, ) -> Result>> { diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index f4f3ad22e..1eeddcccb 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -274,7 +274,7 @@ fn send_original_documents_data( embedder.clone(), &embedder_name, &possible_embedding_mistakes, - Some(embedder_stats.clone()), + embedder_stats.clone(), &unused_vectors_distribution, request_threads(), ) { diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 946fb00b5..c21dabf74 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -23,7 +23,7 @@ pub struct EmbeddingExtractor<'a, 'b> { embedders: &'a EmbeddingConfigs, sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, - embedder_stats: Option>, + embedder_stats: Arc, threads: &'a ThreadPoolNoAbort, } @@ -32,7 +32,7 @@ impl<'a, 'b> EmbeddingExtractor<'a, 'b> { embedders: &'a EmbeddingConfigs, sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, - embedder_stats: Option>, + embedder_stats: Arc, threads: &'a ThreadPoolNoAbort, ) -> Self { let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution); @@ -311,7 +311,7 @@ struct Chunks<'a, 'b, 'extractor> { dimensions: usize, prompt: &'a Prompt, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, - embedder_stats: Option>, + embedder_stats: Arc, user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, sender: EmbeddingSender<'a, 'b>, @@ -327,7 +327,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { prompt: &'a Prompt, user_provided: &'a RefCell>, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, - embedder_stats: Option>, + embedder_stats: Arc, threads: &'a ThreadPoolNoAbort, sender: EmbeddingSender<'a, 'b>, doc_alloc: &'a Bump, @@ -416,7 +416,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { embedder_id: u8, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, - embedder_stats: Option>, + embedder_stats: Arc, unused_vectors_distribution: &UnusedVectorsDistributionBump, threads: &ThreadPoolNoAbort, sender: EmbeddingSender<'a, 'b>, diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 040886236..c721a2563 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -248,7 +248,7 @@ where embedders, embedding_sender, field_distribution, - Some(embedder_stats), + embedder_stats, request_threads(), ); let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); diff --git a/crates/milli/src/vector/composite.rs b/crates/milli/src/vector/composite.rs index 7d9497165..87f05d4fe 100644 --- a/crates/milli/src/vector/composite.rs +++ b/crates/milli/src/vector/composite.rs @@ -196,7 +196,7 @@ impl SubEmbedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - embedder_stats: Option>, + embedder_stats: Arc, ) -> std::result::Result>, EmbedError> { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), @@ -218,7 +218,7 @@ impl SubEmbedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Option>, + embedder_stats: Arc, ) -> std::result::Result, EmbedError> { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index efa981694..481eb6c99 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -749,7 +749,7 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - embedder_stats: Option>, + embedder_stats: Arc, ) -> std::result::Result>, EmbedError> { match self { Embedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), @@ -772,7 +772,7 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Option>, + embedder_stats: Arc, ) -> std::result::Result, EmbedError> { match self { Embedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/ollama.rs index e26b7e1ea..045b65b72 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/ollama.rs @@ -121,21 +121,21 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - embedder_stats: Option>, + embedder_stats: Arc, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { text_chunks .into_iter() - .map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone()))) .collect() } else { threads .install(move || { text_chunks .into_par_iter() - .map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone()))) .collect() }) .map_err(|error| EmbedError { @@ -149,14 +149,14 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Option>, + embedder_stats: Arc, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { let embeddings: Result>, _> = texts .chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) .collect(); let embeddings = embeddings?; @@ -166,7 +166,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) .collect(); let embeddings = embeddings?; diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/openai.rs index ca072d6e5..b64e3d467 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/openai.rs @@ -262,21 +262,21 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - embedder_stats: Option>, + embedder_stats: Arc, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { text_chunks .into_iter() - .map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone()))) .collect() } else { threads .install(move || { text_chunks .into_par_iter() - .map(move |chunk| self.embed(&chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone()))) .collect() }) .map_err(|error| EmbedError { @@ -290,14 +290,14 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Option>, + embedder_stats: Arc, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { let embeddings: Result>, _> = texts .chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) .collect(); let embeddings = embeddings?; Ok(embeddings.into_iter().flatten().collect()) @@ -306,7 +306,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) .collect(); let embeddings = embeddings?; diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index 294b0ceda..409284b65 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -208,21 +208,21 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - embedder_stats: Option>, + embedder_stats: Arc, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { text_chunks .into_iter() - .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) .collect() } else { threads .install(move || { text_chunks .into_par_iter() - .map(move |chunk| self.embed(chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) .collect() }) .map_err(|error| EmbedError { @@ -236,14 +236,14 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Option>, + embedder_stats: Arc, ) -> Result, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { let embeddings: Result>, _> = texts .chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed_ref(chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed_ref(chunk, None, Some(embedder_stats.clone()))) .collect(); let embeddings = embeddings?; @@ -253,7 +253,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed_ref(chunk, None, embedder_stats.clone())) + .map(move |chunk| self.embed_ref(chunk, None, Some(embedder_stats.clone()))) .collect(); let embeddings = embeddings?; From 211c1b753f24b79a575f4c5e5d9617e10d0a47db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 24 Jun 2025 15:27:39 +0200 Subject: [PATCH 016/150] Fix the env variable name --- crates/meilisearch/src/option.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 35ce71cf4..5b7d1e52f 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -62,7 +62,7 @@ const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str = "MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS"; const MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE: &str = - "MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_SIZE"; + "MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE"; const MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES: &str = "MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES"; const MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION: &str = "MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION"; From 89498a2beaf12616fc586078ea1629642303bd6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 24 Jun 2025 15:58:39 +0200 Subject: [PATCH 017/150] Remove Gemini from the LLM-providers list --- crates/meilisearch-types/src/features.rs | 3 --- crates/meilisearch/src/routes/chats/config.rs | 2 +- crates/meilisearch/src/routes/chats/settings.rs | 2 -- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/crates/meilisearch-types/src/features.rs b/crates/meilisearch-types/src/features.rs index 49bee8d97..9ec2d321f 100644 --- a/crates/meilisearch-types/src/features.rs +++ b/crates/meilisearch-types/src/features.rs @@ -114,7 +114,6 @@ pub enum ChatCompletionSource { OpenAi, AzureOpenAi, Mistral, - Gemini, VLlm, } @@ -134,7 +133,6 @@ impl ChatCompletionSource { AzureOpenAi if Self::old_openai_model(model) => System, AzureOpenAi => Developer, Mistral => System, - Gemini => System, VLlm => System, } } @@ -154,7 +152,6 @@ impl ChatCompletionSource { match self { OpenAi => Some("https://api.openai.com/v1/"), Mistral => Some("https://api.mistral.ai/v1/"), - Gemini => Some("https://generativelanguage.googleapis.com/v1beta/openai"), AzureOpenAi | VLlm => None, } } diff --git a/crates/meilisearch/src/routes/chats/config.rs b/crates/meilisearch/src/routes/chats/config.rs index 24ba6bd07..d4426a97a 100644 --- a/crates/meilisearch/src/routes/chats/config.rs +++ b/crates/meilisearch/src/routes/chats/config.rs @@ -13,7 +13,7 @@ impl Config { pub fn new(chat_settings: &DbChatSettings) -> Self { use meilisearch_types::features::ChatCompletionSource::*; match chat_settings.source { - OpenAi | Mistral | Gemini | VLlm => { + OpenAi | Mistral | VLlm => { let mut config = OpenAIConfig::default(); if let Some(org_id) = chat_settings.org_id.as_ref() { config = config.with_org_id(org_id); diff --git a/crates/meilisearch/src/routes/chats/settings.rs b/crates/meilisearch/src/routes/chats/settings.rs index 28611ee98..38eb0d3c5 100644 --- a/crates/meilisearch/src/routes/chats/settings.rs +++ b/crates/meilisearch/src/routes/chats/settings.rs @@ -218,7 +218,6 @@ pub enum ChatCompletionSource { #[default] OpenAi, Mistral, - Gemini, AzureOpenAi, VLlm, } @@ -229,7 +228,6 @@ impl From for DbChatCompletionSource { match source { OpenAi => DbChatCompletionSource::OpenAi, Mistral => DbChatCompletionSource::Mistral, - Gemini => DbChatCompletionSource::Gemini, AzureOpenAi => DbChatCompletionSource::AzureOpenAi, VLlm => DbChatCompletionSource::VLlm, } From 5f50fc946442e8b308cbfa7c3a9f3138a84f6705 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 24 Jun 2025 17:05:49 +0200 Subject: [PATCH 018/150] Add new analytics to the chat completions route --- .../routes/chats/chat_completion_analytics.rs | 139 ++++++++++++++++++ .../src/routes/chats/chat_completions.rs | 32 ++++ crates/meilisearch/src/routes/chats/mod.rs | 1 + 3 files changed, 172 insertions(+) create mode 100644 crates/meilisearch/src/routes/chats/chat_completion_analytics.rs diff --git a/crates/meilisearch/src/routes/chats/chat_completion_analytics.rs b/crates/meilisearch/src/routes/chats/chat_completion_analytics.rs new file mode 100644 index 000000000..4fde81653 --- /dev/null +++ b/crates/meilisearch/src/routes/chats/chat_completion_analytics.rs @@ -0,0 +1,139 @@ +use std::collections::BinaryHeap; + +use serde_json::{json, Value}; + +use crate::aggregate_methods; +use crate::analytics::{Aggregate, AggregateMethod}; + +aggregate_methods!( + ChatCompletionPOST => "Chat Completion POST", +); + +#[derive(Default)] +pub struct ChatCompletionAggregator { + // requests + total_received: usize, + total_succeeded: usize, + time_spent: BinaryHeap, + + // chat completion specific metrics + total_messages: usize, + total_streamed_requests: usize, + total_non_streamed_requests: usize, + + // model usage tracking + models_used: std::collections::HashMap, + + _method: std::marker::PhantomData, +} + +impl ChatCompletionAggregator { + pub fn from_request(model: &str, message_count: usize, is_stream: bool) -> Self { + let mut models_used = std::collections::HashMap::new(); + models_used.insert(model.to_string(), 1); + + Self { + total_received: 1, + total_messages: message_count, + total_streamed_requests: if is_stream { 1 } else { 0 }, + total_non_streamed_requests: if is_stream { 0 } else { 1 }, + models_used, + ..Default::default() + } + } + + pub fn succeed(&mut self, time_spent: std::time::Duration) { + self.total_succeeded += 1; + self.time_spent.push(time_spent.as_millis() as usize); + } +} + +impl Aggregate for ChatCompletionAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(mut self: Box, new: Box) -> Box { + let Self { + total_received, + total_succeeded, + mut time_spent, + total_messages, + total_streamed_requests, + total_non_streamed_requests, + models_used, + .. + } = *new; + + // Aggregate time spent + self.time_spent.append(&mut time_spent); + + // Aggregate counters + self.total_received = self.total_received.saturating_add(total_received); + self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); + self.total_messages = self.total_messages.saturating_add(total_messages); + self.total_streamed_requests = + self.total_streamed_requests.saturating_add(total_streamed_requests); + self.total_non_streamed_requests = + self.total_non_streamed_requests.saturating_add(total_non_streamed_requests); + + // Aggregate model usage + for (model, count) in models_used { + *self.models_used.entry(model).or_insert(0) += count; + } + + self + } + + fn into_event(self: Box) -> Value { + let Self { + total_received, + total_succeeded, + time_spent, + total_messages, + total_streamed_requests, + total_non_streamed_requests, + models_used, + .. + } = *self; + + // Compute time statistics + let time_spent: Vec = time_spent.into_sorted_vec(); + let (max_time, min_time, avg_time) = if time_spent.is_empty() { + (0, 0, 0) + } else { + let max_time = time_spent.last().unwrap_or(&0); + let min_time = time_spent.first().unwrap_or(&0); + let sum: usize = time_spent.iter().sum(); + let avg_time = sum / time_spent.len(); + (*max_time, *min_time, avg_time) + }; + + // Compute average messages per request + let avg_messages_per_request = + if total_received > 0 { total_messages as f64 / total_received as f64 } else { 0.0 }; + + // Compute streaming vs non-streaming proportions + let streaming_ratio = if total_received > 0 { + total_streamed_requests as f64 / total_received as f64 + } else { + 0.0 + }; + + json!({ + "total_received": total_received, + "total_succeeded": total_succeeded, + "time_spent": { + "max": max_time, + "min": min_time, + "avg": avg_time + }, + "total_messages": total_messages, + "avg_messages_per_request": avg_messages_per_request, + "total_streamed_requests": total_streamed_requests, + "total_non_streamed_requests": total_non_streamed_requests, + "streaming_ratio": streaming_ratio, + "models_used": models_used, + }) + } +} diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index 8108e24dc..552a627b1 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -36,6 +36,7 @@ use serde_json::json; use tokio::runtime::Handle; use tokio::sync::mpsc::error::SendError; +use super::chat_completion_analytics::{ChatCompletionAggregator, ChatCompletionPOST}; use super::config::Config; use super::errors::{MistralError, OpenAiOutsideError, StreamErrorEvent}; use super::utils::format_documents; @@ -43,6 +44,7 @@ use super::{ ChatsParam, MEILI_APPEND_CONVERSATION_MESSAGE_NAME, MEILI_SEARCH_IN_INDEX_FUNCTION_NAME, MEILI_SEARCH_PROGRESS_NAME, MEILI_SEARCH_SOURCES_NAME, }; +use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{extract_token_from_request, GuardedData, Policy as _}; @@ -64,6 +66,7 @@ async fn chat( req: HttpRequest, search_queue: web::Data, web::Json(chat_completion): web::Json, + analytics: web::Data, ) -> impl Responder { let ChatsParam { workspace_uid } = chats_param.into_inner(); @@ -76,6 +79,7 @@ async fn chat( &workspace_uid, req, chat_completion, + analytics, ) .await, ) @@ -88,6 +92,7 @@ async fn chat( &workspace_uid, req, chat_completion, + analytics, ) .await, ) @@ -315,9 +320,18 @@ async fn non_streamed_chat( workspace_uid: &str, req: HttpRequest, chat_completion: CreateChatCompletionRequest, + analytics: web::Data, ) -> Result { index_scheduler.features().check_chat_completions("using the /chats chat completions route")?; + // Create analytics aggregator + let aggregate = ChatCompletionAggregator::::from_request( + &chat_completion.model, + chat_completion.messages.len(), + false, // non_streamed_chat is not streaming + ); + let start_time = std::time::Instant::now(); + if let Some(n) = chat_completion.n.filter(|&n| n != 1) { return Err(ResponseError::from_msg( format!("You tried to specify n = {n} but only single choices are supported (n = 1)."), @@ -414,6 +428,11 @@ async fn non_streamed_chat( } } + // Record success in analytics + let mut aggregate = aggregate; + aggregate.succeed(start_time.elapsed()); + analytics.publish(aggregate, &req); + Ok(HttpResponse::Ok().json(response)) } @@ -424,6 +443,7 @@ async fn streamed_chat( workspace_uid: &str, req: HttpRequest, mut chat_completion: CreateChatCompletionRequest, + analytics: web::Data, ) -> Result { index_scheduler.features().check_chat_completions("using the /chats chat completions route")?; let filters = index_scheduler.filters(); @@ -445,6 +465,14 @@ async fn streamed_chat( } }; + // Create analytics aggregator + let mut aggregate = ChatCompletionAggregator::::from_request( + &chat_completion.model, + chat_completion.messages.len(), + true, // streamed_chat is always streaming + ); + let start_time = std::time::Instant::now(); + let config = Config::new(&chat_settings); let auth_token = extract_token_from_request(&req)?.unwrap().to_string(); let system_role = chat_settings.source.system_role(&chat_completion.model); @@ -490,6 +518,10 @@ async fn streamed_chat( let _ = tx.stop().await; }); + // Record success in analytics after the stream is set up + aggregate.succeed(start_time.elapsed()); + analytics.publish(aggregate, &req); + Ok(Sse::from_infallible_receiver(rx).with_retry_duration(Duration::from_secs(10))) } diff --git a/crates/meilisearch/src/routes/chats/mod.rs b/crates/meilisearch/src/routes/chats/mod.rs index a8a93e6a4..8633bd496 100644 --- a/crates/meilisearch/src/routes/chats/mod.rs +++ b/crates/meilisearch/src/routes/chats/mod.rs @@ -19,6 +19,7 @@ use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::GuardedData; use crate::routes::PAGINATION_DEFAULT_LIMIT; +mod chat_completion_analytics; pub mod chat_completions; mod config; mod errors; From 5f62274f2143728d8961ca9881fcd70e281bbcda Mon Sep 17 00:00:00 2001 From: Nymuxyzo Date: Mon, 23 Jun 2025 21:44:26 +0200 Subject: [PATCH 019/150] Add disableOnNumbers to settings reset --- crates/meilisearch-types/src/settings.rs | 1 + .../tests/settings/get_settings.rs | 51 ++++++++++++++++++- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index 1c225b355..5e5f3b5b3 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -751,6 +751,7 @@ pub fn apply_settings_to_builder( builder.reset_min_word_len_two_typos(); builder.reset_exact_words(); builder.reset_exact_attributes(); + builder.reset_disable_on_numbers(); } Setting::NotSet => (), } diff --git a/crates/meilisearch/tests/settings/get_settings.rs b/crates/meilisearch/tests/settings/get_settings.rs index cdb803e8b..47e699380 100644 --- a/crates/meilisearch/tests/settings/get_settings.rs +++ b/crates/meilisearch/tests/settings/get_settings.rs @@ -247,6 +247,20 @@ async fn get_settings() { assert_eq!(settings["prefixSearch"], json!("indexingTime")); assert_eq!(settings["facetSearch"], json!(true)); assert_eq!(settings["embedders"], json!({})); + assert_eq!(settings["synonyms"], json!({})); + assert_eq!( + settings["typoTolerance"], + json!({ + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 5, + "twoTypos": 9 + }, + "disableOnWords": [], + "disableOnAttributes": [], + "disableOnNumbers": false + }) + ); } #[actix_rt::test] @@ -426,8 +440,15 @@ async fn reset_all_settings() { assert_eq!(code, 202); server.wait_task(response.uid()).await.succeeded(); - let (update_task,_status_code) = index - .update_settings(json!({"displayedAttributes": ["name", "age"], "searchableAttributes": ["name"], "stopWords": ["the"], "filterableAttributes": ["age"], "synonyms": {"puppy": ["dog", "doggo", "potat"] }})) + let (update_task, _status_code) = index + .update_settings(json!({ + "displayedAttributes": ["name", "age"], + "searchableAttributes": ["name"], + "stopWords": ["the"], + "filterableAttributes": ["age"], + "synonyms": {"puppy": ["dog", "doggo", "potat"] }, + "typoTolerance": {"disableOnNumbers": true} + })) .await; server.wait_task(update_task.uid()).await.succeeded(); let (response, code) = index.settings().await; @@ -437,6 +458,19 @@ async fn reset_all_settings() { assert_eq!(response["stopWords"], json!(["the"])); assert_eq!(response["synonyms"], json!({"puppy": ["dog", "doggo", "potat"] })); assert_eq!(response["filterableAttributes"], json!(["age"])); + assert_eq!( + response["typoTolerance"], + json!({ + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 5, + "twoTypos": 9 + }, + "disableOnWords": [], + "disableOnAttributes": [], + "disableOnNumbers": true + }) + ); let (delete_task, _status_code) = index.delete_settings().await; server.wait_task(delete_task.uid()).await.succeeded(); @@ -448,6 +482,19 @@ async fn reset_all_settings() { assert_eq!(response["stopWords"], json!([])); assert_eq!(response["filterableAttributes"], json!([])); assert_eq!(response["synonyms"], json!({})); + assert_eq!( + response["typoTolerance"], + json!({ + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 5, + "twoTypos": 9 + }, + "disableOnWords": [], + "disableOnAttributes": [], + "disableOnNumbers": false + }) + ); let (response, code) = index.get_document(1, None).await; assert_eq!(code, 200); From adc9976615eea6b1d8d9fbfacef14fb79c6c3a78 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 25 Jun 2025 11:50:26 +0200 Subject: [PATCH 020/150] Simplify the analytics chat completions aggragetor --- .../routes/chats/chat_completion_analytics.rs | 22 ++++++++----------- .../src/routes/chats/chat_completions.rs | 6 ++--- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/crates/meilisearch/src/routes/chats/chat_completion_analytics.rs b/crates/meilisearch/src/routes/chats/chat_completion_analytics.rs index 4fde81653..c700894ca 100644 --- a/crates/meilisearch/src/routes/chats/chat_completion_analytics.rs +++ b/crates/meilisearch/src/routes/chats/chat_completion_analytics.rs @@ -2,15 +2,10 @@ use std::collections::BinaryHeap; use serde_json::{json, Value}; -use crate::aggregate_methods; -use crate::analytics::{Aggregate, AggregateMethod}; - -aggregate_methods!( - ChatCompletionPOST => "Chat Completion POST", -); +use crate::analytics::Aggregate; #[derive(Default)] -pub struct ChatCompletionAggregator { +pub struct ChatCompletionAggregator { // requests total_received: usize, total_succeeded: usize, @@ -23,22 +18,23 @@ pub struct ChatCompletionAggregator { // model usage tracking models_used: std::collections::HashMap, - - _method: std::marker::PhantomData, } -impl ChatCompletionAggregator { +impl ChatCompletionAggregator { pub fn from_request(model: &str, message_count: usize, is_stream: bool) -> Self { let mut models_used = std::collections::HashMap::new(); models_used.insert(model.to_string(), 1); Self { total_received: 1, + total_succeeded: 0, + time_spent: BinaryHeap::new(), + total_messages: message_count, total_streamed_requests: if is_stream { 1 } else { 0 }, total_non_streamed_requests: if is_stream { 0 } else { 1 }, + models_used, - ..Default::default() } } @@ -48,9 +44,9 @@ impl ChatCompletionAggregator { } } -impl Aggregate for ChatCompletionAggregator { +impl Aggregate for ChatCompletionAggregator { fn event_name(&self) -> &'static str { - Method::event_name() + "Chat Completion POST" } fn aggregate(mut self: Box, new: Box) -> Box { diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index 552a627b1..ccbdccbbc 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -36,7 +36,7 @@ use serde_json::json; use tokio::runtime::Handle; use tokio::sync::mpsc::error::SendError; -use super::chat_completion_analytics::{ChatCompletionAggregator, ChatCompletionPOST}; +use super::chat_completion_analytics::ChatCompletionAggregator; use super::config::Config; use super::errors::{MistralError, OpenAiOutsideError, StreamErrorEvent}; use super::utils::format_documents; @@ -325,7 +325,7 @@ async fn non_streamed_chat( index_scheduler.features().check_chat_completions("using the /chats chat completions route")?; // Create analytics aggregator - let aggregate = ChatCompletionAggregator::::from_request( + let aggregate = ChatCompletionAggregator::from_request( &chat_completion.model, chat_completion.messages.len(), false, // non_streamed_chat is not streaming @@ -466,7 +466,7 @@ async fn streamed_chat( }; // Create analytics aggregator - let mut aggregate = ChatCompletionAggregator::::from_request( + let mut aggregate = ChatCompletionAggregator::from_request( &chat_completion.model, chat_completion.messages.len(), true, // streamed_chat is always streaming From 1d3b18f774029fe8c3710ae632aa981b25ababa9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 25 Jun 2025 14:58:21 +0200 Subject: [PATCH 021/150] Update test to be more reproducible --- crates/meilisearch/tests/vector/rest.rs | 33 +++++++++---------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 2c8d3ed7c..7e2245223 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -343,31 +343,16 @@ async fn create_faulty_mock_raw(sender: mpsc::Sender<()>) -> (MockServer, Value) Mock::given(method("POST")) .and(path("/")) - .respond_with(move |req: &Request| { + .respond_with(move |_req: &Request| { let count = count.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - let req_body = match req.body_json::() { - Ok(body) => body, - Err(error) => { - return ResponseTemplate::new(400).set_body_json(json!({ - "error": format!("Invalid request: {error}") - })); - } - }; - if count >= 5 { let _ = sender.try_send(()); - ResponseTemplate::new(500).set_delay(Duration::from_secs(u64::MAX)).set_body_json( - json!({ - "error": "Service Unavailable", - "text": req_body - }), - ) + ResponseTemplate::new(500) + .set_delay(Duration::from_secs(u64::MAX)) + .set_body_string("Service Unavailable") } else { - ResponseTemplate::new(500).set_body_json(json!({ - "error": "Service Unavailable", - "text": req_body - })) + ResponseTemplate::new(500).set_body_string("Service Unavailable") } }) .mount(&mock_server) @@ -2195,7 +2180,11 @@ async fn last_error_stats() { receiver.recv().await; let (response, _code) = index.filtered_batches(&[], &[], &[]).await; - snapshot!(json_string!(response["results"][0], { ".progress" => "[ignored]", ".stats.embedder.totalCount" => "[ignored]", ".startedAt" => "[ignored]" }), @r#" + snapshot!(json_string!(response["results"][0], { + ".progress" => "[ignored]", + ".stats.embedder.totalCount" => "[ignored]", + ".startedAt" => "[ignored]" + }), @r#" { "uid": 1, "progress": "[ignored]", @@ -2217,7 +2206,7 @@ async fn last_error_stats() { "embedder": { "totalCount": "[ignored]", "errorCount": 5, - "lastError": "runtime error: received internal error HTTP 500 from embedding server\n - server replied with `{\"error\":\"Service Unavailable\",\"text\":\"will_error\"}`" + "lastError": "runtime error: received internal error HTTP 500 from embedding server\n - server replied with `Service Unavailable`" } }, "duration": null, From e74c3b692abbd64531bf11dd997f28dfe053d4e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 12 Jun 2025 16:23:48 +0200 Subject: [PATCH 022/150] Introduce a new route to export documents and enqueue the export task --- crates/dump/src/lib.rs | 14 +++ crates/index-scheduler/src/dump.rs | 14 +++ crates/index-scheduler/src/insta_snapshot.rs | 3 + crates/index-scheduler/src/processing.rs | 9 ++ .../src/scheduler/autobatcher.rs | 1 + .../src/scheduler/create_batch.rs | 29 ++++- .../src/scheduler/process_batch.rs | 24 +++- crates/index-scheduler/src/utils.rs | 9 ++ crates/meilisearch-types/src/error.rs | 5 + crates/meilisearch-types/src/keys.rs | 5 + crates/meilisearch-types/src/task_view.rs | 45 ++++++++ crates/meilisearch-types/src/tasks.rs | 47 +++++++- crates/meilisearch/src/routes/export.rs | 105 ++++++++++++++++++ crates/meilisearch/src/routes/mod.rs | 3 + 14 files changed, 303 insertions(+), 10 deletions(-) create mode 100644 crates/meilisearch/src/routes/export.rs diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 285818a87..29007e9ce 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -141,6 +141,12 @@ pub enum KindDump { instance_uid: Option, }, SnapshotCreation, + Export { + url: String, + indexes: Vec, + skip_embeddings: bool, + api_key: Option, + }, UpgradeDatabase { from: (u32, u32, u32), }, @@ -213,6 +219,14 @@ impl From for KindDump { KindDump::DumpCreation { keys, instance_uid } } KindWithContent::SnapshotCreation => KindDump::SnapshotCreation, + KindWithContent::Export { url, indexes, skip_embeddings, api_key } => { + KindDump::Export { + url, + indexes: indexes.into_iter().map(|pattern| pattern.to_string()).collect(), + skip_embeddings, + api_key, + } + } KindWithContent::UpgradeDatabase { from: version } => { KindDump::UpgradeDatabase { from: version } } diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index ca26e50c8..457d80597 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -4,6 +4,7 @@ use std::io; use dump::{KindDump, TaskDump, UpdateFile}; use meilisearch_types::batches::{Batch, BatchId}; use meilisearch_types::heed::RwTxn; +use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; use roaring::RoaringBitmap; @@ -211,6 +212,19 @@ impl<'a> Dump<'a> { KindWithContent::DumpCreation { keys, instance_uid } } KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, + KindDump::Export { url, indexes, skip_embeddings, api_key } => { + KindWithContent::Export { + url, + indexes: indexes + .into_iter() + .map(|index| { + IndexUidPattern::try_from(index).map_err(|_| Error::CorruptedDump) + }) + .collect::, Error>>()?, + skip_embeddings, + api_key, + } + } KindDump::UpgradeDatabase { from } => KindWithContent::UpgradeDatabase { from }, }, }; diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index d01548319..d1db77b2f 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -289,6 +289,9 @@ fn snapshot_details(d: &Details) -> String { Details::IndexSwap { swaps } => { format!("{{ swaps: {swaps:?} }}") } + Details::Export { url, api_key, exported_documents, skip_embeddings } => { + format!("{{ url: {url:?}, api_key: {api_key:?}, exported_documents: {exported_documents:?}, skip_embeddings: {skip_embeddings:?} }}") + } Details::UpgradeDatabase { from, to } => { format!("{{ from: {from:?}, to: {to:?} }}") } diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index f23b811e5..5d4ac11c3 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -175,8 +175,17 @@ make_enum_progress! { } } +make_enum_progress! { + pub enum Export { + EnsuringCorrectnessOfTheTarget, + ExportTheSettings, + ExportTheDocuments, + } +} + make_atomic_progress!(Task alias AtomicTaskStep => "task" ); make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); +make_atomic_progress!(Index alias AtomicIndexStep => "index" ); make_atomic_progress!(Batch alias AtomicBatchStep => "batch" ); make_atomic_progress!(UpdateFile alias AtomicUpdateFileStep => "update file" ); diff --git a/crates/index-scheduler/src/scheduler/autobatcher.rs b/crates/index-scheduler/src/scheduler/autobatcher.rs index b57983291..b3f7d2743 100644 --- a/crates/index-scheduler/src/scheduler/autobatcher.rs +++ b/crates/index-scheduler/src/scheduler/autobatcher.rs @@ -71,6 +71,7 @@ impl From for AutobatchKind { KindWithContent::TaskCancelation { .. } | KindWithContent::TaskDeletion { .. } | KindWithContent::DumpCreation { .. } + | KindWithContent::Export { .. } | KindWithContent::UpgradeDatabase { .. } | KindWithContent::SnapshotCreation => { panic!("The autobatcher should never be called with tasks that don't apply to an index.") diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index e3763881b..7a6fa4a9b 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -47,6 +47,9 @@ pub(crate) enum Batch { IndexSwap { task: Task, }, + Export { + task: Task, + }, UpgradeDatabase { tasks: Vec, }, @@ -103,6 +106,7 @@ impl Batch { Batch::TaskCancelation { task, .. } | Batch::Dump(task) | Batch::IndexCreation { task, .. } + | Batch::Export { task } | Batch::IndexUpdate { task, .. } => { RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() } @@ -142,6 +146,7 @@ impl Batch { | TaskDeletions(_) | SnapshotCreation(_) | Dump(_) + | Export { .. } | UpgradeDatabase { .. } | IndexSwap { .. } => None, IndexOperation { op, .. } => Some(op.index_uid()), @@ -167,6 +172,7 @@ impl fmt::Display for Batch { Batch::IndexUpdate { .. } => f.write_str("IndexUpdate")?, Batch::IndexDeletion { .. } => f.write_str("IndexDeletion")?, Batch::IndexSwap { .. } => f.write_str("IndexSwap")?, + Batch::Export { .. } => f.write_str("Export")?, Batch::UpgradeDatabase { .. } => f.write_str("UpgradeDatabase")?, }; match index_uid { @@ -426,9 +432,10 @@ impl IndexScheduler { /// 0. We get the *last* task to cancel. /// 1. We get the tasks to upgrade. /// 2. We get the *next* task to delete. - /// 3. We get the *next* snapshot to process. - /// 4. We get the *next* dump to process. - /// 5. We get the *next* tasks to process for a specific index. + /// 3. We get the *next* export to process. + /// 4. We get the *next* snapshot to process. + /// 5. We get the *next* dump to process. + /// 6. We get the *next* tasks to process for a specific index. #[tracing::instrument(level = "trace", skip(self, rtxn), target = "indexing::scheduler")] pub(crate) fn create_next_batch( &self, @@ -500,7 +507,17 @@ impl IndexScheduler { return Ok(Some((Batch::TaskDeletions(tasks), current_batch))); } - // 3. we batch the snapshot. + // 3. we batch the export. + let to_export = self.queue.tasks.get_kind(rtxn, Kind::Export)? & enqueued; + if !to_export.is_empty() { + let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, to_export)?; + current_batch.processing(&mut tasks); + let task = tasks.pop().expect("There must be only one export task"); + current_batch.reason(BatchStopReason::TaskKindCannotBeBatched { kind: Kind::Export }); + return Ok(Some((Batch::Export { task }, current_batch))); + } + + // 4. we batch the snapshot. let to_snapshot = self.queue.tasks.get_kind(rtxn, Kind::SnapshotCreation)? & enqueued; if !to_snapshot.is_empty() { let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, to_snapshot)?; @@ -510,7 +527,7 @@ impl IndexScheduler { return Ok(Some((Batch::SnapshotCreation(tasks), current_batch))); } - // 4. we batch the dumps. + // 5. we batch the dumps. let to_dump = self.queue.tasks.get_kind(rtxn, Kind::DumpCreation)? & enqueued; if let Some(to_dump) = to_dump.min() { let mut task = @@ -523,7 +540,7 @@ impl IndexScheduler { return Ok(Some((Batch::Dump(task), current_batch))); } - // 5. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. + // 6. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. let task_id = if let Some(task_id) = enqueued.min() { task_id } else { return Ok(None) }; let mut task = self.queue.tasks.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index c349f90ad..1f6c4eb2c 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -1,6 +1,7 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::Ordering; +use std::time::Duration; use meilisearch_types::batches::{BatchEnqueuedAt, BatchId}; use meilisearch_types::heed::{RoTxn, RwTxn}; @@ -13,9 +14,9 @@ use roaring::RoaringBitmap; use super::create_batch::Batch; use crate::processing::{ - AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, FinalizingIndexStep, - InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress, - UpdateIndexProgress, + AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, Export, + FinalizingIndexStep, InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, + TaskDeletionProgress, UpdateIndexProgress, }; use crate::utils::{ self, remove_n_tasks_datetime_earlier_than, remove_task_datetime, swap_index_uid_in_task, @@ -361,6 +362,23 @@ impl IndexScheduler { task.status = Status::Succeeded; Ok((vec![task], ProcessBatchInfo::default())) } + Batch::Export { mut task } => { + progress.update_progress(Export::EnsuringCorrectnessOfTheTarget); + + // TODO send check requests with the API Key + + let mut wtxn = self.env.write_txn()?; + let KindWithContent::Export { url, indexes, skip_embeddings, api_key } = &task.kind + else { + unreachable!() + }; + + eprintln!("Exporting data to {}...", url); + std::thread::sleep(Duration::from_secs(30)); + + task.status = Status::Succeeded; + Ok((vec![task], ProcessBatchInfo::default())) + } Batch::UpgradeDatabase { mut tasks } => { let KindWithContent::UpgradeDatabase { from } = tasks.last().unwrap().kind else { unreachable!(); diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 67e8fc090..7fe44d1c1 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -273,6 +273,7 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) { K::TaskCancelation { .. } | K::TaskDeletion { .. } | K::DumpCreation { .. } + | K::Export { .. } // TODO I have patterns, not index uids | K::UpgradeDatabase { .. } | K::SnapshotCreation => (), }; @@ -600,6 +601,14 @@ impl crate::IndexScheduler { Details::Dump { dump_uid: _ } => { assert_eq!(kind.as_kind(), Kind::DumpCreation); } + Details::Export { + url: _, + api_key: _, + exported_documents: _, + skip_embeddings: _, + } => { + assert_eq!(kind.as_kind(), Kind::Export); + } Details::UpgradeDatabase { from: _, to: _ } => { assert_eq!(kind.as_kind(), Kind::UpgradeDatabase); } diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index d2500b7e1..22c668d59 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -389,6 +389,11 @@ InvalidDocumentEditionContext , InvalidRequest , BAD_REQU InvalidDocumentEditionFunctionFilter , InvalidRequest , BAD_REQUEST ; EditDocumentsByFunctionError , InvalidRequest , BAD_REQUEST ; InvalidSettingsIndexChat , InvalidRequest , BAD_REQUEST ; +// Export +InvalidExportUrl , InvalidRequest , BAD_REQUEST ; +InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; +InvalidExportSkipEmbeddings , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; UnimplementedNonStreamingChatCompletions , InvalidRequest , NOT_IMPLEMENTED ; diff --git a/crates/meilisearch-types/src/keys.rs b/crates/meilisearch-types/src/keys.rs index df2810727..3ba31c2cb 100644 --- a/crates/meilisearch-types/src/keys.rs +++ b/crates/meilisearch-types/src/keys.rs @@ -317,6 +317,9 @@ pub enum Action { #[serde(rename = "experimental.update")] #[deserr(rename = "experimental.update")] ExperimentalFeaturesUpdate, + #[serde(rename = "export")] + #[deserr(rename = "export")] + Export, #[serde(rename = "network.get")] #[deserr(rename = "network.get")] NetworkGet, @@ -438,6 +441,8 @@ pub mod actions { pub const EXPERIMENTAL_FEATURES_GET: u8 = ExperimentalFeaturesGet.repr(); pub const EXPERIMENTAL_FEATURES_UPDATE: u8 = ExperimentalFeaturesUpdate.repr(); + pub const EXPORT: u8 = Export.repr(); + pub const NETWORK_GET: u8 = NetworkGet.repr(); pub const NETWORK_UPDATE: u8 = NetworkUpdate.repr(); diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 86a00426b..06fda0835 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeMap; + use milli::Object; use serde::{Deserialize, Serialize}; use time::{Duration, OffsetDateTime}; @@ -118,6 +120,15 @@ pub struct DetailsView { pub upgrade_from: Option, #[serde(skip_serializing_if = "Option::is_none")] pub upgrade_to: Option, + // exporting + #[serde(skip_serializing_if = "Option::is_none")] + pub url: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub api_key: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub exported_documents: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub skip_embeddings: Option, } impl DetailsView { @@ -238,6 +249,37 @@ impl DetailsView { Some(left) } }, + url: match (self.url.clone(), other.url.clone()) { + (None, None) => None, + (None, Some(url)) | (Some(url), None) => Some(url), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, + api_key: match (self.api_key.clone(), other.api_key.clone()) { + (None, None) => None, + (None, Some(key)) | (Some(key), None) => Some(key), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, + exported_documents: match ( + self.exported_documents.clone(), + other.exported_documents.clone(), + ) { + (None, None) => None, + (None, Some(exp)) | (Some(exp), None) => Some(exp), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, + skip_embeddings: match (self.skip_embeddings, other.skip_embeddings) { + (None, None) => None, + (None, Some(skip)) | (Some(skip), None) => Some(skip), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, // We want the earliest version upgrade_from: match (self.upgrade_from.clone(), other.upgrade_from.clone()) { (None, None) => None, @@ -327,6 +369,9 @@ impl From
for DetailsView { Details::IndexSwap { swaps } => { DetailsView { swaps: Some(swaps), ..Default::default() } } + Details::Export { url, api_key, exported_documents, skip_embeddings } => { + DetailsView { exported_documents: Some(exported_documents), ..Default::default() } + } Details::UpgradeDatabase { from, to } => DetailsView { upgrade_from: Some(format!("v{}.{}.{}", from.0, from.1, from.2)), upgrade_to: Some(format!("v{}.{}.{}", to.0, to.1, to.2)), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 95c52d9a6..e31e6062b 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -1,5 +1,5 @@ use core::fmt; -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::fmt::{Display, Write}; use std::str::FromStr; @@ -14,6 +14,7 @@ use uuid::Uuid; use crate::batches::BatchId; use crate::error::ResponseError; +use crate::index_uid_pattern::IndexUidPattern; use crate::keys::Key; use crate::settings::{Settings, Unchecked}; use crate::{versioning, InstanceUid}; @@ -50,6 +51,7 @@ impl Task { | SnapshotCreation | TaskCancelation { .. } | TaskDeletion { .. } + | Export { .. } | UpgradeDatabase { .. } | IndexSwap { .. } => None, DocumentAdditionOrUpdate { index_uid, .. } @@ -86,6 +88,7 @@ impl Task { | KindWithContent::TaskDeletion { .. } | KindWithContent::DumpCreation { .. } | KindWithContent::SnapshotCreation + | KindWithContent::Export { .. } | KindWithContent::UpgradeDatabase { .. } => None, } } @@ -152,6 +155,12 @@ pub enum KindWithContent { instance_uid: Option, }, SnapshotCreation, + Export { + url: String, + api_key: Option, + indexes: Vec, + skip_embeddings: bool, + }, UpgradeDatabase { from: (u32, u32, u32), }, @@ -180,6 +189,7 @@ impl KindWithContent { KindWithContent::TaskDeletion { .. } => Kind::TaskDeletion, KindWithContent::DumpCreation { .. } => Kind::DumpCreation, KindWithContent::SnapshotCreation => Kind::SnapshotCreation, + KindWithContent::Export { .. } => Kind::Export, KindWithContent::UpgradeDatabase { .. } => Kind::UpgradeDatabase, } } @@ -192,6 +202,7 @@ impl KindWithContent { | SnapshotCreation | TaskCancelation { .. } | TaskDeletion { .. } + | Export { .. } // TODO Should I resolve the index names? | UpgradeDatabase { .. } => vec![], DocumentAdditionOrUpdate { index_uid, .. } | DocumentEdition { index_uid, .. } @@ -269,6 +280,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + exported_documents: Default::default(), + skip_embeddings: *skip_embeddings, + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), to: ( @@ -335,6 +354,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + exported_documents: Default::default(), + skip_embeddings: skip_embeddings.clone(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -383,6 +410,14 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + exported_documents: BTreeMap::default(), + skip_embeddings: skip_embeddings.clone(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -499,6 +534,7 @@ pub enum Kind { TaskDeletion, DumpCreation, SnapshotCreation, + Export, UpgradeDatabase, } @@ -516,6 +552,7 @@ impl Kind { | Kind::TaskCancelation | Kind::TaskDeletion | Kind::DumpCreation + | Kind::Export | Kind::UpgradeDatabase | Kind::SnapshotCreation => false, } @@ -536,6 +573,7 @@ impl Display for Kind { Kind::TaskDeletion => write!(f, "taskDeletion"), Kind::DumpCreation => write!(f, "dumpCreation"), Kind::SnapshotCreation => write!(f, "snapshotCreation"), + Kind::Export => write!(f, "export"), Kind::UpgradeDatabase => write!(f, "upgradeDatabase"), } } @@ -643,6 +681,12 @@ pub enum Details { IndexSwap { swaps: Vec, }, + Export { + url: String, + api_key: Option, + exported_documents: BTreeMap, + skip_embeddings: bool, + }, UpgradeDatabase { from: (u32, u32, u32), to: (u32, u32, u32), @@ -667,6 +711,7 @@ impl Details { Self::SettingsUpdate { .. } | Self::IndexInfo { .. } | Self::Dump { .. } + | Self::Export { .. } | Self::UpgradeDatabase { .. } | Self::IndexSwap { .. } => (), } diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs new file mode 100644 index 000000000..666799273 --- /dev/null +++ b/crates/meilisearch/src/routes/export.rs @@ -0,0 +1,105 @@ +use actix_web::web::{self, Data}; +use actix_web::{HttpRequest, HttpResponse}; +use deserr::actix_web::AwebJson; +use deserr::Deserr; +use index_scheduler::IndexScheduler; +use meilisearch_types::deserr::DeserrJsonError; +use meilisearch_types::error::deserr_codes::*; +use meilisearch_types::error::ResponseError; +use meilisearch_types::index_uid_pattern::IndexUidPattern; +use meilisearch_types::keys::actions; +use meilisearch_types::tasks::KindWithContent; +use serde::Serialize; +use tracing::debug; +use utoipa::{OpenApi, ToSchema}; + +use crate::analytics::Analytics; +use crate::extractors::authentication::policies::ActionPolicy; +use crate::extractors::authentication::GuardedData; +use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; +use crate::Opt; + +#[derive(OpenApi)] +#[openapi( + paths(export), + tags(( + name = "Export", + description = "The `/export` route allows you to trigger an export process to a remote Meilisearch instance.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/export"), + )), +)] +pub struct ExportApi; + +pub fn configure(cfg: &mut web::ServiceConfig) { + cfg.service(web::resource("").route(web::post().to(export))); +} + +#[utoipa::path( + get, + path = "", + tag = "Export", + security(("Bearer" = ["export", "*"])), + responses( + (status = OK, description = "Known nodes are returned", body = Export, content_type = "application/json", example = json!( + { + "indexes": ["movie", "steam-*"], + "skip_embeddings": true, + "apiKey": "meilisearch-api-key" + })), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] +async fn export( + index_scheduler: GuardedData, Data>, + export: AwebJson, + req: HttpRequest, + opt: web::Data, + _analytics: Data, +) -> Result { + // TODO make it experimental? + // index_scheduler.features().check_network("Using the /network route")?; + + let export = export.into_inner(); + debug!(returns = ?export, "Trigger export"); + + let Export { url, api_key, indexes, skip_embeddings } = export; + let task = KindWithContent::Export { url, api_key, indexes, skip_embeddings }; + let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; + let task: SummarizedTaskView = + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); + + Ok(HttpResponse::Ok().json(task)) +} + +#[derive(Debug, Deserr, ToSchema, Serialize)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct Export { + #[schema(value_type = Option, example = json!("https://ms-1234.heaven.meilisearch.com"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub url: String, + #[schema(value_type = Option, example = json!("1234abcd"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub api_key: Option, + #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] + #[deserr(default, error = DeserrJsonError)] + #[serde(default)] + pub indexes: Vec, + #[schema(value_type = Option, example = json!("true"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub skip_embeddings: bool, +} diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index cc62e43c3..748cd5d83 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -54,6 +54,7 @@ mod api_key; pub mod batches; pub mod chats; mod dump; +mod export; pub mod features; pub mod indexes; mod logs; @@ -84,6 +85,7 @@ mod tasks_test; (path = "/multi-search", api = multi_search::MultiSearchApi), (path = "/swap-indexes", api = swap_indexes::SwapIndexesApi), (path = "/experimental-features", api = features::ExperimentalFeaturesApi), + (path = "/export", api = export::ExportApi), (path = "/network", api = network::NetworkApi), ), paths(get_health, get_version, get_stats), @@ -115,6 +117,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::scope("/metrics").configure(metrics::configure)) .service(web::scope("/experimental-features").configure(features::configure)) .service(web::scope("/network").configure(network::configure)) + .service(web::scope("/export").configure(export::configure)) .service(web::scope("/chats").configure(chats::configure)); #[cfg(feature = "swagger")] From e023ee4b6b1a5f2a87f245579742dde43300f117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 14 Jun 2025 11:39:53 +0200 Subject: [PATCH 023/150] Working first implementation --- crates/dump/src/lib.rs | 25 ++-- crates/index-scheduler/src/dump.rs | 27 ++-- crates/index-scheduler/src/error.rs | 4 + crates/index-scheduler/src/insta_snapshot.rs | 4 +- crates/index-scheduler/src/scheduler/mod.rs | 1 + .../src/scheduler/process_batch.rs | 45 ++++-- .../src/scheduler/process_export.rs | 141 ++++++++++++++++++ .../mod.rs => process_upgrade.rs} | 0 crates/index-scheduler/src/test_utils.rs | 1 + crates/index-scheduler/src/utils.rs | 7 +- crates/meilisearch-types/src/error.rs | 3 +- .../src/index_uid_pattern.rs | 2 +- crates/meilisearch-types/src/task_view.rs | 36 +++-- crates/meilisearch-types/src/tasks.rs | 71 +++++---- crates/meilisearch/src/routes/export.rs | 34 ++++- 15 files changed, 298 insertions(+), 103 deletions(-) create mode 100644 crates/index-scheduler/src/scheduler/process_export.rs rename crates/index-scheduler/src/scheduler/{process_upgrade/mod.rs => process_upgrade.rs} (100%) diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 29007e9ce..5c67d7a94 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -1,12 +1,16 @@ #![allow(clippy::type_complexity)] #![allow(clippy::wrong_self_convention)] +use std::collections::BTreeMap; + use meilisearch_types::batches::BatchId; use meilisearch_types::error::ResponseError; use meilisearch_types::keys::Key; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::settings::Unchecked; -use meilisearch_types::tasks::{Details, IndexSwap, KindWithContent, Status, Task, TaskId}; +use meilisearch_types::tasks::{ + Details, ExportIndexSettings, IndexSwap, KindWithContent, Status, Task, TaskId, +}; use meilisearch_types::InstanceUid; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -143,9 +147,8 @@ pub enum KindDump { SnapshotCreation, Export { url: String, - indexes: Vec, - skip_embeddings: bool, api_key: Option, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), @@ -219,14 +222,14 @@ impl From for KindDump { KindDump::DumpCreation { keys, instance_uid } } KindWithContent::SnapshotCreation => KindDump::SnapshotCreation, - KindWithContent::Export { url, indexes, skip_embeddings, api_key } => { - KindDump::Export { - url, - indexes: indexes.into_iter().map(|pattern| pattern.to_string()).collect(), - skip_embeddings, - api_key, - } - } + KindWithContent::Export { url, api_key, indexes } => KindDump::Export { + url, + api_key, + indexes: indexes + .into_iter() + .map(|(pattern, settings)| (pattern.to_string(), settings)) + .collect(), + }, KindWithContent::UpgradeDatabase { from: version } => { KindDump::UpgradeDatabase { from: version } } diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index 457d80597..2a99a74aa 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -212,19 +212,20 @@ impl<'a> Dump<'a> { KindWithContent::DumpCreation { keys, instance_uid } } KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, - KindDump::Export { url, indexes, skip_embeddings, api_key } => { - KindWithContent::Export { - url, - indexes: indexes - .into_iter() - .map(|index| { - IndexUidPattern::try_from(index).map_err(|_| Error::CorruptedDump) - }) - .collect::, Error>>()?, - skip_embeddings, - api_key, - } - } + KindDump::Export { url, indexes, api_key } => KindWithContent::Export { + url, + api_key, + indexes: indexes + .into_iter() + .map(|(pattern, settings)| { + Ok(( + IndexUidPattern::try_from(pattern) + .map_err(|_| Error::CorruptedDump)?, + settings, + )) + }) + .collect::>()?, + }, KindDump::UpgradeDatabase { from } => KindWithContent::UpgradeDatabase { from }, }, }; diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index cb798b385..2020ac597 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -151,6 +151,8 @@ pub enum Error { CorruptedTaskQueue, #[error(transparent)] DatabaseUpgrade(Box), + #[error(transparent)] + Export(Box), #[error("Failed to rollback for index `{index}`: {rollback_outcome} ")] RollbackFailed { index: String, rollback_outcome: RollbackOutcome }, #[error(transparent)] @@ -221,6 +223,7 @@ impl Error { | Error::IoError(_) | Error::Persist(_) | Error::FeatureNotEnabled(_) + | Error::Export(_) | Error::Anyhow(_) => true, Error::CreateBatch(_) | Error::CorruptedTaskQueue @@ -294,6 +297,7 @@ impl ErrorCode for Error { Error::CorruptedTaskQueue => Code::Internal, Error::CorruptedDump => Code::Internal, Error::DatabaseUpgrade(_) => Code::Internal, + Error::Export(_) => Code::Internal, Error::RollbackFailed { .. } => Code::Internal, Error::UnrecoverableError(_) => Code::Internal, Error::IndexSchedulerVersionMismatch { .. } => Code::Internal, diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index d1db77b2f..138b591ff 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -289,8 +289,8 @@ fn snapshot_details(d: &Details) -> String { Details::IndexSwap { swaps } => { format!("{{ swaps: {swaps:?} }}") } - Details::Export { url, api_key, exported_documents, skip_embeddings } => { - format!("{{ url: {url:?}, api_key: {api_key:?}, exported_documents: {exported_documents:?}, skip_embeddings: {skip_embeddings:?} }}") + Details::Export { url, api_key, indexes } => { + format!("{{ url: {url:?}, api_key: {api_key:?}, indexes: {indexes:?} }}") } Details::UpgradeDatabase { from, to } => { format!("{{ from: {from:?}, to: {to:?} }}") diff --git a/crates/index-scheduler/src/scheduler/mod.rs b/crates/index-scheduler/src/scheduler/mod.rs index 0e258e27b..5ac591143 100644 --- a/crates/index-scheduler/src/scheduler/mod.rs +++ b/crates/index-scheduler/src/scheduler/mod.rs @@ -4,6 +4,7 @@ mod autobatcher_test; mod create_batch; mod process_batch; mod process_dump_creation; +mod process_export; mod process_index_operation; mod process_snapshot_creation; mod process_upgrade; diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 1f6c4eb2c..99278756d 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -1,7 +1,6 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::Ordering; -use std::time::Duration; use meilisearch_types::batches::{BatchEnqueuedAt, BatchId}; use meilisearch_types::heed::{RoTxn, RwTxn}; @@ -14,9 +13,9 @@ use roaring::RoaringBitmap; use super::create_batch::Batch; use crate::processing::{ - AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, Export, - FinalizingIndexStep, InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, - TaskDeletionProgress, UpdateIndexProgress, + AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, FinalizingIndexStep, + InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress, + UpdateIndexProgress, }; use crate::utils::{ self, remove_n_tasks_datetime_earlier_than, remove_task_datetime, swap_index_uid_in_task, @@ -363,18 +362,32 @@ impl IndexScheduler { Ok((vec![task], ProcessBatchInfo::default())) } Batch::Export { mut task } => { - progress.update_progress(Export::EnsuringCorrectnessOfTheTarget); - - // TODO send check requests with the API Key - - let mut wtxn = self.env.write_txn()?; - let KindWithContent::Export { url, indexes, skip_embeddings, api_key } = &task.kind - else { + let KindWithContent::Export { url, indexes, api_key } = &task.kind else { unreachable!() }; - eprintln!("Exporting data to {}...", url); - std::thread::sleep(Duration::from_secs(30)); + let ret = catch_unwind(AssertUnwindSafe(|| { + self.process_export(url, indexes, api_key.as_deref(), progress) + })); + + match ret { + // TODO return the matched and exported documents + Ok(Ok(())) => (), + Ok(Err(Error::AbortedTask)) => return Err(Error::AbortedTask), + Ok(Err(e)) => return Err(Error::Export(Box::new(e))), + Err(e) => { + let msg = match e.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match e.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + return Err(Error::Export(Box::new(Error::ProcessBatchPanicked( + msg.to_string(), + )))); + } + } task.status = Status::Succeeded; Ok((vec![task], ProcessBatchInfo::default())) @@ -726,9 +739,11 @@ impl IndexScheduler { from.1, from.2 ); - match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let ret = catch_unwind(std::panic::AssertUnwindSafe(|| { self.process_rollback(from, progress) - })) { + })); + + match ret { Ok(Ok(())) => {} Ok(Err(err)) => return Err(Error::DatabaseUpgrade(Box::new(err))), Err(e) => { diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs new file mode 100644 index 000000000..e01ddf2e4 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -0,0 +1,141 @@ +use std::collections::BTreeMap; +use std::time::Duration; + +use meilisearch_types::index_uid_pattern::IndexUidPattern; +use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::{obkv_to_json, Filter}; +use meilisearch_types::settings::{self, SecretPolicy}; +use meilisearch_types::tasks::ExportIndexSettings; +use ureq::{json, Agent}; + +use crate::{Error, IndexScheduler, Result}; + +impl IndexScheduler { + pub(super) fn process_export( + &self, + url: &str, + indexes: &BTreeMap, + api_key: Option<&str>, + progress: Progress, + ) -> Result<()> { + #[cfg(test)] + self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; + + let indexes: Vec<_> = self + .index_names()? + .into_iter() + .flat_map(|uid| { + indexes + .iter() + .find(|(pattern, _)| pattern.matches_str(&uid)) + .map(|(_pattern, settings)| (uid, settings)) + }) + .collect(); + + let agent: Agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); + + for (i, (uid, settings)) in indexes.iter().enumerate() { + let must_stop_processing = self.scheduler.must_stop_processing.clone(); + if must_stop_processing.get() { + return Err(Error::AbortedTask); + } + + progress.update_progress(VariableNameStep::::new( + format!("Exporting index `{uid}`"), + i as u32, + indexes.len() as u32, + )); + + let ExportIndexSettings { skip_embeddings, filter } = settings; + let index = self.index(uid)?; + let index_rtxn = index.read_txn()?; + + // Send the primary key + let primary_key = index.primary_key(&index_rtxn).unwrap(); + // TODO implement retry logic + let mut request = agent.post(&format!("{url}/indexes")); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_json(&json!({ "uid": uid, "primaryKey": primary_key })).unwrap(); + + // Send the index settings + let settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // TODO implement retry logic + // improve error reporting (get error message) + let mut request = agent.patch(&format!("{url}/indexes/{uid}/settings")); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_json(settings).unwrap(); + + let filter = filter + .as_deref() + .map(Filter::from_str) + .transpose() + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))? + .flatten(); + + let filter_universe = filter + .map(|f| f.evaluate(&index_rtxn, &index)) + .transpose() + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let whole_universe = index + .documents_ids(&index_rtxn) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + let universe = filter_universe.unwrap_or(whole_universe); + + let fields_ids_map = index.fields_ids_map(&index_rtxn)?; + let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index + .embedding_configs(&index_rtxn) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + + let limit = 50 * 1024 * 1024; // 50 MiB + let mut buffer = Vec::new(); + let mut tmp_buffer = Vec::new(); + for docid in universe { + let document = index + .document(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + let value = obkv_to_json(&all_fields, &fields_ids_map, document) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + tmp_buffer.clear(); + serde_json::to_writer(&mut tmp_buffer, &value) + .map_err(meilisearch_types::milli::InternalError::from) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + + if buffer.len() + tmp_buffer.len() > limit { + // TODO implement retry logic + post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + buffer.clear(); + } + buffer.extend_from_slice(&tmp_buffer); + } + + post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + } + + Ok(()) + } +} + +fn post_serialized_documents( + agent: &Agent, + url: &str, + uid: &str, + api_key: Option<&str>, + buffer: &[u8], +) -> Result { + let mut request = agent.post(&format!("{url}/indexes/{uid}/documents")); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_bytes(buffer) +} + +enum ExportIndex {} diff --git a/crates/index-scheduler/src/scheduler/process_upgrade/mod.rs b/crates/index-scheduler/src/scheduler/process_upgrade.rs similarity index 100% rename from crates/index-scheduler/src/scheduler/process_upgrade/mod.rs rename to crates/index-scheduler/src/scheduler/process_upgrade.rs diff --git a/crates/index-scheduler/src/test_utils.rs b/crates/index-scheduler/src/test_utils.rs index 5f206b55c..bfed7f53a 100644 --- a/crates/index-scheduler/src/test_utils.rs +++ b/crates/index-scheduler/src/test_utils.rs @@ -37,6 +37,7 @@ pub(crate) enum FailureLocation { InsideCreateBatch, InsideProcessBatch, PanicInsideProcessBatch, + ProcessExport, ProcessUpgrade, AcquiringWtxn, UpdatingTaskAfterProcessBatchSuccess { task_uid: u32 }, diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 7fe44d1c1..79571745b 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -601,12 +601,7 @@ impl crate::IndexScheduler { Details::Dump { dump_uid: _ } => { assert_eq!(kind.as_kind(), Kind::DumpCreation); } - Details::Export { - url: _, - api_key: _, - exported_documents: _, - skip_embeddings: _, - } => { + Details::Export { url: _, api_key: _, indexes: _ } => { assert_eq!(kind.as_kind(), Kind::Export); } Details::UpgradeDatabase { from: _, to: _ } => { diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 22c668d59..08ee803ef 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -393,7 +393,8 @@ InvalidSettingsIndexChat , InvalidRequest , BAD_REQU InvalidExportUrl , InvalidRequest , BAD_REQUEST ; InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; -InvalidExportSkipEmbeddings , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexSkipEmbeddings , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; UnimplementedNonStreamingChatCompletions , InvalidRequest , NOT_IMPLEMENTED ; diff --git a/crates/meilisearch-types/src/index_uid_pattern.rs b/crates/meilisearch-types/src/index_uid_pattern.rs index baf0249e2..f90fc7aee 100644 --- a/crates/meilisearch-types/src/index_uid_pattern.rs +++ b/crates/meilisearch-types/src/index_uid_pattern.rs @@ -12,7 +12,7 @@ use crate::index_uid::{IndexUid, IndexUidFormatError}; /// An index uid pattern is composed of only ascii alphanumeric characters, - and _, between 1 and 400 /// bytes long and optionally ending with a *. -#[derive(Serialize, Deserialize, Deserr, Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Serialize, Deserialize, Deserr, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[deserr(try_from(&String) = FromStr::from_str -> IndexUidPatternFormatError)] pub struct IndexUidPattern(String); diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 06fda0835..0a8d7b8fe 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -8,7 +8,9 @@ use utoipa::ToSchema; use crate::batches::BatchId; use crate::error::ResponseError; use crate::settings::{Settings, Unchecked}; -use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId}; +use crate::tasks::{ + serialize_duration, Details, DetailsExportIndexSettings, IndexSwap, Kind, Status, Task, TaskId, +}; #[derive(Debug, Clone, PartialEq, Serialize, ToSchema)] #[serde(rename_all = "camelCase")] @@ -126,9 +128,7 @@ pub struct DetailsView { #[serde(skip_serializing_if = "Option::is_none")] pub api_key: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub exported_documents: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub skip_embeddings: Option, + pub indexes: Option>, } impl DetailsView { @@ -263,19 +263,9 @@ impl DetailsView { // So we return the first one we encounter but that shouldn't be an issue anyway. (Some(left), Some(_right)) => Some(left), }, - exported_documents: match ( - self.exported_documents.clone(), - other.exported_documents.clone(), - ) { + indexes: match (self.indexes.clone(), other.indexes.clone()) { (None, None) => None, - (None, Some(exp)) | (Some(exp), None) => Some(exp), - // We should never be able to batch multiple exports at the same time. - // So we return the first one we encounter but that shouldn't be an issue anyway. - (Some(left), Some(_right)) => Some(left), - }, - skip_embeddings: match (self.skip_embeddings, other.skip_embeddings) { - (None, None) => None, - (None, Some(skip)) | (Some(skip), None) => Some(skip), + (None, Some(indexes)) | (Some(indexes), None) => Some(indexes), // We should never be able to batch multiple exports at the same time. // So we return the first one we encounter but that shouldn't be an issue anyway. (Some(left), Some(_right)) => Some(left), @@ -369,9 +359,17 @@ impl From
for DetailsView { Details::IndexSwap { swaps } => { DetailsView { swaps: Some(swaps), ..Default::default() } } - Details::Export { url, api_key, exported_documents, skip_embeddings } => { - DetailsView { exported_documents: Some(exported_documents), ..Default::default() } - } + Details::Export { url, api_key, indexes } => DetailsView { + url: Some(url), + api_key, + indexes: Some( + indexes + .into_iter() + .map(|(pattern, settings)| (pattern.to_string(), settings)) + .collect(), + ), + ..Default::default() + }, Details::UpgradeDatabase { from, to } => DetailsView { upgrade_from: Some(format!("v{}.{}.{}", from.0, from.1, from.2)), upgrade_to: Some(format!("v{}.{}.{}", to.0, to.1, to.2)), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index e31e6062b..1f8f7e7cb 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -9,7 +9,7 @@ use milli::Object; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize, Serializer}; use time::{Duration, OffsetDateTime}; -use utoipa::ToSchema; +use utoipa::{schema, ToSchema}; use uuid::Uuid; use crate::batches::BatchId; @@ -158,8 +158,7 @@ pub enum KindWithContent { Export { url: String, api_key: Option, - indexes: Vec, - skip_embeddings: bool, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), @@ -172,6 +171,13 @@ pub struct IndexSwap { pub indexes: (String, String), } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct ExportIndexSettings { + pub skip_embeddings: bool, + pub filter: Option, +} + impl KindWithContent { pub fn as_kind(&self) -> Kind { match self { @@ -280,14 +286,11 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { - Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - exported_documents: Default::default(), - skip_embeddings: *skip_embeddings, - }) - } + KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), to: ( @@ -354,14 +357,11 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { - Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - exported_documents: Default::default(), - skip_embeddings: skip_embeddings.clone(), - }) - } + KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -410,14 +410,11 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { - Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - exported_documents: BTreeMap::default(), - skip_embeddings: skip_embeddings.clone(), - }) - } + KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -684,8 +681,7 @@ pub enum Details { Export { url: String, api_key: Option, - exported_documents: BTreeMap, - skip_embeddings: bool, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), @@ -693,6 +689,23 @@ pub enum Details { }, } +#[derive(Debug, PartialEq, Clone, Serialize, Deserialize, ToSchema)] +#[schema(rename_all = "camelCase")] +pub struct DetailsExportIndexSettings { + #[serde(flatten)] + settings: ExportIndexSettings, + #[serde(skip_serializing_if = "Option::is_none")] + matched_documents: Option, + #[serde(skip_serializing_if = "Option::is_none")] + exported_documents: Option, +} + +impl From for DetailsExportIndexSettings { + fn from(settings: ExportIndexSettings) -> Self { + DetailsExportIndexSettings { settings, matched_documents: None, exported_documents: None } + } +} + impl Details { pub fn to_failed(&self) -> Self { let mut details = self.clone(); diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 666799273..7029f0ebf 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeMap; + use actix_web::web::{self, Data}; use actix_web::{HttpRequest, HttpResponse}; use deserr::actix_web::AwebJson; @@ -8,7 +10,7 @@ use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::actions; -use meilisearch_types::tasks::KindWithContent; +use meilisearch_types::tasks::{ExportIndexSettings as DbExportIndexSettings, KindWithContent}; use serde::Serialize; use tracing::debug; use utoipa::{OpenApi, ToSchema}; @@ -69,8 +71,17 @@ async fn export( let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); - let Export { url, api_key, indexes, skip_embeddings } = export; - let task = KindWithContent::Export { url, api_key, indexes, skip_embeddings }; + let Export { url, api_key, indexes } = export; + let task = KindWithContent::Export { + url, + api_key, + indexes: indexes + .into_iter() + .map(|(pattern, ExportIndexSettings { skip_embeddings, filter })| { + (pattern, DbExportIndexSettings { skip_embeddings, filter }) + }) + .collect(), + }; let uid = get_task_id(&req, &opt)?; let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = @@ -95,11 +106,22 @@ pub struct Export { #[deserr(default, error = DeserrJsonError)] pub api_key: Option, #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] - #[deserr(default, error = DeserrJsonError)] + #[deserr(default)] #[serde(default)] - pub indexes: Vec, + pub indexes: BTreeMap, +} + +#[derive(Debug, Deserr, ToSchema, Serialize)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct ExportIndexSettings { #[schema(value_type = Option, example = json!("true"))] #[serde(default)] - #[deserr(default, error = DeserrJsonError)] + #[deserr(default, error = DeserrJsonError)] pub skip_embeddings: bool, + #[schema(value_type = Option, example = json!("genres = action"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub filter: Option, } From e8795d2608326dff111098d64ea25b646ff4361c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 14 Jun 2025 12:43:24 +0200 Subject: [PATCH 024/150] Export embeddings --- .../src/scheduler/process_export.rs | 73 ++++++++++++++++++- 1 file changed, 70 insertions(+), 3 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e01ddf2e4..1686472ab 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -1,13 +1,17 @@ use std::collections::BTreeMap; +use std::sync::atomic; use std::time::Duration; use meilisearch_types::index_uid_pattern::IndexUidPattern; +use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, Filter}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; use ureq::{json, Agent}; +use crate::processing::AtomicDocumentStep; use crate::{Error, IndexScheduler, Result}; impl IndexScheduler { @@ -92,19 +96,77 @@ impl IndexScheduler { .embedding_configs(&index_rtxn) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + let total_documents = universe.len() as u32; + let (step, progress_step) = AtomicDocumentStep::new(total_documents); + progress.update_progress(progress_step); + let limit = 50 * 1024 * 1024; // 50 MiB let mut buffer = Vec::new(); let mut tmp_buffer = Vec::new(); - for docid in universe { + for (i, docid) in universe.into_iter().enumerate() { let document = index .document(&index_rtxn, docid) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - let value = obkv_to_json(&all_fields, &fields_ids_map, document) + let mut document = obkv_to_json(&all_fields, &fields_ids_map, document) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // TODO definitely factorize this code + if !*skip_embeddings { + 'inject_vectors: { + let embeddings = index + .embeddings(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME) + .or_insert(serde_json::Value::Object(Default::default())); + + let serde_json::Value::Object(vectors) = vectors else { + return Err(Error::from_milli( + meilisearch_types::milli::Error::UserError( + meilisearch_types::milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of(&index_rtxn, std::iter::once(docid)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={docid}") + } + }, + value: vectors.clone(), + }, + ), + Some(uid.to_string()), + )); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(docid)); + + let embeddings = ExplicitVectors { + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + )), + regenerate: !user_provided, + }; + vectors + .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); + } + } + } + tmp_buffer.clear(); - serde_json::to_writer(&mut tmp_buffer, &value) + serde_json::to_writer(&mut tmp_buffer, &document) .map_err(meilisearch_types::milli::InternalError::from) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; @@ -114,9 +176,14 @@ impl IndexScheduler { buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); + + if i % 100 == 0 { + step.fetch_add(100, atomic::Ordering::Relaxed); + } } post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + step.store(total_documents, atomic::Ordering::Relaxed); } Ok(()) From acb7c0a449462d682448d5362cc189ad6410d155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 11:35:47 +0200 Subject: [PATCH 025/150] Implement a retry strategy --- Cargo.lock | 1 + crates/index-scheduler/Cargo.toml | 1 + crates/index-scheduler/src/error.rs | 4 + .../src/scheduler/process_export.rs | 108 ++++++++++++++---- crates/meilisearch-types/src/settings.rs | 1 + 5 files changed, 91 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7455ff1b4..a883b749f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2997,6 +2997,7 @@ name = "index-scheduler" version = "1.15.2" dependencies = [ "anyhow", + "backoff", "big_s", "bincode", "bumpalo", diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index f4901b2f2..de0d01935 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -44,6 +44,7 @@ time = { version = "0.3.41", features = [ tracing = "0.1.41" ureq = "2.12.1" uuid = { version = "1.17.0", features = ["serde", "v4"] } +backoff = "0.4.0" [dev-dependencies] big_s = "1.0.2" diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index 2020ac597..60669ff2d 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -153,6 +153,8 @@ pub enum Error { DatabaseUpgrade(Box), #[error(transparent)] Export(Box), + #[error("Failed to export documents to remote server {code} ({type}): {message} <{link}>")] + FromRemoteWhenExporting { message: String, code: String, r#type: String, link: String }, #[error("Failed to rollback for index `{index}`: {rollback_outcome} ")] RollbackFailed { index: String, rollback_outcome: RollbackOutcome }, #[error(transparent)] @@ -214,6 +216,7 @@ impl Error { | Error::BatchNotFound(_) | Error::TaskDeletionWithEmptyQuery | Error::TaskCancelationWithEmptyQuery + | Error::FromRemoteWhenExporting { .. } | Error::AbortedTask | Error::Dump(_) | Error::Heed(_) @@ -285,6 +288,7 @@ impl ErrorCode for Error { Error::Dump(e) => e.error_code(), Error::Milli { error, .. } => error.error_code(), Error::ProcessBatchPanicked(_) => Code::Internal, + Error::FromRemoteWhenExporting { .. } => Code::Internal, Error::Heed(e) => e.error_code(), Error::HeedTransaction(e) => e.error_code(), Error::FileStore(e) => e.error_code(), diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 1686472ab..7501c260e 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -1,14 +1,18 @@ use std::collections::BTreeMap; +use std::io; use std::sync::atomic; use std::time::Duration; +use backoff::ExponentialBackoff; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, Filter}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; +use serde::Deserialize; use ureq::{json, Agent}; use crate::processing::AtomicDocumentStep; @@ -17,7 +21,7 @@ use crate::{Error, IndexScheduler, Result}; impl IndexScheduler { pub(super) fn process_export( &self, - url: &str, + base_url: &str, indexes: &BTreeMap, api_key: Option<&str>, progress: Progress, @@ -56,24 +60,34 @@ impl IndexScheduler { // Send the primary key let primary_key = index.primary_key(&index_rtxn).unwrap(); - // TODO implement retry logic - let mut request = agent.post(&format!("{url}/indexes")); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); - } - request.send_json(&json!({ "uid": uid, "primaryKey": primary_key })).unwrap(); + let url = format!("{base_url}/indexes"); + retry(|| { + let mut request = agent.post(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + let index_param = json!({ "uid": uid, "primaryKey": primary_key }); + request.send_json(&index_param).map_err(into_backoff_error) + })?; // Send the index settings - let settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) + let mut settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - // TODO implement retry logic - // improve error reporting (get error message) - let mut request = agent.patch(&format!("{url}/indexes/{uid}/settings")); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); + // Remove the experimental chat setting if not enabled + if self.features().check_chat_completions("exporting chat settings").is_err() { + settings.chat = Setting::NotSet; } - request.send_json(settings).unwrap(); + // Retry logic for sending settings + let url = format!("{base_url}/indexes/{uid}/settings"); + retry(|| { + let mut request = agent.patch(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_json(settings.clone()).map_err(into_backoff_error) + })?; + // TODO support JSON Value objects let filter = filter .as_deref() .map(Filter::from_str) @@ -171,8 +185,7 @@ impl IndexScheduler { .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; if buffer.len() + tmp_buffer.len() > limit { - // TODO implement retry logic - post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); @@ -182,7 +195,7 @@ impl IndexScheduler { } } - post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); step.store(total_documents, atomic::Ordering::Relaxed); } @@ -190,19 +203,66 @@ impl IndexScheduler { } } +fn retry(send_request: F) -> Result +where + F: Fn() -> Result>, +{ + match backoff::retry(ExponentialBackoff::default(), || send_request()) { + Ok(response) => Ok(response), + Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), + Err(backoff::Error::Transient { err, retry_after: _ }) => Err(ureq_error_into_error(err)), + } +} + fn post_serialized_documents( agent: &Agent, - url: &str, + base_url: &str, uid: &str, api_key: Option<&str>, buffer: &[u8], -) -> Result { - let mut request = agent.post(&format!("{url}/indexes/{uid}/documents")); - request = request.set("Content-Type", "application/x-ndjson"); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); +) -> Result { + let url = format!("{base_url}/indexes/{uid}/documents"); + retry(|| { + let mut request = agent.post(&url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(buffer).map_err(into_backoff_error) + }) +} + +fn into_backoff_error(err: ureq::Error) -> backoff::Error { + match err { + // Those code status must trigger an automatic retry + // + ureq::Error::Status(408 | 429 | 500 | 502 | 503 | 504, _) => { + backoff::Error::Transient { err, retry_after: None } + } + ureq::Error::Status(_, _) => backoff::Error::Permanent(err), + ureq::Error::Transport(_) => backoff::Error::Transient { err, retry_after: None }, + } +} + +/// Converts a `ureq::Error` into an `Error`. +fn ureq_error_into_error(error: ureq::Error) -> Error { + #[derive(Deserialize)] + struct MeiliError { + message: String, + code: String, + r#type: String, + link: String, + } + + match error { + ureq::Error::Status(_, response) => match response.into_json() { + Ok(MeiliError { message, code, r#type, link }) => { + Error::FromRemoteWhenExporting { message, code, r#type, link } + } + Err(e) => io::Error::from(e).into(), + }, + ureq::Error::Transport(transport) => io::Error::new(io::ErrorKind::Other, transport).into(), } - request.send_bytes(buffer) } enum ExportIndex {} diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index 1c225b355..295318f4b 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -968,6 +968,7 @@ pub fn settings( if let SecretPolicy::HideSecrets = secret_policy { settings.hide_secrets() } + Ok(settings) } From 7c448bcc003c99f125ad8e75dca590b71c984187 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 14:53:50 +0200 Subject: [PATCH 026/150] Make clippy happy --- crates/meilisearch-types/src/tasks.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 1f8f7e7cb..3ef60cacf 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -289,7 +289,7 @@ impl KindWithContent { KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), @@ -360,7 +360,7 @@ impl KindWithContent { KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, @@ -413,7 +413,7 @@ impl From<&KindWithContent> for Option
{ KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, From 3e2f4682137159745848bee46d637dbd35cc9cc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:34:05 +0200 Subject: [PATCH 027/150] Support task cancelation --- .../src/scheduler/process_export.rs | 54 ++++++++++--------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 7501c260e..ceac18632 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -15,6 +15,7 @@ use meilisearch_types::tasks::ExportIndexSettings; use serde::Deserialize; use ureq::{json, Agent}; +use super::MustStopProcessing; use crate::processing::AtomicDocumentStep; use crate::{Error, IndexScheduler, Result}; @@ -41,9 +42,8 @@ impl IndexScheduler { .collect(); let agent: Agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); - + let must_stop_processing = self.scheduler.must_stop_processing.clone(); for (i, (uid, settings)) in indexes.iter().enumerate() { - let must_stop_processing = self.scheduler.must_stop_processing.clone(); if must_stop_processing.get() { return Err(Error::AbortedTask); } @@ -59,9 +59,9 @@ impl IndexScheduler { let index_rtxn = index.read_txn()?; // Send the primary key - let primary_key = index.primary_key(&index_rtxn).unwrap(); + let primary_key = index.primary_key(&index_rtxn)?; let url = format!("{base_url}/indexes"); - retry(|| { + retry(&must_stop_processing, || { let mut request = agent.post(&url); if let Some(api_key) = api_key { request = request.set("Authorization", &format!("Bearer {api_key}")); @@ -79,7 +79,7 @@ impl IndexScheduler { } // Retry logic for sending settings let url = format!("{base_url}/indexes/{uid}/settings"); - retry(|| { + retry(&must_stop_processing, || { let mut request = agent.patch(&url); if let Some(api_key) = api_key { request = request.set("Authorization", &format!("Bearer {api_key}")); @@ -115,6 +115,8 @@ impl IndexScheduler { progress.update_progress(progress_step); let limit = 50 * 1024 * 1024; // 50 MiB + let documents_url = format!("{base_url}/indexes/{uid}/documents"); + let mut buffer = Vec::new(); let mut tmp_buffer = Vec::new(); for (i, docid) in universe.into_iter().enumerate() { @@ -185,7 +187,14 @@ impl IndexScheduler { .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; if buffer.len() + tmp_buffer.len() > limit { - post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); + retry(&must_stop_processing, || { + let mut request = agent.post(&documents_url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(&buffer).map_err(into_backoff_error) + })?; buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); @@ -195,7 +204,14 @@ impl IndexScheduler { } } - post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); + retry(&must_stop_processing, || { + let mut request = agent.post(&documents_url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(&buffer).map_err(into_backoff_error) + })?; step.store(total_documents, atomic::Ordering::Relaxed); } @@ -203,10 +219,14 @@ impl IndexScheduler { } } -fn retry(send_request: F) -> Result +fn retry(must_stop_processing: &MustStopProcessing, send_request: F) -> Result where F: Fn() -> Result>, { + if must_stop_processing.get() { + return Err(Error::AbortedTask); + } + match backoff::retry(ExponentialBackoff::default(), || send_request()) { Ok(response) => Ok(response), Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), @@ -214,24 +234,6 @@ where } } -fn post_serialized_documents( - agent: &Agent, - base_url: &str, - uid: &str, - api_key: Option<&str>, - buffer: &[u8], -) -> Result { - let url = format!("{base_url}/indexes/{uid}/documents"); - retry(|| { - let mut request = agent.post(&url); - request = request.set("Content-Type", "application/x-ndjson"); - if let Some(api_key) = api_key { - request = request.set("Authorization", &(format!("Bearer {api_key}"))); - } - request.send_bytes(buffer).map_err(into_backoff_error) - }) -} - fn into_backoff_error(err: ureq::Error) -> backoff::Error { match err { // Those code status must trigger an automatic retry From bc08cd0deb8805b126c64dc384b18d2ee203f508 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:37:15 +0200 Subject: [PATCH 028/150] Make clippy happy again --- .../index-scheduler/src/scheduler/process_export.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index ceac18632..e10c468fc 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -59,7 +59,10 @@ impl IndexScheduler { let index_rtxn = index.read_txn()?; // Send the primary key - let primary_key = index.primary_key(&index_rtxn)?; + let primary_key = index + .primary_key(&index_rtxn) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + let url = format!("{base_url}/indexes"); retry(&must_stop_processing, || { let mut request = agent.post(&url); @@ -108,7 +111,7 @@ impl IndexScheduler { let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); let embedding_configs = index .embedding_configs(&index_rtxn) - .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; let total_documents = universe.len() as u32; let (step, progress_step) = AtomicDocumentStep::new(total_documents); @@ -227,7 +230,7 @@ where return Err(Error::AbortedTask); } - match backoff::retry(ExponentialBackoff::default(), || send_request()) { + match backoff::retry(ExponentialBackoff::default(), send_request) { Ok(response) => Ok(response), Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), Err(backoff::Error::Transient { err, retry_after: _ }) => Err(ureq_error_into_error(err)), @@ -261,7 +264,7 @@ fn ureq_error_into_error(error: ureq::Error) -> Error { Ok(MeiliError { message, code, r#type, link }) => { Error::FromRemoteWhenExporting { message, code, r#type, link } } - Err(e) => io::Error::from(e).into(), + Err(e) => e.into(), }, ureq::Error::Transport(transport) => io::Error::new(io::ErrorKind::Other, transport).into(), } From 3329248a8448cc1ea8b2356dac803f38b8972287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:50:32 +0200 Subject: [PATCH 029/150] Support no pattern when exporting --- .../src/scheduler/process_export.rs | 89 +++++++++---------- crates/meilisearch-types/src/tasks.rs | 3 +- crates/meilisearch/src/routes/export.rs | 21 +++-- 3 files changed, 54 insertions(+), 59 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e10c468fc..5c65ca51e 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -54,7 +54,7 @@ impl IndexScheduler { indexes.len() as u32, )); - let ExportIndexSettings { skip_embeddings, filter } = settings; + let ExportIndexSettings { filter } = settings; let index = self.index(uid)?; let index_rtxn = index.read_txn()?; @@ -131,56 +131,53 @@ impl IndexScheduler { .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; // TODO definitely factorize this code - if !*skip_embeddings { - 'inject_vectors: { - let embeddings = index - .embeddings(&index_rtxn, docid) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + 'inject_vectors: { + let embeddings = index + .embeddings(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - if embeddings.is_empty() { - break 'inject_vectors; - } + if embeddings.is_empty() { + break 'inject_vectors; + } - let vectors = document - .entry(RESERVED_VECTORS_FIELD_NAME) - .or_insert(serde_json::Value::Object(Default::default())); + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME) + .or_insert(serde_json::Value::Object(Default::default())); - let serde_json::Value::Object(vectors) = vectors else { - return Err(Error::from_milli( - meilisearch_types::milli::Error::UserError( - meilisearch_types::milli::UserError::InvalidVectorsMapType { - document_id: { - if let Ok(Some(Ok(index))) = index - .external_id_of(&index_rtxn, std::iter::once(docid)) - .map(|it| it.into_iter().next()) - { - index - } else { - format!("internal docid={docid}") - } - }, - value: vectors.clone(), + let serde_json::Value::Object(vectors) = vectors else { + return Err(Error::from_milli( + meilisearch_types::milli::Error::UserError( + meilisearch_types::milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of(&index_rtxn, std::iter::once(docid)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={docid}") + } }, - ), - Some(uid.to_string()), - )); + value: vectors.clone(), + }, + ), + Some(uid.to_string()), + )); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(docid)); + + let embeddings = ExplicitVectors { + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + )), + regenerate: !user_provided, }; - - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(docid)); - - let embeddings = ExplicitVectors { - embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, - )), - regenerate: !user_provided, - }; - vectors - .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); - } + vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); } } diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 3ef60cacf..b5e2581fc 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -171,10 +171,9 @@ pub struct IndexSwap { pub indexes: (String, String), } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] pub struct ExportIndexSettings { - pub skip_embeddings: bool, pub filter: Option, } diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 7029f0ebf..40ef20008 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -72,16 +72,19 @@ async fn export( debug!(returns = ?export, "Trigger export"); let Export { url, api_key, indexes } = export; - let task = KindWithContent::Export { - url, - api_key, - indexes: indexes + + let indexes = if indexes.is_empty() { + BTreeMap::from([(IndexUidPattern::new_unchecked("*"), DbExportIndexSettings::default())]) + } else { + indexes .into_iter() - .map(|(pattern, ExportIndexSettings { skip_embeddings, filter })| { - (pattern, DbExportIndexSettings { skip_embeddings, filter }) + .map(|(pattern, ExportIndexSettings { filter })| { + (pattern, DbExportIndexSettings { filter }) }) - .collect(), + .collect() }; + + let task = KindWithContent::Export { url, api_key, indexes }; let uid = get_task_id(&req, &opt)?; let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = @@ -116,10 +119,6 @@ pub struct Export { #[serde(rename_all = "camelCase")] #[schema(rename_all = "camelCase")] pub struct ExportIndexSettings { - #[schema(value_type = Option, example = json!("true"))] - #[serde(default)] - #[deserr(default, error = DeserrJsonError)] - pub skip_embeddings: bool, #[schema(value_type = Option, example = json!("genres = action"))] #[serde(default)] #[deserr(default, error = DeserrJsonError)] From ee812b31c4ef73305fb417869e6ca0d89b856642 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:56:26 +0200 Subject: [PATCH 030/150] Support JSON value as filters --- crates/index-scheduler/src/scheduler/process_export.rs | 5 ++--- crates/meilisearch-types/src/tasks.rs | 7 ++++--- crates/meilisearch/src/routes/export.rs | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 5c65ca51e..e6c09e58a 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -90,10 +90,9 @@ impl IndexScheduler { request.send_json(settings.clone()).map_err(into_backoff_error) })?; - // TODO support JSON Value objects let filter = filter - .as_deref() - .map(Filter::from_str) + .as_ref() + .map(Filter::from_json) .transpose() .map_err(|e| Error::from_milli(e, Some(uid.to_string())))? .flatten(); diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index b5e2581fc..86951192c 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -8,6 +8,7 @@ use milli::update::IndexDocumentsMethod; use milli::Object; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize, Serializer}; +use serde_json::Value; use time::{Duration, OffsetDateTime}; use utoipa::{schema, ToSchema}; use uuid::Uuid; @@ -111,11 +112,11 @@ pub enum KindWithContent { }, DocumentDeletionByFilter { index_uid: String, - filter_expr: serde_json::Value, + filter_expr: Value, }, DocumentEdition { index_uid: String, - filter_expr: Option, + filter_expr: Option, context: Option, function: String, }, @@ -174,7 +175,7 @@ pub struct IndexSwap { #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] pub struct ExportIndexSettings { - pub filter: Option, + pub filter: Option, } impl KindWithContent { diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 40ef20008..de1fe2c38 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -12,6 +12,7 @@ use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::actions; use meilisearch_types::tasks::{ExportIndexSettings as DbExportIndexSettings, KindWithContent}; use serde::Serialize; +use serde_json::Value; use tracing::debug; use utoipa::{OpenApi, ToSchema}; @@ -122,5 +123,5 @@ pub struct ExportIndexSettings { #[schema(value_type = Option, example = json!("genres = action"))] #[serde(default)] #[deserr(default, error = DeserrJsonError)] - pub filter: Option, + pub filter: Option, } From 2d4f7c635eedc00e3ecf4c07cb5c14f300379103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 16:18:31 +0200 Subject: [PATCH 031/150] Make tests happy --- crates/index-scheduler/src/scheduler/test.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index 06bc14051..fb309f882 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -732,6 +732,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, From c6216517c7243809ae7b886eb8e07cecf34ab5b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 16:30:35 +0200 Subject: [PATCH 032/150] Parallelize document upload --- .../src/scheduler/process_export.rs | 189 ++++++++++-------- crates/index-scheduler/src/scheduler/test.rs | 3 + crates/milli/src/thread_pool_no_abort.rs | 18 +- .../src/update/index_documents/extract/mod.rs | 2 +- .../milli/src/update/index_documents/mod.rs | 1 + crates/milli/src/update/mod.rs | 2 +- 6 files changed, 133 insertions(+), 82 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e6c09e58a..3054c919b 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -7,9 +7,9 @@ use backoff::ExponentialBackoff; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; -use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::update::{request_threads, Setting}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; -use meilisearch_types::milli::{obkv_to_json, Filter}; +use meilisearch_types::milli::{self, obkv_to_json, Filter, InternalError}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; use serde::Deserialize; @@ -112,6 +112,10 @@ impl IndexScheduler { .embedding_configs(&index_rtxn) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // We don't need to keep this one alive as we will + // spawn many threads to process the documents + drop(index_rtxn); + let total_documents = universe.len() as u32; let (step, progress_step) = AtomicDocumentStep::new(total_documents); progress.update_progress(progress_step); @@ -119,73 +123,107 @@ impl IndexScheduler { let limit = 50 * 1024 * 1024; // 50 MiB let documents_url = format!("{base_url}/indexes/{uid}/documents"); - let mut buffer = Vec::new(); - let mut tmp_buffer = Vec::new(); - for (i, docid) in universe.into_iter().enumerate() { - let document = index - .document(&index_rtxn, docid) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + request_threads() + .broadcast(|ctx| { + let index_rtxn = index + .read_txn() + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - let mut document = obkv_to_json(&all_fields, &fields_ids_map, document) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let mut buffer = Vec::new(); + let mut tmp_buffer = Vec::new(); + for (i, docid) in universe.iter().enumerate() { + if i % ctx.num_threads() != ctx.index() { + continue; + } - // TODO definitely factorize this code - 'inject_vectors: { - let embeddings = index - .embeddings(&index_rtxn, docid) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let document = index + .document(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - if embeddings.is_empty() { - break 'inject_vectors; + let mut document = obkv_to_json(&all_fields, &fields_ids_map, document) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + // TODO definitely factorize this code + 'inject_vectors: { + let embeddings = index + .embeddings(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME) + .or_insert(serde_json::Value::Object(Default::default())); + + let serde_json::Value::Object(vectors) = vectors else { + return Err(Error::from_milli( + milli::Error::UserError( + milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of( + &index_rtxn, + std::iter::once(docid), + ) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={docid}") + } + }, + value: vectors.clone(), + }, + ), + Some(uid.to_string()), + )); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(docid)); + + let embeddings = ExplicitVectors { + embeddings: Some( + VectorOrArrayOfVectors::from_array_of_vectors(embeddings), + ), + regenerate: !user_provided, + }; + vectors.insert( + embedder_name, + serde_json::to_value(embeddings).unwrap(), + ); + } + } + + tmp_buffer.clear(); + serde_json::to_writer(&mut tmp_buffer, &document) + .map_err(milli::InternalError::from) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + + if buffer.len() + tmp_buffer.len() > limit { + retry(&must_stop_processing, || { + let mut request = agent.post(&documents_url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request + .set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(&buffer).map_err(into_backoff_error) + })?; + buffer.clear(); + } + buffer.extend_from_slice(&tmp_buffer); + + if i % 100 == 0 { + step.fetch_add(100, atomic::Ordering::Relaxed); + } } - let vectors = document - .entry(RESERVED_VECTORS_FIELD_NAME) - .or_insert(serde_json::Value::Object(Default::default())); - - let serde_json::Value::Object(vectors) = vectors else { - return Err(Error::from_milli( - meilisearch_types::milli::Error::UserError( - meilisearch_types::milli::UserError::InvalidVectorsMapType { - document_id: { - if let Ok(Some(Ok(index))) = index - .external_id_of(&index_rtxn, std::iter::once(docid)) - .map(|it| it.into_iter().next()) - { - index - } else { - format!("internal docid={docid}") - } - }, - value: vectors.clone(), - }, - ), - Some(uid.to_string()), - )); - }; - - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(docid)); - - let embeddings = ExplicitVectors { - embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, - )), - regenerate: !user_provided, - }; - vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); - } - } - - tmp_buffer.clear(); - serde_json::to_writer(&mut tmp_buffer, &document) - .map_err(meilisearch_types::milli::InternalError::from) - .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - - if buffer.len() + tmp_buffer.len() > limit { retry(&must_stop_processing, || { let mut request = agent.post(&documents_url); request = request.set("Content-Type", "application/x-ndjson"); @@ -194,23 +232,16 @@ impl IndexScheduler { } request.send_bytes(&buffer).map_err(into_backoff_error) })?; - buffer.clear(); - } - buffer.extend_from_slice(&tmp_buffer); - if i % 100 == 0 { - step.fetch_add(100, atomic::Ordering::Relaxed); - } - } + Ok(()) + }) + .map_err(|e| { + Error::from_milli( + milli::Error::InternalError(InternalError::PanicInThreadPool(e)), + Some(uid.to_string()), + ) + })?; - retry(&must_stop_processing, || { - let mut request = agent.post(&documents_url); - request = request.set("Content-Type", "application/x-ndjson"); - if let Some(api_key) = api_key { - request = request.set("Authorization", &(format!("Bearer {api_key}"))); - } - request.send_bytes(&buffer).map_err(into_backoff_error) - })?; step.store(total_documents, atomic::Ordering::Relaxed); } diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index fb309f882..ee26165c7 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -766,6 +766,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, @@ -806,6 +807,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, @@ -847,6 +849,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, diff --git a/crates/milli/src/thread_pool_no_abort.rs b/crates/milli/src/thread_pool_no_abort.rs index 0c2fbb30d..66380ff36 100644 --- a/crates/milli/src/thread_pool_no_abort.rs +++ b/crates/milli/src/thread_pool_no_abort.rs @@ -1,7 +1,7 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::Arc; -use rayon::{ThreadPool, ThreadPoolBuilder}; +use rayon::{BroadcastContext, ThreadPool, ThreadPoolBuilder}; use thiserror::Error; /// A rayon ThreadPool wrapper that can catch panics in the pool @@ -32,6 +32,22 @@ impl ThreadPoolNoAbort { } } + pub fn broadcast(&self, op: OP) -> Result, PanicCatched> + where + OP: Fn(BroadcastContext<'_>) -> R + Sync, + R: Send, + { + self.active_operations.fetch_add(1, Ordering::Relaxed); + let output = self.thread_pool.broadcast(op); + self.active_operations.fetch_sub(1, Ordering::Relaxed); + // While reseting the pool panic catcher we return an error if we catched one. + if self.pool_catched_panic.swap(false, Ordering::SeqCst) { + Err(PanicCatched) + } else { + Ok(output) + } + } + pub fn current_num_threads(&self) -> usize { self.thread_pool.current_num_threads() } diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index 8cd664a2f..cb4ac03a6 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -210,7 +210,7 @@ fn run_extraction_task( }) } -fn request_threads() -> &'static ThreadPoolNoAbort { +pub fn request_threads() -> &'static ThreadPoolNoAbort { static REQUEST_THREADS: OnceLock = OnceLock::new(); REQUEST_THREADS.get_or_init(|| { diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index f547c68d4..dd0238fcb 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -12,6 +12,7 @@ use std::sync::Arc; use crossbeam_channel::{Receiver, Sender}; use enrich::enrich_documents_batch; +pub use extract::request_threads; use grenad::{Merger, MergerBuilder}; use hashbrown::HashMap; use heed::types::Str; diff --git a/crates/milli/src/update/mod.rs b/crates/milli/src/update/mod.rs index 04ce68fc7..64eb9f1d3 100644 --- a/crates/milli/src/update/mod.rs +++ b/crates/milli/src/update/mod.rs @@ -4,7 +4,7 @@ pub use self::clear_documents::ClearDocuments; pub use self::concurrent_available_ids::ConcurrentAvailableIds; pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; -pub use self::index_documents::*; +pub use self::index_documents::{request_threads, *}; pub use self::indexer_config::{default_thread_pool_and_threads, IndexerConfig}; pub use self::new::ChannelCongestion; pub use self::settings::{validate_embedding_settings, Setting, Settings}; From a743da30618850e6e6e302b1c7e009d932d7a8b6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 25 Jun 2025 12:29:14 +0200 Subject: [PATCH 033/150] Gzip-compress the content --- .../src/scheduler/process_export.rs | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 3054c919b..180162eda 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -1,9 +1,11 @@ use std::collections::BTreeMap; -use std::io; +use std::io::{self, Write as _}; use std::sync::atomic; use std::time::Duration; use backoff::ExponentialBackoff; +use flate2::write::GzEncoder; +use flate2::Compression; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; @@ -131,6 +133,7 @@ impl IndexScheduler { let mut buffer = Vec::new(); let mut tmp_buffer = Vec::new(); + let mut compressed_buffer = Vec::new(); for (i, docid) in universe.iter().enumerate() { if i % ctx.num_threads() != ctx.index() { continue; @@ -205,17 +208,31 @@ impl IndexScheduler { .map_err(milli::InternalError::from) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - if buffer.len() + tmp_buffer.len() > limit { + // Make sure we put at least one document in the buffer even + // though we might go above the buffer limit before sending + if !buffer.is_empty() && buffer.len() + tmp_buffer.len() > limit { + // We compress the documents before sending them + let mut encoder = + GzEncoder::new(&mut compressed_buffer, Compression::default()); + encoder + .write_all(&buffer) + .map_err(|e| Error::from_milli(e.into(), Some(uid.clone())))?; + encoder + .finish() + .map_err(|e| Error::from_milli(e.into(), Some(uid.clone())))?; + retry(&must_stop_processing, || { let mut request = agent.post(&documents_url); request = request.set("Content-Type", "application/x-ndjson"); + request = request.set("Content-Encoding", "gzip"); if let Some(api_key) = api_key { request = request .set("Authorization", &(format!("Bearer {api_key}"))); } - request.send_bytes(&buffer).map_err(into_backoff_error) + request.send_bytes(&compressed_buffer).map_err(into_backoff_error) })?; buffer.clear(); + compressed_buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); From 9422b6d654cf44a60d540444310e9c96b173b18a Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 26 Jun 2025 10:58:27 +0200 Subject: [PATCH 034/150] Update crates/meilisearch/src/lib.rs Co-authored-by: Louis Dureuil --- crates/meilisearch/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index cdecd520c..5acfb4bc9 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -543,7 +543,7 @@ fn import_dump( tracing::info!("Importing the settings."); let settings = index_reader.settings()?; apply_settings_to_builder(&settings, &mut builder); - let embedder_stats: Arc = Default::default(); // FIXME: this isn't linked to anything + let embedder_stats: Arc = Default::default(); builder.execute( |indexing_step| tracing::debug!("update: {:?}", indexing_step), || false, From 3fc16c627dfdf033f9521264d2517108532d9b4c Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 26 Jun 2025 11:11:38 +0200 Subject: [PATCH 035/150] Comment the delay --- crates/meilisearch/tests/vector/rest.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 7e2245223..e80dfeb0a 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -349,7 +349,7 @@ async fn create_faulty_mock_raw(sender: mpsc::Sender<()>) -> (MockServer, Value) if count >= 5 { let _ = sender.try_send(()); ResponseTemplate::new(500) - .set_delay(Duration::from_secs(u64::MAX)) + .set_delay(Duration::from_secs(u64::MAX)) // Make the response hang forever .set_body_string("Service Unavailable") } else { ResponseTemplate::new(500).set_body_string("Service Unavailable") From ef007d547df6fcc48480a8593ce135f46863a293 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 26 Jun 2025 11:12:01 +0200 Subject: [PATCH 036/150] Remove panics --- crates/meilisearch-types/src/batches.rs | 2 +- crates/milli/src/progress.rs | 4 +++- crates/milli/src/vector/rest.rs | 19 ++++++++++--------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index cec74fb75..c8d98655f 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -100,7 +100,7 @@ pub struct EmbedderStatsView { impl From<&EmbedderStats> for EmbedderStatsView { fn from(stats: &EmbedderStats) -> Self { - let errors = stats.errors.read().unwrap(); + let errors = stats.errors.read().unwrap_or_else(|p| p.into_inner()); Self { total_count: stats.total_count.load(std::sync::atomic::Ordering::Relaxed), error_count: errors.1 as usize, diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 7ecfcc095..61c61cd49 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -30,7 +30,9 @@ pub struct EmbedderStats { impl std::fmt::Debug for EmbedderStats { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let (error, count) = self.errors.read().unwrap().clone(); + let guard = self.errors.read().unwrap_or_else(|p| p.into_inner()); + let (error, count) = (guard.0.clone(), guard.1); + std::mem::drop(guard); f.debug_struct("EmbedderStats") .field("last_error", &error) .field("total_count", &self.total_count.load(Ordering::Relaxed)) diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index 409284b65..dd08c6a5e 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -335,10 +335,11 @@ where Err(retry) => { tracing::warn!("Failed: {}", retry.error); if let Some(embedder_stats) = &embedder_stats { - if let Ok(mut errors) = embedder_stats.errors.write() { - errors.0 = Some(retry.error.to_string()); - errors.1 += 1; - } + let stringified_error = retry.error.to_string(); + let mut errors = + embedder_stats.errors.write().unwrap_or_else(|p| p.into_inner()); + errors.0 = Some(stringified_error); + errors.1 += 1; } if let Some(deadline) = deadline { let now = std::time::Instant::now(); @@ -377,11 +378,11 @@ where Ok(response) => Ok(response), Err(retry) => { if let Some(embedder_stats) = &embedder_stats { - if let Ok(mut errors) = embedder_stats.errors.write() { - errors.0 = Some(retry.error.to_string()); - errors.1 += 1; - } - } + let stringified_error = retry.error.to_string(); + let mut errors = embedder_stats.errors.write().unwrap_or_else(|p| p.into_inner()); + errors.0 = Some(stringified_error); + errors.1 += 1; + }; Err(retry.into_error()) } } From 29f6eeff8fc82b55799e1c958249cb53349e603e Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 26 Jun 2025 12:07:48 +0200 Subject: [PATCH 037/150] Remove lots of Arcs --- crates/benchmarks/benches/indexing.rs | 62 +++++++++---------- crates/benchmarks/benches/utils.rs | 2 +- crates/fuzzers/src/bin/fuzz-indexing.rs | 2 +- .../src/scheduler/process_index_operation.rs | 10 +-- .../milli/src/search/new/tests/integration.rs | 2 +- crates/milli/src/test_index.rs | 6 +- .../extract/extract_vector_points.rs | 8 +-- .../src/update/index_documents/extract/mod.rs | 6 +- .../milli/src/update/index_documents/mod.rs | 30 ++++----- .../src/update/new/extract/vectors/mod.rs | 18 +++--- .../milli/src/update/new/indexer/extract.rs | 3 +- crates/milli/src/update/new/indexer/mod.rs | 3 +- crates/milli/src/update/settings.rs | 4 +- crates/milli/src/vector/composite.rs | 9 ++- crates/milli/src/vector/mod.rs | 4 +- crates/milli/src/vector/ollama.rs | 15 +++-- crates/milli/src/vector/openai.rs | 15 +++-- crates/milli/src/vector/rest.rs | 23 ++++--- .../milli/tests/search/facet_distribution.rs | 2 +- crates/milli/tests/search/mod.rs | 2 +- crates/milli/tests/search/query_criteria.rs | 2 +- crates/milli/tests/search/typo_tolerance.rs | 2 +- 22 files changed, 112 insertions(+), 118 deletions(-) diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 8241da9d2..3afad8ee5 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -169,7 +169,7 @@ fn indexing_songs_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -236,7 +236,7 @@ fn reindexing_songs_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -281,7 +281,7 @@ fn reindexing_songs_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -350,7 +350,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -427,7 +427,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -472,7 +472,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -513,7 +513,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -581,7 +581,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -648,7 +648,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -715,7 +715,7 @@ fn indexing_wiki(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -781,7 +781,7 @@ fn reindexing_wiki(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -826,7 +826,7 @@ fn reindexing_wiki(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -894,7 +894,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -971,7 +971,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -1017,7 +1017,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -1059,7 +1059,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -1126,7 +1126,7 @@ fn indexing_movies_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -1192,7 +1192,7 @@ fn reindexing_movies_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -1237,7 +1237,7 @@ fn reindexing_movies_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -1305,7 +1305,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); @@ -1354,7 +1354,7 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec Index { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); diff --git a/crates/fuzzers/src/bin/fuzz-indexing.rs b/crates/fuzzers/src/bin/fuzz-indexing.rs index 23c4cb9c2..0632b7846 100644 --- a/crates/fuzzers/src/bin/fuzz-indexing.rs +++ b/crates/fuzzers/src/bin/fuzz-indexing.rs @@ -144,7 +144,7 @@ fn main() { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index b5338e511..14b07aea0 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -35,7 +35,7 @@ impl IndexScheduler { index: &'i Index, operation: IndexOperation, progress: &Progress, - embedder_stats: Arc, + embedder_stats: Arc, // Cant change ) -> Result<(Vec, Option)> { let indexer_alloc = Bump::new(); let started_processing_at = std::time::Instant::now(); @@ -180,7 +180,7 @@ impl IndexScheduler { embedders, &|| must_stop_processing.get(), progress, - embedder_stats, + &embedder_stats, ) .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?, ); @@ -292,7 +292,7 @@ impl IndexScheduler { embedders, &|| must_stop_processing.get(), progress, - embedder_stats, + &embedder_stats, ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, ); @@ -441,7 +441,7 @@ impl IndexScheduler { embedders, &|| must_stop_processing.get(), progress, - embedder_stats, + &embedder_stats, ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, ); @@ -478,7 +478,7 @@ impl IndexScheduler { .execute( |indexing_step| tracing::debug!(update = ?indexing_step), || must_stop_processing.get(), - embedder_stats, + embedder_stats.clone(), ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index c4e521a88..36917c10e 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -95,7 +95,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index 3546660b0..d218bb3a6 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -103,7 +103,7 @@ impl TempIndex { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) }) .unwrap()?; @@ -186,7 +186,7 @@ impl TempIndex { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) }) .unwrap()?; @@ -261,7 +261,7 @@ fn aborting_indexation() { embedders, &|| should_abort.load(Relaxed), &Progress::default(), - Default::default(), + &Default::default(), ) }) .unwrap() diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index e6d874a69..e1981a615 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -684,7 +684,7 @@ pub fn extract_embeddings( embedder: Arc, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, ) -> Result>> { @@ -727,7 +727,7 @@ pub fn extract_embeddings( std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)), embedder_name, possible_embedding_mistakes, - embedder_stats.clone(), + embedder_stats, unused_vectors_distribution, request_threads, )?; @@ -750,7 +750,7 @@ pub fn extract_embeddings( std::mem::take(&mut chunks), embedder_name, possible_embedding_mistakes, - embedder_stats.clone(), + embedder_stats, unused_vectors_distribution, request_threads, )?; @@ -789,7 +789,7 @@ fn embed_chunks( text_chunks: Vec>, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, ) -> Result>> { diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index 1eeddcccb..3af665c67 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -50,7 +50,7 @@ pub(crate) fn data_from_obkv_documents( settings_diff: Arc, max_positions_per_attributes: Option, possible_embedding_mistakes: Arc, - embedder_stats: Arc, + embedder_stats: Arc, // Cant change ) -> Result<()> { let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join( || { @@ -234,7 +234,7 @@ fn send_original_documents_data( embedders_configs: Arc>, settings_diff: Arc, possible_embedding_mistakes: Arc, - embedder_stats: Arc, + embedder_stats: Arc, // Cant change ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -274,7 +274,7 @@ fn send_original_documents_data( embedder.clone(), &embedder_name, &possible_embedding_mistakes, - embedder_stats.clone(), + &embedder_stats, &unused_vectors_distribution, request_threads(), ) { diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index f2e1783e4..2bddf1b17 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -81,7 +81,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { added_documents: u64, deleted_documents: u64, embedders: EmbeddingConfigs, - embedder_stats: Arc, + embedder_stats: Arc, // Cant change } #[derive(Default, Debug, Clone)] @@ -104,7 +104,7 @@ where config: IndexDocumentsConfig, progress: FP, should_abort: FA, - embedder_stats: Arc, + embedder_stats: Arc, // Cant change ) -> Result> { let transform = Some(Transform::new( wtxn, @@ -2030,7 +2030,7 @@ mod tests { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2118,7 +2118,7 @@ mod tests { EmbeddingConfigs::default(), &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2304,7 +2304,7 @@ mod tests { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2367,7 +2367,7 @@ mod tests { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2421,7 +2421,7 @@ mod tests { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2474,7 +2474,7 @@ mod tests { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2529,7 +2529,7 @@ mod tests { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2589,7 +2589,7 @@ mod tests { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2642,7 +2642,7 @@ mod tests { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2695,7 +2695,7 @@ mod tests { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2894,7 +2894,7 @@ mod tests { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2954,7 +2954,7 @@ mod tests { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -3011,7 +3011,7 @@ mod tests { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); wtxn.commit().unwrap(); diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index c21dabf74..85398aa99 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -1,4 +1,4 @@ -use std::{cell::RefCell, sync::Arc}; +use std::cell::RefCell; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; @@ -23,7 +23,7 @@ pub struct EmbeddingExtractor<'a, 'b> { embedders: &'a EmbeddingConfigs, sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, - embedder_stats: Arc, + embedder_stats: &'a EmbedderStats, threads: &'a ThreadPoolNoAbort, } @@ -32,7 +32,7 @@ impl<'a, 'b> EmbeddingExtractor<'a, 'b> { embedders: &'a EmbeddingConfigs, sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, - embedder_stats: Arc, + embedder_stats: &'a EmbedderStats, threads: &'a ThreadPoolNoAbort, ) -> Self { let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution); @@ -78,7 +78,7 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { prompt, context.data, &self.possible_embedding_mistakes, - self.embedder_stats.clone(), + self.embedder_stats, self.threads, self.sender, &context.doc_alloc, @@ -311,7 +311,7 @@ struct Chunks<'a, 'b, 'extractor> { dimensions: usize, prompt: &'a Prompt, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, - embedder_stats: Arc, + embedder_stats: &'a EmbedderStats, user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, sender: EmbeddingSender<'a, 'b>, @@ -327,7 +327,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { prompt: &'a Prompt, user_provided: &'a RefCell>, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, - embedder_stats: Arc, + embedder_stats: &'a EmbedderStats, threads: &'a ThreadPoolNoAbort, sender: EmbeddingSender<'a, 'b>, doc_alloc: &'a Bump, @@ -378,7 +378,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { self.embedder_id, self.embedder_name, self.possible_embedding_mistakes, - self.embedder_stats.clone(), + self.embedder_stats, unused_vectors_distribution, self.threads, self.sender, @@ -397,7 +397,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { self.embedder_id, self.embedder_name, self.possible_embedding_mistakes, - self.embedder_stats.clone(), + self.embedder_stats, unused_vectors_distribution, self.threads, self.sender, @@ -416,7 +416,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { embedder_id: u8, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, unused_vectors_distribution: &UnusedVectorsDistributionBump, threads: &ThreadPoolNoAbort, sender: EmbeddingSender<'a, 'b>, diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index c721a2563..97ffc8624 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -1,6 +1,5 @@ use std::collections::BTreeMap; use std::sync::atomic::AtomicBool; -use std::sync::Arc; use std::sync::OnceLock; use bumpalo::Bump; @@ -36,7 +35,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( mut index_embeddings: Vec, document_ids: &mut RoaringBitmap, modified_docids: &mut RoaringBitmap, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, ) -> Result<(FacetFieldIdsDelta, Vec)> where DC: DocumentChanges<'pl>, diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 33774f892..bb6ba0102 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -1,5 +1,4 @@ use std::sync::atomic::AtomicBool; -use std::sync::Arc; use std::sync::{Once, RwLock}; use std::thread::{self, Builder}; @@ -56,7 +55,7 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>( embedders: EmbeddingConfigs, must_stop_processing: &'indexer MSP, progress: &'indexer Progress, - embedder_stats: Arc, + embedder_stats: &'indexer EmbedderStats, ) -> Result where DC: DocumentChanges<'pl>, diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index b3f70d1b6..71cedf456 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -475,7 +475,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { progress_callback: &FP, should_abort: &FA, settings_diff: InnerIndexSettingsDiff, - embedder_stats: Arc, + embedder_stats: Arc, // Cant change ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -1362,7 +1362,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { mut self, progress_callback: FP, should_abort: FA, - embedder_stats: Arc, + embedder_stats: Arc, // Cant change ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, diff --git a/crates/milli/src/vector/composite.rs b/crates/milli/src/vector/composite.rs index 87f05d4fe..8314b8649 100644 --- a/crates/milli/src/vector/composite.rs +++ b/crates/milli/src/vector/composite.rs @@ -1,4 +1,3 @@ -use std::sync::Arc; use std::time::Instant; use arroy::Distance; @@ -154,7 +153,7 @@ impl SubEmbedder { &self, texts: Vec, deadline: Option, - embedder_stats: Option>, + embedder_stats: Option<&EmbedderStats>, ) -> std::result::Result, EmbedError> { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed(texts), @@ -169,7 +168,7 @@ impl SubEmbedder { &self, text: &str, deadline: Option, - embedder_stats: Option>, + embedder_stats: Option<&EmbedderStats>, ) -> std::result::Result { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed_one(text), @@ -196,7 +195,7 @@ impl SubEmbedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, ) -> std::result::Result>, EmbedError> { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), @@ -218,7 +217,7 @@ impl SubEmbedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, ) -> std::result::Result, EmbedError> { match self { SubEmbedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 481eb6c99..065beb5fb 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -749,7 +749,7 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, ) -> std::result::Result>, EmbedError> { match self { Embedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), @@ -772,7 +772,7 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, ) -> std::result::Result, EmbedError> { match self { Embedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/ollama.rs index 045b65b72..d4329a2de 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/ollama.rs @@ -1,4 +1,3 @@ -use std::sync::Arc; use std::time::Instant; use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; @@ -106,7 +105,7 @@ impl Embedder { &self, texts: &[S], deadline: Option, - embedder_stats: Option>, + embedder_stats: Option<&EmbedderStats>, ) -> Result, EmbedError> { match self.rest_embedder.embed_ref(texts, deadline, embedder_stats) { Ok(embeddings) => Ok(embeddings), @@ -121,21 +120,21 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { text_chunks .into_iter() - .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats))) .collect() } else { threads .install(move || { text_chunks .into_par_iter() - .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats))) .collect() }) .map_err(|error| EmbedError { @@ -149,14 +148,14 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { let embeddings: Result>, _> = texts .chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats))) .collect(); let embeddings = embeddings?; @@ -166,7 +165,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats))) .collect(); let embeddings = embeddings?; diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/openai.rs index b64e3d467..0159d5c76 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/openai.rs @@ -1,5 +1,4 @@ use std::fmt; -use std::sync::Arc; use std::time::Instant; use ordered_float::OrderedFloat; @@ -217,7 +216,7 @@ impl Embedder { &self, texts: &[S], deadline: Option, - embedder_stats: Option>, + embedder_stats: Option<&EmbedderStats>, ) -> Result, EmbedError> { match self.rest_embedder.embed_ref(texts, deadline, embedder_stats) { Ok(embeddings) => Ok(embeddings), @@ -262,21 +261,21 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { text_chunks .into_iter() - .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats))) .collect() } else { threads .install(move || { text_chunks .into_par_iter() - .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed(&chunk, None, Some(embedder_stats))) .collect() }) .map_err(|error| EmbedError { @@ -290,14 +289,14 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { let embeddings: Result>, _> = texts .chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats))) .collect(); let embeddings = embeddings?; Ok(embeddings.into_iter().flatten().collect()) @@ -306,7 +305,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats))) .collect(); let embeddings = embeddings?; diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index dd08c6a5e..fbe3c1129 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -1,5 +1,4 @@ use std::collections::BTreeMap; -use std::sync::Arc; use std::time::Instant; use deserr::Deserr; @@ -170,7 +169,7 @@ impl Embedder { &self, texts: Vec, deadline: Option, - embedder_stats: Option>, + embedder_stats: Option<&EmbedderStats>, ) -> Result, EmbedError> { embed( &self.data, @@ -186,7 +185,7 @@ impl Embedder { &self, texts: &[S], deadline: Option, - embedder_stats: Option>, + embedder_stats: Option<&EmbedderStats>, ) -> Result, EmbedError> where S: AsRef + Serialize, @@ -208,21 +207,21 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, ) -> Result>, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { text_chunks .into_iter() - .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats))) .collect() } else { threads .install(move || { text_chunks .into_par_iter() - .map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed(chunk, None, Some(embedder_stats))) .collect() }) .map_err(|error| EmbedError { @@ -236,14 +235,14 @@ impl Embedder { &self, texts: &[&str], threads: &ThreadPoolNoAbort, - embedder_stats: Arc, + embedder_stats: &EmbedderStats, ) -> Result, EmbedError> { // This condition helps reduce the number of active rayon jobs // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. if threads.active_operations() >= REQUEST_PARALLELISM { let embeddings: Result>, _> = texts .chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed_ref(chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed_ref(chunk, None, Some(embedder_stats))) .collect(); let embeddings = embeddings?; @@ -253,7 +252,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed_ref(chunk, None, Some(embedder_stats.clone()))) + .map(move |chunk| self.embed_ref(chunk, None, Some(embedder_stats))) .collect(); let embeddings = embeddings?; @@ -303,7 +302,7 @@ fn embed( expected_count: usize, expected_dimension: Option, deadline: Option, - embedder_stats: Option>, + embedder_stats: Option<&EmbedderStats>, ) -> Result, EmbedError> where S: Serialize, @@ -323,7 +322,7 @@ where for attempt in 0..10 { if let Some(embedder_stats) = &embedder_stats { - embedder_stats.as_ref().total_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + embedder_stats.total_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed); } let response = request.clone().send_json(&body); let result = check_response(response, data.configuration_source).and_then(|response| { @@ -367,7 +366,7 @@ where } if let Some(embedder_stats) = &embedder_stats { - embedder_stats.as_ref().total_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + embedder_stats.total_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed); } let response = request.send_json(&body); let result = check_response(response, data.configuration_source).and_then(|response| { diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index 5ed223400..8548f0d01 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -74,7 +74,7 @@ fn test_facet_distribution_with_no_facet_values() { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index beee4ac54..4098af736 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -114,7 +114,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index 04b8374de..b72978330 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -344,7 +344,7 @@ fn criteria_ascdesc() { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index e2cdab550..9aacbf82a 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -153,7 +153,7 @@ fn test_typo_disabled_on_word() { embedders, &|| false, &Progress::default(), - Default::default(), + &Default::default(), ) .unwrap(); From 0f6dd133b2bcb54d1489fa748f8d8dedadca4125 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 26 Jun 2025 12:11:43 +0200 Subject: [PATCH 038/150] Turn to references --- crates/meilisearch/src/lib.rs | 2 +- crates/milli/src/update/index_documents/extract/mod.rs | 4 ++-- crates/milli/src/update/index_documents/mod.rs | 6 +++--- crates/milli/src/update/settings.rs | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 5acfb4bc9..c902f4e60 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -579,7 +579,7 @@ fn import_dump( }, |indexing_step| tracing::trace!("update: {:?}", indexing_step), || false, - embedder_stats, + &embedder_stats, )?; let builder = builder.with_embedders(embedders); diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index 3af665c67..9c1971356 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -50,7 +50,7 @@ pub(crate) fn data_from_obkv_documents( settings_diff: Arc, max_positions_per_attributes: Option, possible_embedding_mistakes: Arc, - embedder_stats: Arc, // Cant change + embedder_stats: &Arc, ) -> Result<()> { let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join( || { @@ -234,7 +234,7 @@ fn send_original_documents_data( embedders_configs: Arc>, settings_diff: Arc, possible_embedding_mistakes: Arc, - embedder_stats: Arc, // Cant change + embedder_stats: Arc, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 2bddf1b17..6e56ad155 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -81,7 +81,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { added_documents: u64, deleted_documents: u64, embedders: EmbeddingConfigs, - embedder_stats: Arc, // Cant change + embedder_stats: &'t Arc, } #[derive(Default, Debug, Clone)] @@ -104,7 +104,7 @@ where config: IndexDocumentsConfig, progress: FP, should_abort: FA, - embedder_stats: Arc, // Cant change + embedder_stats: &'t Arc, ) -> Result> { let transform = Some(Transform::new( wtxn, @@ -331,7 +331,7 @@ where settings_diff_cloned, max_positions_per_attributes, Arc::new(possible_embedding_mistakes), - embedder_stats.clone() + &embedder_stats ) }); diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 71cedf456..06c2d0cc2 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -475,7 +475,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { progress_callback: &FP, should_abort: &FA, settings_diff: InnerIndexSettingsDiff, - embedder_stats: Arc, // Cant change + embedder_stats: &Arc, // Cant change ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -507,7 +507,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { IndexDocumentsConfig::default(), &progress_callback, &should_abort, - embedder_stats, + &embedder_stats, )?; indexing_builder.execute_raw(output)?; @@ -1421,7 +1421,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { ); if inner_settings_diff.any_reindexing_needed() { - self.reindex(&progress_callback, &should_abort, inner_settings_diff, embedder_stats)?; + self.reindex(&progress_callback, &should_abort, inner_settings_diff, &embedder_stats)?; } Ok(()) From 2ff382c023c25d65fb255bc388223789b395be0a Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 26 Jun 2025 12:14:56 +0200 Subject: [PATCH 039/150] Remove useless clone --- crates/index-scheduler/src/scheduler/process_index_operation.rs | 2 +- crates/milli/src/update/settings.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index 14b07aea0..84554849f 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -478,7 +478,7 @@ impl IndexScheduler { .execute( |indexing_step| tracing::debug!(update = ?indexing_step), || must_stop_processing.get(), - embedder_stats.clone(), + embedder_stats, ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 06c2d0cc2..99736f971 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -507,7 +507,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { IndexDocumentsConfig::default(), &progress_callback, &should_abort, - &embedder_stats, + embedder_stats, )?; indexing_builder.execute_raw(output)?; From 4d26e9c6f2b64f0b1f5afbeafdad242271cecc69 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 26 Jun 2025 12:21:34 +0200 Subject: [PATCH 040/150] Remove my comments --- .../index-scheduler/src/scheduler/process_index_operation.rs | 2 +- crates/milli/src/update/settings.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index 84554849f..4c0db9ce4 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -35,7 +35,7 @@ impl IndexScheduler { index: &'i Index, operation: IndexOperation, progress: &Progress, - embedder_stats: Arc, // Cant change + embedder_stats: Arc, ) -> Result<(Vec, Option)> { let indexer_alloc = Bump::new(); let started_processing_at = std::time::Instant::now(); diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 99736f971..05dbb4784 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -475,7 +475,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { progress_callback: &FP, should_abort: &FA, settings_diff: InnerIndexSettingsDiff, - embedder_stats: &Arc, // Cant change + embedder_stats: &Arc, ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -1362,7 +1362,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { mut self, progress_callback: FP, should_abort: FA, - embedder_stats: Arc, // Cant change + embedder_stats: Arc, ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, From 44d6430bae887c11bc9f866684bf857204137d57 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 26 Jun 2025 12:30:08 +0200 Subject: [PATCH 041/150] Rename fields --- crates/meilisearch-types/src/batch_view.rs | 4 ++-- crates/meilisearch-types/src/batches.rs | 10 +++++----- crates/meilisearch/tests/vector/rest.rs | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/crates/meilisearch-types/src/batch_view.rs b/crates/meilisearch-types/src/batch_view.rs index ea027b74e..297b10ba1 100644 --- a/crates/meilisearch-types/src/batch_view.rs +++ b/crates/meilisearch-types/src/batch_view.rs @@ -32,7 +32,7 @@ pub struct BatchStatsView { #[serde(flatten)] pub stats: BatchStats, #[serde(skip_serializing_if = "EmbedderStatsView::skip_serializing", default)] - pub embedder: EmbedderStatsView, + pub embedder_requests: EmbedderStatsView, } impl BatchView { @@ -43,7 +43,7 @@ impl BatchView { details: batch.details.clone(), stats: BatchStatsView { stats: batch.stats.clone(), - embedder: batch.embedder_stats.clone(), + embedder_requests: batch.embedder_stats.clone(), }, duration: batch.finished_at.map(|finished_at| finished_at - batch.started_at), started_at: batch.started_at, diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index c8d98655f..e1cc2b7c7 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -92,8 +92,8 @@ pub struct BatchStats { #[serde(rename_all = "camelCase")] #[schema(rename_all = "camelCase")] pub struct EmbedderStatsView { - pub total_count: usize, - pub error_count: usize, + pub total: usize, + pub failed: usize, #[serde(skip_serializing_if = "Option::is_none", default)] pub last_error: Option, } @@ -102,8 +102,8 @@ impl From<&EmbedderStats> for EmbedderStatsView { fn from(stats: &EmbedderStats) -> Self { let errors = stats.errors.read().unwrap_or_else(|p| p.into_inner()); Self { - total_count: stats.total_count.load(std::sync::atomic::Ordering::Relaxed), - error_count: errors.1 as usize, + total: stats.total_count.load(std::sync::atomic::Ordering::Relaxed), + failed: errors.1 as usize, last_error: errors.0.clone(), } } @@ -111,6 +111,6 @@ impl From<&EmbedderStats> for EmbedderStatsView { impl EmbedderStatsView { pub fn skip_serializing(&self) -> bool { - self.total_count == 0 && self.error_count == 0 && self.last_error.is_none() + self.total == 0 && self.failed == 0 && self.last_error.is_none() } } diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index e80dfeb0a..6e781e525 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -2182,7 +2182,7 @@ async fn last_error_stats() { let (response, _code) = index.filtered_batches(&[], &[], &[]).await; snapshot!(json_string!(response["results"][0], { ".progress" => "[ignored]", - ".stats.embedder.totalCount" => "[ignored]", + ".stats.embedderRequests.total" => "[ignored]", ".startedAt" => "[ignored]" }), @r#" { @@ -2203,9 +2203,9 @@ async fn last_error_stats() { "indexUids": { "doggo": 1 }, - "embedder": { - "totalCount": "[ignored]", - "errorCount": 5, + "embedderRequests": { + "total": "[ignored]", + "failed": 5, "lastError": "runtime error: received internal error HTTP 500 from embedding server\n - server replied with `Service Unavailable`" } }, From 63031219c543318258aaf4bb268b9e29bebf4968 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 25 Jun 2025 18:24:50 +0200 Subject: [PATCH 042/150] Add the payload size to the parameters --- Cargo.lock | 1 + crates/dump/src/lib.rs | 5 +- crates/index-scheduler/src/dump.rs | 31 ++++++----- crates/index-scheduler/src/insta_snapshot.rs | 4 +- .../src/scheduler/process_batch.rs | 11 +++- .../src/scheduler/process_export.rs | 6 ++- crates/index-scheduler/src/utils.rs | 2 +- crates/meilisearch-types/Cargo.toml | 1 + crates/meilisearch-types/src/error.rs | 1 + crates/meilisearch-types/src/lib.rs | 2 +- crates/meilisearch-types/src/task_view.rs | 14 ++++- crates/meilisearch-types/src/tasks.rs | 42 +++++++++------ crates/meilisearch/src/routes/export.rs | 51 ++++++++++++++++++- 13 files changed, 130 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a883b749f..be6aa4b21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3855,6 +3855,7 @@ dependencies = [ "anyhow", "bumpalo", "bumparaw-collections", + "byte-unit", "convert_case 0.8.0", "csv", "deserr", diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 5c67d7a94..7fd0ea376 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -4,6 +4,7 @@ use std::collections::BTreeMap; use meilisearch_types::batches::BatchId; +use meilisearch_types::byte_unit::Byte; use meilisearch_types::error::ResponseError; use meilisearch_types::keys::Key; use meilisearch_types::milli::update::IndexDocumentsMethod; @@ -148,6 +149,7 @@ pub enum KindDump { Export { url: String, api_key: Option, + payload_size: Option, indexes: BTreeMap, }, UpgradeDatabase { @@ -222,9 +224,10 @@ impl From for KindDump { KindDump::DumpCreation { keys, instance_uid } } KindWithContent::SnapshotCreation => KindDump::SnapshotCreation, - KindWithContent::Export { url, api_key, indexes } => KindDump::Export { + KindWithContent::Export { url, api_key, payload_size, indexes } => KindDump::Export { url, api_key, + payload_size, indexes: indexes .into_iter() .map(|(pattern, settings)| (pattern.to_string(), settings)) diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index 2a99a74aa..1e681c8e8 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -212,20 +212,23 @@ impl<'a> Dump<'a> { KindWithContent::DumpCreation { keys, instance_uid } } KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, - KindDump::Export { url, indexes, api_key } => KindWithContent::Export { - url, - api_key, - indexes: indexes - .into_iter() - .map(|(pattern, settings)| { - Ok(( - IndexUidPattern::try_from(pattern) - .map_err(|_| Error::CorruptedDump)?, - settings, - )) - }) - .collect::>()?, - }, + KindDump::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { + url, + api_key, + payload_size, + indexes: indexes + .into_iter() + .map(|(pattern, settings)| { + Ok(( + IndexUidPattern::try_from(pattern) + .map_err(|_| Error::CorruptedDump)?, + settings, + )) + }) + .collect::>()?, + } + } KindDump::UpgradeDatabase { from } => KindWithContent::UpgradeDatabase { from }, }, }; diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index 138b591ff..f48821520 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -289,8 +289,8 @@ fn snapshot_details(d: &Details) -> String { Details::IndexSwap { swaps } => { format!("{{ swaps: {swaps:?} }}") } - Details::Export { url, api_key, indexes } => { - format!("{{ url: {url:?}, api_key: {api_key:?}, indexes: {indexes:?} }}") + Details::Export { url, api_key, payload_size, indexes } => { + format!("{{ url: {url:?}, api_key: {api_key:?}, payload_size: {payload_size:?}, indexes: {indexes:?} }}") } Details::UpgradeDatabase { from, to } => { format!("{{ from: {from:?}, to: {to:?} }}") diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 99278756d..e56b8e13a 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -362,12 +362,19 @@ impl IndexScheduler { Ok((vec![task], ProcessBatchInfo::default())) } Batch::Export { mut task } => { - let KindWithContent::Export { url, indexes, api_key } = &task.kind else { + let KindWithContent::Export { url, api_key, payload_size, indexes } = &task.kind + else { unreachable!() }; let ret = catch_unwind(AssertUnwindSafe(|| { - self.process_export(url, indexes, api_key.as_deref(), progress) + self.process_export( + url, + api_key.as_deref(), + payload_size.as_ref(), + indexes, + progress, + ) })); match ret { diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 180162eda..e777809fd 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -4,6 +4,7 @@ use std::sync::atomic; use std::time::Duration; use backoff::ExponentialBackoff; +use byte_unit::Byte; use flate2::write::GzEncoder; use flate2::Compression; use meilisearch_types::index_uid_pattern::IndexUidPattern; @@ -25,8 +26,9 @@ impl IndexScheduler { pub(super) fn process_export( &self, base_url: &str, - indexes: &BTreeMap, api_key: Option<&str>, + payload_size: Option<&Byte>, + indexes: &BTreeMap, progress: Progress, ) -> Result<()> { #[cfg(test)] @@ -122,7 +124,7 @@ impl IndexScheduler { let (step, progress_step) = AtomicDocumentStep::new(total_documents); progress.update_progress(progress_step); - let limit = 50 * 1024 * 1024; // 50 MiB + let limit = payload_size.map(|ps| ps.as_u64() as usize).unwrap_or(50 * 1024 * 1024); // defaults to 50 MiB let documents_url = format!("{base_url}/indexes/{uid}/documents"); request_threads() diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 79571745b..594023145 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -601,7 +601,7 @@ impl crate::IndexScheduler { Details::Dump { dump_uid: _ } => { assert_eq!(kind.as_kind(), Kind::DumpCreation); } - Details::Export { url: _, api_key: _, indexes: _ } => { + Details::Export { url: _, api_key: _, payload_size: _, indexes: _ } => { assert_eq!(kind.as_kind(), Kind::Export); } Details::UpgradeDatabase { from: _, to: _ } => { diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index f76044078..faf59643f 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -15,6 +15,7 @@ actix-web = { version = "4.11.0", default-features = false } anyhow = "1.0.98" bumpalo = "3.18.1" bumparaw-collections = "0.1.4" +byte-unit = { version = "5.1.6", features = ["serde"] } convert_case = "0.8.0" csv = "1.3.1" deserr = { version = "0.6.3", features = ["actix-web"] } diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 08ee803ef..a8f45b4ef 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -392,6 +392,7 @@ InvalidSettingsIndexChat , InvalidRequest , BAD_REQU // Export InvalidExportUrl , InvalidRequest , BAD_REQUEST ; InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; +InvalidExportPayloadSize , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; InvalidExportIndexSkipEmbeddings , InvalidRequest , BAD_REQUEST ; InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; diff --git a/crates/meilisearch-types/src/lib.rs b/crates/meilisearch-types/src/lib.rs index a1a57b7e6..fe69da526 100644 --- a/crates/meilisearch-types/src/lib.rs +++ b/crates/meilisearch-types/src/lib.rs @@ -18,7 +18,7 @@ pub mod versioning; pub use milli::{heed, Index}; use uuid::Uuid; pub use versioning::VERSION_FILE_NAME; -pub use {milli, serde_cs}; +pub use {byte_unit, milli, serde_cs}; pub type Document = serde_json::Map; pub type InstanceUid = Uuid; diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 0a8d7b8fe..1dbd5637b 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -1,5 +1,6 @@ use std::collections::BTreeMap; +use byte_unit::UnitType; use milli::Object; use serde::{Deserialize, Serialize}; use time::{Duration, OffsetDateTime}; @@ -128,6 +129,8 @@ pub struct DetailsView { #[serde(skip_serializing_if = "Option::is_none")] pub api_key: Option, #[serde(skip_serializing_if = "Option::is_none")] + pub payload_size: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub indexes: Option>, } @@ -263,6 +266,13 @@ impl DetailsView { // So we return the first one we encounter but that shouldn't be an issue anyway. (Some(left), Some(_right)) => Some(left), }, + payload_size: match (self.payload_size.clone(), other.payload_size.clone()) { + (None, None) => None, + (None, Some(size)) | (Some(size), None) => Some(size), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, indexes: match (self.indexes.clone(), other.indexes.clone()) { (None, None) => None, (None, Some(indexes)) | (Some(indexes), None) => Some(indexes), @@ -359,9 +369,11 @@ impl From
for DetailsView { Details::IndexSwap { swaps } => { DetailsView { swaps: Some(swaps), ..Default::default() } } - Details::Export { url, api_key, indexes } => DetailsView { + Details::Export { url, api_key, payload_size, indexes } => DetailsView { url: Some(url), api_key, + payload_size: payload_size + .map(|ps| ps.get_appropriate_unit(UnitType::Both).to_string()), indexes: Some( indexes .into_iter() diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 86951192c..508035bb7 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet}; use std::fmt::{Display, Write}; use std::str::FromStr; +use byte_unit::Byte; use enum_iterator::Sequence; use milli::update::IndexDocumentsMethod; use milli::Object; @@ -159,6 +160,7 @@ pub enum KindWithContent { Export { url: String, api_key: Option, + payload_size: Option, indexes: BTreeMap, }, UpgradeDatabase { @@ -286,11 +288,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), - }), + KindWithContent::Export { url, api_key, payload_size, indexes } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + payload_size: payload_size.clone(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), to: ( @@ -357,11 +362,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), - }), + KindWithContent::Export { url, api_key, payload_size, indexes } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + payload_size: payload_size.clone(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -410,11 +418,14 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), - }), + KindWithContent::Export { url, api_key, payload_size, indexes } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + payload_size: payload_size.clone(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -681,6 +692,7 @@ pub enum Details { Export { url: String, api_key: Option, + payload_size: Option, indexes: BTreeMap, }, UpgradeDatabase { diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index de1fe2c38..1c519224c 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -1,7 +1,10 @@ use std::collections::BTreeMap; +use std::convert::Infallible; +use std::str::FromStr as _; use actix_web::web::{self, Data}; use actix_web::{HttpRequest, HttpResponse}; +use byte_unit::Byte; use deserr::actix_web::AwebJson; use deserr::Deserr; use index_scheduler::IndexScheduler; @@ -72,7 +75,7 @@ async fn export( let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); - let Export { url, api_key, indexes } = export; + let Export { url, api_key, payload_size, indexes } = export; let indexes = if indexes.is_empty() { BTreeMap::from([(IndexUidPattern::new_unchecked("*"), DbExportIndexSettings::default())]) @@ -85,7 +88,12 @@ async fn export( .collect() }; - let task = KindWithContent::Export { url, api_key, indexes }; + let task = KindWithContent::Export { + url, + api_key, + payload_size: payload_size.map(|ByteWithDeserr(bytes)| bytes), + indexes, + }; let uid = get_task_id(&req, &opt)?; let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = @@ -109,12 +117,51 @@ pub struct Export { #[serde(default)] #[deserr(default, error = DeserrJsonError)] pub api_key: Option, + #[schema(value_type = Option, example = json!("24MiB"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub payload_size: Option, #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] #[deserr(default)] #[serde(default)] pub indexes: BTreeMap, } +/// A wrapper around the `Byte` type that implements `Deserr`. +#[derive(Debug, Serialize)] +#[serde(transparent)] +pub struct ByteWithDeserr(pub Byte); + +impl deserr::Deserr for ByteWithDeserr +where + E: deserr::DeserializeError, +{ + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + use deserr::{ErrorKind, Value, ValueKind}; + match value { + Value::Integer(integer) => Ok(ByteWithDeserr(Byte::from_u64(integer))), + Value::String(string) => Byte::from_str(&string).map(ByteWithDeserr).map_err(|e| { + deserr::take_cf_content(E::error::( + None, + ErrorKind::Unexpected { msg: e.to_string() }, + location, + )) + }), + actual => Err(deserr::take_cf_content(E::error( + None, + ErrorKind::IncorrectValueKind { + actual, + accepted: &[ValueKind::Integer, ValueKind::String], + }, + location, + ))), + } + } +} + #[derive(Debug, Deserr, ToSchema, Serialize)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] #[serde(rename_all = "camelCase")] From e6e9a033aa153250b9fe96addb13701d49feccd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 15:45:24 +0200 Subject: [PATCH 043/150] Introduce new analytics to the export route --- crates/meilisearch/src/routes/export.rs | 7 +- .../src/routes/export_analytics.rs | 67 +++++++++++++++++++ crates/meilisearch/src/routes/mod.rs | 1 + 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 crates/meilisearch/src/routes/export_analytics.rs diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 1c519224c..21a77ae32 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -22,6 +22,7 @@ use utoipa::{OpenApi, ToSchema}; use crate::analytics::Analytics; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::GuardedData; +use crate::routes::export_analytics::ExportAnalytics; use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; @@ -67,7 +68,7 @@ async fn export( export: AwebJson, req: HttpRequest, opt: web::Data, - _analytics: Data, + analytics: Data, ) -> Result { // TODO make it experimental? // index_scheduler.features().check_network("Using the /network route")?; @@ -75,6 +76,8 @@ async fn export( let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); + let analytics_aggregate = ExportAnalytics::from_export(&export); + let Export { url, api_key, payload_size, indexes } = export; let indexes = if indexes.is_empty() { @@ -101,6 +104,8 @@ async fn export( .await?? .into(); + analytics.publish(analytics_aggregate, &req); + Ok(HttpResponse::Ok().json(task)) } diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs new file mode 100644 index 000000000..7299dba8d --- /dev/null +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -0,0 +1,67 @@ +use crate::analytics::Aggregate; +use crate::routes::export::Export; + +#[derive(Default)] +pub struct ExportAnalytics { + total_received: usize, + has_api_key: bool, + total_index_patterns: usize, + total_patterns_with_filter: usize, + payload_sizes: Vec, +} + +impl ExportAnalytics { + pub fn from_export(export: &Export) -> Self { + let Export { url: _, api_key, payload_size, indexes } = export; + + let has_api_key = api_key.is_some(); + let total_index_patterns = indexes.len(); + let total_patterns_with_filter = + indexes.values().filter(|settings| settings.filter.is_some()).count(); + let payload_sizes = + if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { + vec![byte_size.as_u64()] + } else { + vec![] + }; + + Self { + total_received: 1, + has_api_key, + total_index_patterns, + total_patterns_with_filter, + payload_sizes, + } + } +} + +impl Aggregate for ExportAnalytics { + fn event_name(&self) -> &'static str { + "Export Triggered" + } + + fn aggregate(mut self: Box, other: Box) -> Box { + self.total_received += other.total_received; + self.has_api_key |= other.has_api_key; + self.total_index_patterns += other.total_index_patterns; + self.total_patterns_with_filter += other.total_patterns_with_filter; + self.payload_sizes.extend(other.payload_sizes); + self + } + + fn into_event(self: Box) -> serde_json::Value { + let avg_payload_size = if self.payload_sizes.is_empty() { + None + } else { + Some(self.payload_sizes.iter().sum::() / self.payload_sizes.len() as u64) + }; + + serde_json::json!({ + "total_received": self.total_received, + "has_api_key": self.has_api_key, + "total_index_patterns": self.total_index_patterns, + "total_patterns_with_filter": self.total_patterns_with_filter, + "avg_payload_size": avg_payload_size, + }) + } +} diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 748cd5d83..08583d20f 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -55,6 +55,7 @@ pub mod batches; pub mod chats; mod dump; mod export; +mod export_analytics; pub mod features; pub mod indexes; mod logs; From 0bb7866f1e549c8791ac752f90af0dfcbd5fd6a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 15:48:21 +0200 Subject: [PATCH 044/150] Remove the skip embeddings boolean in the settings --- crates/meilisearch-types/src/error.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index a8f45b4ef..1c2840084 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -394,7 +394,6 @@ InvalidExportUrl , InvalidRequest , BAD_REQU InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; InvalidExportPayloadSize , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; -InvalidExportIndexSkipEmbeddings , InvalidRequest , BAD_REQUEST ; InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; From bf13268649343ad2a410ca1411b5dce4f5b0fcf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 16:03:13 +0200 Subject: [PATCH 045/150] Better compute aggragates --- .../src/routes/export_analytics.rs | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs index 7299dba8d..44dba2c9b 100644 --- a/crates/meilisearch/src/routes/export_analytics.rs +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -5,8 +5,8 @@ use crate::routes::export::Export; pub struct ExportAnalytics { total_received: usize, has_api_key: bool, - total_index_patterns: usize, - total_patterns_with_filter: usize, + sum_index_patterns: usize, + sum_patterns_with_filter: usize, payload_sizes: Vec, } @@ -15,8 +15,8 @@ impl ExportAnalytics { let Export { url: _, api_key, payload_size, indexes } = export; let has_api_key = api_key.is_some(); - let total_index_patterns = indexes.len(); - let total_patterns_with_filter = + let index_patterns_count = indexes.len(); + let patterns_with_filter_count = indexes.values().filter(|settings| settings.filter.is_some()).count(); let payload_sizes = if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { @@ -28,8 +28,8 @@ impl ExportAnalytics { Self { total_received: 1, has_api_key, - total_index_patterns, - total_patterns_with_filter, + sum_index_patterns: index_patterns_count, + sum_patterns_with_filter: patterns_with_filter_count, payload_sizes, } } @@ -43,8 +43,8 @@ impl Aggregate for ExportAnalytics { fn aggregate(mut self: Box, other: Box) -> Box { self.total_received += other.total_received; self.has_api_key |= other.has_api_key; - self.total_index_patterns += other.total_index_patterns; - self.total_patterns_with_filter += other.total_patterns_with_filter; + self.sum_index_patterns += other.sum_index_patterns; + self.sum_patterns_with_filter += other.sum_patterns_with_filter; self.payload_sizes.extend(other.payload_sizes); self } @@ -56,11 +56,23 @@ impl Aggregate for ExportAnalytics { Some(self.payload_sizes.iter().sum::() / self.payload_sizes.len() as u64) }; + let avg_index_patterns = if self.total_received == 0 { + None + } else { + Some(self.sum_index_patterns as f64 / self.total_received as f64) + }; + + let avg_patterns_with_filter = if self.total_received == 0 { + None + } else { + Some(self.sum_patterns_with_filter as f64 / self.total_received as f64) + }; + serde_json::json!({ "total_received": self.total_received, "has_api_key": self.has_api_key, - "total_index_patterns": self.total_index_patterns, - "total_patterns_with_filter": self.total_patterns_with_filter, + "avg_index_patterns": avg_index_patterns, + "avg_patterns_with_filter": avg_patterns_with_filter, "avg_payload_size": avg_payload_size, }) } From e3003c1609fda6e0a2af649b8fc7bd3bff429d74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 16:05:12 +0200 Subject: [PATCH 046/150] Improve OpenAPI schema --- crates/meilisearch/src/routes/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 08583d20f..51298411a 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -2,6 +2,7 @@ use std::collections::BTreeMap; use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; +use export::Export; use index_scheduler::IndexScheduler; use meilisearch_auth::AuthController; use meilisearch_types::batch_view::BatchView; @@ -98,7 +99,7 @@ mod tasks_test; url = "/", description = "Local server", )), - components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures)) + components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures, ExportApi, Export)) )] pub struct MeilisearchApi; From b956918c11bd66a02ca9abda1ab905aa178a0ccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 16:31:35 +0200 Subject: [PATCH 047/150] Fix clippy and more utoipa issues --- crates/meilisearch-types/src/tasks.rs | 6 +++--- crates/meilisearch/src/routes/mod.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 508035bb7..3301b4320 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -292,7 +292,7 @@ impl KindWithContent { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - payload_size: payload_size.clone(), + payload_size: *payload_size, indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } @@ -366,7 +366,7 @@ impl KindWithContent { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - payload_size: payload_size.clone(), + payload_size: *payload_size, indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } @@ -422,7 +422,7 @@ impl From<&KindWithContent> for Option
{ Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - payload_size: payload_size.clone(), + payload_size: *payload_size, indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 51298411a..260d973a1 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -99,7 +99,7 @@ mod tasks_test; url = "/", description = "Local server", )), - components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures, ExportApi, Export)) + components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures, Export)) )] pub struct MeilisearchApi; From 0f1dd3614cc86753ca26dc10ebd2cc659659c55a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 17:51:57 +0200 Subject: [PATCH 048/150] Update tasks tests --- crates/meilisearch/src/routes/tasks_test.rs | 2 +- crates/meilisearch/tests/batches/errors.rs | 2 +- crates/meilisearch/tests/tasks/errors.rs | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/meilisearch/src/routes/tasks_test.rs b/crates/meilisearch/src/routes/tasks_test.rs index a17b80c82..b09eb0fb3 100644 --- a/crates/meilisearch/src/routes/tasks_test.rs +++ b/crates/meilisearch/src/routes/tasks_test.rs @@ -228,7 +228,7 @@ mod tests { let err = deserr_query_params::(params).unwrap_err(); snapshot!(meili_snap::json_string!(err), @r#" { - "message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" diff --git a/crates/meilisearch/tests/batches/errors.rs b/crates/meilisearch/tests/batches/errors.rs index 7f5fedb6a..bfc0d9251 100644 --- a/crates/meilisearch/tests/batches/errors.rs +++ b/crates/meilisearch/tests/batches/errors.rs @@ -42,7 +42,7 @@ async fn batch_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" diff --git a/crates/meilisearch/tests/tasks/errors.rs b/crates/meilisearch/tests/tasks/errors.rs index 759531d42..9970bafa4 100644 --- a/crates/meilisearch/tests/tasks/errors.rs +++ b/crates/meilisearch/tests/tasks/errors.rs @@ -97,7 +97,7 @@ async fn task_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" @@ -108,7 +108,7 @@ async fn task_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" @@ -119,7 +119,7 @@ async fn task_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" From 82fe80b360ffce8e6b5ba6fa38f6c694411b2cce Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Jun 2025 11:48:58 +0200 Subject: [PATCH 049/150] Replace the legacy Settings::execute by the new one --- crates/benchmarks/benches/indexing.rs | 2 +- crates/benchmarks/benches/utils.rs | 2 +- .../src/scheduler/process_batch.rs | 6 +- .../src/scheduler/process_index_operation.rs | 10 +- crates/meilisearch/src/lib.rs | 10 +- .../milli/src/search/new/tests/integration.rs | 2 +- crates/milli/src/test_index.rs | 2 +- crates/milli/src/update/settings.rs | 120 +++++++++++++++++- crates/milli/tests/search/distinct.rs | 3 +- .../milli/tests/search/facet_distribution.rs | 2 +- crates/milli/tests/search/mod.rs | 2 +- crates/milli/tests/search/phrase_search.rs | 3 +- crates/milli/tests/search/query_criteria.rs | 6 +- crates/milli/tests/search/typo_tolerance.rs | 8 +- 14 files changed, 137 insertions(+), 41 deletions(-) diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 3afad8ee5..610fa4a00 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -65,7 +65,7 @@ fn setup_settings<'t>( let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); builder.set_sortable_fields(sortable_fields); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); } fn setup_index_with_settings( diff --git a/crates/benchmarks/benches/utils.rs b/crates/benchmarks/benches/utils.rs index 32e844a0b..2cacc5477 100644 --- a/crates/benchmarks/benches/utils.rs +++ b/crates/benchmarks/benches/utils.rs @@ -90,7 +90,7 @@ pub fn base_setup(conf: &Conf) -> Index { (conf.configure)(&mut builder); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); wtxn.commit().unwrap(); let config = IndexerConfig::default(); diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 5261692b6..237608648 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -245,11 +245,7 @@ impl IndexScheduler { let must_stop_processing = self.scheduler.must_stop_processing.clone(); builder - .execute( - |indexing_step| tracing::debug!(update = ?indexing_step), - || must_stop_processing.get(), - current_batch.embedder_stats.clone(), - ) + .execute(&|| must_stop_processing.get(), &progress) .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; index_wtxn.commit()?; } diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index 4c0db9ce4..c302d6983 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -474,15 +474,11 @@ impl IndexScheduler { } progress.update_progress(SettingsProgress::ApplyTheSettings); - builder - .execute( - |indexing_step| tracing::debug!(update = ?indexing_step), - || must_stop_processing.get(), - embedder_stats, - ) + let congestion = builder + .execute(&|| must_stop_processing.get(), progress) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; - Ok((tasks, None)) + Ok((tasks, congestion)) } IndexOperation::DocumentClearAndSetting { index_uid, diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index c902f4e60..4bfce17f8 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -37,7 +37,7 @@ use index_scheduler::{IndexScheduler, IndexSchedulerOptions}; use meilisearch_auth::{open_auth_store_env, AuthController}; use meilisearch_types::milli::constants::VERSION_MAJOR; use meilisearch_types::milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; -use meilisearch_types::milli::progress::EmbedderStats; +use meilisearch_types::milli::progress::Progress; use meilisearch_types::milli::update::{ default_thread_pool_and_threads, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, }; @@ -464,6 +464,7 @@ fn import_dump( index_scheduler: &mut IndexScheduler, auth: &mut AuthController, ) -> Result<(), anyhow::Error> { + let progress = Progress::default(); let reader = File::open(dump_path)?; let mut dump_reader = dump::DumpReader::open(reader)?; @@ -543,12 +544,7 @@ fn import_dump( tracing::info!("Importing the settings."); let settings = index_reader.settings()?; apply_settings_to_builder(&settings, &mut builder); - let embedder_stats: Arc = Default::default(); - builder.execute( - |indexing_step| tracing::debug!("update: {:?}", indexing_step), - || false, - embedder_stats.clone(), - )?; + builder.execute(&|| false, &progress)?; // 4.3 Import the documents. // 4.3.1 We need to recreate the grenad+obkv format accepted by the index. diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 36917c10e..700a527ac 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -44,7 +44,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("america") => vec![S("the united states")], }); builder.set_searchable_fields(vec![S("title"), S("description")]); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); wtxn.commit().unwrap(); // index documents diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index d218bb3a6..03bef5838 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -135,7 +135,7 @@ impl TempIndex { ) -> Result<(), crate::error::Error> { let mut builder = update::Settings::new(wtxn, &self.inner, &self.indexer_config); update(&mut builder); - builder.execute(drop, || false, Default::default())?; + builder.execute(&|| false, &Progress::default())?; Ok(()) } diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 05dbb4784..3e08f8d75 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -27,17 +27,21 @@ use crate::index::{ DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, }; use crate::order_by_map::OrderByMap; +use crate::progress::Progress; use crate::progress::EmbedderStats; use crate::prompt::{default_max_bytes, default_template_text, PromptData}; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; +use crate::update::new::indexer::reindex; use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::vector::settings::{ EmbedderAction, EmbedderSource, EmbeddingSettings, NestingContext, ReindexAction, SubEmbeddingSettings, WriteBackToDocuments, }; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; -use crate::{FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result}; +use crate::{ + ChannelCongestion, FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result, +}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { @@ -1358,12 +1362,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } - pub fn execute( - mut self, - progress_callback: FP, - should_abort: FA, - embedder_stats: Arc, - ) -> Result<()> + pub fn legacy_execute(mut self, progress_callback: FP, should_abort: FA) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, @@ -1426,6 +1425,106 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(()) } + + pub fn execute<'indexer, MSP>( + mut self, + must_stop_processing: &'indexer MSP, + progress: &'indexer Progress, + ) -> Result> + where + MSP: Fn() -> bool + Sync, + { + // force the old indexer if the environment says so + if std::env::var_os("MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS").is_some() { + return self + .legacy_execute( + |indexing_step| tracing::debug!(update = ?indexing_step), + must_stop_processing, + ) + .map(|_| None); + } + + // only use the new indexer when only the embedder possibly changed + if let Self { + searchable_fields: Setting::NotSet, + displayed_fields: Setting::NotSet, + filterable_fields: Setting::NotSet, + sortable_fields: Setting::NotSet, + criteria: Setting::NotSet, + stop_words: Setting::NotSet, + non_separator_tokens: Setting::NotSet, + separator_tokens: Setting::NotSet, + dictionary: Setting::NotSet, + distinct_field: Setting::NotSet, + synonyms: Setting::NotSet, + primary_key: Setting::NotSet, + authorize_typos: Setting::NotSet, + min_word_len_two_typos: Setting::NotSet, + min_word_len_one_typo: Setting::NotSet, + exact_words: Setting::NotSet, + exact_attributes: Setting::NotSet, + max_values_per_facet: Setting::NotSet, + sort_facet_values_by: Setting::NotSet, + pagination_max_total_hits: Setting::NotSet, + proximity_precision: Setting::NotSet, + embedder_settings: _, + search_cutoff: Setting::NotSet, + localized_attributes_rules: Setting::NotSet, + prefix_search: Setting::NotSet, + facet_search: Setting::NotSet, + disable_on_numbers: Setting::NotSet, + chat: Setting::NotSet, + wtxn: _, + index: _, + indexer_config: _, // TODO: this is not used + } = &self + { + self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; + + let old_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?; + + // Update index settings + let embedding_config_updates = self.update_embedding_configs()?; + + let mut new_inner_settings = + InnerIndexSettings::from_index(self.index, self.wtxn, None)?; + new_inner_settings.recompute_searchables(self.wtxn, self.index)?; + + let primary_key_id = self + .index + .primary_key(self.wtxn)? + .and_then(|name| new_inner_settings.fields_ids_map.id(name)); + let settings_update_only = true; + let inner_settings_diff = InnerIndexSettingsDiff::new( + old_inner_settings, + new_inner_settings, + primary_key_id, + embedding_config_updates, + settings_update_only, + ); + + if self.index.number_of_documents(self.wtxn)? > 0 { + reindex( + self.wtxn, + self.index, + &self.indexer_config.thread_pool, + self.indexer_config.grenad_parameters(), + &inner_settings_diff, + must_stop_processing, + progress, + ) + .map(Some) + } else { + Ok(None) + } + } else { + self.legacy_execute( + |indexing_step| tracing::debug!(update = ?indexing_step), + must_stop_processing, + ) + .map(|_| None) + } + } } pub struct InnerIndexSettingsDiff { @@ -1685,6 +1784,7 @@ pub(crate) struct InnerIndexSettings { pub disabled_typos_terms: DisabledTyposTerms, pub proximity_precision: ProximityPrecision, pub embedding_configs: EmbeddingConfigs, + pub embedder_category_id: HashMap, pub geo_fields_ids: Option<(FieldId, FieldId)>, pub prefix_search: PrefixSearch, pub facet_search: bool, @@ -1707,6 +1807,11 @@ impl InnerIndexSettings { Some(embedding_configs) => embedding_configs, None => embedders(index.embedding_configs(rtxn)?)?, }; + let embedder_category_id = index + .embedder_category_id + .iter(rtxn)? + .map(|r| r.map(|(k, v)| (k.to_string(), v))) + .collect::>()?; let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default(); let facet_search = index.facet_search(rtxn)?; let geo_fields_ids = match fields_ids_map.id(RESERVED_GEO_FIELD_NAME) { @@ -1746,6 +1851,7 @@ impl InnerIndexSettings { exact_attributes, proximity_precision, embedding_configs, + embedder_category_id, geo_fields_ids, prefix_search, facet_search, diff --git a/crates/milli/tests/search/distinct.rs b/crates/milli/tests/search/distinct.rs index 15fcf70a2..c22755751 100644 --- a/crates/milli/tests/search/distinct.rs +++ b/crates/milli/tests/search/distinct.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use big_s::S; +use milli::progress::Progress; use milli::update::Settings; use milli::{Criterion, Search, SearchResult, TermsMatchingStrategy}; use Criterion::*; @@ -19,7 +20,7 @@ macro_rules! test_distinct { let config = milli::update::IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_distinct_field(S(stringify!($distinct))); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index 8548f0d01..ff939ec47 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -25,7 +25,7 @@ fn test_facet_distribution_with_no_facet_values() { FilterableAttributesRule::Field(S("genres")), FilterableAttributesRule::Field(S("tags")), ]); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); wtxn.commit().unwrap(); // index documents diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 4098af736..0515ece66 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -63,7 +63,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("america") => vec![S("the united states")], }); builder.set_searchable_fields(vec![S("title"), S("description")]); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); wtxn.commit().unwrap(); // index documents diff --git a/crates/milli/tests/search/phrase_search.rs b/crates/milli/tests/search/phrase_search.rs index 180fcd176..da519c6f6 100644 --- a/crates/milli/tests/search/phrase_search.rs +++ b/crates/milli/tests/search/phrase_search.rs @@ -1,3 +1,4 @@ +use milli::progress::Progress; use milli::update::{IndexerConfig, Settings}; use milli::{Criterion, Index, Search, TermsMatchingStrategy}; @@ -10,7 +11,7 @@ fn set_stop_words(index: &Index, stop_words: &[&str]) { let mut builder = Settings::new(&mut wtxn, index, &config); let stop_words = stop_words.iter().map(|s| s.to_string()).collect(); builder.set_stop_words(stop_words); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); wtxn.commit().unwrap(); } diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index b72978330..113c8bc03 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -236,7 +236,7 @@ fn criteria_mixup() { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.clone()); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -276,7 +276,7 @@ fn criteria_ascdesc() { S("name"), S("age"), }); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); wtxn.commit().unwrap(); let mut wtxn = index.write_txn().unwrap(); @@ -359,7 +359,7 @@ fn criteria_ascdesc() { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(vec![criterion.clone()]); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index 9aacbf82a..f8e688215 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -46,7 +46,7 @@ fn test_typo_tolerance_one_typo() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); builder.set_min_word_len_one_typo(4); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); @@ -92,7 +92,7 @@ fn test_typo_tolerance_two_typo() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); builder.set_min_word_len_two_typos(7); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); @@ -181,7 +181,7 @@ fn test_typo_disabled_on_word() { // `zealand` doesn't allow typos anymore exact_words.insert("zealand".to_string()); builder.set_exact_words(exact_words); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); let mut search = Search::new(&txn, &index); search.query("zealand"); @@ -219,7 +219,7 @@ fn test_disable_typo_on_attribute() { let mut builder = Settings::new(&mut txn, &index, &config); // disable typos on `description` builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect()); - builder.execute(|_| (), || false, Default::default()).unwrap(); + builder.execute(&|| false, &Progress::default()).unwrap(); let mut search = Search::new(&txn, &index); search.query("antebelum"); From e704f4d1ec0a68cb113c1240739bd977a086f421 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Jun 2025 14:00:00 +0200 Subject: [PATCH 050/150] Reimplement reindexing shell --- .../milli/src/update/new/indexer/extract.rs | 27 ++ crates/milli/src/update/new/indexer/mod.rs | 268 +++++++++++++++--- crates/milli/src/update/settings.rs | 32 +++ 3 files changed, 282 insertions(+), 45 deletions(-) diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 97ffc8624..6f2479373 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -12,6 +12,8 @@ use super::super::steps::IndexingStep; use super::super::thread_local::{FullySend, ThreadLocal}; use super::super::FacetFieldIdsDelta; use super::document_changes::{extract, DocumentChanges, IndexingContext}; +use crate::documents::FieldIdMapper; +use crate::documents::PrimaryKey; use crate::index::IndexEmbeddingConfig; use crate::progress::EmbedderStats; use crate::progress::MergingWordCache; @@ -19,7 +21,10 @@ use crate::proximity::ProximityPrecision; use crate::update::new::extract::EmbeddingExtractor; use crate::update::new::merger::merge_and_send_rtree; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; +use crate::update::settings::SettingsDelta; use crate::vector::EmbeddingConfigs; +use crate::Index; +use crate::InternalError; use crate::{Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; #[allow(clippy::too_many_arguments)] @@ -315,6 +320,28 @@ where Result::Ok((facet_field_ids_delta, index_embeddings)) } +pub(super) fn extract_all_settings_changes<'extractor, MSP, SD>( + indexing_context: IndexingContext, + indexer_span: Span, + extractor_sender: ExtractorBbqueueSender, + settings_delta: &SD, + extractor_allocs: &'extractor mut ThreadLocal>, + finished_extraction: &AtomicBool, + field_distribution: &mut BTreeMap, + mut index_embeddings: Vec, + modified_docids: &mut RoaringBitmap, +) -> Result> +where + MSP: Fn() -> bool + Sync, + SD: SettingsDelta, +{ + + indexing_context.progress.update_progress(IndexingStep::WaitingForDatabaseWrites); + finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); + + Result::Ok(index_embeddings) +} + fn request_threads() -> &'static ThreadPoolNoAbort { static REQUEST_THREADS: OnceLock = OnceLock::new(); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index bb6ba0102..ccfe20617 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -1,3 +1,4 @@ +use std::collections::BTreeMap; use std::sync::atomic::AtomicBool; use std::sync::{Once, RwLock}; use std::thread::{self, Builder}; @@ -19,9 +20,11 @@ use super::steps::IndexingStep; use super::thread_local::ThreadLocal; use crate::documents::PrimaryKey; use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; +use crate::update::settings::SettingsDelta; use crate::progress::{EmbedderStats, Progress}; use crate::update::GrenadParameters; -use crate::vector::{ArroyWrapper, EmbeddingConfigs}; +use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; +use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; pub(crate) mod de; @@ -32,6 +35,7 @@ mod extract; mod guess_primary_key; mod partial_dump; mod post_processing; +pub mod settings_changes; mod update_by_function; mod write; @@ -40,8 +44,6 @@ static LOG_MEMORY_METRICS_ONCE: Once = Once::new(); /// This is the main function of this crate. /// /// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`]. -/// -/// TODO return stats #[allow(clippy::too_many_arguments)] // clippy: 😝 pub fn index<'pl, 'indexer, 'index, DC, MSP>( wtxn: &mut RwTxn, @@ -66,48 +68,8 @@ where let arroy_memory = grenad_parameters.max_memory; - // We reduce the actual memory used to 5%. The reason we do this here and not in Meilisearch - // is because we still use the old indexer for the settings and it is highly impacted by the - // max memory. So we keep the changes here and will remove these changes once we use the new - // indexer to also index settings. Related to #5125 and #5141. - let grenad_parameters = GrenadParameters { - max_memory: grenad_parameters.max_memory.map(|mm| mm * 5 / 100), - ..grenad_parameters - }; - - // 5% percent of the allocated memory for the extractors, or min 100MiB - // 5% percent of the allocated memory for the bbqueues, or min 50MiB - // - // Minimum capacity for bbqueues - let minimum_total_bbbuffer_capacity = 50 * 1024 * 1024 * pool.current_num_threads(); // 50 MiB - let minimum_total_extractors_capacity = minimum_total_bbbuffer_capacity * 2; - - let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( - ( - GrenadParameters { - max_memory: Some(minimum_total_extractors_capacity), - ..grenad_parameters - }, - minimum_total_bbbuffer_capacity, - ), // 100 MiB by thread by default - |max_memory| { - let total_bbbuffer_capacity = max_memory.max(minimum_total_bbbuffer_capacity); - let new_grenad_parameters = GrenadParameters { - max_memory: Some(max_memory.max(minimum_total_extractors_capacity)), - ..grenad_parameters - }; - (new_grenad_parameters, total_bbbuffer_capacity) - }, - ); - - LOG_MEMORY_METRICS_ONCE.call_once(|| { - tracing::debug!( - "Indexation allocated memory metrics - \ - Total BBQueue size: {total_bbbuffer_capacity}, \ - Total extractor memory: {:?}", - grenad_parameters.max_memory, - ); - }); + let (grenad_parameters, total_bbbuffer_capacity) = + indexer_memory_settings(pool.current_num_threads(), grenad_parameters); let (extractor_sender, writer_receiver) = pool .install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000)) @@ -241,3 +203,219 @@ where Ok(congestion) } + +#[allow(clippy::too_many_arguments)] // clippy: 😝 +pub fn reindex<'pl, 'indexer, 'index, MSP, SD>( + wtxn: &mut RwTxn<'index>, + index: &'index Index, + pool: &ThreadPoolNoAbort, + grenad_parameters: GrenadParameters, + settings_delta: &'indexer SD, + must_stop_processing: &'indexer MSP, + progress: &'indexer Progress, +) -> Result +where + MSP: Fn() -> bool + Sync, + SD: SettingsDelta + Sync, +{ + let mut bbbuffers = Vec::new(); + let finished_extraction = AtomicBool::new(false); + + let arroy_memory = grenad_parameters.max_memory; + + let (grenad_parameters, total_bbbuffer_capacity) = + indexer_memory_settings(pool.current_num_threads(), grenad_parameters); + + let (extractor_sender, writer_receiver) = pool + .install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000)) + .unwrap(); + + let mut extractor_allocs = ThreadLocal::with_capacity(rayon::current_num_threads()); + + let db_fields_ids_map = index.fields_ids_map(wtxn)?; + let new_fields_ids_map = settings_delta.new_fields_ids_map().clone(); + let new_fields_ids_map = RwLock::new(new_fields_ids_map); + let fields_ids_map_store = ThreadLocal::with_capacity(rayon::current_num_threads()); + let doc_allocs = ThreadLocal::with_capacity(rayon::current_num_threads()); + + let indexing_context = IndexingContext { + index, + db_fields_ids_map: &db_fields_ids_map, + new_fields_ids_map: &new_fields_ids_map, + doc_allocs: &doc_allocs, + fields_ids_map_store: &fields_ids_map_store, + must_stop_processing, + progress, + grenad_parameters: &grenad_parameters, + }; + + let index_embeddings = index.embedding_configs(wtxn)?; + let mut field_distribution = index.field_distribution(wtxn)?; + let mut modified_docids = roaring::RoaringBitmap::new(); + + let congestion = thread::scope(|s| -> Result { + let indexer_span = tracing::Span::current(); + let finished_extraction = &finished_extraction; + // prevent moving the field_distribution and document_ids in the inner closure... + let field_distribution = &mut field_distribution; + let modified_docids = &mut modified_docids; + let extractor_handle = + Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { + pool.install(move || { + extract::extract_all_settings_changes( + indexing_context, + indexer_span, + extractor_sender, + settings_delta, + &mut extractor_allocs, + finished_extraction, + field_distribution, + index_embeddings, + modified_docids, + ) + }) + .unwrap() + })?; + + let new_embedders = settings_delta.new_embedders(); + let embedder_actions = settings_delta.embedder_actions(); + let index_embedder_category_ids = settings_delta.new_embedder_category_id(); + let mut arroy_writers = arroy_writers_from_embedder_actions( + index, + embedder_actions, + new_embedders, + index_embedder_category_ids, + )?; + + let congestion = + write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?; + + indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors); + + let index_embeddings = extractor_handle.join().unwrap()?; + + indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase); + + pool.install(|| { + build_vectors( + index, + wtxn, + indexing_context.progress, + index_embeddings, + arroy_memory, + &mut arroy_writers, + Some(&embedder_actions), + &indexing_context.must_stop_processing, + ) + }) + .unwrap()?; + + indexing_context.progress.update_progress(IndexingStep::Finalizing); + + Ok(congestion) as Result<_> + })?; + + // required to into_inner the new_fields_ids_map + drop(fields_ids_map_store); + + let new_fields_ids_map = new_fields_ids_map.into_inner().unwrap(); + let document_ids = index.documents_ids(wtxn)?; + update_index( + index, + wtxn, + new_fields_ids_map, + None, + settings_delta.new_embedders().clone(), + field_distribution, + document_ids, + )?; + + Ok(congestion) +} + +fn arroy_writers_from_embedder_actions<'indexer, 'index>( + index: &'index Index, + embedder_actions: &'indexer BTreeMap, + embedders: &'indexer EmbeddingConfigs, + index_embedder_category_ids: &'indexer std::collections::HashMap, +) -> Result> { + let vector_arroy = index.vector_arroy; + + embedders + .inner_as_ref() + .iter() + .filter_map(|(embedder_name, (embedder, _, _))| match embedder_actions.get(embedder_name) { + None => None, + Some(action) if action.write_back().is_some() => None, + Some(action) => { + let Some(&embedder_category_id) = index_embedder_category_ids.get(embedder_name) + else { + return Some(Err(crate::error::Error::InternalError( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + ))); + }; + let writer = + ArroyWrapper::new(vector_arroy, embedder_category_id, action.was_quantized); + let dimensions = embedder.dimensions(); + Some(Ok(( + embedder_category_id, + (embedder_name.as_str(), embedder.as_ref(), writer, dimensions), + ))) + } + }) + .collect() +} + +fn indexer_memory_settings( + current_num_threads: usize, + grenad_parameters: GrenadParameters, +) -> (GrenadParameters, usize) { + // We reduce the actual memory used to 5%. The reason we do this here and not in Meilisearch + // is because we still use the old indexer for the settings and it is highly impacted by the + // max memory. So we keep the changes here and will remove these changes once we use the new + // indexer to also index settings. Related to #5125 and #5141. + let grenad_parameters = GrenadParameters { + max_memory: grenad_parameters.max_memory.map(|mm| mm * 5 / 100), + ..grenad_parameters + }; + + // 5% percent of the allocated memory for the extractors, or min 100MiB + // 5% percent of the allocated memory for the bbqueues, or min 50MiB + // + // Minimum capacity for bbqueues + let minimum_total_bbbuffer_capacity = 50 * 1024 * 1024 * current_num_threads; + // 50 MiB + let minimum_total_extractors_capacity = minimum_total_bbbuffer_capacity * 2; + + let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( + ( + GrenadParameters { + max_memory: Some(minimum_total_extractors_capacity), + ..grenad_parameters + }, + minimum_total_bbbuffer_capacity, + ), // 100 MiB by thread by default + |max_memory| { + let total_bbbuffer_capacity = max_memory.max(minimum_total_bbbuffer_capacity); + let new_grenad_parameters = GrenadParameters { + max_memory: Some(max_memory.max(minimum_total_extractors_capacity)), + ..grenad_parameters + }; + (new_grenad_parameters, total_bbbuffer_capacity) + }, + ); + + LOG_MEMORY_METRICS_ONCE.call_once(|| { + tracing::debug!( + "Indexation allocated memory metrics - \ + Total BBQueue size: {total_bbbuffer_capacity}, \ + Total extractor memory: {:?}", + grenad_parameters.max_memory, + ); + }); + + (grenad_parameters, total_bbbuffer_capacity) +} diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 3e08f8d75..d21afdd28 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -2221,6 +2221,38 @@ fn deserialize_sub_embedder( } } +/// Implement this trait for the settings delta type. +/// This is used in the new settings update flow and will allow to easily replace the old settings delta type: `InnerIndexSettingsDiff`. +pub trait SettingsDelta { + fn new_embedders(&self) -> &EmbeddingConfigs; + fn old_embedders(&self) -> &EmbeddingConfigs; + fn new_embedder_category_id(&self) -> &HashMap; + fn embedder_actions(&self) -> &BTreeMap; + fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata; +} + +impl SettingsDelta for InnerIndexSettingsDiff { + fn new_embedders(&self) -> &EmbeddingConfigs { + &self.new.embedding_configs + } + + fn old_embedders(&self) -> &EmbeddingConfigs { + &self.old.embedding_configs + } + + fn new_embedder_category_id(&self) -> &HashMap { + &self.new.embedder_category_id + } + + fn embedder_actions(&self) -> &BTreeMap { + &self.embedding_config_updates + } + + fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata { + &self.new.fields_ids_map + } +} + #[cfg(test)] #[path = "test_settings.rs"] mod tests; From 510a4b91be80dffe3f52a4b2e88070774a30431f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Jun 2025 14:37:24 +0200 Subject: [PATCH 051/150] Introduce DatabaseDocument type --- .../milli/src/update/new/document_change.rs | 74 +++++++++++-------- .../update/new/indexer/document_deletion.rs | 4 +- .../update/new/indexer/document_operation.rs | 4 +- .../milli/src/update/new/indexer/extract.rs | 26 +++++++ .../update/new/indexer/settings_changes.rs | 55 ++++++++++++++ .../update/new/indexer/update_by_function.rs | 9 +-- crates/milli/src/update/new/mod.rs | 2 +- 7 files changed, 133 insertions(+), 41 deletions(-) create mode 100644 crates/milli/src/update/new/indexer/settings_changes.rs diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 8a8ac4bb3..2ff96fd24 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -14,16 +14,11 @@ use crate::vector::EmbeddingConfigs; use crate::{DocumentId, Index, InternalError, Result}; pub enum DocumentChange<'doc> { - Deletion(Deletion<'doc>), + Deletion(DatabaseDocument<'doc>), Update(Update<'doc>), Insertion(Insertion<'doc>), } -pub struct Deletion<'doc> { - docid: DocumentId, - external_document_id: &'doc str, -} - pub struct Update<'doc> { docid: DocumentId, external_document_id: &'doc str, @@ -37,6 +32,11 @@ pub struct Insertion<'doc> { new: Versions<'doc>, } +pub struct DatabaseDocument<'doc> { + docid: DocumentId, + external_document_id: &'doc str, +} + impl<'doc> DocumentChange<'doc> { pub fn docid(&self) -> DocumentId { match &self { @@ -55,31 +55,6 @@ impl<'doc> DocumentChange<'doc> { } } -impl<'doc> Deletion<'doc> { - pub fn create(docid: DocumentId, external_document_id: &'doc str) -> Self { - Self { docid, external_document_id } - } - - pub fn docid(&self) -> DocumentId { - self.docid - } - - pub fn external_document_id(&self) -> &'doc str { - self.external_document_id - } - - pub fn current<'a, Mapper: FieldIdMapper>( - &self, - rtxn: &'a RoTxn, - index: &'a Index, - mapper: &'a Mapper, - ) -> Result> { - Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or( - crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, - )?) - } -} - impl<'doc> Insertion<'doc> { pub fn create(docid: DocumentId, external_document_id: &'doc str, new: Versions<'doc>) -> Self { Insertion { docid, external_document_id, new } @@ -304,3 +279,40 @@ impl<'doc> Update<'doc> { } } } + +impl<'doc> DatabaseDocument<'doc> { + pub fn create(docid: DocumentId, external_document_id: &'doc str) -> Self { + Self { docid, external_document_id } + } + + pub fn docid(&self) -> DocumentId { + self.docid + } + + pub fn external_document_id(&self) -> &'doc str { + self.external_document_id + } + + pub fn current<'a, Mapper: FieldIdMapper>( + &self, + rtxn: &'a RoTxn, + index: &'a Index, + mapper: &'a Mapper, + ) -> Result> { + Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or( + crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, + )?) + } + + pub fn current_vectors<'a, Mapper: FieldIdMapper>( + &self, + rtxn: &'a RoTxn, + index: &'a Index, + mapper: &'a Mapper, + doc_alloc: &'a Bump, + ) -> Result> { + Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or( + crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, + )?) + } +} diff --git a/crates/milli/src/update/new/indexer/document_deletion.rs b/crates/milli/src/update/new/indexer/document_deletion.rs index c4a72a2a1..114ce0a69 100644 --- a/crates/milli/src/update/new/indexer/document_deletion.rs +++ b/crates/milli/src/update/new/indexer/document_deletion.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::document_changes::{DocumentChangeContext, DocumentChanges}; use crate::documents::PrimaryKey; use crate::update::new::thread_local::MostlySend; -use crate::update::new::{Deletion, DocumentChange}; +use crate::update::new::{DatabaseDocument, DocumentChange}; use crate::{DocumentId, Result}; #[derive(Default)] @@ -74,7 +74,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { let external_document_id = external_document_id.to_bump(&context.doc_alloc); - Ok(Some(DocumentChange::Deletion(Deletion::create(*docid, external_document_id)))) + Ok(Some(DocumentChange::Deletion(DatabaseDocument::create(*docid, external_document_id)))) } fn len(&self) -> usize { diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index ca433c043..70dc5f35c 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -19,7 +19,7 @@ use crate::progress::{AtomicPayloadStep, Progress}; use crate::update::new::document::Versions; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::MostlySend; -use crate::update::new::{Deletion, Insertion, Update}; +use crate::update::new::{DatabaseDocument, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, InternalError, Result, UserError}; @@ -577,7 +577,7 @@ impl<'pl> PayloadOperations<'pl> { if self.is_new { Ok(None) } else { - let deletion = Deletion::create(self.docid, external_doc); + let deletion = DatabaseDocument::create(self.docid, external_doc); Ok(Some(DocumentChange::Deletion(deletion))) } } diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 6f2479373..c79c23c1c 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -19,6 +19,7 @@ use crate::progress::EmbedderStats; use crate::progress::MergingWordCache; use crate::proximity::ProximityPrecision; use crate::update::new::extract::EmbeddingExtractor; +use crate::update::new::indexer::settings_changes::DatabaseDocuments; use crate::update::new::merger::merge_and_send_rtree; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::SettingsDelta; @@ -335,6 +336,13 @@ where MSP: Fn() -> bool + Sync, SD: SettingsDelta, { + // Create the list of document ids to extract + let rtxn = indexing_context.index.read_txn()?; + let all_document_ids = + indexing_context.index.documents_ids(&rtxn)?.into_iter().collect::>(); + let primary_key = + primary_key_from_db(&indexing_context.index, &rtxn, &indexing_context.db_fields_ids_map)?; + let documents = DatabaseDocuments::new(&all_document_ids, primary_key); indexing_context.progress.update_progress(IndexingStep::WaitingForDatabaseWrites); finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); @@ -342,6 +350,24 @@ where Result::Ok(index_embeddings) } +fn primary_key_from_db<'indexer, 'index>( + index: &'indexer Index, + rtxn: &'indexer heed::RoTxn<'index>, + fields: &'indexer impl FieldIdMapper, +) -> Result> { + let Some(primary_key) = index.primary_key(rtxn)? else { + return Err(InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::MAIN, + key: Some(crate::index::main_key::PRIMARY_KEY_KEY), + } + .into()); + }; + let Some(primary_key) = PrimaryKey::new(primary_key, fields) else { + unreachable!("Primary key must exist at this point"); + }; + Ok(primary_key) +} + fn request_threads() -> &'static ThreadPoolNoAbort { static REQUEST_THREADS: OnceLock = OnceLock::new(); diff --git a/crates/milli/src/update/new/indexer/settings_changes.rs b/crates/milli/src/update/new/indexer/settings_changes.rs new file mode 100644 index 000000000..2e3d9c917 --- /dev/null +++ b/crates/milli/src/update/new/indexer/settings_changes.rs @@ -0,0 +1,55 @@ +use std::sync::atomic::Ordering; +use std::sync::Arc; + +use bumpalo::Bump; +use rayon::iter::IndexedParallelIterator; +use rayon::slice::ParallelSlice; + +use super::document_changes::IndexingContext; +use crate::documents::PrimaryKey; +use crate::progress::AtomicDocumentStep; +use crate::update::new::document_change::DatabaseDocument; +use crate::update::new::indexer::document_changes::DocumentChangeContext; +use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; +use crate::update::new::steps::IndexingStep; +use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; +use crate::{DocumentId, InternalError, Result}; +pub struct DatabaseDocuments<'indexer> { + documents: &'indexer [DocumentId], + primary_key: PrimaryKey<'indexer>, +} + +impl<'indexer> DatabaseDocuments<'indexer> { + pub fn new(documents: &'indexer [DocumentId], primary_key: PrimaryKey<'indexer>) -> Self { + Self { documents, primary_key } + } + + fn iter(&self, chunk_size: usize) -> impl IndexedParallelIterator { + self.documents.par_chunks(chunk_size) + } + + fn item_to_database_document< + 'doc, // lifetime of a single `process` call + T: MostlySend, + >( + &'doc self, + context: &'doc DocumentChangeContext, + docid: &'doc DocumentId, + ) -> Result>> { + let current = context.index.document(&context.rtxn, *docid)?; + + let external_document_id = self.primary_key.extract_docid_from_db( + current, + &context.db_fields_ids_map, + &context.doc_alloc, + )?; + + let external_document_id = external_document_id.to_bump(&context.doc_alloc); + + Ok(Some(DatabaseDocument::create(*docid, external_document_id))) + } + + fn len(&self) -> usize { + self.documents.len() + } +} diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index 3001648e6..694645d28 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -13,7 +13,7 @@ use crate::error::{FieldIdMapMissingEntry, InternalError}; use crate::update::new::document::Versions; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::thread_local::MostlySend; -use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update}; +use crate::update::new::{DatabaseDocument, DocumentChange, KvReaderFieldId, Update}; use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; pub struct UpdateByFunction { @@ -128,10 +128,9 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { match scope.remove::("doc") { // If the "doc" variable has been set to (), we effectively delete the document. - Some(doc) if doc.is_unit() => Ok(Some(DocumentChange::Deletion(Deletion::create( - docid, - doc_alloc.alloc_str(&document_id), - )))), + Some(doc) if doc.is_unit() => Ok(Some(DocumentChange::Deletion( + DatabaseDocument::create(docid, doc_alloc.alloc_str(&document_id)), + ))), None => unreachable!("missing doc variable from the Rhai scope"), Some(new_document) => match new_document.try_cast() { Some(new_rhai_document) => { diff --git a/crates/milli/src/update/new/mod.rs b/crates/milli/src/update/new/mod.rs index 81ff93e54..e3adc5bde 100644 --- a/crates/milli/src/update/new/mod.rs +++ b/crates/milli/src/update/new/mod.rs @@ -1,4 +1,4 @@ -pub use document_change::{Deletion, DocumentChange, Insertion, Update}; +pub use document_change::{DatabaseDocument, DocumentChange, Insertion, Update}; pub use indexer::ChannelCongestion; pub use merger::{ merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases, FacetFieldIdsDelta, From e60b855a5458da57097e5d87a105fde618b86944 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Jun 2025 14:38:48 +0200 Subject: [PATCH 052/150] Delete embedders from arroy --- crates/milli/src/update/new/indexer/mod.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index ccfe20617..576362c89 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -218,6 +218,8 @@ where MSP: Fn() -> bool + Sync, SD: SettingsDelta + Sync, { + delete_old_embedders(wtxn, index, settings_delta)?; + let mut bbbuffers = Vec::new(); let finished_extraction = AtomicBool::new(false); @@ -369,6 +371,25 @@ fn arroy_writers_from_embedder_actions<'indexer, 'index>( .collect() } +fn delete_old_embedders<'indexer, 'index, SD>( + wtxn: &mut RwTxn<'_>, + index: &'index Index, + settings_delta: &'indexer SD, +) -> Result<()> +where + SD: SettingsDelta, +{ + for (_name, action) in settings_delta.embedder_actions() { + if let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() { + let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized); + let dimensions = reader.dimensions(wtxn)?; + reader.clear(wtxn, dimensions)?; + } + } + + Ok(()) +} + fn indexer_memory_settings( current_num_threads: usize, grenad_parameters: GrenadParameters, From 31142b36635ba062184b7d42f2e86021b11e911c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Jun 2025 14:42:07 +0200 Subject: [PATCH 053/150] Introduce extractor for setting changes --- .../update/new/indexer/document_changes.rs | 2 +- .../update/new/indexer/settings_changes.rs | 93 +++++++++++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index 5302c9d05..ca5bc8dc5 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -43,7 +43,7 @@ pub struct DocumentChangeContext< pub extractor_alloc: &'extractor Bump, /// Pool of doc allocators, used to retrieve the doc allocator we provided for the documents - doc_allocs: &'doc ThreadLocal>>, + pub doc_allocs: &'doc ThreadLocal>>, /// Extractor-specific data pub data: &'doc T, diff --git a/crates/milli/src/update/new/indexer/settings_changes.rs b/crates/milli/src/update/new/indexer/settings_changes.rs index 2e3d9c917..f92935399 100644 --- a/crates/milli/src/update/new/indexer/settings_changes.rs +++ b/crates/milli/src/update/new/indexer/settings_changes.rs @@ -14,6 +14,19 @@ use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::{DocumentId, InternalError, Result}; + +/// An internal iterator (i.e. using `foreach`) of `DocumentChange`s +pub trait SettingsChangeExtractor<'extractor>: Sync { + type Data: MostlySend; + + fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> Result; + + fn process<'doc>( + &'doc self, + changes: impl Iterator>>, + context: &'doc DocumentChangeContext, + ) -> Result<()>; +} pub struct DatabaseDocuments<'indexer> { documents: &'indexer [DocumentId], primary_key: PrimaryKey<'indexer>, @@ -53,3 +66,83 @@ impl<'indexer> DatabaseDocuments<'indexer> { self.documents.len() } } + +const CHUNK_SIZE: usize = 100; + +pub fn settings_change_extract< + 'extractor, // invariant lifetime of extractor_alloc + 'fid, // invariant lifetime of fields ids map + 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing + 'data, // invariant on EX::Data lifetime of datastore + 'index, // covariant lifetime of the index + EX: SettingsChangeExtractor<'extractor>, + MSP: Fn() -> bool + Sync, +>( + documents: &'indexer DatabaseDocuments<'indexer>, + extractor: &EX, + IndexingContext { + index, + db_fields_ids_map, + new_fields_ids_map, + doc_allocs, + fields_ids_map_store, + must_stop_processing, + progress, + grenad_parameters: _, + }: IndexingContext<'fid, 'indexer, 'index, MSP>, + extractor_allocs: &'extractor mut ThreadLocal>, + datastore: &'data ThreadLocal, + step: IndexingStep, +) -> Result<()> { + tracing::trace!("We are resetting the extractor allocators"); + progress.update_progress(step); + // Clean up and reuse the extractor allocs + for extractor_alloc in extractor_allocs.iter_mut() { + tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes()); + extractor_alloc.0.reset(); + } + + let total_documents = documents.len() as u32; + let (step, progress_step) = AtomicDocumentStep::new(total_documents); + progress.update_progress(progress_step); + + let pi = documents.iter(CHUNK_SIZE); + pi.try_arc_for_each_try_init( + || { + DocumentChangeContext::new( + index, + db_fields_ids_map, + new_fields_ids_map, + extractor_allocs, + doc_allocs, + datastore, + fields_ids_map_store, + move |index_alloc| extractor.init_data(index_alloc), + ) + }, + |context, items| { + if (must_stop_processing)() { + return Err(Arc::new(InternalError::AbortedIndexation.into())); + } + + // Clean up and reuse the document-specific allocator + context.doc_alloc.reset(); + + let items = items.as_ref(); + let changes = items + .iter() + .filter_map(|item| documents.item_to_database_document(context, item).transpose()); + + let res = extractor.process(changes, context).map_err(Arc::new); + step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed); + + // send back the doc_alloc in the pool + context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc)); + + res + }, + )?; + step.store(total_documents, Ordering::Relaxed); + + Ok(()) +} From 51a087b764d0f453d613cd631f95eeef58c8a4fe Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Jun 2025 15:56:38 +0200 Subject: [PATCH 054/150] Write back user provided vectors from deleted embedders --- crates/milli/src/update/new/document.rs | 47 +++++-- .../milli/src/update/new/extract/documents.rs | 117 +++++++++++++++++- .../milli/src/update/new/indexer/extract.rs | 12 ++ 3 files changed, 160 insertions(+), 16 deletions(-) diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index 1ef44fc8d..c7156c120 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -9,6 +9,7 @@ use super::vector_document::VectorDocument; use super::{KvReaderFieldId, KvWriterFieldId}; use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::documents::FieldIdMapper; +use crate::vector::settings::EmbedderAction; use crate::{DocumentId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError}; /// A view into a document that can represent either the current version from the DB, @@ -309,6 +310,7 @@ where pub fn write_to_obkv<'s, 'a, 'map, 'buffer>( document: &'s impl Document<'s>, vector_document: Option<&'s impl VectorDocument<'s>>, + embedder_actions: &'a BTreeMap, fields_ids_map: &'a mut GlobalFieldsIdsMap<'map>, mut document_buffer: &'a mut bumpalo::collections::Vec<'buffer, u8>, ) -> Result<&'a KvReaderFieldId> @@ -338,20 +340,39 @@ where for res in vector_document.iter_vectors() { let (name, entry) = res?; if entry.has_configured_embedder { - continue; // we don't write vectors with configured embedder in documents + if let Some(action) = embedder_actions.get(name) { + if action.write_back().is_some() && !entry.regenerate { + vectors.insert( + name, + serde_json::json!({ + "regenerate": entry.regenerate, + // TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object + "embeddings": entry.embeddings, + }), + ); + } + } + } else { + match embedder_actions.get(name) { + Some(action) if action.write_back().is_none() => { + continue; + } + _ => { + vectors.insert( + name, + if entry.implicit { + serde_json::json!(entry.embeddings) + } else { + serde_json::json!({ + "regenerate": entry.regenerate, + // TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object + "embeddings": entry.embeddings, + }) + }, + ); + } + } } - vectors.insert( - name, - if entry.implicit { - serde_json::json!(entry.embeddings) - } else { - serde_json::json!({ - "regenerate": entry.regenerate, - // TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object - "embeddings": entry.embeddings, - }) - }, - ); } if vectors.is_empty() { diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index d1c92919b..37d867b31 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -1,16 +1,25 @@ use std::cell::RefCell; +use std::collections::BTreeMap; use bumpalo::Bump; use hashbrown::HashMap; use super::DelAddRoaringBitmap; use crate::constants::RESERVED_GEO_FIELD_NAME; -use crate::update::new::channel::DocumentsSender; +use crate::update::new::channel::{DocumentsSender, ExtractorBbqueueSender}; use crate::update::new::document::{write_to_obkv, Document as _}; -use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; +use crate::update::new::document_change::DatabaseDocument; +use crate::update::new::indexer::document_changes::{ + DocumentChangeContext, Extractor, IndexingContext, +}; +use crate::update::new::indexer::settings_changes::{ + settings_change_extract, DatabaseDocuments, SettingsChangeExtractor, +}; use crate::update::new::ref_cell_ext::RefCellExt as _; -use crate::update::new::thread_local::FullySend; +use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; +use crate::update::settings::SettingsDelta; +use crate::vector::settings::EmbedderAction; use crate::vector::EmbeddingConfigs; use crate::Result; @@ -45,6 +54,7 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> { ) -> Result<()> { let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc); let mut document_extractor_data = context.data.0.borrow_mut_or_yield(); + let embedder_actions = &Default::default(); for change in changes { let change = change?; @@ -121,9 +131,11 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> { let content = write_to_obkv( &content, vector_content.as_ref(), + embedder_actions, &mut new_fields_ids_map, &mut document_buffer, )?; + self.document_sender.uncompressed(docid, external_docid, content).unwrap(); } DocumentChange::Insertion(insertion) => { @@ -146,6 +158,7 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> { let content = write_to_obkv( &content, inserted_vectors.as_ref(), + embedder_actions, &mut new_fields_ids_map, &mut document_buffer, )?; @@ -158,3 +171,101 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> { Ok(()) } } + +pub struct SettingsChangeDocumentExtractor<'a, 'b> { + document_sender: DocumentsSender<'a, 'b>, + embedder_actions: &'a BTreeMap, +} + +impl<'a, 'b> SettingsChangeDocumentExtractor<'a, 'b> { + pub fn new( + document_sender: DocumentsSender<'a, 'b>, + embedder_actions: &'a BTreeMap, + ) -> Self { + Self { document_sender, embedder_actions } + } +} + +impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeDocumentExtractor<'_, '_> { + type Data = FullySend>; + + fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result { + Ok(FullySend(Default::default())) + } + + fn process<'doc>( + &self, + documents: impl Iterator>>, + context: &DocumentChangeContext, + ) -> Result<()> { + let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc); + + for document in documents { + let document = document?; + // **WARNING**: the exclusive borrow on `new_fields_ids_map` needs to be taken **inside** of the `for change in changes` loop + // Otherwise, `BorrowMutError` will occur for document changes that also need the new_fields_ids_map (e.g.: UpdateByFunction) + let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); + + let external_docid = document.external_document_id().to_owned(); + let content = + document.current(&context.rtxn, context.index, &context.db_fields_ids_map)?; + let vector_content = document.current_vectors( + &context.rtxn, + context.index, + &context.db_fields_ids_map, + &context.doc_alloc, + )?; + + let content = write_to_obkv( + &content, + Some(&vector_content), + self.embedder_actions, + &mut new_fields_ids_map, + &mut document_buffer, + )?; + + self.document_sender.uncompressed(document.docid(), external_docid, content).unwrap(); + } + + Ok(()) + } +} + +/// Modify the database documents based on the settings changes. +/// +/// This function extracts the documents from the database, +/// modifies them by adding or removing vector fields based on embedder actions, +/// and then updates the database. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents::extract")] +pub fn update_database_documents<'indexer, 'extractor, MSP, SD>( + documents: &'indexer DatabaseDocuments<'indexer>, + indexing_context: IndexingContext, + extractor_sender: &ExtractorBbqueueSender, + settings_delta: &SD, + extractor_allocs: &'extractor mut ThreadLocal>, +) -> Result<()> +where + MSP: Fn() -> bool + Sync, + SD: SettingsDelta, +{ + // skip if no embedder_actions + if settings_delta.embedder_actions().is_empty() { + return Ok(()); + } + + let document_sender = extractor_sender.documents(); + let document_extractor = + SettingsChangeDocumentExtractor::new(document_sender, settings_delta.embedder_actions()); + let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + + settings_change_extract( + documents, + &document_extractor, + indexing_context, + extractor_allocs, + &datastore, + crate::update::new::steps::IndexingStep::ExtractingDocuments, + )?; + + Ok(()) +} diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index c79c23c1c..bb426c330 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -344,6 +344,18 @@ where primary_key_from_db(&indexing_context.index, &rtxn, &indexing_context.db_fields_ids_map)?; let documents = DatabaseDocuments::new(&all_document_ids, primary_key); + let span = + tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); + let _entered = span.enter(); + + update_database_documents( + &documents, + indexing_context, + &extractor_sender, + settings_delta, + extractor_allocs, + )?; + indexing_context.progress.update_progress(IndexingStep::WaitingForDatabaseWrites); finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); From 900be0ccad5ac59f66dc106dbc935cd65d4a7bf3 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Jun 2025 14:46:45 +0200 Subject: [PATCH 055/150] Extract or regenerate vectors related to settings changes --- crates/milli/src/update/new/extract/mod.rs | 2 +- .../src/update/new/extract/vectors/mod.rs | 198 ++++++++++++++++++ .../milli/src/update/new/indexer/extract.rs | 48 +++++ crates/milli/src/update/new/indexer/mod.rs | 1 + crates/milli/src/update/new/indexer/write.rs | 10 +- 5 files changed, 256 insertions(+), 3 deletions(-) diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index 2abefb098..05c90d8f8 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -12,7 +12,7 @@ pub use documents::*; pub use faceted::*; pub use geo::*; pub use searchable::*; -pub use vectors::EmbeddingExtractor; +pub use vectors::{EmbeddingExtractor, SettingsChangeEmbeddingExtractor}; /// TODO move in permissive json pointer pub mod perm_json_p { diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 85398aa99..73a418b19 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -1,4 +1,5 @@ use std::cell::RefCell; +use std::collections::BTreeMap; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; @@ -9,13 +10,16 @@ use crate::error::FaultSource; use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::new::channel::EmbeddingSender; +use crate::update::new::document_change::DatabaseDocument; use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; +use crate::update::new::indexer::settings_changes::SettingsChangeExtractor; use crate::update::new::thread_local::MostlySend; use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; use crate::vector::error::{ EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistributionBump, }; +use crate::vector::settings::{EmbedderAction, ReindexAction}; use crate::vector::{Embedder, Embedding, EmbeddingConfigs}; use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; @@ -294,6 +298,200 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { } } +pub struct SettingsChangeEmbeddingExtractor<'a, 'b> { + embedders: &'a EmbeddingConfigs, + old_embedders: &'a EmbeddingConfigs, + embedder_actions: &'a BTreeMap, + embedder_category_id: &'a std::collections::HashMap, + sender: EmbeddingSender<'a, 'b>, + possible_embedding_mistakes: PossibleEmbeddingMistakes, + threads: &'a ThreadPoolNoAbort, +} + +impl<'a, 'b> SettingsChangeEmbeddingExtractor<'a, 'b> { + pub fn new( + embedders: &'a EmbeddingConfigs, + old_embedders: &'a EmbeddingConfigs, + embedder_actions: &'a BTreeMap, + embedder_category_id: &'a std::collections::HashMap, + sender: EmbeddingSender<'a, 'b>, + field_distribution: &'a FieldDistribution, + threads: &'a ThreadPoolNoAbort, + ) -> Self { + let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution); + Self { + embedders, + old_embedders, + embedder_actions, + embedder_category_id, + sender, + threads, + possible_embedding_mistakes, + } + } +} + +impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbeddingExtractor<'_, '_> { + type Data = RefCell>; + + fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result { + Ok(RefCell::new(EmbeddingExtractorData(HashMap::new_in(extractor_alloc)))) + } + + fn process<'doc>( + &'doc self, + documents: impl Iterator>>, + context: &'doc DocumentChangeContext, + ) -> crate::Result<()> { + let embedders = self.embedders.inner_as_ref(); + let old_embedders = self.old_embedders.inner_as_ref(); + let unused_vectors_distribution = UnusedVectorsDistributionBump::new_in(&context.doc_alloc); + + let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); + for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { + // if the embedder is not in the embedder_actions, we don't need to reindex. + if let Some((embedder_id, reindex_action)) = + self.embedder_actions.get(embedder_name).and_then(|action| { + let embedder_id = self + .embedder_category_id + .get(embedder_name) + .expect("embedder_category_id should be present"); + action.reindex().map(|reindex| (*embedder_id, reindex)) + }) + { + all_chunks.push(( + Chunks::new( + embedder, + embedder_id, + embedder_name, + prompt, + context.data, + &self.possible_embedding_mistakes, + self.threads, + self.sender, + &context.doc_alloc, + ), + reindex_action, + )) + } + } + + for document in documents { + let document = document?; + + let current_vectors = document.current_vectors( + &context.rtxn, + context.index, + context.db_fields_ids_map, + &context.doc_alloc, + )?; + + for (chunks, reindex_action) in &mut all_chunks { + let embedder_name = chunks.embedder_name(); + let current_vectors = current_vectors.vectors_for_key(embedder_name)?; + + // if the vectors for this document have been already provided, we don't need to reindex. + let (is_new_embedder, must_regenerate) = + current_vectors.as_ref().map_or((true, true), |vectors| { + (!vectors.has_configured_embedder, vectors.regenerate) + }); + + match reindex_action { + ReindexAction::RegeneratePrompts => { + if !must_regenerate { + continue; + } + // we need to regenerate the prompts for the document + + // Get the old prompt and render the document with it + let Some((_, old_prompt, _)) = old_embedders.get(embedder_name) else { + unreachable!("ReindexAction::RegeneratePrompts implies that the embedder {embedder_name} is in the old_embedders") + }; + let old_rendered = old_prompt.render_document( + document.external_document_id(), + document.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + )?; + + // Get the new prompt and render the document with it + let new_prompt = chunks.prompt(); + let new_rendered = new_prompt.render_document( + document.external_document_id(), + document.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + )?; + + // Compare the rendered documents + // if they are different, regenerate the vectors + if new_rendered != old_rendered { + chunks.set_autogenerated( + document.docid(), + document.external_document_id(), + new_rendered, + &unused_vectors_distribution, + )?; + } + } + ReindexAction::FullReindex => { + let prompt = chunks.prompt(); + // if no inserted vectors, then regenerate: true + no embeddings => autogenerate + if let Some(embeddings) = current_vectors + .and_then(|vectors| vectors.embeddings) + // insert the embeddings only for new embedders + .filter(|_| is_new_embedder) + { + chunks.set_regenerate(document.docid(), must_regenerate); + chunks.set_vectors( + document.external_document_id(), + document.docid(), + embeddings.into_vec(&context.doc_alloc, embedder_name).map_err( + |error| UserError::InvalidVectorsEmbedderConf { + document_id: document.external_document_id().to_string(), + error: error.to_string(), + }, + )?, + )?; + } else if must_regenerate { + let rendered = prompt.render_document( + document.external_document_id(), + document.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + )?; + chunks.set_autogenerated( + document.docid(), + document.external_document_id(), + rendered, + &unused_vectors_distribution, + )?; + } + } + } + } + } + + for (chunk, _) in all_chunks { + chunk.drain(&unused_vectors_distribution)?; + } + + Ok(()) + } +} + // **Warning**: the destructor of this struct is not normally run, make sure that all its fields: // 1. don't have side effects tied to they destructors // 2. if allocated, are allocated inside of the bumpalo diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index bb426c330..c94da9629 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -12,6 +12,7 @@ use super::super::steps::IndexingStep; use super::super::thread_local::{FullySend, ThreadLocal}; use super::super::FacetFieldIdsDelta; use super::document_changes::{extract, DocumentChanges, IndexingContext}; +use super::settings_changes::settings_change_extract; use crate::documents::FieldIdMapper; use crate::documents::PrimaryKey; use crate::index::IndexEmbeddingConfig; @@ -356,6 +357,53 @@ where extractor_allocs, )?; + 'vectors: { + if settings_delta.embedder_actions().is_empty() { + break 'vectors; + } + + let embedding_sender = extractor_sender.embeddings(); + + // extract the remaining embedders + let extractor = SettingsChangeEmbeddingExtractor::new( + settings_delta.new_embedders(), + settings_delta.old_embedders(), + settings_delta.embedder_actions(), + settings_delta.new_embedder_category_id(), + embedding_sender, + field_distribution, + request_threads(), + ); + let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + { + let span = tracing::debug_span!(target: "indexing::documents::extract", "vectors"); + let _entered = span.enter(); + + settings_change_extract( + &documents, + &extractor, + indexing_context, + extractor_allocs, + &datastore, + IndexingStep::ExtractingEmbeddings, + )?; + } + { + let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors"); + let _entered = span.enter(); + + for config in &mut index_embeddings { + 'data: for data in datastore.iter_mut() { + let data = &mut data.get_mut().0; + let Some(deladd) = data.remove(&config.name) else { + continue 'data; + }; + deladd.apply_to(&mut config.user_provided, modified_docids); + } + } + } + } + indexing_context.progress.update_progress(IndexingStep::WaitingForDatabaseWrites); finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 576362c89..6dbfc5433 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -170,6 +170,7 @@ where index_embeddings, arroy_memory, &mut arroy_writers, + None, &indexing_context.must_stop_processing, ) }) diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 5a600eeb3..19696f169 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -1,3 +1,4 @@ +use std::collections::BTreeMap; use std::sync::atomic::AtomicBool; use bstr::ByteSlice as _; @@ -13,6 +14,7 @@ use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::index::IndexEmbeddingConfig; use crate::progress::Progress; use crate::update::settings::InnerIndexSettings; +use crate::vector::settings::EmbedderAction; use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings}; use crate::{Error, Index, InternalError, Result, UserError}; @@ -106,6 +108,7 @@ pub fn build_vectors( index_embeddings: Vec, arroy_memory: Option, arroy_writers: &mut HashMap, + embeder_actions: Option<&BTreeMap>, must_stop_processing: &MSP, ) -> Result<()> where @@ -117,14 +120,17 @@ where let seed = rand::random(); let mut rng = rand::rngs::StdRng::seed_from_u64(seed); - for (_index, (_embedder_name, _embedder, writer, dimensions)) in arroy_writers { + for (_index, (embedder_name, _embedder, writer, dimensions)) in arroy_writers { let dimensions = *dimensions; + let is_being_quantized = embeder_actions + .and_then(|actions| actions.get(*embedder_name).map(|action| action.is_being_quantized)) + .unwrap_or(false); writer.build_and_quantize( wtxn, progress, &mut rng, dimensions, - false, + is_being_quantized, arroy_memory, must_stop_processing, )?; From f16e6f7c370d854d54e56d4648ee9de6e9830781 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Jun 2025 14:47:08 +0200 Subject: [PATCH 056/150] Update snapshots --- crates/index-scheduler/src/scheduler/test_embedders.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/test_embedders.rs b/crates/index-scheduler/src/scheduler/test_embedders.rs index 772aa1520..dab7a2ad9 100644 --- a/crates/index-scheduler/src/scheduler/test_embedders.rs +++ b/crates/index-scheduler/src/scheduler/test_embedders.rs @@ -800,7 +800,7 @@ fn delete_embedder_with_user_provided_vectors() { .unwrap() .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .collect::>(); - snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"regenerate":false}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"regenerate":false}}}]"###); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"regenerate":false,"embeddings":[[0.0,0.0,0.0]]}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"regenerate":false,"embeddings":[[1.0,1.0,1.0]]}}}]"###); } { @@ -835,6 +835,6 @@ fn delete_embedder_with_user_provided_vectors() { .collect::>(); // FIXME: redaction - snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"regenerate\":false},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"regenerate\":false}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"regenerate\":false}}}]""###); + snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"regenerate\":false,\"embeddings\":[[0.0,0.0,0.0]]},\"my_doggo_embedder\":{\"regenerate\":false,\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]]}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"regenerate\":false,\"embeddings\":[[1.0,1.0,1.0]]}}}]""###); } } From a685eeafeb8620e66a5cdf71672d5942fcef29be Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Jun 2025 14:47:24 +0200 Subject: [PATCH 057/150] wierd snapshot update --- crates/index-scheduler/src/scheduler/test_embedders.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/scheduler/test_embedders.rs b/crates/index-scheduler/src/scheduler/test_embedders.rs index dab7a2ad9..305894d0a 100644 --- a/crates/index-scheduler/src/scheduler/test_embedders.rs +++ b/crates/index-scheduler/src/scheduler/test_embedders.rs @@ -399,7 +399,7 @@ fn import_vectors_first_and_embedder_later() { .collect::>(); // the all the vectors linked to the new specified embedder have been removed // Only the unknown embedders stays in the document DB - snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1,2,3]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4,5]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); let conf = index.embedding_configs(&rtxn).unwrap(); // even though we specified the vector for the ID 3, it shouldn't be marked // as user provided since we explicitely marked it as NOT user provided. From 77802dabf6006d7b3d59d6e03f2f9ca2317b3350 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 26 Jun 2025 15:19:44 +0200 Subject: [PATCH 058/150] rename DocumentChangeContext into DocumentContext --- crates/milli/src/update/new/extract/documents.rs | 8 +++----- .../src/update/new/extract/faceted/extract_facets.rs | 6 +++--- crates/milli/src/update/new/extract/geo/mod.rs | 4 ++-- .../new/extract/searchable/extract_word_docids.rs | 6 +++--- .../searchable/extract_word_pair_proximity_docids.rs | 6 +++--- crates/milli/src/update/new/extract/vectors/mod.rs | 6 +++--- .../milli/src/update/new/indexer/document_changes.rs | 12 ++++++------ .../src/update/new/indexer/document_deletion.rs | 8 ++++---- .../src/update/new/indexer/document_operation.rs | 4 ++-- crates/milli/src/update/new/indexer/partial_dump.rs | 4 ++-- .../milli/src/update/new/indexer/settings_changes.rs | 8 ++++---- .../src/update/new/indexer/update_by_function.rs | 6 +++--- 12 files changed, 38 insertions(+), 40 deletions(-) diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index 37d867b31..f5dcb3eff 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -9,9 +9,7 @@ use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::update::new::channel::{DocumentsSender, ExtractorBbqueueSender}; use crate::update::new::document::{write_to_obkv, Document as _}; use crate::update::new::document_change::DatabaseDocument; -use crate::update::new::indexer::document_changes::{ - DocumentChangeContext, Extractor, IndexingContext, -}; +use crate::update::new::indexer::document_changes::{DocumentContext, Extractor, IndexingContext}; use crate::update::new::indexer::settings_changes::{ settings_change_extract, DatabaseDocuments, SettingsChangeExtractor, }; @@ -50,7 +48,7 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> { fn process<'doc>( &self, changes: impl Iterator>>, - context: &DocumentChangeContext, + context: &DocumentContext, ) -> Result<()> { let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc); let mut document_extractor_data = context.data.0.borrow_mut_or_yield(); @@ -196,7 +194,7 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeDocumentE fn process<'doc>( &self, documents: impl Iterator>>, - context: &DocumentChangeContext, + context: &DocumentContext, ) -> Result<()> { let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc); diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 517ef3f2d..39e68fb6d 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -17,7 +17,7 @@ use crate::update::del_add::DelAdd; use crate::update::new::channel::FieldIdDocidFacetSender; use crate::update::new::extract::perm_json_p; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, + extract, DocumentContext, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::steps::IndexingStep; @@ -51,7 +51,7 @@ impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'_, '_> { fn process<'doc>( &self, changes: impl Iterator>>, - context: &DocumentChangeContext, + context: &DocumentContext, ) -> Result<()> { for change in changes { let change = change?; @@ -75,7 +75,7 @@ pub struct FacetedDocidsExtractor; impl FacetedDocidsExtractor { #[allow(clippy::too_many_arguments)] fn extract_document_change( - context: &DocumentChangeContext>, + context: &DocumentContext>, filterable_attributes: &[FilterableAttributesRule], sortable_fields: &HashSet, asc_desc_fields: &HashSet, diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index b2ccc1b2b..927434ff6 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -11,7 +11,7 @@ use serde_json::Value; use crate::error::GeoError; use crate::update::new::document::Document; -use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; +use crate::update::new::indexer::document_changes::{DocumentContext, Extractor}; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::thread_local::MostlySend; use crate::update::new::DocumentChange; @@ -150,7 +150,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { fn process<'doc>( &'doc self, changes: impl Iterator>>, - context: &'doc DocumentChangeContext, + context: &'doc DocumentContext, ) -> Result<()> { let rtxn = &context.rtxn; let index = context.index; diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 046116939..35bc9f063 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -11,7 +11,7 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, + extract, DocumentContext, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::steps::IndexingStep; @@ -226,7 +226,7 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'_> { fn process<'doc>( &self, changes: impl Iterator>>, - context: &DocumentChangeContext, + context: &DocumentContext, ) -> Result<()> { for change in changes { let change = change?; @@ -305,7 +305,7 @@ impl WordDocidsExtractors { } fn extract_document_change( - context: &DocumentChangeContext>>, + context: &DocumentContext>>, document_tokenizer: &DocumentTokenizer, searchable_attributes: Option<&[&str]>, document_change: DocumentChange, diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 3b358800f..dffde06c7 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -10,7 +10,7 @@ use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::update::new::document::Document; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, + extract, DocumentContext, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::steps::IndexingStep; @@ -39,7 +39,7 @@ impl<'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractorData< fn process<'doc>( &self, changes: impl Iterator>>, - context: &DocumentChangeContext, + context: &DocumentContext, ) -> Result<()> { for change in changes { let change = change?; @@ -116,7 +116,7 @@ impl WordPairProximityDocidsExtractor { // and to store the docids of the documents that have a number of words in a given field // equal to or under than MAX_COUNTED_WORDS. fn extract_document_change( - context: &DocumentChangeContext>, + context: &DocumentContext>, document_tokenizer: &DocumentTokenizer, searchable_attributes: Option<&[&str]>, document_change: DocumentChange, diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 73a418b19..c5e56356d 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -11,7 +11,7 @@ use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::new::channel::EmbeddingSender; use crate::update::new::document_change::DatabaseDocument; -use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; +use crate::update::new::indexer::document_changes::{DocumentContext, Extractor}; use crate::update::new::indexer::settings_changes::SettingsChangeExtractor; use crate::update::new::thread_local::MostlySend; use crate::update::new::vector_document::VectorDocument; @@ -60,7 +60,7 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { fn process<'doc>( &'doc self, changes: impl Iterator>>, - context: &'doc DocumentChangeContext, + context: &'doc DocumentContext, ) -> crate::Result<()> { let embedders = self.embedders.inner_as_ref(); let mut unused_vectors_distribution = @@ -341,7 +341,7 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding fn process<'doc>( &'doc self, documents: impl Iterator>>, - context: &'doc DocumentChangeContext, + context: &'doc DocumentContext, ) -> crate::Result<()> { let embedders = self.embedders.inner_as_ref(); let old_embedders = self.old_embedders.inner_as_ref(); diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index ca5bc8dc5..3069ab29b 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -15,7 +15,7 @@ use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::GrenadParameters; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result}; -pub struct DocumentChangeContext< +pub struct DocumentContext< 'doc, // covariant lifetime of a single `process` call 'extractor: 'doc, // invariant lifetime of the extractor_allocs 'fid: 'doc, // invariant lifetime of the new_fields_ids_map @@ -56,7 +56,7 @@ impl< 'fid: 'doc, // invariant lifetime of fields ids map 'indexer: 'doc, // covariant lifetime of objects that survive a `process` call T: MostlySend, - > DocumentChangeContext<'doc, 'extractor, 'fid, 'indexer, T> + > DocumentContext<'doc, 'extractor, 'fid, 'indexer, T> { #[allow(clippy::too_many_arguments)] pub fn new( @@ -84,7 +84,7 @@ impl< let data = datastore.get_or_try(move || init_data(&extractor_alloc.0))?; let txn = index.read_txn()?; - Ok(DocumentChangeContext { + Ok(DocumentContext { index, rtxn: txn, db_fields_ids_map, @@ -106,7 +106,7 @@ pub trait Extractor<'extractor>: Sync { fn process<'doc>( &'doc self, changes: impl Iterator>>, - context: &'doc DocumentChangeContext, + context: &'doc DocumentContext, ) -> Result<()>; } @@ -125,7 +125,7 @@ pub trait DocumentChanges<'pl // lifetime of the underlying payload fn item_to_document_change<'doc, // lifetime of a single `process` call T: MostlySend>( &'doc self, - context: &'doc DocumentChangeContext, + context: &'doc DocumentContext, item: &'doc Self::Item, ) -> Result>> where 'pl: 'doc // the payload must survive the process calls ; @@ -224,7 +224,7 @@ where let pi = document_changes.iter(CHUNK_SIZE); pi.try_arc_for_each_try_init( || { - DocumentChangeContext::new( + DocumentContext::new( index, db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/src/update/new/indexer/document_deletion.rs b/crates/milli/src/update/new/indexer/document_deletion.rs index 114ce0a69..292cdc36e 100644 --- a/crates/milli/src/update/new/indexer/document_deletion.rs +++ b/crates/milli/src/update/new/indexer/document_deletion.rs @@ -4,7 +4,7 @@ use rayon::iter::IndexedParallelIterator; use rayon::slice::ParallelSlice as _; use roaring::RoaringBitmap; -use super::document_changes::{DocumentChangeContext, DocumentChanges}; +use super::document_changes::{DocumentContext, DocumentChanges}; use crate::documents::PrimaryKey; use crate::update::new::thread_local::MostlySend; use crate::update::new::{DatabaseDocument, DocumentChange}; @@ -58,7 +58,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { T: MostlySend, >( &'doc self, - context: &'doc DocumentChangeContext, + context: &'doc DocumentContext, docid: &'doc Self::Item, ) -> Result>> where @@ -94,7 +94,7 @@ mod test { use crate::index::tests::TempIndex; use crate::progress::Progress; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, Extractor, IndexingContext, + extract, DocumentContext, Extractor, IndexingContext, }; use crate::update::new::indexer::DocumentDeletion; use crate::update::new::steps::IndexingStep; @@ -125,7 +125,7 @@ mod test { fn process<'doc>( &self, changes: impl Iterator>>, - context: &DocumentChangeContext, + context: &DocumentContext, ) -> crate::Result<()> { for change in changes { let change = change?; diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 70dc5f35c..4bcfb2d47 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -12,7 +12,7 @@ use serde_json::value::RawValue; use serde_json::Deserializer; use super::super::document_change::DocumentChange; -use super::document_changes::{DocumentChangeContext, DocumentChanges}; +use super::document_changes::{DocumentContext, DocumentChanges}; use super::guess_primary_key::retrieve_or_guess_primary_key; use crate::documents::PrimaryKey; use crate::progress::{AtomicPayloadStep, Progress}; @@ -411,7 +411,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> { fn item_to_document_change<'doc, T: MostlySend + 'doc>( &'doc self, - context: &'doc DocumentChangeContext, + context: &'doc DocumentContext, item: &'doc Self::Item, ) -> Result>> where diff --git a/crates/milli/src/update/new/indexer/partial_dump.rs b/crates/milli/src/update/new/indexer/partial_dump.rs index 6e4abd898..614c61353 100644 --- a/crates/milli/src/update/new/indexer/partial_dump.rs +++ b/crates/milli/src/update/new/indexer/partial_dump.rs @@ -5,7 +5,7 @@ use rayon::iter::IndexedParallelIterator; use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; -use super::document_changes::{DocumentChangeContext, DocumentChanges}; +use super::document_changes::{DocumentContext, DocumentChanges}; use crate::documents::PrimaryKey; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; use crate::update::new::document::Versions; @@ -55,7 +55,7 @@ where fn item_to_document_change<'doc, T: MostlySend + 'doc>( &'doc self, - context: &'doc DocumentChangeContext, + context: &'doc DocumentContext, document: &'doc Self::Item, ) -> Result>> where diff --git a/crates/milli/src/update/new/indexer/settings_changes.rs b/crates/milli/src/update/new/indexer/settings_changes.rs index f92935399..90c451534 100644 --- a/crates/milli/src/update/new/indexer/settings_changes.rs +++ b/crates/milli/src/update/new/indexer/settings_changes.rs @@ -9,7 +9,7 @@ use super::document_changes::IndexingContext; use crate::documents::PrimaryKey; use crate::progress::AtomicDocumentStep; use crate::update::new::document_change::DatabaseDocument; -use crate::update::new::indexer::document_changes::DocumentChangeContext; +use crate::update::new::indexer::document_changes::DocumentContext; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; @@ -24,7 +24,7 @@ pub trait SettingsChangeExtractor<'extractor>: Sync { fn process<'doc>( &'doc self, changes: impl Iterator>>, - context: &'doc DocumentChangeContext, + context: &'doc DocumentContext, ) -> Result<()>; } pub struct DatabaseDocuments<'indexer> { @@ -46,7 +46,7 @@ impl<'indexer> DatabaseDocuments<'indexer> { T: MostlySend, >( &'doc self, - context: &'doc DocumentChangeContext, + context: &'doc DocumentContext, docid: &'doc DocumentId, ) -> Result>> { let current = context.index.document(&context.rtxn, *docid)?; @@ -109,7 +109,7 @@ pub fn settings_change_extract< let pi = documents.iter(CHUNK_SIZE); pi.try_arc_for_each_try_init( || { - DocumentChangeContext::new( + DocumentContext::new( index, db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index 694645d28..b394757d1 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -5,7 +5,7 @@ use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use roaring::RoaringBitmap; use rustc_hash::FxBuildHasher; -use super::document_changes::DocumentChangeContext; +use super::document_changes::DocumentContext; use super::DocumentChanges; use crate::documents::Error::InvalidDocumentFormat; use crate::documents::PrimaryKey; @@ -86,13 +86,13 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { fn item_to_document_change<'doc, T: MostlySend + 'doc>( &self, - context: &'doc DocumentChangeContext, + context: &'doc DocumentContext, docid: &'doc Self::Item, ) -> Result>> where 'index: 'doc, { - let DocumentChangeContext { + let DocumentContext { index, db_fields_ids_map, rtxn: txn, From 7fa1c41190620506bd31bcd54c5e4c713903b948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 18:25:49 +0200 Subject: [PATCH 059/150] Fix some api key errors --- crates/meilisearch/tests/auth/api_keys.rs | 2 +- crates/meilisearch/tests/auth/errors.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/tests/auth/api_keys.rs b/crates/meilisearch/tests/auth/api_keys.rs index 5a18b4dbf..2688dd918 100644 --- a/crates/meilisearch/tests/auth/api_keys.rs +++ b/crates/meilisearch/tests/auth/api_keys.rs @@ -421,7 +421,7 @@ async fn error_add_api_key_invalid_parameters_actions() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r###" { - "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", + "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" diff --git a/crates/meilisearch/tests/auth/errors.rs b/crates/meilisearch/tests/auth/errors.rs index ebe2e53fa..687cb67a0 100644 --- a/crates/meilisearch/tests/auth/errors.rs +++ b/crates/meilisearch/tests/auth/errors.rs @@ -93,7 +93,7 @@ async fn create_api_key_bad_actions() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", + "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" From 657bbf5d1e4f4dba0c816d94ff3ee9002fe0b880 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 27 Jun 2025 10:14:26 +0200 Subject: [PATCH 060/150] Fix more tests --- crates/meilisearch-types/src/tasks.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 3301b4320..a6ed593db 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -614,6 +614,8 @@ impl FromStr for Kind { Ok(Kind::DumpCreation) } else if kind.eq_ignore_ascii_case("snapshotCreation") { Ok(Kind::SnapshotCreation) + } else if kind.eq_ignore_ascii_case("export") { + Ok(Kind::Export) } else if kind.eq_ignore_ascii_case("upgradeDatabase") { Ok(Kind::UpgradeDatabase) } else { From 72192994363c8fc4060014eecb1905dd88cb979f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 27 Jun 2025 12:23:55 +0200 Subject: [PATCH 061/150] Better handle task abortion --- .../src/scheduler/process_export.rs | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e777809fd..57f79c83f 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -16,7 +16,7 @@ use meilisearch_types::milli::{self, obkv_to_json, Filter, InternalError}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; use serde::Deserialize; -use ureq::{json, Agent}; +use ureq::{json, Response}; use super::MustStopProcessing; use crate::processing::AtomicDocumentStep; @@ -45,7 +45,7 @@ impl IndexScheduler { }) .collect(); - let agent: Agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); + let agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); let must_stop_processing = self.scheduler.must_stop_processing.clone(); for (i, (uid, settings)) in indexes.iter().enumerate() { if must_stop_processing.get() { @@ -272,11 +272,16 @@ fn retry(must_stop_processing: &MustStopProcessing, send_request: F) -> Resul where F: Fn() -> Result>, { - if must_stop_processing.get() { - return Err(Error::AbortedTask); - } - - match backoff::retry(ExponentialBackoff::default(), send_request) { + match backoff::retry(ExponentialBackoff::default(), || { + if must_stop_processing.get() { + return Err(backoff::Error::Permanent(ureq::Error::Status( + u16::MAX, + // 444: Connection Closed Without Response + Response::new(444, "Abort", "Aborted task").unwrap(), + ))); + } + send_request() + }) { Ok(response) => Ok(response), Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), Err(backoff::Error::Transient { err, retry_after: _ }) => Err(ureq_error_into_error(err)), @@ -306,6 +311,9 @@ fn ureq_error_into_error(error: ureq::Error) -> Error { } match error { + // This is a workaround to handle task abortion - the error propagation path + // makes it difficult to cleanly surface the abortion at this level. + ureq::Error::Status(u16::MAX, _) => Error::AbortedTask, ureq::Error::Status(_, response) => match response.into_json() { Ok(MeiliError { message, code, r#type, link }) => { Error::FromRemoteWhenExporting { message, code, r#type, link } From 0687cf058ac402943f9ea5861061bc345823a48e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 30 Jun 2025 09:47:30 +0200 Subject: [PATCH 062/150] Avoid rewritting documents that don't change Ensure being on a reindex action before getting embedder_category_id Fix document skip function --- .../milli/src/update/new/extract/documents.rs | 50 +++++++++++++++++-- .../src/update/new/extract/vectors/mod.rs | 18 ++++--- 2 files changed, 58 insertions(+), 10 deletions(-) diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index f5dcb3eff..1514de72f 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -7,7 +7,7 @@ use hashbrown::HashMap; use super::DelAddRoaringBitmap; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::update::new::channel::{DocumentsSender, ExtractorBbqueueSender}; -use crate::update::new::document::{write_to_obkv, Document as _}; +use crate::update::new::document::{write_to_obkv, Document}; use crate::update::new::document_change::DatabaseDocument; use crate::update::new::indexer::document_changes::{DocumentContext, Extractor, IndexingContext}; use crate::update::new::indexer::settings_changes::{ @@ -15,6 +15,7 @@ use crate::update::new::indexer::settings_changes::{ }; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::thread_local::{FullySend, ThreadLocal}; +use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; use crate::update::settings::SettingsDelta; use crate::vector::settings::EmbedderAction; @@ -214,6 +215,11 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeDocumentE &context.doc_alloc, )?; + // if the document doesn't need to be updated, we skip it + if !must_update_document(&vector_content, self.embedder_actions)? { + continue; + } + let content = write_to_obkv( &content, Some(&vector_content), @@ -246,8 +252,7 @@ where MSP: Fn() -> bool + Sync, SD: SettingsDelta, { - // skip if no embedder_actions - if settings_delta.embedder_actions().is_empty() { + if !must_update_database(settings_delta) { return Ok(()); } @@ -267,3 +272,42 @@ where Ok(()) } + +fn must_update_database(settings_delta: &SD) -> bool { + settings_delta.embedder_actions().iter().any(|(name, action)| { + if action.reindex().is_some() { + // if action has a reindex, we need to update the documents database if the embedder is a new one + settings_delta.old_embedders().get(name).is_none() + } else { + // if action has a write_back, we need to update the documents database + action.write_back().is_some() + } + }) +} + +fn must_update_document<'s, 'a>( + vector_document: &'s impl VectorDocument<'s>, + embedder_actions: &'a BTreeMap, +) -> Result +where + 's: 'a, +{ + // Check if any vector needs to be written back for the document + for (name, action) in embedder_actions { + // if the vector entry is not found, we don't need to update the document + let Some(vector_entry) = vector_document.vectors_for_key(name)? else { + continue; + }; + + // if the vector entry is user provided, we need to update the document by writing back vectors. + let write_back = action.write_back().is_some() && !vector_entry.regenerate; + // if the vector entry is a new embedder, we need to update the document removing the vectors from the document. + let new_embedder = action.reindex().is_some() && !vector_entry.has_configured_embedder; + + if write_back || new_embedder { + return Ok(true); + } + } + + Ok(false) +} diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index c5e56356d..252e136fd 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -351,13 +351,17 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { // if the embedder is not in the embedder_actions, we don't need to reindex. if let Some((embedder_id, reindex_action)) = - self.embedder_actions.get(embedder_name).and_then(|action| { - let embedder_id = self - .embedder_category_id - .get(embedder_name) - .expect("embedder_category_id should be present"); - action.reindex().map(|reindex| (*embedder_id, reindex)) - }) + self.embedder_actions + .get(embedder_name) + // keep only the reindex actions + .and_then(EmbedderAction::reindex) + // map the reindex action to the embedder_id + .map(|reindex| { + let embedder_id = self.embedder_category_id.get(embedder_name).expect( + "An embedder_category_id must exist for all reindexed embedders", + ); + (*embedder_id, reindex) + }) { all_chunks.push(( Chunks::new( From d35b2d8d334989faa3fae2b469e7c48242aa641e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 26 Jun 2025 18:07:08 +0200 Subject: [PATCH 063/150] minor fixes --- .../milli/src/update/new/extract/documents.rs | 4 ++-- crates/milli/src/update/new/indexer/extract.rs | 13 +++++++------ crates/milli/src/update/new/indexer/mod.rs | 17 ++++++----------- .../src/update/new/indexer/settings_changes.rs | 7 +++---- crates/milli/src/update/new/indexer/write.rs | 1 + crates/milli/src/update/settings.rs | 6 ++---- 6 files changed, 21 insertions(+), 27 deletions(-) diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index 1514de72f..4d9a72715 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -186,10 +186,10 @@ impl<'a, 'b> SettingsChangeDocumentExtractor<'a, 'b> { } impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeDocumentExtractor<'_, '_> { - type Data = FullySend>; + type Data = FullySend<()>; fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result { - Ok(FullySend(Default::default())) + Ok(FullySend(())) } fn process<'doc>( diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index c94da9629..8ed9dc37a 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -322,12 +322,13 @@ where Result::Ok((facet_field_ids_delta, index_embeddings)) } -pub(super) fn extract_all_settings_changes<'extractor, MSP, SD>( +#[allow(clippy::too_many_arguments)] +pub(super) fn extract_all_settings_changes( indexing_context: IndexingContext, indexer_span: Span, extractor_sender: ExtractorBbqueueSender, settings_delta: &SD, - extractor_allocs: &'extractor mut ThreadLocal>, + extractor_allocs: &mut ThreadLocal>, finished_extraction: &AtomicBool, field_distribution: &mut BTreeMap, mut index_embeddings: Vec, @@ -342,7 +343,7 @@ where let all_document_ids = indexing_context.index.documents_ids(&rtxn)?.into_iter().collect::>(); let primary_key = - primary_key_from_db(&indexing_context.index, &rtxn, &indexing_context.db_fields_ids_map)?; + primary_key_from_db(indexing_context.index, &rtxn, &indexing_context.db_fields_ids_map)?; let documents = DatabaseDocuments::new(&all_document_ids, primary_key); let span = @@ -364,7 +365,7 @@ where let embedding_sender = extractor_sender.embeddings(); - // extract the remaining embedders + // extract the remaining embeddings let extractor = SettingsChangeEmbeddingExtractor::new( settings_delta.new_embedders(), settings_delta.old_embedders(), @@ -410,9 +411,9 @@ where Result::Ok(index_embeddings) } -fn primary_key_from_db<'indexer, 'index>( +fn primary_key_from_db<'indexer>( index: &'indexer Index, - rtxn: &'indexer heed::RoTxn<'index>, + rtxn: &'indexer heed::RoTxn<'_>, fields: &'indexer impl FieldIdMapper, ) -> Result> { let Some(primary_key) = index.primary_key(rtxn)? else { diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 6dbfc5433..2398b5f09 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -205,8 +205,7 @@ where Ok(congestion) } -#[allow(clippy::too_many_arguments)] // clippy: 😝 -pub fn reindex<'pl, 'indexer, 'index, MSP, SD>( +pub fn reindex<'indexer, 'index, MSP, SD>( wtxn: &mut RwTxn<'index>, index: &'index Index, pool: &ThreadPoolNoAbort, @@ -307,7 +306,7 @@ where index_embeddings, arroy_memory, &mut arroy_writers, - Some(&embedder_actions), + Some(embedder_actions), &indexing_context.must_stop_processing, ) }) @@ -336,8 +335,8 @@ where Ok(congestion) } -fn arroy_writers_from_embedder_actions<'indexer, 'index>( - index: &'index Index, +fn arroy_writers_from_embedder_actions<'indexer>( + index: &Index, embedder_actions: &'indexer BTreeMap, embedders: &'indexer EmbeddingConfigs, index_embedder_category_ids: &'indexer std::collections::HashMap, @@ -372,15 +371,11 @@ fn arroy_writers_from_embedder_actions<'indexer, 'index>( .collect() } -fn delete_old_embedders<'indexer, 'index, SD>( - wtxn: &mut RwTxn<'_>, - index: &'index Index, - settings_delta: &'indexer SD, -) -> Result<()> +fn delete_old_embedders(wtxn: &mut RwTxn<'_>, index: &Index, settings_delta: &SD) -> Result<()> where SD: SettingsDelta, { - for (_name, action) in settings_delta.embedder_actions() { + for action in settings_delta.embedder_actions().values() { if let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() { let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized); let dimensions = reader.dimensions(wtxn)?; diff --git a/crates/milli/src/update/new/indexer/settings_changes.rs b/crates/milli/src/update/new/indexer/settings_changes.rs index 90c451534..99e303f16 100644 --- a/crates/milli/src/update/new/indexer/settings_changes.rs +++ b/crates/milli/src/update/new/indexer/settings_changes.rs @@ -23,7 +23,7 @@ pub trait SettingsChangeExtractor<'extractor>: Sync { fn process<'doc>( &'doc self, - changes: impl Iterator>>, + documents: impl Iterator>>, context: &'doc DocumentContext, ) -> Result<()>; } @@ -128,12 +128,11 @@ pub fn settings_change_extract< // Clean up and reuse the document-specific allocator context.doc_alloc.reset(); - let items = items.as_ref(); - let changes = items + let documents = items .iter() .filter_map(|item| documents.item_to_database_document(context, item).transpose()); - let res = extractor.process(changes, context).map_err(Arc::new); + let res = extractor.process(documents, context).map_err(Arc::new); step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed); // send back the doc_alloc in the pool diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 19696f169..fa48ff589 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -101,6 +101,7 @@ impl ChannelCongestion { } #[tracing::instrument(level = "debug", skip_all, target = "indexing::vectors")] +#[allow(clippy::too_many_arguments)] pub fn build_vectors( index: &Index, wtxn: &mut RwTxn<'_>, diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index d21afdd28..32e3b17f9 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1476,7 +1476,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { chat: Setting::NotSet, wtxn: _, index: _, - indexer_config: _, // TODO: this is not used + indexer_config: _, } = &self { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; @@ -1486,9 +1486,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // Update index settings let embedding_config_updates = self.update_embedding_configs()?; - let mut new_inner_settings = - InnerIndexSettings::from_index(self.index, self.wtxn, None)?; - new_inner_settings.recompute_searchables(self.wtxn, self.index)?; + let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?; let primary_key_id = self .index From 6db5939f8409bb831ca79a76e5d708e7500aa698 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 30 Jun 2025 09:46:19 +0200 Subject: [PATCH 064/150] Re-integrate embedder stats --- crates/benchmarks/benches/indexing.rs | 2 +- crates/benchmarks/benches/utils.rs | 2 +- .../index-scheduler/src/scheduler/process_batch.rs | 6 +++++- .../src/scheduler/process_index_operation.rs | 2 +- crates/meilisearch/src/lib.rs | 5 +++-- crates/milli/src/search/new/tests/integration.rs | 2 +- crates/milli/src/test_index.rs | 2 +- crates/milli/src/update/new/extract/vectors/mod.rs | 4 ++++ crates/milli/src/update/new/indexer/extract.rs | 2 ++ crates/milli/src/update/new/indexer/mod.rs | 6 ++++-- crates/milli/src/update/settings.rs | 13 +++++++++++-- crates/milli/tests/search/distinct.rs | 2 +- crates/milli/tests/search/facet_distribution.rs | 2 +- crates/milli/tests/search/mod.rs | 2 +- crates/milli/tests/search/phrase_search.rs | 2 +- crates/milli/tests/search/query_criteria.rs | 6 +++--- crates/milli/tests/search/typo_tolerance.rs | 8 ++++---- 17 files changed, 45 insertions(+), 23 deletions(-) diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 610fa4a00..16e7a2f81 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -65,7 +65,7 @@ fn setup_settings<'t>( let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); builder.set_sortable_fields(sortable_fields); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); } fn setup_index_with_settings( diff --git a/crates/benchmarks/benches/utils.rs b/crates/benchmarks/benches/utils.rs index 2cacc5477..54bb7e51b 100644 --- a/crates/benchmarks/benches/utils.rs +++ b/crates/benchmarks/benches/utils.rs @@ -90,7 +90,7 @@ pub fn base_setup(conf: &Conf) -> Index { (conf.configure)(&mut builder); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); wtxn.commit().unwrap(); let config = IndexerConfig::default(); diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 237608648..e6bf6f713 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -245,7 +245,11 @@ impl IndexScheduler { let must_stop_processing = self.scheduler.must_stop_processing.clone(); builder - .execute(&|| must_stop_processing.get(), &progress) + .execute( + &|| must_stop_processing.get(), + &progress, + current_batch.embedder_stats.clone(), + ) .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; index_wtxn.commit()?; } diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index c302d6983..04aaf9a84 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -475,7 +475,7 @@ impl IndexScheduler { progress.update_progress(SettingsProgress::ApplyTheSettings); let congestion = builder - .execute(&|| must_stop_processing.get(), progress) + .execute(&|| must_stop_processing.get(), progress, embedder_stats) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; Ok((tasks, congestion)) diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 4bfce17f8..871bd688e 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -37,7 +37,7 @@ use index_scheduler::{IndexScheduler, IndexSchedulerOptions}; use meilisearch_auth::{open_auth_store_env, AuthController}; use meilisearch_types::milli::constants::VERSION_MAJOR; use meilisearch_types::milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; -use meilisearch_types::milli::progress::Progress; +use meilisearch_types::milli::progress::{EmbedderStats, Progress}; use meilisearch_types::milli::update::{ default_thread_pool_and_threads, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, }; @@ -544,7 +544,8 @@ fn import_dump( tracing::info!("Importing the settings."); let settings = index_reader.settings()?; apply_settings_to_builder(&settings, &mut builder); - builder.execute(&|| false, &progress)?; + let embedder_stats: Arc = Default::default(); + builder.execute(&|| false, &progress, embedder_stats.clone())?; // 4.3 Import the documents. // 4.3.1 We need to recreate the grenad+obkv format accepted by the index. diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 700a527ac..9e2afca97 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -44,7 +44,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("america") => vec![S("the united states")], }); builder.set_searchable_fields(vec![S("title"), S("description")]); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); wtxn.commit().unwrap(); // index documents diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index 03bef5838..f2e34c615 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -135,7 +135,7 @@ impl TempIndex { ) -> Result<(), crate::error::Error> { let mut builder = update::Settings::new(wtxn, &self.inner, &self.indexer_config); update(&mut builder); - builder.execute(&|| false, &Progress::default())?; + builder.execute(&|| false, &Progress::default(), Default::default())?; Ok(()) } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 252e136fd..edb68b6db 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -303,6 +303,7 @@ pub struct SettingsChangeEmbeddingExtractor<'a, 'b> { old_embedders: &'a EmbeddingConfigs, embedder_actions: &'a BTreeMap, embedder_category_id: &'a std::collections::HashMap, + embedder_stats: &'a EmbedderStats, sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, @@ -314,6 +315,7 @@ impl<'a, 'b> SettingsChangeEmbeddingExtractor<'a, 'b> { old_embedders: &'a EmbeddingConfigs, embedder_actions: &'a BTreeMap, embedder_category_id: &'a std::collections::HashMap, + embedder_stats: &'a EmbedderStats, sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, threads: &'a ThreadPoolNoAbort, @@ -324,6 +326,7 @@ impl<'a, 'b> SettingsChangeEmbeddingExtractor<'a, 'b> { old_embedders, embedder_actions, embedder_category_id, + embedder_stats, sender, threads, possible_embedding_mistakes, @@ -371,6 +374,7 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding prompt, context.data, &self.possible_embedding_mistakes, + self.embedder_stats, self.threads, self.sender, &context.doc_alloc, diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 8ed9dc37a..2986d5d57 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -333,6 +333,7 @@ pub(super) fn extract_all_settings_changes( field_distribution: &mut BTreeMap, mut index_embeddings: Vec, modified_docids: &mut RoaringBitmap, + embedder_stats: &EmbedderStats, ) -> Result> where MSP: Fn() -> bool + Sync, @@ -371,6 +372,7 @@ where settings_delta.old_embedders(), settings_delta.embedder_actions(), settings_delta.new_embedder_category_id(), + embedder_stats, embedding_sender, field_distribution, request_threads(), diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 2398b5f09..7d1ad6df5 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -1,6 +1,6 @@ use std::collections::BTreeMap; use std::sync::atomic::AtomicBool; -use std::sync::{Once, RwLock}; +use std::sync::{Arc, Once, RwLock}; use std::thread::{self, Builder}; use big_s::S; @@ -20,8 +20,8 @@ use super::steps::IndexingStep; use super::thread_local::ThreadLocal; use crate::documents::PrimaryKey; use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; -use crate::update::settings::SettingsDelta; use crate::progress::{EmbedderStats, Progress}; +use crate::update::settings::SettingsDelta; use crate::update::GrenadParameters; use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs}; @@ -213,6 +213,7 @@ pub fn reindex<'indexer, 'index, MSP, SD>( settings_delta: &'indexer SD, must_stop_processing: &'indexer MSP, progress: &'indexer Progress, + embedder_stats: Arc, ) -> Result where MSP: Fn() -> bool + Sync, @@ -274,6 +275,7 @@ where field_distribution, index_embeddings, modified_docids, + &embedder_stats, ) }) .unwrap() diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 32e3b17f9..834b85978 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -27,8 +27,8 @@ use crate::index::{ DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, }; use crate::order_by_map::OrderByMap; -use crate::progress::Progress; use crate::progress::EmbedderStats; +use crate::progress::Progress; use crate::prompt::{default_max_bytes, default_template_text, PromptData}; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; @@ -1362,7 +1362,12 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } - pub fn legacy_execute(mut self, progress_callback: FP, should_abort: FA) -> Result<()> + pub fn legacy_execute( + mut self, + progress_callback: FP, + should_abort: FA, + embedder_stats: Arc, + ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, @@ -1430,6 +1435,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { mut self, must_stop_processing: &'indexer MSP, progress: &'indexer Progress, + embedder_stats: Arc, ) -> Result> where MSP: Fn() -> bool + Sync, @@ -1440,6 +1446,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { .legacy_execute( |indexing_step| tracing::debug!(update = ?indexing_step), must_stop_processing, + embedder_stats, ) .map(|_| None); } @@ -1510,6 +1517,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { &inner_settings_diff, must_stop_processing, progress, + embedder_stats, ) .map(Some) } else { @@ -1519,6 +1527,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.legacy_execute( |indexing_step| tracing::debug!(update = ?indexing_step), must_stop_processing, + embedder_stats, ) .map(|_| None) } diff --git a/crates/milli/tests/search/distinct.rs b/crates/milli/tests/search/distinct.rs index c22755751..c7fa9befa 100644 --- a/crates/milli/tests/search/distinct.rs +++ b/crates/milli/tests/search/distinct.rs @@ -20,7 +20,7 @@ macro_rules! test_distinct { let config = milli::update::IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_distinct_field(S(stringify!($distinct))); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index ff939ec47..d04db425e 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -25,7 +25,7 @@ fn test_facet_distribution_with_no_facet_values() { FilterableAttributesRule::Field(S("genres")), FilterableAttributesRule::Field(S("tags")), ]); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); wtxn.commit().unwrap(); // index documents diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 0515ece66..3ee78561d 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -63,7 +63,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("america") => vec![S("the united states")], }); builder.set_searchable_fields(vec![S("title"), S("description")]); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); wtxn.commit().unwrap(); // index documents diff --git a/crates/milli/tests/search/phrase_search.rs b/crates/milli/tests/search/phrase_search.rs index da519c6f6..397729c20 100644 --- a/crates/milli/tests/search/phrase_search.rs +++ b/crates/milli/tests/search/phrase_search.rs @@ -11,7 +11,7 @@ fn set_stop_words(index: &Index, stop_words: &[&str]) { let mut builder = Settings::new(&mut wtxn, index, &config); let stop_words = stop_words.iter().map(|s| s.to_string()).collect(); builder.set_stop_words(stop_words); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); wtxn.commit().unwrap(); } diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index 113c8bc03..cb0c23e42 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -236,7 +236,7 @@ fn criteria_mixup() { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.clone()); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -276,7 +276,7 @@ fn criteria_ascdesc() { S("name"), S("age"), }); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); wtxn.commit().unwrap(); let mut wtxn = index.write_txn().unwrap(); @@ -359,7 +359,7 @@ fn criteria_ascdesc() { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(vec![criterion.clone()]); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index f8e688215..49c9c7b5d 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -46,7 +46,7 @@ fn test_typo_tolerance_one_typo() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); builder.set_min_word_len_one_typo(4); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); @@ -92,7 +92,7 @@ fn test_typo_tolerance_two_typo() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); builder.set_min_word_len_two_typos(7); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); @@ -181,7 +181,7 @@ fn test_typo_disabled_on_word() { // `zealand` doesn't allow typos anymore exact_words.insert("zealand".to_string()); builder.set_exact_words(exact_words); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); let mut search = Search::new(&txn, &index); search.query("zealand"); @@ -219,7 +219,7 @@ fn test_disable_typo_on_attribute() { let mut builder = Settings::new(&mut txn, &index, &config); // disable typos on `description` builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect()); - builder.execute(&|| false, &Progress::default()).unwrap(); + builder.execute(&|| false, &Progress::default(), Default::default()).unwrap(); let mut search = Search::new(&txn, &index); search.query("antebelum"); From 6b2b8ed676c27eb4047797831dd70065ad6b734a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 30 Jun 2025 11:49:03 +0200 Subject: [PATCH 065/150] Transform experimental_no_edition_2024_for_settings into a config --- .../src/analytics/segment_analytics.rs | 10 +++++-- crates/meilisearch/src/option.rs | 29 +++++++++++++++++-- crates/meilisearch/tests/common/server.rs | 1 + crates/milli/src/update/indexer_config.rs | 2 ++ crates/milli/src/update/settings.rs | 2 +- 5 files changed, 39 insertions(+), 5 deletions(-) diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index c7e0634f4..668a7fded 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -202,6 +202,7 @@ struct Infos { experimental_composite_embedders: bool, experimental_embedding_cache_entries: usize, experimental_no_snapshot_compaction: bool, + experimental_no_edition_2024_for_settings: bool, gpu_enabled: bool, db_path: bool, import_dump: bool, @@ -286,8 +287,12 @@ impl Infos { ScheduleSnapshot::Enabled(interval) => Some(interval), }; - let IndexerOpts { max_indexing_memory, max_indexing_threads, skip_index_budget: _ } = - indexer_options; + let IndexerOpts { + max_indexing_memory, + max_indexing_threads, + skip_index_budget: _, + experimental_no_edition_2024_for_settings, + } = indexer_options; let RuntimeTogglableFeatures { metrics, @@ -350,6 +355,7 @@ impl Infos { ssl_require_auth, ssl_resumption, ssl_tickets, + experimental_no_edition_2024_for_settings, } } } diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 5b7d1e52f..9ebf502d9 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -53,6 +53,8 @@ const MEILI_EXPERIMENTAL_DUMPLESS_UPGRADE: &str = "MEILI_EXPERIMENTAL_DUMPLESS_U const MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS: &str = "MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS"; const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE"; const MEILI_EXPERIMENTAL_CONTAINS_FILTER: &str = "MEILI_EXPERIMENTAL_CONTAINS_FILTER"; +const MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS: &str = + "MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE"; const MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER: &str = "MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER"; @@ -749,12 +751,24 @@ pub struct IndexerOpts { #[clap(skip)] #[serde(skip)] pub skip_index_budget: bool, + + /// Experimental no edition 2024 for settings feature. For more information, + /// + /// Enables the experimental no edition 2024 for settings feature. + #[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS)] + #[serde(default)] + pub experimental_no_edition_2024_for_settings: bool, } impl IndexerOpts { /// Exports the values to their corresponding env vars if they are not set. pub fn export_to_env(self) { - let IndexerOpts { max_indexing_memory, max_indexing_threads, skip_index_budget: _ } = self; + let IndexerOpts { + max_indexing_memory, + max_indexing_threads, + skip_index_budget: _, + experimental_no_edition_2024_for_settings, + } = self; if let Some(max_indexing_memory) = max_indexing_memory.0 { export_to_env_if_not_present( MEILI_MAX_INDEXING_MEMORY, @@ -767,6 +781,12 @@ impl IndexerOpts { max_indexing_threads.to_string(), ); } + if experimental_no_edition_2024_for_settings { + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS, + experimental_no_edition_2024_for_settings.to_string(), + ); + } } } @@ -785,7 +805,12 @@ impl TryFrom<&IndexerOpts> for IndexerConfig { max_threads: *other.max_indexing_threads, max_positions_per_attributes: None, skip_index_budget: other.skip_index_budget, - ..Default::default() + experimental_no_edition_2024_for_settings: other + .experimental_no_edition_2024_for_settings, + chunk_compression_type: Default::default(), + chunk_compression_level: Default::default(), + documents_chunk_size: Default::default(), + max_nb_chunks: Default::default(), }) } } diff --git a/crates/meilisearch/tests/common/server.rs b/crates/meilisearch/tests/common/server.rs index 1f5688a02..4367650c5 100644 --- a/crates/meilisearch/tests/common/server.rs +++ b/crates/meilisearch/tests/common/server.rs @@ -464,6 +464,7 @@ pub fn default_settings(dir: impl AsRef) -> Opt { skip_index_budget: true, // Having 2 threads makes the tests way faster max_indexing_threads: MaxThreads::from_str("2").unwrap(), + experimental_no_edition_2024_for_settings: false, }, experimental_enable_metrics: false, ..Parser::parse_from(None as Option<&str>) diff --git a/crates/milli/src/update/indexer_config.rs b/crates/milli/src/update/indexer_config.rs index eb7fbd4d5..a0f901818 100644 --- a/crates/milli/src/update/indexer_config.rs +++ b/crates/milli/src/update/indexer_config.rs @@ -15,6 +15,7 @@ pub struct IndexerConfig { pub thread_pool: ThreadPoolNoAbort, pub max_positions_per_attributes: Option, pub skip_index_budget: bool, + pub experimental_no_edition_2024_for_settings: bool, } impl IndexerConfig { @@ -63,6 +64,7 @@ impl Default for IndexerConfig { chunk_compression_level: None, max_positions_per_attributes: None, skip_index_budget: false, + experimental_no_edition_2024_for_settings: false, } } } diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 834b85978..c6ede7a1d 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1441,7 +1441,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { MSP: Fn() -> bool + Sync, { // force the old indexer if the environment says so - if std::env::var_os("MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS").is_some() { + if self.indexer_config.experimental_no_edition_2024_for_settings { return self .legacy_execute( |indexing_step| tracing::debug!(update = ?indexing_step), From 7a204609fea7577eb5ea73dd4791516afe73b1eb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 30 Jun 2025 14:21:46 +0200 Subject: [PATCH 066/150] Move document context and identifiers in document.rs --- crates/milli/src/update/new/document.rs | 136 +++++++++++++++++- .../milli/src/update/new/document_change.rs | 45 +----- .../milli/src/update/new/extract/documents.rs | 10 +- .../new/extract/faceted/extract_facets.rs | 3 +- .../milli/src/update/new/extract/geo/mod.rs | 4 +- .../extract/searchable/extract_word_docids.rs | 3 +- .../extract_word_pair_proximity_docids.rs | 4 +- .../src/update/new/extract/vectors/mod.rs | 6 +- .../update/new/indexer/document_changes.rs | 84 +---------- .../update/new/indexer/document_deletion.rs | 15 +- .../update/new/indexer/document_operation.rs | 8 +- .../milli/src/update/new/indexer/extract.rs | 4 +- .../src/update/new/indexer/partial_dump.rs | 4 +- .../update/new/indexer/settings_changes.rs | 15 +- .../update/new/indexer/update_by_function.rs | 7 +- crates/milli/src/update/new/mod.rs | 3 +- 16 files changed, 182 insertions(+), 169 deletions(-) diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index c7156c120..b07cc0298 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -1,7 +1,10 @@ +use std::cell::{Cell, RefCell}; use std::collections::{BTreeMap, BTreeSet}; +use std::sync::RwLock; +use bumpalo::Bump; use bumparaw_collections::RawMap; -use heed::RoTxn; +use heed::{RoTxn, WithoutTls}; use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; @@ -9,8 +12,13 @@ use super::vector_document::VectorDocument; use super::{KvReaderFieldId, KvWriterFieldId}; use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::documents::FieldIdMapper; +use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; +use crate::update::new::vector_document::VectorDocumentFromDb; use crate::vector::settings::EmbedderAction; -use crate::{DocumentId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError}; +use crate::{ + DocumentId, FieldIdMapWithMetadata, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, + Result, UserError, +}; /// A view into a document that can represent either the current version from the DB, /// the update data from payload or other means, or the merged updated version. @@ -460,3 +468,127 @@ impl<'doc> Versions<'doc> { self.data.get(k) } } + +pub struct DocumentIdentifiers<'doc> { + docid: DocumentId, + external_document_id: &'doc str, +} + +impl<'doc> DocumentIdentifiers<'doc> { + pub fn create(docid: DocumentId, external_document_id: &'doc str) -> Self { + Self { docid, external_document_id } + } + + pub fn docid(&self) -> DocumentId { + self.docid + } + + pub fn external_document_id(&self) -> &'doc str { + self.external_document_id + } + + pub fn current<'a, Mapper: FieldIdMapper>( + &self, + rtxn: &'a RoTxn, + index: &'a Index, + mapper: &'a Mapper, + ) -> Result> { + Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or( + crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, + )?) + } + + pub fn current_vectors<'a, Mapper: FieldIdMapper>( + &self, + rtxn: &'a RoTxn, + index: &'a Index, + mapper: &'a Mapper, + doc_alloc: &'a Bump, + ) -> Result> { + Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or( + crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, + )?) + } +} + +pub struct DocumentContext< + 'doc, // covariant lifetime of a single `process` call + 'extractor: 'doc, // invariant lifetime of the extractor_allocs + 'fid: 'doc, // invariant lifetime of the new_fields_ids_map + 'indexer: 'doc, // covariant lifetime of objects that outlive a single `process` call + T: MostlySend, +> { + /// The index we're indexing in + pub index: &'indexer Index, + /// The fields ids map as it was at the start of this indexing process. Contains at least all top-level fields from documents + /// inside of the DB. + pub db_fields_ids_map: &'indexer FieldsIdsMap, + /// A transaction providing data from the DB before all indexing operations + pub rtxn: RoTxn<'indexer, WithoutTls>, + + /// Global field id map that is up to date with the current state of the indexing process. + /// + /// - Inserting a field will take a lock + /// - Retrieving a field may take a lock as well + pub new_fields_ids_map: &'doc std::cell::RefCell>, + + /// Data allocated in this allocator is cleared between each call to `process`. + pub doc_alloc: Bump, + + /// Data allocated in this allocator is not cleared between each call to `process`, unless the data spills. + pub extractor_alloc: &'extractor Bump, + + /// Pool of doc allocators, used to retrieve the doc allocator we provided for the documents + pub doc_allocs: &'doc ThreadLocal>>, + + /// Extractor-specific data + pub data: &'doc T, +} + +impl< + 'doc, // covariant lifetime of a single `process` call + 'data: 'doc, // invariant on T lifetime of the datastore + 'extractor: 'doc, // invariant lifetime of extractor_allocs + 'fid: 'doc, // invariant lifetime of fields ids map + 'indexer: 'doc, // covariant lifetime of objects that survive a `process` call + T: MostlySend, + > DocumentContext<'doc, 'extractor, 'fid, 'indexer, T> +{ + #[allow(clippy::too_many_arguments)] + pub fn new( + index: &'indexer Index, + db_fields_ids_map: &'indexer FieldsIdsMap, + new_fields_ids_map: &'fid RwLock, + extractor_allocs: &'extractor ThreadLocal>, + doc_allocs: &'doc ThreadLocal>>, + datastore: &'data ThreadLocal, + fields_ids_map_store: &'doc ThreadLocal>>>, + init_data: F, + ) -> Result + where + F: FnOnce(&'extractor Bump) -> Result, + { + let doc_alloc = + doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024)))); + let doc_alloc = doc_alloc.0.take(); + let fields_ids_map = fields_ids_map_store + .get_or(|| RefCell::new(GlobalFieldsIdsMap::new(new_fields_ids_map)).into()); + + let fields_ids_map = &fields_ids_map.0; + let extractor_alloc = extractor_allocs.get_or_default(); + + let data = datastore.get_or_try(move || init_data(&extractor_alloc.0))?; + + let txn = index.read_txn()?; + Ok(DocumentContext { + index, + rtxn: txn, + db_fields_ids_map, + new_fields_ids_map: fields_ids_map, + doc_alloc, + extractor_alloc: &extractor_alloc.0, + data, + doc_allocs, + }) + } +} diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 2ff96fd24..2b9161319 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -10,11 +10,12 @@ use super::vector_document::{ }; use crate::attribute_patterns::PatternMatch; use crate::documents::FieldIdMapper; +use crate::update::new::document::DocumentIdentifiers; use crate::vector::EmbeddingConfigs; use crate::{DocumentId, Index, InternalError, Result}; pub enum DocumentChange<'doc> { - Deletion(DatabaseDocument<'doc>), + Deletion(DocumentIdentifiers<'doc>), Update(Update<'doc>), Insertion(Insertion<'doc>), } @@ -32,11 +33,6 @@ pub struct Insertion<'doc> { new: Versions<'doc>, } -pub struct DatabaseDocument<'doc> { - docid: DocumentId, - external_document_id: &'doc str, -} - impl<'doc> DocumentChange<'doc> { pub fn docid(&self) -> DocumentId { match &self { @@ -279,40 +275,3 @@ impl<'doc> Update<'doc> { } } } - -impl<'doc> DatabaseDocument<'doc> { - pub fn create(docid: DocumentId, external_document_id: &'doc str) -> Self { - Self { docid, external_document_id } - } - - pub fn docid(&self) -> DocumentId { - self.docid - } - - pub fn external_document_id(&self) -> &'doc str { - self.external_document_id - } - - pub fn current<'a, Mapper: FieldIdMapper>( - &self, - rtxn: &'a RoTxn, - index: &'a Index, - mapper: &'a Mapper, - ) -> Result> { - Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or( - crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, - )?) - } - - pub fn current_vectors<'a, Mapper: FieldIdMapper>( - &self, - rtxn: &'a RoTxn, - index: &'a Index, - mapper: &'a Mapper, - doc_alloc: &'a Bump, - ) -> Result> { - Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or( - crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, - )?) - } -} diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index 4d9a72715..5c1a1927a 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -8,10 +8,10 @@ use super::DelAddRoaringBitmap; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::update::new::channel::{DocumentsSender, ExtractorBbqueueSender}; use crate::update::new::document::{write_to_obkv, Document}; -use crate::update::new::document_change::DatabaseDocument; -use crate::update::new::indexer::document_changes::{DocumentContext, Extractor, IndexingContext}; +use crate::update::new::document::{DocumentContext, DocumentIdentifiers}; +use crate::update::new::indexer::document_changes::{Extractor, IndexingContext}; use crate::update::new::indexer::settings_changes::{ - settings_change_extract, DatabaseDocuments, SettingsChangeExtractor, + settings_change_extract, DocumentsIndentifiers, SettingsChangeExtractor, }; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::thread_local::{FullySend, ThreadLocal}; @@ -194,7 +194,7 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeDocumentE fn process<'doc>( &self, - documents: impl Iterator>>, + documents: impl Iterator>>, context: &DocumentContext, ) -> Result<()> { let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc); @@ -242,7 +242,7 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeDocumentE /// and then updates the database. #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents::extract")] pub fn update_database_documents<'indexer, 'extractor, MSP, SD>( - documents: &'indexer DatabaseDocuments<'indexer>, + documents: &'indexer DocumentsIndentifiers<'indexer>, indexing_context: IndexingContext, extractor_sender: &ExtractorBbqueueSender, settings_delta: &SD, diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 39e68fb6d..6e9ae7ee4 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -15,9 +15,10 @@ use crate::filterable_attributes_rules::match_faceted_field; use crate::heed_codec::facet::OrderedF64Codec; use crate::update::del_add::DelAdd; use crate::update::new::channel::FieldIdDocidFacetSender; +use crate::update::new::document::DocumentContext; use crate::update::new::extract::perm_json_p; use crate::update::new::indexer::document_changes::{ - extract, DocumentContext, DocumentChanges, Extractor, IndexingContext, + extract, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::steps::IndexingStep; diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index 927434ff6..8e164b48f 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -10,8 +10,8 @@ use serde_json::value::RawValue; use serde_json::Value; use crate::error::GeoError; -use crate::update::new::document::Document; -use crate::update::new::indexer::document_changes::{DocumentContext, Extractor}; +use crate::update::new::document::{Document, DocumentContext}; +use crate::update::new::indexer::document_changes::Extractor; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::thread_local::MostlySend; use crate::update::new::DocumentChange; diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 35bc9f063..5daf34ca4 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -8,10 +8,11 @@ use bumpalo::Bump; use super::match_searchable_field; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; +use crate::update::new::document::DocumentContext; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ - extract, DocumentContext, DocumentChanges, Extractor, IndexingContext, + extract, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::steps::IndexingStep; diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index dffde06c7..c9acb9734 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -7,10 +7,10 @@ use bumpalo::Bump; use super::match_searchable_field; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::proximity::{index_proximity, MAX_DISTANCE}; -use crate::update::new::document::Document; +use crate::update::new::document::{Document, DocumentContext}; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::indexer::document_changes::{ - extract, DocumentContext, DocumentChanges, Extractor, IndexingContext, + extract, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::steps::IndexingStep; diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index edb68b6db..6d5052ac8 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -10,8 +10,8 @@ use crate::error::FaultSource; use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::new::channel::EmbeddingSender; -use crate::update::new::document_change::DatabaseDocument; -use crate::update::new::indexer::document_changes::{DocumentContext, Extractor}; +use crate::update::new::document::{DocumentContext, DocumentIdentifiers}; +use crate::update::new::indexer::document_changes::Extractor; use crate::update::new::indexer::settings_changes::SettingsChangeExtractor; use crate::update::new::thread_local::MostlySend; use crate::update::new::vector_document::VectorDocument; @@ -343,7 +343,7 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding fn process<'doc>( &'doc self, - documents: impl Iterator>>, + documents: impl Iterator>>, context: &'doc DocumentContext, ) -> crate::Result<()> { let embedders = self.embedders.inner_as_ref(); diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index 3069ab29b..c88751ee3 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -3,100 +3,18 @@ use std::sync::atomic::Ordering; use std::sync::{Arc, RwLock}; use bumpalo::Bump; -use heed::{RoTxn, WithoutTls}; use rayon::iter::IndexedParallelIterator; use super::super::document_change::DocumentChange; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::progress::{AtomicDocumentStep, Progress}; +use crate::update::new::document::DocumentContext; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::GrenadParameters; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result}; -pub struct DocumentContext< - 'doc, // covariant lifetime of a single `process` call - 'extractor: 'doc, // invariant lifetime of the extractor_allocs - 'fid: 'doc, // invariant lifetime of the new_fields_ids_map - 'indexer: 'doc, // covariant lifetime of objects that outlive a single `process` call - T: MostlySend, -> { - /// The index we're indexing in - pub index: &'indexer Index, - /// The fields ids map as it was at the start of this indexing process. Contains at least all top-level fields from documents - /// inside of the DB. - pub db_fields_ids_map: &'indexer FieldsIdsMap, - /// A transaction providing data from the DB before all indexing operations - pub rtxn: RoTxn<'indexer, WithoutTls>, - - /// Global field id map that is up to date with the current state of the indexing process. - /// - /// - Inserting a field will take a lock - /// - Retrieving a field may take a lock as well - pub new_fields_ids_map: &'doc std::cell::RefCell>, - - /// Data allocated in this allocator is cleared between each call to `process`. - pub doc_alloc: Bump, - - /// Data allocated in this allocator is not cleared between each call to `process`, unless the data spills. - pub extractor_alloc: &'extractor Bump, - - /// Pool of doc allocators, used to retrieve the doc allocator we provided for the documents - pub doc_allocs: &'doc ThreadLocal>>, - - /// Extractor-specific data - pub data: &'doc T, -} - -impl< - 'doc, // covariant lifetime of a single `process` call - 'data: 'doc, // invariant on T lifetime of the datastore - 'extractor: 'doc, // invariant lifetime of extractor_allocs - 'fid: 'doc, // invariant lifetime of fields ids map - 'indexer: 'doc, // covariant lifetime of objects that survive a `process` call - T: MostlySend, - > DocumentContext<'doc, 'extractor, 'fid, 'indexer, T> -{ - #[allow(clippy::too_many_arguments)] - pub fn new( - index: &'indexer Index, - db_fields_ids_map: &'indexer FieldsIdsMap, - new_fields_ids_map: &'fid RwLock, - extractor_allocs: &'extractor ThreadLocal>, - doc_allocs: &'doc ThreadLocal>>, - datastore: &'data ThreadLocal, - fields_ids_map_store: &'doc ThreadLocal>>>, - init_data: F, - ) -> Result - where - F: FnOnce(&'extractor Bump) -> Result, - { - let doc_alloc = - doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024)))); - let doc_alloc = doc_alloc.0.take(); - let fields_ids_map = fields_ids_map_store - .get_or(|| RefCell::new(GlobalFieldsIdsMap::new(new_fields_ids_map)).into()); - - let fields_ids_map = &fields_ids_map.0; - let extractor_alloc = extractor_allocs.get_or_default(); - - let data = datastore.get_or_try(move || init_data(&extractor_alloc.0))?; - - let txn = index.read_txn()?; - Ok(DocumentContext { - index, - rtxn: txn, - db_fields_ids_map, - new_fields_ids_map: fields_ids_map, - doc_alloc, - extractor_alloc: &extractor_alloc.0, - data, - doc_allocs, - }) - } -} - /// An internal iterator (i.e. using `foreach`) of `DocumentChange`s pub trait Extractor<'extractor>: Sync { type Data: MostlySend; diff --git a/crates/milli/src/update/new/indexer/document_deletion.rs b/crates/milli/src/update/new/indexer/document_deletion.rs index 292cdc36e..157e20bb0 100644 --- a/crates/milli/src/update/new/indexer/document_deletion.rs +++ b/crates/milli/src/update/new/indexer/document_deletion.rs @@ -4,10 +4,11 @@ use rayon::iter::IndexedParallelIterator; use rayon::slice::ParallelSlice as _; use roaring::RoaringBitmap; -use super::document_changes::{DocumentContext, DocumentChanges}; +use super::document_changes::DocumentChanges; use crate::documents::PrimaryKey; +use crate::update::new::document::DocumentContext; use crate::update::new::thread_local::MostlySend; -use crate::update::new::{DatabaseDocument, DocumentChange}; +use crate::update::new::{DocumentChange, DocumentIdentifiers}; use crate::{DocumentId, Result}; #[derive(Default)] @@ -74,7 +75,10 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { let external_document_id = external_document_id.to_bump(&context.doc_alloc); - Ok(Some(DocumentChange::Deletion(DatabaseDocument::create(*docid, external_document_id)))) + Ok(Some(DocumentChange::Deletion(DocumentIdentifiers::create( + *docid, + external_document_id, + )))) } fn len(&self) -> usize { @@ -93,9 +97,8 @@ mod test { use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::tests::TempIndex; use crate::progress::Progress; - use crate::update::new::indexer::document_changes::{ - extract, DocumentContext, Extractor, IndexingContext, - }; + use crate::update::new::document::DocumentContext; + use crate::update::new::indexer::document_changes::{extract, Extractor, IndexingContext}; use crate::update::new::indexer::DocumentDeletion; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{MostlySend, ThreadLocal}; diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 4bcfb2d47..98faaf145 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -12,14 +12,14 @@ use serde_json::value::RawValue; use serde_json::Deserializer; use super::super::document_change::DocumentChange; -use super::document_changes::{DocumentContext, DocumentChanges}; +use super::document_changes::DocumentChanges; use super::guess_primary_key::retrieve_or_guess_primary_key; use crate::documents::PrimaryKey; use crate::progress::{AtomicPayloadStep, Progress}; -use crate::update::new::document::Versions; +use crate::update::new::document::{DocumentContext, Versions}; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::MostlySend; -use crate::update::new::{DatabaseDocument, Insertion, Update}; +use crate::update::new::{DocumentIdentifiers, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, InternalError, Result, UserError}; @@ -577,7 +577,7 @@ impl<'pl> PayloadOperations<'pl> { if self.is_new { Ok(None) } else { - let deletion = DatabaseDocument::create(self.docid, external_doc); + let deletion = DocumentIdentifiers::create(self.docid, external_doc); Ok(Some(DocumentChange::Deletion(deletion))) } } diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 2986d5d57..bb275d8aa 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -20,7 +20,7 @@ use crate::progress::EmbedderStats; use crate::progress::MergingWordCache; use crate::proximity::ProximityPrecision; use crate::update::new::extract::EmbeddingExtractor; -use crate::update::new::indexer::settings_changes::DatabaseDocuments; +use crate::update::new::indexer::settings_changes::DocumentsIndentifiers; use crate::update::new::merger::merge_and_send_rtree; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::SettingsDelta; @@ -345,7 +345,7 @@ where indexing_context.index.documents_ids(&rtxn)?.into_iter().collect::>(); let primary_key = primary_key_from_db(indexing_context.index, &rtxn, &indexing_context.db_fields_ids_map)?; - let documents = DatabaseDocuments::new(&all_document_ids, primary_key); + let documents = DocumentsIndentifiers::new(&all_document_ids, primary_key); let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); diff --git a/crates/milli/src/update/new/indexer/partial_dump.rs b/crates/milli/src/update/new/indexer/partial_dump.rs index 614c61353..33e72f532 100644 --- a/crates/milli/src/update/new/indexer/partial_dump.rs +++ b/crates/milli/src/update/new/indexer/partial_dump.rs @@ -5,10 +5,10 @@ use rayon::iter::IndexedParallelIterator; use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; -use super::document_changes::{DocumentContext, DocumentChanges}; +use super::document_changes::DocumentChanges; use crate::documents::PrimaryKey; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; -use crate::update::new::document::Versions; +use crate::update::new::document::{DocumentContext, Versions}; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::thread_local::MostlySend; use crate::update::new::{DocumentChange, Insertion}; diff --git a/crates/milli/src/update/new/indexer/settings_changes.rs b/crates/milli/src/update/new/indexer/settings_changes.rs index 99e303f16..984ab3a0b 100644 --- a/crates/milli/src/update/new/indexer/settings_changes.rs +++ b/crates/milli/src/update/new/indexer/settings_changes.rs @@ -8,8 +8,7 @@ use rayon::slice::ParallelSlice; use super::document_changes::IndexingContext; use crate::documents::PrimaryKey; use crate::progress::AtomicDocumentStep; -use crate::update::new::document_change::DatabaseDocument; -use crate::update::new::indexer::document_changes::DocumentContext; +use crate::update::new::document::{DocumentContext, DocumentIdentifiers}; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; @@ -23,16 +22,16 @@ pub trait SettingsChangeExtractor<'extractor>: Sync { fn process<'doc>( &'doc self, - documents: impl Iterator>>, + documents: impl Iterator>>, context: &'doc DocumentContext, ) -> Result<()>; } -pub struct DatabaseDocuments<'indexer> { +pub struct DocumentsIndentifiers<'indexer> { documents: &'indexer [DocumentId], primary_key: PrimaryKey<'indexer>, } -impl<'indexer> DatabaseDocuments<'indexer> { +impl<'indexer> DocumentsIndentifiers<'indexer> { pub fn new(documents: &'indexer [DocumentId], primary_key: PrimaryKey<'indexer>) -> Self { Self { documents, primary_key } } @@ -48,7 +47,7 @@ impl<'indexer> DatabaseDocuments<'indexer> { &'doc self, context: &'doc DocumentContext, docid: &'doc DocumentId, - ) -> Result>> { + ) -> Result>> { let current = context.index.document(&context.rtxn, *docid)?; let external_document_id = self.primary_key.extract_docid_from_db( @@ -59,7 +58,7 @@ impl<'indexer> DatabaseDocuments<'indexer> { let external_document_id = external_document_id.to_bump(&context.doc_alloc); - Ok(Some(DatabaseDocument::create(*docid, external_document_id))) + Ok(Some(DocumentIdentifiers::create(*docid, external_document_id))) } fn len(&self) -> usize { @@ -78,7 +77,7 @@ pub fn settings_change_extract< EX: SettingsChangeExtractor<'extractor>, MSP: Fn() -> bool + Sync, >( - documents: &'indexer DatabaseDocuments<'indexer>, + documents: &'indexer DocumentsIndentifiers<'indexer>, extractor: &EX, IndexingContext { index, diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index b394757d1..daffe42ed 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -5,15 +5,14 @@ use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use roaring::RoaringBitmap; use rustc_hash::FxBuildHasher; -use super::document_changes::DocumentContext; use super::DocumentChanges; use crate::documents::Error::InvalidDocumentFormat; use crate::documents::PrimaryKey; use crate::error::{FieldIdMapMissingEntry, InternalError}; -use crate::update::new::document::Versions; +use crate::update::new::document::{DocumentContext, Versions}; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::thread_local::MostlySend; -use crate::update::new::{DatabaseDocument, DocumentChange, KvReaderFieldId, Update}; +use crate::update::new::{DocumentChange, DocumentIdentifiers, KvReaderFieldId, Update}; use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; pub struct UpdateByFunction { @@ -129,7 +128,7 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { match scope.remove::("doc") { // If the "doc" variable has been set to (), we effectively delete the document. Some(doc) if doc.is_unit() => Ok(Some(DocumentChange::Deletion( - DatabaseDocument::create(docid, doc_alloc.alloc_str(&document_id)), + DocumentIdentifiers::create(docid, doc_alloc.alloc_str(&document_id)), ))), None => unreachable!("missing doc variable from the Rhai scope"), Some(new_document) => match new_document.try_cast() { diff --git a/crates/milli/src/update/new/mod.rs b/crates/milli/src/update/new/mod.rs index e3adc5bde..ffe27ffda 100644 --- a/crates/milli/src/update/new/mod.rs +++ b/crates/milli/src/update/new/mod.rs @@ -1,4 +1,5 @@ -pub use document_change::{DatabaseDocument, DocumentChange, Insertion, Update}; +pub use document::DocumentIdentifiers; +pub use document_change::{DocumentChange, Insertion, Update}; pub use indexer::ChannelCongestion; pub use merger::{ merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases, FacetFieldIdsDelta, From e414284335830ea3ec36d1344e57acdb57011581 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 30 Jun 2025 14:25:28 +0200 Subject: [PATCH 067/150] Clippy too many arguments --- crates/milli/src/update/new/extract/vectors/mod.rs | 1 + crates/milli/src/update/new/indexer/mod.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 6d5052ac8..4d308018a 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -310,6 +310,7 @@ pub struct SettingsChangeEmbeddingExtractor<'a, 'b> { } impl<'a, 'b> SettingsChangeEmbeddingExtractor<'a, 'b> { + #[allow(clippy::too_many_arguments)] pub fn new( embedders: &'a EmbeddingConfigs, old_embedders: &'a EmbeddingConfigs, diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 7d1ad6df5..0efef48fd 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -205,6 +205,7 @@ where Ok(congestion) } +#[allow(clippy::too_many_arguments)] pub fn reindex<'indexer, 'index, MSP, SD>( wtxn: &mut RwTxn<'index>, index: &'index Index, From 1b54c866e199c16f07f7f8882fd66849ec55d4d3 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 30 Jun 2025 14:47:39 +0200 Subject: [PATCH 068/150] Link experimental feature discussion --- crates/meilisearch/src/option.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 9ebf502d9..9658352c8 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -753,6 +753,7 @@ pub struct IndexerOpts { pub skip_index_budget: bool, /// Experimental no edition 2024 for settings feature. For more information, + /// see: /// /// Enables the experimental no edition 2024 for settings feature. #[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS)] From 85037352b95d947151692307c1f00371fed134a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 30 Jun 2025 18:31:32 +0200 Subject: [PATCH 069/150] Fix most of the easy issues --- crates/index-scheduler/src/processing.rs | 4 ++-- .../src/scheduler/create_batch.rs | 6 ++--- .../src/scheduler/process_export.rs | 5 ++-- crates/index-scheduler/src/utils.rs | 2 +- crates/meilisearch-types/src/task_view.rs | 23 ++++++++++++++++++- crates/meilisearch/src/routes/export.rs | 15 ++++++------ 6 files changed, 39 insertions(+), 16 deletions(-) diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 5d4ac11c3..631719f73 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -178,8 +178,8 @@ make_enum_progress! { make_enum_progress! { pub enum Export { EnsuringCorrectnessOfTheTarget, - ExportTheSettings, - ExportTheDocuments, + ExporingTheSettings, + ExporingTheDocuments, } } diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index 7a6fa4a9b..b08d27d48 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -510,9 +510,9 @@ impl IndexScheduler { // 3. we batch the export. let to_export = self.queue.tasks.get_kind(rtxn, Kind::Export)? & enqueued; if !to_export.is_empty() { - let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, to_export)?; - current_batch.processing(&mut tasks); - let task = tasks.pop().expect("There must be only one export task"); + let task_id = to_export.iter().next().expect("There must be only one export task"); + let mut task = self.queue.tasks.get_task(rtxn, task_id)?.unwrap(); + current_batch.processing([&mut task]); current_batch.reason(BatchStopReason::TaskKindCannotBeBatched { kind: Kind::Export }); return Ok(Some((Batch::Export { task }, current_batch))); } diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 57f79c83f..b81ff0b96 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -86,10 +86,11 @@ impl IndexScheduler { } // Retry logic for sending settings let url = format!("{base_url}/indexes/{uid}/settings"); + let bearer = api_key.map(|api_key| format!("Bearer {api_key}")); retry(&must_stop_processing, || { let mut request = agent.patch(&url); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); + if let Some(bearer) = bearer.as_ref() { + request = request.set("Authorization", bearer); } request.send_json(settings.clone()).map_err(into_backoff_error) })?; diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 594023145..2cfe63bff 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -273,7 +273,7 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) { K::TaskCancelation { .. } | K::TaskDeletion { .. } | K::DumpCreation { .. } - | K::Export { .. } // TODO I have patterns, not index uids + | K::Export { .. } | K::UpgradeDatabase { .. } | K::SnapshotCreation => (), }; diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 1dbd5637b..7521137c0 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -371,7 +371,10 @@ impl From
for DetailsView { } Details::Export { url, api_key, payload_size, indexes } => DetailsView { url: Some(url), - api_key, + api_key: api_key.map(|mut api_key| { + hide_secret(&mut api_key); + api_key + }), payload_size: payload_size .map(|ps| ps.get_appropriate_unit(UnitType::Both).to_string()), indexes: Some( @@ -390,3 +393,21 @@ impl From
for DetailsView { } } } + +// We definitely need to factorize the code to hide the secret key +fn hide_secret(secret: &mut String) { + match secret.len() { + x if x < 10 => { + secret.replace_range(.., "XXX..."); + } + x if x < 20 => { + secret.replace_range(2.., "XXXX..."); + } + x if x < 30 => { + secret.replace_range(3.., "XXXXX..."); + } + _x => { + secret.replace_range(5.., "XXXXXX..."); + } + } +} diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 21a77ae32..1df2d271e 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -42,17 +42,18 @@ pub fn configure(cfg: &mut web::ServiceConfig) { } #[utoipa::path( - get, + post, path = "", tag = "Export", security(("Bearer" = ["export", "*"])), responses( (status = OK, description = "Known nodes are returned", body = Export, content_type = "application/json", example = json!( - { - "indexes": ["movie", "steam-*"], - "skip_embeddings": true, - "apiKey": "meilisearch-api-key" - })), + { + "taskUid": 1, + "status": "enqueued", + "type": "export", + "enqueuedAt": "2021-08-11T09:25:53.000000Z" + })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { "message": "The Authorization header is missing. It must use the bearer authorization method.", @@ -126,7 +127,7 @@ pub struct Export { #[serde(default)] #[deserr(default, error = DeserrJsonError)] pub payload_size: Option, - #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] + #[schema(value_type = Option>, example = json!({ "*": { "filter": null } }))] #[deserr(default)] #[serde(default)] pub indexes: BTreeMap, From ad03c86c4493cb1dec38897983bd0a4d6ec21631 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 30 Jun 2025 18:46:47 +0200 Subject: [PATCH 070/150] Display an accurate number of uploaded documents --- .../src/scheduler/process_batch.rs | 10 +++++---- .../src/scheduler/process_export.rs | 21 +++++++++++++------ crates/meilisearch-types/src/tasks.rs | 8 +++---- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index e56b8e13a..090ff844d 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -377,9 +377,8 @@ impl IndexScheduler { ) })); - match ret { - // TODO return the matched and exported documents - Ok(Ok(())) => (), + let stats = match ret { + Ok(Ok(stats)) => stats, Ok(Err(Error::AbortedTask)) => return Err(Error::AbortedTask), Ok(Err(e)) => return Err(Error::Export(Box::new(e))), Err(e) => { @@ -394,9 +393,12 @@ impl IndexScheduler { msg.to_string(), )))); } - } + }; task.status = Status::Succeeded; + if let Some(Details::Export { indexes, .. }) = task.details.as_mut() { + *indexes = stats; + } Ok((vec![task], ProcessBatchInfo::default())) } Batch::UpgradeDatabase { mut tasks } => { diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index b81ff0b96..bf2917b73 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -14,7 +14,7 @@ use meilisearch_types::milli::update::{request_threads, Setting}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{self, obkv_to_json, Filter, InternalError}; use meilisearch_types::settings::{self, SecretPolicy}; -use meilisearch_types::tasks::ExportIndexSettings; +use meilisearch_types::tasks::{DetailsExportIndexSettings, ExportIndexSettings}; use serde::Deserialize; use ureq::{json, Response}; @@ -30,7 +30,7 @@ impl IndexScheduler { payload_size: Option<&Byte>, indexes: &BTreeMap, progress: Progress, - ) -> Result<()> { + ) -> Result> { #[cfg(test)] self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; @@ -41,13 +41,14 @@ impl IndexScheduler { indexes .iter() .find(|(pattern, _)| pattern.matches_str(&uid)) - .map(|(_pattern, settings)| (uid, settings)) + .map(|(pattern, settings)| (pattern, uid, settings)) }) .collect(); + let mut output = BTreeMap::new(); let agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); let must_stop_processing = self.scheduler.must_stop_processing.clone(); - for (i, (uid, settings)) in indexes.iter().enumerate() { + for (i, (pattern, uid, export_settings)) in indexes.iter().enumerate() { if must_stop_processing.get() { return Err(Error::AbortedTask); } @@ -58,7 +59,7 @@ impl IndexScheduler { indexes.len() as u32, )); - let ExportIndexSettings { filter } = settings; + let ExportIndexSettings { filter } = export_settings; let index = self.index(uid)?; let index_rtxn = index.read_txn()?; @@ -125,6 +126,14 @@ impl IndexScheduler { let (step, progress_step) = AtomicDocumentStep::new(total_documents); progress.update_progress(progress_step); + output.insert( + (*pattern).clone(), + DetailsExportIndexSettings { + settings: (*export_settings).clone(), + matched_documents: Some(total_documents as u64), + }, + ); + let limit = payload_size.map(|ps| ps.as_u64() as usize).unwrap_or(50 * 1024 * 1024); // defaults to 50 MiB let documents_url = format!("{base_url}/indexes/{uid}/documents"); @@ -265,7 +274,7 @@ impl IndexScheduler { step.store(total_documents, atomic::Ordering::Relaxed); } - Ok(()) + Ok(output) } } diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index a6ed593db..cdbf6d3aa 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -707,16 +707,14 @@ pub enum Details { #[schema(rename_all = "camelCase")] pub struct DetailsExportIndexSettings { #[serde(flatten)] - settings: ExportIndexSettings, + pub settings: ExportIndexSettings, #[serde(skip_serializing_if = "Option::is_none")] - matched_documents: Option, - #[serde(skip_serializing_if = "Option::is_none")] - exported_documents: Option, + pub matched_documents: Option, } impl From for DetailsExportIndexSettings { fn from(settings: ExportIndexSettings) -> Self { - DetailsExportIndexSettings { settings, matched_documents: None, exported_documents: None } + DetailsExportIndexSettings { settings, matched_documents: None } } } From f4bb6cbca894e690e9789a7945cbf1f4f2d5d800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 30 Jun 2025 18:59:16 +0200 Subject: [PATCH 071/150] Better behavior when null indexes --- crates/meilisearch-types/src/tasks.rs | 2 +- crates/meilisearch/src/routes/export.rs | 14 ++++++++------ crates/meilisearch/src/routes/export_analytics.rs | 7 ++++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index cdbf6d3aa..0618fa333 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -210,7 +210,7 @@ impl KindWithContent { | SnapshotCreation | TaskCancelation { .. } | TaskDeletion { .. } - | Export { .. } // TODO Should I resolve the index names? + | Export { .. } | UpgradeDatabase { .. } => vec![], DocumentAdditionOrUpdate { index_uid, .. } | DocumentEdition { index_uid, .. } diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 1df2d271e..31f8812c7 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -81,15 +81,17 @@ async fn export( let Export { url, api_key, payload_size, indexes } = export; - let indexes = if indexes.is_empty() { - BTreeMap::from([(IndexUidPattern::new_unchecked("*"), DbExportIndexSettings::default())]) - } else { - indexes + let indexes = match indexes { + Some(indexes) => indexes .into_iter() .map(|(pattern, ExportIndexSettings { filter })| { (pattern, DbExportIndexSettings { filter }) }) - .collect() + .collect(), + None => BTreeMap::from([( + IndexUidPattern::new_unchecked("*"), + DbExportIndexSettings::default(), + )]), }; let task = KindWithContent::Export { @@ -130,7 +132,7 @@ pub struct Export { #[schema(value_type = Option>, example = json!({ "*": { "filter": null } }))] #[deserr(default)] #[serde(default)] - pub indexes: BTreeMap, + pub indexes: Option>, } /// A wrapper around the `Byte` type that implements `Deserr`. diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs index 44dba2c9b..7ac713e9b 100644 --- a/crates/meilisearch/src/routes/export_analytics.rs +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -15,9 +15,10 @@ impl ExportAnalytics { let Export { url: _, api_key, payload_size, indexes } = export; let has_api_key = api_key.is_some(); - let index_patterns_count = indexes.len(); - let patterns_with_filter_count = - indexes.values().filter(|settings| settings.filter.is_some()).count(); + let index_patterns_count = indexes.as_ref().map_or(0, |indexes| indexes.len()); + let patterns_with_filter_count = indexes.as_ref().map_or(0, |indexes| { + indexes.values().filter(|settings| settings.filter.is_some()).count() + }); let payload_sizes = if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { vec![byte_size.as_u64()] From 0d85f8fcee106b3f18fd7389c41bf91d18b08837 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Jun 2025 16:34:31 +0200 Subject: [PATCH 072/150] Make sure to recover from missing update file --- crates/index-scheduler/src/scheduler/create_batch.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index e3763881b..a5bc1ec6f 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -1,4 +1,5 @@ use std::fmt; +use std::io::ErrorKind; use meilisearch_types::heed::RoTxn; use meilisearch_types::milli::update::IndexDocumentsMethod; @@ -577,7 +578,11 @@ impl IndexScheduler { .and_then(|task| task.ok_or(Error::CorruptedTaskQueue))?; if let Some(uuid) = task.content_uuid() { - let content_size = self.queue.file_store.compute_size(uuid)?; + let content_size = match self.queue.file_store.compute_size(uuid) { + Ok(content_size) => content_size, + Err(file_store::Error::IoError(err)) if err.kind() == ErrorKind::NotFound => 0, + Err(otherwise) => return Err(otherwise.into()), + }; total_size = total_size.saturating_add(content_size); } From efd5fd96ccc63a886005b0d42e79cd9a5aaa13f9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 11:02:42 +0200 Subject: [PATCH 073/150] Add the overrideSettings parameter --- .../src/scheduler/process_export.rs | 83 +++++++++++++------ crates/meilisearch-types/src/error.rs | 1 + crates/meilisearch-types/src/tasks.rs | 1 + crates/meilisearch/src/routes/export.rs | 8 +- 4 files changed, 65 insertions(+), 28 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index bf2917b73..19b2bf743 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -59,42 +59,73 @@ impl IndexScheduler { indexes.len() as u32, )); - let ExportIndexSettings { filter } = export_settings; + let ExportIndexSettings { filter, override_settings } = export_settings; let index = self.index(uid)?; let index_rtxn = index.read_txn()?; - // Send the primary key + let url = format!("{base_url}/indexes/{uid}"); + + // First, check if the index already exists + let response = retry(&must_stop_processing, || { + let mut request = agent.get(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + + request.send_string("").map_err(into_backoff_error) + })?; + let already_existed = response.status() == 200; + let primary_key = index .primary_key(&index_rtxn) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - let url = format!("{base_url}/indexes"); - retry(&must_stop_processing, || { - let mut request = agent.post(&url); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); - } - let index_param = json!({ "uid": uid, "primaryKey": primary_key }); - request.send_json(&index_param).map_err(into_backoff_error) - })?; + // Create the index + if !already_existed { + let url = format!("{base_url}/indexes"); + retry(&must_stop_processing, || { + let mut request = agent.post(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + let index_param = json!({ "uid": uid, "primaryKey": primary_key }); + request.send_json(&index_param).map_err(into_backoff_error) + })?; + } + + // Patch the index primary key + if already_existed && *override_settings { + let url = format!("{base_url}/indexes/{uid}"); + retry(&must_stop_processing, || { + let mut request = agent.patch(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + let index_param = json!({ "primaryKey": primary_key }); + request.send_json(&index_param).map_err(into_backoff_error) + })?; + } // Send the index settings - let mut settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - // Remove the experimental chat setting if not enabled - if self.features().check_chat_completions("exporting chat settings").is_err() { - settings.chat = Setting::NotSet; - } - // Retry logic for sending settings - let url = format!("{base_url}/indexes/{uid}/settings"); - let bearer = api_key.map(|api_key| format!("Bearer {api_key}")); - retry(&must_stop_processing, || { - let mut request = agent.patch(&url); - if let Some(bearer) = bearer.as_ref() { - request = request.set("Authorization", bearer); + if !already_existed || *override_settings { + let mut settings = + settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // Remove the experimental chat setting if not enabled + if self.features().check_chat_completions("exporting chat settings").is_err() { + settings.chat = Setting::NotSet; } - request.send_json(settings.clone()).map_err(into_backoff_error) - })?; + // Retry logic for sending settings + let url = format!("{base_url}/indexes/{uid}/settings"); + let bearer = api_key.map(|api_key| format!("Bearer {api_key}")); + retry(&must_stop_processing, || { + let mut request = agent.patch(&url); + if let Some(bearer) = bearer.as_ref() { + request = request.set("Authorization", bearer); + } + request.send_json(settings.clone()).map_err(into_backoff_error) + })?; + } let filter = filter .as_ref() diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 1c2840084..30f6868f6 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -395,6 +395,7 @@ InvalidExportApiKey , InvalidRequest , BAD_REQU InvalidExportPayloadSize , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexOverrideSettings , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; UnimplementedNonStreamingChatCompletions , InvalidRequest , NOT_IMPLEMENTED ; diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 0618fa333..99b04f1e3 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -178,6 +178,7 @@ pub struct IndexSwap { #[serde(rename_all = "camelCase")] pub struct ExportIndexSettings { pub filter: Option, + pub override_settings: bool, } impl KindWithContent { diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 31f8812c7..172a162c6 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -84,8 +84,8 @@ async fn export( let indexes = match indexes { Some(indexes) => indexes .into_iter() - .map(|(pattern, ExportIndexSettings { filter })| { - (pattern, DbExportIndexSettings { filter }) + .map(|(pattern, ExportIndexSettings { filter, override_settings })| { + (pattern, DbExportIndexSettings { filter, override_settings }) }) .collect(), None => BTreeMap::from([( @@ -179,4 +179,8 @@ pub struct ExportIndexSettings { #[serde(default)] #[deserr(default, error = DeserrJsonError)] pub filter: Option, + #[schema(value_type = Option, example = json!(true))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub override_settings: bool, } From 9cfbef478eb80258b1698c75abe80b5a0f92b85b Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 11:04:59 +0200 Subject: [PATCH 074/150] Add override setttings to analytics --- crates/meilisearch/src/routes/export_analytics.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs index 7ac713e9b..b66a5133b 100644 --- a/crates/meilisearch/src/routes/export_analytics.rs +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -7,6 +7,7 @@ pub struct ExportAnalytics { has_api_key: bool, sum_index_patterns: usize, sum_patterns_with_filter: usize, + sum_patterns_with_override_settings: usize, payload_sizes: Vec, } @@ -19,6 +20,9 @@ impl ExportAnalytics { let patterns_with_filter_count = indexes.as_ref().map_or(0, |indexes| { indexes.values().filter(|settings| settings.filter.is_some()).count() }); + let patterns_with_override_settings_count = indexes.as_ref().map_or(0, |indexes| { + indexes.values().filter(|settings| settings.override_settings).count() + }); let payload_sizes = if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { vec![byte_size.as_u64()] @@ -31,6 +35,7 @@ impl ExportAnalytics { has_api_key, sum_index_patterns: index_patterns_count, sum_patterns_with_filter: patterns_with_filter_count, + sum_patterns_with_override_settings: patterns_with_override_settings_count, payload_sizes, } } @@ -46,6 +51,7 @@ impl Aggregate for ExportAnalytics { self.has_api_key |= other.has_api_key; self.sum_index_patterns += other.sum_index_patterns; self.sum_patterns_with_filter += other.sum_patterns_with_filter; + self.sum_patterns_with_override_settings += other.sum_patterns_with_override_settings; self.payload_sizes.extend(other.payload_sizes); self } @@ -69,11 +75,18 @@ impl Aggregate for ExportAnalytics { Some(self.sum_patterns_with_filter as f64 / self.total_received as f64) }; + let avg_patterns_with_override_settings = if self.total_received == 0 { + None + } else { + Some(self.sum_patterns_with_override_settings as f64 / self.total_received as f64) + }; + serde_json::json!({ "total_received": self.total_received, "has_api_key": self.has_api_key, "avg_index_patterns": avg_index_patterns, "avg_patterns_with_filter": avg_patterns_with_filter, + "avg_patterns_with_override_settings": avg_patterns_with_override_settings, "avg_payload_size": avg_payload_size, }) } From 259fc067d33ff78593ae3b842ea2aabd169f7ac5 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 11:14:59 +0200 Subject: [PATCH 075/150] Count exported documents by index name, not pattern --- .../src/scheduler/process_export.rs | 9 ++++----- crates/meilisearch-types/src/tasks.rs | 14 +++++++------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 19b2bf743..d1f5616b7 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -30,7 +30,7 @@ impl IndexScheduler { payload_size: Option<&Byte>, indexes: &BTreeMap, progress: Progress, - ) -> Result> { + ) -> Result> { #[cfg(test)] self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; @@ -48,7 +48,7 @@ impl IndexScheduler { let mut output = BTreeMap::new(); let agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); let must_stop_processing = self.scheduler.must_stop_processing.clone(); - for (i, (pattern, uid, export_settings)) in indexes.iter().enumerate() { + for (i, (_pattern, uid, export_settings)) in indexes.iter().enumerate() { if must_stop_processing.get() { return Err(Error::AbortedTask); } @@ -63,9 +63,8 @@ impl IndexScheduler { let index = self.index(uid)?; let index_rtxn = index.read_txn()?; - let url = format!("{base_url}/indexes/{uid}"); - // First, check if the index already exists + let url = format!("{base_url}/indexes/{uid}"); let response = retry(&must_stop_processing, || { let mut request = agent.get(&url); if let Some(api_key) = api_key { @@ -158,7 +157,7 @@ impl IndexScheduler { progress.update_progress(progress_step); output.insert( - (*pattern).clone(), + uid.clone(), DetailsExportIndexSettings { settings: (*export_settings).clone(), matched_documents: Some(total_documents as u64), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 99b04f1e3..423cf539e 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -289,12 +289,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: BTreeMap::new(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -363,12 +363,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: BTreeMap::new(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -419,12 +419,12 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: BTreeMap::new(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -696,7 +696,7 @@ pub enum Details { url: String, api_key: Option, payload_size: Option, - indexes: BTreeMap, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), From d439a3cb9d05f6b69a41a7a1fd4370c0cd1ce128 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:39:24 +0200 Subject: [PATCH 076/150] Fix progress names --- crates/index-scheduler/src/processing.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 631719f73..2aa7cf859 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -178,8 +178,8 @@ make_enum_progress! { make_enum_progress! { pub enum Export { EnsuringCorrectnessOfTheTarget, - ExporingTheSettings, - ExporingTheDocuments, + ExportingTheSettings, + ExportingTheDocuments, } } From 074d509d9280cdc277b80950dec111737126c375 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:39:52 +0200 Subject: [PATCH 077/150] Fix expect message --- crates/index-scheduler/src/scheduler/create_batch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index b08d27d48..693275c32 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -510,7 +510,7 @@ impl IndexScheduler { // 3. we batch the export. let to_export = self.queue.tasks.get_kind(rtxn, Kind::Export)? & enqueued; if !to_export.is_empty() { - let task_id = to_export.iter().next().expect("There must be only one export task"); + let task_id = to_export.iter().next().expect("There must be at least one export task"); let mut task = self.queue.tasks.get_task(rtxn, task_id)?.unwrap(); current_batch.processing([&mut task]); current_batch.reason(BatchStopReason::TaskKindCannotBeBatched { kind: Kind::Export }); From 9dac91efe056d17eeabe18aaafdd1da401b44416 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:40:39 +0200 Subject: [PATCH 078/150] Fix utoipa response --- crates/meilisearch/src/routes/export.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 172a162c6..97356f7eb 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -47,7 +47,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { tag = "Export", security(("Bearer" = ["export", "*"])), responses( - (status = OK, description = "Known nodes are returned", body = Export, content_type = "application/json", example = json!( + (status = 202, description = "Export successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( { "taskUid": 1, "status": "enqueued", From c078efd730ffec4a4f2d9670437287d080269ca9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:40:59 +0200 Subject: [PATCH 079/150] Remove experimental todo --- crates/meilisearch/src/routes/export.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 97356f7eb..a4b6720d1 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -71,9 +71,6 @@ async fn export( opt: web::Data, analytics: Data, ) -> Result { - // TODO make it experimental? - // index_scheduler.features().check_network("Using the /network route")?; - let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); From 25c19a306b1fa4967b013066c693012293347272 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:42:44 +0200 Subject: [PATCH 080/150] Rename variable Co-authored-by: Kero --- crates/index-scheduler/src/scheduler/process_export.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index d1f5616b7..b5134deb9 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -71,16 +71,16 @@ impl IndexScheduler { request = request.set("Authorization", &format!("Bearer {api_key}")); } - request.send_string("").map_err(into_backoff_error) + request.send_bytes(Default::default()).map_err(into_backoff_error) })?; - let already_existed = response.status() == 200; + let index_exists = response.status() == 200; let primary_key = index .primary_key(&index_rtxn) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; // Create the index - if !already_existed { + if !index_exists { let url = format!("{base_url}/indexes"); retry(&must_stop_processing, || { let mut request = agent.post(&url); @@ -93,7 +93,7 @@ impl IndexScheduler { } // Patch the index primary key - if already_existed && *override_settings { + if index_exists && *override_settings { let url = format!("{base_url}/indexes/{uid}"); retry(&must_stop_processing, || { let mut request = agent.patch(&url); @@ -106,7 +106,7 @@ impl IndexScheduler { } // Send the index settings - if !already_existed || *override_settings { + if !index_exists || *override_settings { let mut settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; From 37a692f942253c128980e31e6d3be75b94a12a0e Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:47:43 +0200 Subject: [PATCH 081/150] Keep `IndexUidPattern` --- .../src/scheduler/process_export.rs | 4 ++-- crates/meilisearch-types/src/tasks.rs | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index b5134deb9..eaad7aa34 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -30,7 +30,7 @@ impl IndexScheduler { payload_size: Option<&Byte>, indexes: &BTreeMap, progress: Progress, - ) -> Result> { + ) -> Result> { #[cfg(test)] self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; @@ -157,7 +157,7 @@ impl IndexScheduler { progress.update_progress(progress_step); output.insert( - uid.clone(), + IndexUidPattern::new_unchecked(uid.clone()), DetailsExportIndexSettings { settings: (*export_settings).clone(), matched_documents: Some(total_documents as u64), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 423cf539e..99b04f1e3 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -289,12 +289,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { + KindWithContent::Export { url, api_key, payload_size, indexes } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: BTreeMap::new(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -363,12 +363,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { + KindWithContent::Export { url, api_key, payload_size, indexes } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: BTreeMap::new(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -419,12 +419,12 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { + KindWithContent::Export { url, api_key, payload_size, indexes } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: BTreeMap::new(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -696,7 +696,7 @@ pub enum Details { url: String, api_key: Option, payload_size: Option, - indexes: BTreeMap, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), From b7bebe9bbb33b4ba87408362068f732281f609ea Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 15:03:04 +0200 Subject: [PATCH 082/150] Fix export when index already exists --- crates/index-scheduler/src/scheduler/process_export.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index eaad7aa34..676481319 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -72,8 +72,12 @@ impl IndexScheduler { } request.send_bytes(Default::default()).map_err(into_backoff_error) - })?; - let index_exists = response.status() == 200; + }); + let index_exists = match response { + Ok(response) => response.status() == 200, + Err(Error::FromRemoteWhenExporting { code, .. }) if code == "index_not_found" => false, + Err(e) => return Err(e), + }; let primary_key = index .primary_key(&index_rtxn) From 9211e94c4f019a890175a109b1ce78a43c10bb5f Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 15:03:20 +0200 Subject: [PATCH 083/150] Format --- crates/index-scheduler/src/scheduler/process_export.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 676481319..30721065e 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -75,7 +75,9 @@ impl IndexScheduler { }); let index_exists = match response { Ok(response) => response.status() == 200, - Err(Error::FromRemoteWhenExporting { code, .. }) if code == "index_not_found" => false, + Err(Error::FromRemoteWhenExporting { code, .. }) if code == "index_not_found" => { + false + } Err(e) => return Err(e), }; From d2776efb11f85f1df9501eb6079d98aa4013ba29 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 15:14:56 +0200 Subject: [PATCH 084/150] Fix flaky last_error test --- crates/meilisearch/tests/vector/rest.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 6e781e525..87296c36a 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -2183,6 +2183,7 @@ async fn last_error_stats() { snapshot!(json_string!(response["results"][0], { ".progress" => "[ignored]", ".stats.embedderRequests.total" => "[ignored]", + ".stats.embedderRequests.failed" => "[ignored]", ".startedAt" => "[ignored]" }), @r#" { @@ -2205,7 +2206,7 @@ async fn last_error_stats() { }, "embedderRequests": { "total": "[ignored]", - "failed": 5, + "failed": "[ignored]", "lastError": "runtime error: received internal error HTTP 500 from embedding server\n - server replied with `Service Unavailable`" } }, From c2d5b20a424a2b34fa19a14fc7654464c2c37e95 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 1 Jul 2025 17:23:08 +0000 Subject: [PATCH 085/150] Bump Swatinem/rust-cache from 2.7.8 to 2.8.0 Bumps [Swatinem/rust-cache](https://github.com/swatinem/rust-cache) from 2.7.8 to 2.8.0. - [Release notes](https://github.com/swatinem/rust-cache/releases) - [Changelog](https://github.com/Swatinem/rust-cache/blob/master/CHANGELOG.md) - [Commits](https://github.com/swatinem/rust-cache/compare/v2.7.8...v2.8.0) --- updated-dependencies: - dependency-name: Swatinem/rust-cache dependency-version: 2.8.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/test-suite.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 6cf8bfa0f..2924a07bc 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -29,7 +29,7 @@ jobs: - name: Setup test with Rust stable uses: dtolnay/rust-toolchain@1.85 - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.8 + uses: Swatinem/rust-cache@v2.8.0 - name: Run cargo check without any default features uses: actions-rs/cargo@v1 with: @@ -51,7 +51,7 @@ jobs: steps: - uses: actions/checkout@v3 - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.8 + uses: Swatinem/rust-cache@v2.8.0 - uses: dtolnay/rust-toolchain@1.85 - name: Run cargo check without any default features uses: actions-rs/cargo@v1 @@ -155,7 +155,7 @@ jobs: apt-get install build-essential -y - uses: dtolnay/rust-toolchain@1.85 - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.8 + uses: Swatinem/rust-cache@v2.8.0 - name: Run tests in debug uses: actions-rs/cargo@v1 with: @@ -172,7 +172,7 @@ jobs: profile: minimal components: clippy - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.8 + uses: Swatinem/rust-cache@v2.8.0 - name: Run cargo clippy uses: actions-rs/cargo@v1 with: @@ -191,7 +191,7 @@ jobs: override: true components: rustfmt - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.8 + uses: Swatinem/rust-cache@v2.8.0 - name: Run cargo fmt # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate From 879cf850373b8e4defddbcd81b10f8d7d7bb7542 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 1 Jul 2025 17:23:13 +0000 Subject: [PATCH 086/150] Bump svenstaro/upload-release-action from 2.7.0 to 2.11.1 Bumps [svenstaro/upload-release-action](https://github.com/svenstaro/upload-release-action) from 2.7.0 to 2.11.1. - [Release notes](https://github.com/svenstaro/upload-release-action/releases) - [Changelog](https://github.com/svenstaro/upload-release-action/blob/master/CHANGELOG.md) - [Commits](https://github.com/svenstaro/upload-release-action/compare/2.7.0...2.11.1) --- updated-dependencies: - dependency-name: svenstaro/upload-release-action dependency-version: 2.11.1 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/publish-apt-brew-pkg.yml | 2 +- .github/workflows/publish-binaries.yml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish-apt-brew-pkg.yml b/.github/workflows/publish-apt-brew-pkg.yml index e6adfca57..5b6994dcf 100644 --- a/.github/workflows/publish-apt-brew-pkg.yml +++ b/.github/workflows/publish-apt-brew-pkg.yml @@ -32,7 +32,7 @@ jobs: - name: Build deb package run: cargo deb -p meilisearch -o target/debian/meilisearch.deb - name: Upload debian pkg to release - uses: svenstaro/upload-release-action@2.7.0 + uses: svenstaro/upload-release-action@2.11.1 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/debian/meilisearch.deb diff --git a/.github/workflows/publish-binaries.yml b/.github/workflows/publish-binaries.yml index 885a04d0d..3200e778e 100644 --- a/.github/workflows/publish-binaries.yml +++ b/.github/workflows/publish-binaries.yml @@ -51,7 +51,7 @@ jobs: # No need to upload binaries for dry run (cron) - name: Upload binaries to release if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.7.0 + uses: svenstaro/upload-release-action@2.11.1 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/release/meilisearch @@ -81,7 +81,7 @@ jobs: # No need to upload binaries for dry run (cron) - name: Upload binaries to release if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.7.0 + uses: svenstaro/upload-release-action@2.11.1 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/release/${{ matrix.artifact_name }} @@ -113,7 +113,7 @@ jobs: - name: Upload the binary to release # No need to upload binaries for dry run (cron) if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.7.0 + uses: svenstaro/upload-release-action@2.11.1 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/${{ matrix.target }}/release/meilisearch @@ -178,7 +178,7 @@ jobs: - name: Upload the binary to release # No need to upload binaries for dry run (cron) if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.7.0 + uses: svenstaro/upload-release-action@2.11.1 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/${{ matrix.target }}/release/meilisearch From d2e4d6dd8ae78273fe7644262fbdf86116273276 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:11:40 +0200 Subject: [PATCH 087/150] prompt: Publishes some types --- crates/milli/src/prompt/mod.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/prompt/mod.rs b/crates/milli/src/prompt/mod.rs index a8288f83d..f1b4ddf89 100644 --- a/crates/milli/src/prompt/mod.rs +++ b/crates/milli/src/prompt/mod.rs @@ -9,12 +9,11 @@ use std::fmt::Debug; use std::num::NonZeroUsize; use bumpalo::Bump; -use document::ParseableDocument; +pub(crate) use document::{Document, ParseableDocument}; use error::{NewPromptError, RenderPromptError}; -use fields::{BorrowedFields, OwnedFields}; +pub use fields::{BorrowedFields, OwnedFields}; -use self::context::Context; -use self::document::Document; +pub use self::context::Context; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::del_add::DelAdd; use crate::GlobalFieldsIdsMap; From 76ca44b2141e18e7ba399e031ae3bb6b468cf36f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:47:49 +0200 Subject: [PATCH 088/150] Expand `json_template` module --- .../injectable_value.rs} | 180 ++++------- crates/milli/src/vector/json_template/mod.rs | 283 ++++++++++++++++++ 2 files changed, 342 insertions(+), 121 deletions(-) rename crates/milli/src/vector/{json_template.rs => json_template/injectable_value.rs} (84%) create mode 100644 crates/milli/src/vector/json_template/mod.rs diff --git a/crates/milli/src/vector/json_template.rs b/crates/milli/src/vector/json_template/injectable_value.rs similarity index 84% rename from crates/milli/src/vector/json_template.rs rename to crates/milli/src/vector/json_template/injectable_value.rs index 179cbe9af..ec7d900db 100644 --- a/crates/milli/src/vector/json_template.rs +++ b/crates/milli/src/vector/json_template/injectable_value.rs @@ -1,20 +1,17 @@ -//! Module to manipulate JSON templates. +//! Module to manipulate JSON values containing placeholder strings. //! //! This module allows two main operations: -//! 1. Render JSON values from a template and a context value. -//! 2. Retrieve data from a template and JSON values. - -#![warn(rustdoc::broken_intra_doc_links)] -#![warn(missing_docs)] +//! 1. Render JSON values from a template value containing placeholders and a value to inject. +//! 2. Extract data from a template value containing placeholders and a concrete JSON value that fits the template value. use serde::Deserialize; use serde_json::{Map, Value}; -type ValuePath = Vec; +use super::{format_value, inject_value, path_with_root, PathComponent, ValuePath}; /// Encapsulates a JSON template and allows injecting and extracting values from it. #[derive(Debug)] -pub struct ValueTemplate { +pub struct InjectableValue { template: Value, value_kind: ValueKind, } @@ -32,34 +29,13 @@ struct ArrayPath { value_path_in_array: ValuePath, } -/// Component of a path to a Value -#[derive(Debug, Clone)] -pub enum PathComponent { - /// A key inside of an object - MapKey(String), - /// An index inside of an array - ArrayIndex(usize), -} - -impl PartialEq for PathComponent { - fn eq(&self, other: &Self) -> bool { - match (self, other) { - (Self::MapKey(l0), Self::MapKey(r0)) => l0 == r0, - (Self::ArrayIndex(l0), Self::ArrayIndex(r0)) => l0 == r0, - _ => false, - } - } -} - -impl Eq for PathComponent {} - -/// Error that occurs when no few value was provided to a template for injection. +/// Error that occurs when no value was provided to a template for injection. #[derive(Debug)] pub struct MissingValue; -/// Error that occurs when trying to parse a template in [`ValueTemplate::new`] +/// Error that occurs when trying to parse a template in [`InjectableValue::new`] #[derive(Debug)] -pub enum TemplateParsingError { +pub enum InjectableParsingError { /// A repeat string appears inside a repeated value NestedRepeatString(ValuePath), /// A repeat string appears outside of an array @@ -85,42 +61,42 @@ pub enum TemplateParsingError { }, } -impl TemplateParsingError { +impl InjectableParsingError { /// Produce an error message from the error kind, the name of the root object, the placeholder string and the repeat string pub fn error_message(&self, root: &str, placeholder: &str, repeat: &str) -> String { match self { - TemplateParsingError::NestedRepeatString(path) => { + InjectableParsingError::NestedRepeatString(path) => { format!( r#"in {}: "{repeat}" appears nested inside of a value that is itself repeated"#, path_with_root(root, path) ) } - TemplateParsingError::RepeatStringNotInArray(path) => format!( + InjectableParsingError::RepeatStringNotInArray(path) => format!( r#"in {}: "{repeat}" appears outside of an array"#, path_with_root(root, path) ), - TemplateParsingError::BadIndexForRepeatString(path, index) => format!( + InjectableParsingError::BadIndexForRepeatString(path, index) => format!( r#"in {}: "{repeat}" expected at position #1, but found at position #{index}"#, path_with_root(root, path) ), - TemplateParsingError::MissingPlaceholderInRepeatedValue(path) => format!( + InjectableParsingError::MissingPlaceholderInRepeatedValue(path) => format!( r#"in {}: Expected "{placeholder}" inside of the repeated value"#, path_with_root(root, path) ), - TemplateParsingError::MultipleRepeatString(current, previous) => format!( + InjectableParsingError::MultipleRepeatString(current, previous) => format!( r#"in {}: Found "{repeat}", but it was already present in {}"#, path_with_root(root, current), path_with_root(root, previous) ), - TemplateParsingError::MultiplePlaceholderString(current, previous) => format!( + InjectableParsingError::MultiplePlaceholderString(current, previous) => format!( r#"in {}: Found "{placeholder}", but it was already present in {}"#, path_with_root(root, current), path_with_root(root, previous) ), - TemplateParsingError::MissingPlaceholderString => { + InjectableParsingError::MissingPlaceholderString => { format!(r#"in `{root}`: "{placeholder}" not found"#) } - TemplateParsingError::BothArrayAndSingle { + InjectableParsingError::BothArrayAndSingle { single_path, path_to_array, array_to_placeholder, @@ -140,41 +116,41 @@ impl TemplateParsingError { fn prepend_path(self, mut prepended_path: ValuePath) -> Self { match self { - TemplateParsingError::NestedRepeatString(mut path) => { + InjectableParsingError::NestedRepeatString(mut path) => { prepended_path.append(&mut path); - TemplateParsingError::NestedRepeatString(prepended_path) + InjectableParsingError::NestedRepeatString(prepended_path) } - TemplateParsingError::RepeatStringNotInArray(mut path) => { + InjectableParsingError::RepeatStringNotInArray(mut path) => { prepended_path.append(&mut path); - TemplateParsingError::RepeatStringNotInArray(prepended_path) + InjectableParsingError::RepeatStringNotInArray(prepended_path) } - TemplateParsingError::BadIndexForRepeatString(mut path, index) => { + InjectableParsingError::BadIndexForRepeatString(mut path, index) => { prepended_path.append(&mut path); - TemplateParsingError::BadIndexForRepeatString(prepended_path, index) + InjectableParsingError::BadIndexForRepeatString(prepended_path, index) } - TemplateParsingError::MissingPlaceholderInRepeatedValue(mut path) => { + InjectableParsingError::MissingPlaceholderInRepeatedValue(mut path) => { prepended_path.append(&mut path); - TemplateParsingError::MissingPlaceholderInRepeatedValue(prepended_path) + InjectableParsingError::MissingPlaceholderInRepeatedValue(prepended_path) } - TemplateParsingError::MultipleRepeatString(mut path, older_path) => { + InjectableParsingError::MultipleRepeatString(mut path, older_path) => { let older_prepended_path = prepended_path.iter().cloned().chain(older_path).collect(); prepended_path.append(&mut path); - TemplateParsingError::MultipleRepeatString(prepended_path, older_prepended_path) + InjectableParsingError::MultipleRepeatString(prepended_path, older_prepended_path) } - TemplateParsingError::MultiplePlaceholderString(mut path, older_path) => { + InjectableParsingError::MultiplePlaceholderString(mut path, older_path) => { let older_prepended_path = prepended_path.iter().cloned().chain(older_path).collect(); prepended_path.append(&mut path); - TemplateParsingError::MultiplePlaceholderString( + InjectableParsingError::MultiplePlaceholderString( prepended_path, older_prepended_path, ) } - TemplateParsingError::MissingPlaceholderString => { - TemplateParsingError::MissingPlaceholderString + InjectableParsingError::MissingPlaceholderString => { + InjectableParsingError::MissingPlaceholderString } - TemplateParsingError::BothArrayAndSingle { + InjectableParsingError::BothArrayAndSingle { single_path, mut path_to_array, array_to_placeholder, @@ -184,7 +160,7 @@ impl TemplateParsingError { prepended_path.iter().cloned().chain(single_path).collect(); prepended_path.append(&mut path_to_array); // we don't prepend the array_to_placeholder path as it is the array path that is prepended - TemplateParsingError::BothArrayAndSingle { + InjectableParsingError::BothArrayAndSingle { single_path: single_prepended_path, path_to_array: prepended_path, array_to_placeholder, @@ -194,7 +170,7 @@ impl TemplateParsingError { } } -/// Error that occurs when [`ValueTemplate::extract`] fails. +/// Error that occurs when [`InjectableValue::extract`] fails. #[derive(Debug)] pub struct ExtractionError { /// The cause of the failure @@ -336,27 +312,6 @@ enum LastNamedObject<'a> { NestedArrayInsideObject { object_name: &'a str, index: usize, nesting_level: usize }, } -/// Builds a string representation of a path, preprending the name of the root value. -pub fn path_with_root<'a>( - root: &str, - path: impl IntoIterator + 'a, -) -> String { - use std::fmt::Write as _; - let mut res = format!("`{root}"); - for component in path.into_iter() { - match component { - PathComponent::MapKey(key) => { - let _ = write!(&mut res, ".{key}"); - } - PathComponent::ArrayIndex(index) => { - let _ = write!(&mut res, "[{index}]"); - } - } - } - res.push('`'); - res -} - /// Context where an extraction failure happened /// /// The operation that failed @@ -405,7 +360,7 @@ enum ArrayParsingContext<'a> { NotNested(&'a mut Option), } -impl ValueTemplate { +impl InjectableValue { /// Prepare a template for injection or extraction. /// /// # Parameters @@ -419,12 +374,12 @@ impl ValueTemplate { /// /// # Errors /// - /// - [`TemplateParsingError`]: refer to the documentation of this type + /// - [`InjectableParsingError`]: refer to the documentation of this type pub fn new( template: Value, placeholder_string: &str, repeat_string: &str, - ) -> Result { + ) -> Result { let mut value_path = None; let mut array_path = None; let mut current_path = Vec::new(); @@ -438,11 +393,11 @@ impl ValueTemplate { )?; let value_kind = match (array_path, value_path) { - (None, None) => return Err(TemplateParsingError::MissingPlaceholderString), + (None, None) => return Err(InjectableParsingError::MissingPlaceholderString), (None, Some(value_path)) => ValueKind::Single(value_path), (Some(array_path), None) => ValueKind::Array(array_path), (Some(array_path), Some(value_path)) => { - return Err(TemplateParsingError::BothArrayAndSingle { + return Err(InjectableParsingError::BothArrayAndSingle { single_path: value_path, path_to_array: array_path.path_to_array, array_to_placeholder: array_path.value_path_in_array, @@ -564,29 +519,29 @@ impl ValueTemplate { value_path: &mut Option, mut array_path: &mut ArrayParsingContext, current_path: &mut ValuePath, - ) -> Result<(), TemplateParsingError> { + ) -> Result<(), InjectableParsingError> { // two modes for parsing array. match array { // 1. array contains a repeat string in second position [first, second, rest @ ..] if second == repeat_string => { let ArrayParsingContext::NotNested(array_path) = &mut array_path else { - return Err(TemplateParsingError::NestedRepeatString(current_path.clone())); + return Err(InjectableParsingError::NestedRepeatString(current_path.clone())); }; if let Some(array_path) = array_path { - return Err(TemplateParsingError::MultipleRepeatString( + return Err(InjectableParsingError::MultipleRepeatString( current_path.clone(), array_path.path_to_array.clone(), )); } if first == repeat_string { - return Err(TemplateParsingError::BadIndexForRepeatString( + return Err(InjectableParsingError::BadIndexForRepeatString( current_path.clone(), 0, )); } if let Some(position) = rest.iter().position(|value| value == repeat_string) { let position = position + 2; - return Err(TemplateParsingError::BadIndexForRepeatString( + return Err(InjectableParsingError::BadIndexForRepeatString( current_path.clone(), position, )); @@ -609,7 +564,9 @@ impl ValueTemplate { value_path.ok_or_else(|| { let mut repeated_value_path = current_path.clone(); repeated_value_path.push(PathComponent::ArrayIndex(0)); - TemplateParsingError::MissingPlaceholderInRepeatedValue(repeated_value_path) + InjectableParsingError::MissingPlaceholderInRepeatedValue( + repeated_value_path, + ) })? }; **array_path = Some(ArrayPath { @@ -621,7 +578,7 @@ impl ValueTemplate { // 2. array does not contain a repeat string array => { if let Some(position) = array.iter().position(|value| value == repeat_string) { - return Err(TemplateParsingError::BadIndexForRepeatString( + return Err(InjectableParsingError::BadIndexForRepeatString( current_path.clone(), position, )); @@ -650,7 +607,7 @@ impl ValueTemplate { value_path: &mut Option, array_path: &mut ArrayParsingContext, current_path: &mut ValuePath, - ) -> Result<(), TemplateParsingError> { + ) -> Result<(), InjectableParsingError> { for (key, value) in object.iter() { current_path.push(PathComponent::MapKey(key.to_owned())); Self::parse_value( @@ -673,12 +630,12 @@ impl ValueTemplate { value_path: &mut Option, array_path: &mut ArrayParsingContext, current_path: &mut ValuePath, - ) -> Result<(), TemplateParsingError> { + ) -> Result<(), InjectableParsingError> { match value { Value::String(str) => { if placeholder_string == str { if let Some(value_path) = value_path { - return Err(TemplateParsingError::MultiplePlaceholderString( + return Err(InjectableParsingError::MultiplePlaceholderString( current_path.clone(), value_path.clone(), )); @@ -687,7 +644,9 @@ impl ValueTemplate { *value_path = Some(current_path.clone()); } if repeat_string == str { - return Err(TemplateParsingError::RepeatStringNotInArray(current_path.clone())); + return Err(InjectableParsingError::RepeatStringNotInArray( + current_path.clone(), + )); } } Value::Null | Value::Bool(_) | Value::Number(_) => {} @@ -712,27 +671,6 @@ impl ValueTemplate { } } -fn inject_value(rendered: &mut Value, injection_path: &Vec, injected_value: Value) { - let mut current_value = rendered; - for injection_component in injection_path { - current_value = match injection_component { - PathComponent::MapKey(key) => current_value.get_mut(key).unwrap(), - PathComponent::ArrayIndex(index) => current_value.get_mut(index).unwrap(), - } - } - *current_value = injected_value; -} - -fn format_value(value: &Value) -> String { - match value { - Value::Array(array) => format!("an array of size {}", array.len()), - Value::Object(object) => { - format!("an object with {} field(s)", object.len()) - } - value => value.to_string(), - } -} - fn extract_value( extraction_path: &[PathComponent], initial_value: &mut Value, @@ -838,10 +776,10 @@ impl ExtractionResultErrorContext for Result { mod test { use serde_json::{json, Value}; - use super::{PathComponent, TemplateParsingError, ValueTemplate}; + use super::{InjectableParsingError, InjectableValue, PathComponent}; - fn new_template(template: Value) -> Result { - ValueTemplate::new(template, "{{text}}", "{{..}}") + fn new_template(template: Value) -> Result { + InjectableValue::new(template, "{{text}}", "{{..}}") } #[test] @@ -853,7 +791,7 @@ mod test { }); let error = new_template(template.clone()).unwrap_err(); - assert!(matches!(error, TemplateParsingError::MissingPlaceholderString)) + assert!(matches!(error, InjectableParsingError::MissingPlaceholderString)) } #[test] @@ -887,7 +825,7 @@ mod test { }); match new_template(template.clone()) { - Err(TemplateParsingError::MultiplePlaceholderString(left, right)) => { + Err(InjectableParsingError::MultiplePlaceholderString(left, right)) => { assert_eq!( left, vec![PathComponent::MapKey("titi".into()), PathComponent::ArrayIndex(3)] diff --git a/crates/milli/src/vector/json_template/mod.rs b/crates/milli/src/vector/json_template/mod.rs new file mode 100644 index 000000000..57a3b67b1 --- /dev/null +++ b/crates/milli/src/vector/json_template/mod.rs @@ -0,0 +1,283 @@ +//! Exposes types to manipulate JSON values +//! +//! - [`JsonTemplate`]: renders JSON values by rendering its strings as [`Template`]s. +//! - [`InjectableValue`]: Describes a JSON value containing placeholders, +//! then allows to inject values instead of the placeholder to produce new concrete JSON values, +//! or extract sub-values at the placeholder location from concrete JSON values. +//! +//! The module also exposes foundational types to work with JSON paths: +//! +//! - [`ValuePath`] is made of [`PathComponent`]s to indicate the location of a sub-value inside of a JSON value. +//! - [`inject_value`] is a primitive that replaces the sub-value at the described location by an injected value. + +#![warn(rustdoc::broken_intra_doc_links)] +#![warn(missing_docs)] + +use bumpalo::Bump; +use liquid::{Parser, Template}; +use serde_json::{Map, Value}; + +use crate::prompt::ParseableDocument; +use crate::update::new::document::Document; + +mod injectable_value; + +pub use injectable_value::InjectableValue; + +/// Represents a JSON [`Value`] where each string is rendered as a [`Template`]. +#[derive(Debug)] +pub struct JsonTemplate { + value: Value, + templates: Vec, +} + +impl Clone for JsonTemplate { + fn clone(&self) -> Self { + Self::new(self.value.clone()).unwrap() + } +} + +struct TemplateAtPath { + template: Template, + path: ValuePath, +} + +impl std::fmt::Debug for TemplateAtPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TemplateAtPath") + .field("template", &&"template") + .field("path", &self.path) + .finish() + } +} + +/// Error that can occur either when parsing the templates in the value, or when trying to render them. +#[derive(Debug)] +pub struct Error { + template_error: liquid::Error, + path: ValuePath, +} + +impl Error { + /// Produces an error message when the error happened at rendering time. + pub fn rendering_error(&self, root: &str) -> String { + format!( + "in `{}`, error while rendering template: {}", + path_with_root(root, self.path.iter()), + &self.template_error + ) + } + + /// Produces an error message when the error happened at parsing time. + pub fn parsing(&self, root: &str) -> String { + format!( + "in `{}`, error while parsing template: {}", + path_with_root(root, self.path.iter()), + &self.template_error + ) + } +} + +impl JsonTemplate { + /// Creates a new `JsonTemplate` by parsing all strings inside the value as templates. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be parsed. + pub fn new(value: Value) -> Result { + let templates = build_templates(&value)?; + Ok(Self { value, templates }) + } + + /// Renders this value by replacing all its strings with the rendered version of the template they represent from the given context. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be rendered with the given context. + pub fn render(&self, context: &dyn liquid::ObjectView) -> Result { + let mut rendered = self.value.clone(); + for TemplateAtPath { template, path } in &self.templates { + let injected_value = + template.render(context).map_err(|err| error_with_path(err, path.clone()))?; + inject_value(&mut rendered, path, Value::String(injected_value)); + } + Ok(rendered) + } + + /// Renders this value by replacing all its strings with the rendered version of the template they represent from the contents of the given document. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be rendered with the given document. + pub fn render_document<'a, 'doc, D: Document<'a> + std::fmt::Debug>( + &self, + document: D, + doc_alloc: &'doc Bump, + ) -> Result { + let document = ParseableDocument::new(document, doc_alloc); + let v: Vec = vec![]; + let context = crate::prompt::Context::new(&document, &v); + self.render(&context) + } + + /// Renders this value by replacing all its strings with the rendered version of the template they represent from the contents of the search query. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be rendered from the contents of the search query + pub fn render_search(&self, q: Option<&str>, media: Option<&Value>) -> Result { + let search_data = match (q, media) { + (None, None) => liquid::object!({}), + (None, Some(media)) => liquid::object!({ "media": media }), + (Some(q), None) => liquid::object!({"q": q}), + (Some(q), Some(media)) => liquid::object!({"q": q, "media": media}), + }; + self.render(&search_data) + } + + /// The JSON value representing the underlying template + pub fn template(&self) -> &Value { + &self.value + } +} + +fn build_templates(value: &Value) -> Result, Error> { + let mut current_path = ValuePath::new(); + let mut templates = Vec::new(); + let compiler = liquid::ParserBuilder::with_stdlib().build().unwrap(); + parse_value(value, &mut current_path, &mut templates, &compiler)?; + Ok(templates) +} + +fn error_with_path(template_error: liquid::Error, path: ValuePath) -> Error { + Error { template_error, path } +} + +fn parse_value( + value: &Value, + current_path: &mut ValuePath, + templates: &mut Vec, + compiler: &Parser, +) -> Result<(), Error> { + match value { + Value::String(template) => { + let template = compiler + .parse(template) + .map_err(|err| error_with_path(err, current_path.clone()))?; + templates.push(TemplateAtPath { template, path: current_path.clone() }); + } + Value::Array(values) => { + parse_array(values, current_path, templates, compiler)?; + } + Value::Object(map) => { + parse_object(map, current_path, templates, compiler)?; + } + _ => {} + } + Ok(()) +} + +fn parse_object( + map: &Map, + current_path: &mut ValuePath, + templates: &mut Vec, + compiler: &Parser, +) -> Result<(), Error> { + for (key, value) in map { + current_path.push(PathComponent::MapKey(key.clone())); + parse_value(value, current_path, templates, compiler)?; + current_path.pop(); + } + Ok(()) +} + +fn parse_array( + values: &[Value], + current_path: &mut ValuePath, + templates: &mut Vec, + compiler: &Parser, +) -> Result<(), Error> { + for (index, value) in values.iter().enumerate() { + current_path.push(PathComponent::ArrayIndex(index)); + parse_value(value, current_path, templates, compiler)?; + current_path.pop(); + } + Ok(()) +} + +/// A list of [`PathComponent`]s describing a path to a value inside a JSON value. +/// +/// The empty list refers to the root value. +pub type ValuePath = Vec; + +/// Component of a path to a Value +#[derive(Debug, Clone)] +pub enum PathComponent { + /// A key inside of an object + MapKey(String), + /// An index inside of an array + ArrayIndex(usize), +} + +impl PartialEq for PathComponent { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::MapKey(l0), Self::MapKey(r0)) => l0 == r0, + (Self::ArrayIndex(l0), Self::ArrayIndex(r0)) => l0 == r0, + _ => false, + } + } +} + +impl Eq for PathComponent {} + +/// Builds a string representation of a path, preprending the name of the root value. +pub fn path_with_root<'a>( + root: &str, + path: impl IntoIterator + 'a, +) -> String { + use std::fmt::Write as _; + let mut res = format!("`{root}"); + for component in path.into_iter() { + match component { + PathComponent::MapKey(key) => { + let _ = write!(&mut res, ".{key}"); + } + PathComponent::ArrayIndex(index) => { + let _ = write!(&mut res, "[{index}]"); + } + } + } + res.push('`'); + res +} + +/// Modifies `rendered` to replace the sub-value at the `injection_path` location by the `injected_value`. +/// +/// # Panics +/// +/// - if the provided `injection_path` cannot be traversed in `rendered`. +pub fn inject_value( + rendered: &mut Value, + injection_path: &Vec, + injected_value: Value, +) { + let mut current_value = rendered; + for injection_component in injection_path { + current_value = match injection_component { + PathComponent::MapKey(key) => current_value.get_mut(key).unwrap(), + PathComponent::ArrayIndex(index) => current_value.get_mut(index).unwrap(), + } + } + *current_value = injected_value; +} + +fn format_value(value: &Value) -> String { + match value { + Value::Array(array) => format!("an array of size {}", array.len()), + Value::Object(object) => { + format!("an object with {} field(s)", object.len()) + } + value => value.to_string(), + } +} From 17a94c40dc63c70bf66162a9fcd71fcad1d8ebfc Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:48:38 +0200 Subject: [PATCH 089/150] Add `vector::db` module --- crates/milli/src/vector/db.rs | 443 +++++++++++++++++++++++++++++++++ crates/milli/src/vector/mod.rs | 1 + 2 files changed, 444 insertions(+) create mode 100644 crates/milli/src/vector/db.rs diff --git a/crates/milli/src/vector/db.rs b/crates/milli/src/vector/db.rs new file mode 100644 index 000000000..0e890fac9 --- /dev/null +++ b/crates/milli/src/vector/db.rs @@ -0,0 +1,443 @@ +//! Module containing types and methods to store meta-information about the embedders and fragments + +use std::borrow::Cow; + +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use heed::types::{SerdeJson, Str, U8}; +use heed::{BytesEncode, Database, RoTxn, RwTxn, Unspecified}; +use roaring::RoaringBitmap; +use serde::{Deserialize, Serialize}; + +use crate::vector::settings::RemoveFragments; +use crate::vector::EmbeddingConfig; +use crate::{CboRoaringBitmapCodec, DocumentId, UserError}; + +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub config: EmbeddingConfig, + #[serde(default)] + pub fragments: FragmentConfigs, +} + +#[derive(Debug, Clone, Deserialize, Serialize, Default)] +pub struct FragmentConfigs(Vec); + +impl FragmentConfigs { + pub fn new() -> Self { + Default::default() + } + pub fn as_slice(&self) -> &[FragmentConfig] { + self.0.as_slice() + } + + pub fn into_inner(self) -> Vec { + self.0 + } + + pub fn remove_fragments<'a>( + &mut self, + fragments: impl IntoIterator, + ) -> Option { + let mut remove_fragments = Vec::new(); + for fragment in fragments { + let Ok(index_to_remove) = self.0.binary_search_by_key(&fragment, |f| &f.name) else { + continue; + }; + let fragment = self.0.swap_remove(index_to_remove); + remove_fragments.push(fragment.id); + } + (!remove_fragments.is_empty()).then_some(RemoveFragments { fragment_ids: remove_fragments }) + } + + pub fn add_new_fragments( + &mut self, + new_fragments: impl IntoIterator, + ) -> crate::Result<()> { + let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; + + for FragmentConfig { id, name: _ } in self.0.iter() { + free_indices[*id as usize] = false; + } + let mut free_indices = free_indices.iter_mut().enumerate(); + let mut find_free_index = + move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); + + let mut new_fragments = new_fragments.into_iter(); + + for name in &mut new_fragments { + let id = match find_free_index() { + Some(id) => id, + None => { + let more = (&mut new_fragments).count(); + return Err(UserError::TooManyFragments(u8::MAX as usize + more + 1).into()); + } + }; + self.0.push(FragmentConfig { id, name }); + } + Ok(()) + } +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct FragmentConfig { + pub id: u8, + pub name: String, +} + +pub struct IndexEmbeddingConfigs { + main: Database, + embedder_info: Database, +} + +pub struct EmbedderInfo { + pub embedder_id: u8, + pub embedding_status: EmbeddingStatus, +} + +impl EmbedderInfo { + pub fn to_bytes(&self) -> Result, heed::BoxedError> { + EmbedderInfoCodec::bytes_encode(self) + } +} + +/// Optimized struct to hold the list of documents that are `user_provided` and `must_regenerate`. +/// +/// Because most documents have the same value for `user_provided` and `must_regenerate`, we store only +/// the `user_provided` and a list of the documents for which `must_regenerate` assumes the other value +/// than `user_provided`. +#[derive(Default)] +pub struct EmbeddingStatus { + user_provided: RoaringBitmap, + skip_regenerate_different_from_user_provided: RoaringBitmap, +} + +impl EmbeddingStatus { + pub fn new() -> Self { + Default::default() + } + + /// Whether the document contains user-provided vectors for that embedder. + pub fn is_user_provided(&self, docid: DocumentId) -> bool { + self.user_provided.contains(docid) + } + /// Whether vectors should be regenerated for that document and that embedder. + pub fn must_regenerate(&self, docid: DocumentId) -> bool { + let invert = self.skip_regenerate_different_from_user_provided.contains(docid); + let user_provided = self.user_provided.contains(docid); + !(user_provided ^ invert) + } + + pub fn is_user_provided_must_regenerate(&self, docid: DocumentId) -> (bool, bool) { + let invert = self.skip_regenerate_different_from_user_provided.contains(docid); + let user_provided = self.user_provided.contains(docid); + (user_provided, !(user_provided ^ invert)) + } + + pub fn user_provided_docids(&self) -> &RoaringBitmap { + &self.user_provided + } + + pub fn skip_regenerate_docids(&self) -> RoaringBitmap { + &self.user_provided ^ &self.skip_regenerate_different_from_user_provided + } + + pub(crate) fn into_user_provided(self) -> RoaringBitmap { + self.user_provided + } +} + +#[derive(Default)] +pub struct EmbeddingStatusDelta { + del_status: EmbeddingStatus, + add_status: EmbeddingStatus, +} + +impl EmbeddingStatusDelta { + pub fn new() -> Self { + Self::default() + } + + pub fn needs_change( + old_is_user_provided: bool, + old_must_regenerate: bool, + new_is_user_provided: bool, + new_must_regenerate: bool, + ) -> bool { + let old_skip_regenerate_different_user_provided = + old_is_user_provided == old_must_regenerate; + let new_skip_regenerate_different_user_provided = + new_is_user_provided == new_must_regenerate; + + old_is_user_provided != new_is_user_provided + || old_skip_regenerate_different_user_provided + != new_skip_regenerate_different_user_provided + } + + pub fn needs_clear(is_user_provided: bool, must_regenerate: bool) -> bool { + Self::needs_change(is_user_provided, must_regenerate, false, true) + } + + pub fn clear_docid( + &mut self, + docid: DocumentId, + is_user_provided: bool, + must_regenerate: bool, + ) { + self.push_delta(docid, is_user_provided, must_regenerate, false, true); + } + + pub fn push_delta( + &mut self, + docid: DocumentId, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_is_user_provided: bool, + new_must_regenerate: bool, + ) { + // must_regenerate == !skip_regenerate + let old_skip_regenerate_different_user_provided = + old_is_user_provided == old_must_regenerate; + let new_skip_regenerate_different_user_provided = + new_is_user_provided == new_must_regenerate; + + match (old_is_user_provided, new_is_user_provided) { + (true, true) | (false, false) => { /* no change */ } + (true, false) => { + self.del_status.user_provided.insert(docid); + } + (false, true) => { + self.add_status.user_provided.insert(docid); + } + } + + match ( + old_skip_regenerate_different_user_provided, + new_skip_regenerate_different_user_provided, + ) { + (true, true) | (false, false) => { /* no change */ } + (true, false) => { + self.del_status.skip_regenerate_different_from_user_provided.insert(docid); + } + (false, true) => { + self.add_status.skip_regenerate_different_from_user_provided.insert(docid); + } + } + } + + pub fn push_new(&mut self, docid: DocumentId, is_user_provided: bool, must_regenerate: bool) { + self.push_delta( + docid, + !is_user_provided, + !must_regenerate, + is_user_provided, + must_regenerate, + ); + } + + pub fn apply_to(&self, status: &mut EmbeddingStatus) { + status.user_provided -= &self.del_status.user_provided; + status.user_provided |= &self.add_status.user_provided; + + status.skip_regenerate_different_from_user_provided -= + &self.del_status.skip_regenerate_different_from_user_provided; + status.skip_regenerate_different_from_user_provided |= + &self.add_status.skip_regenerate_different_from_user_provided; + } +} + +struct EmbedderInfoCodec; + +impl<'a> heed::BytesDecode<'a> for EmbedderInfoCodec { + type DItem = EmbedderInfo; + + fn bytes_decode(mut bytes: &'a [u8]) -> Result { + let embedder_id = bytes.read_u8()?; + // Support all version that didn't store the embedding status + if bytes.is_empty() { + return Ok(EmbedderInfo { embedder_id, embedding_status: EmbeddingStatus::new() }); + } + let first_bitmap_size = bytes.read_u32::()?; + let first_bitmap_bytes = &bytes[..first_bitmap_size as usize]; + let user_provided = CboRoaringBitmapCodec::bytes_decode(first_bitmap_bytes)?; + let skip_regenerate_different_from_user_provided = + CboRoaringBitmapCodec::bytes_decode(&bytes[first_bitmap_size as usize..])?; + Ok(EmbedderInfo { + embedder_id, + embedding_status: EmbeddingStatus { + user_provided, + skip_regenerate_different_from_user_provided, + }, + }) + } +} + +impl<'a> heed::BytesEncode<'a> for EmbedderInfoCodec { + type EItem = EmbedderInfo; + + fn bytes_encode(item: &'a Self::EItem) -> Result, heed::BoxedError> { + let first_bitmap_size = + CboRoaringBitmapCodec::serialized_size(&item.embedding_status.user_provided); + let second_bitmap_size = CboRoaringBitmapCodec::serialized_size( + &item.embedding_status.skip_regenerate_different_from_user_provided, + ); + + let mut bytes = Vec::with_capacity(1 + 4 + first_bitmap_size + second_bitmap_size); + bytes.write_u8(item.embedder_id)?; + bytes.write_u32::(first_bitmap_size.try_into()?)?; + CboRoaringBitmapCodec::serialize_into_writer( + &item.embedding_status.user_provided, + &mut bytes, + )?; + CboRoaringBitmapCodec::serialize_into_writer( + &item.embedding_status.skip_regenerate_different_from_user_provided, + &mut bytes, + )?; + Ok(bytes.into()) + } +} + +impl IndexEmbeddingConfigs { + pub(crate) fn new( + main: Database, + embedder_info: Database, + ) -> Self { + Self { main, embedder_info: embedder_info.remap_types() } + } + + pub(crate) fn put_embedding_configs( + &self, + wtxn: &mut RwTxn<'_>, + configs: Vec, + ) -> heed::Result<()> { + self.main.remap_types::>>().put( + wtxn, + crate::index::main_key::EMBEDDING_CONFIGS, + &configs, + ) + } + + pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, crate::index::main_key::EMBEDDING_CONFIGS) + } + + pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + Ok(self + .main + .remap_types::>>() + .get(rtxn, crate::index::main_key::EMBEDDING_CONFIGS)? + .unwrap_or_default()) + } + + pub fn embedder_id(&self, rtxn: &RoTxn<'_>, name: &str) -> heed::Result> { + self.embedder_info.remap_data_type::().get(rtxn, name) + } + + pub fn put_fresh_embedder_id( + &self, + wtxn: &mut RwTxn<'_>, + name: &str, + embedder_id: u8, + ) -> heed::Result<()> { + let info = EmbedderInfo { embedder_id, embedding_status: EmbeddingStatus::new() }; + self.put_embedder_info(wtxn, name, &info) + } + + /// Iterate through the passed list of embedder names, associating a fresh embedder id to any new names. + /// + /// Passing the name of a currently existing embedder is not an error, and will not modify its embedder id, + /// so it is not necessary to differentiate between new and existing embedders before calling this function. + pub fn add_new_embedders<'a>( + &self, + wtxn: &mut RwTxn<'_>, + embedder_names: impl IntoIterator, + total_embedder_count: usize, + ) -> crate::Result<()> { + let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; + + for res in self.embedder_info.iter(wtxn)? { + let (_name, EmbedderInfo { embedder_id, embedding_status: _ }) = res?; + free_indices[embedder_id as usize] = false; + } + + let mut free_indices = free_indices.iter_mut().enumerate(); + let mut find_free_index = + move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); + + for embedder_name in embedder_names { + if self.embedder_id(wtxn, embedder_name)?.is_some() { + continue; + } + let embedder_id = find_free_index() + .ok_or(crate::UserError::TooManyEmbedders(total_embedder_count))?; + tracing::debug!( + embedder = embedder_name, + embedder_id, + "assigning free id to new embedder" + ); + self.put_fresh_embedder_id(wtxn, embedder_name, embedder_id)?; + } + Ok(()) + } + + pub fn embedder_info( + &self, + rtxn: &RoTxn<'_>, + name: &str, + ) -> heed::Result> { + self.embedder_info.get(rtxn, name) + } + + /// Clear the list of docids that are `user_provided` or `must_regenerate` across all embedders. + pub fn clear_embedder_info_docids(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<()> { + let mut it = self.embedder_info.iter_mut(wtxn)?; + while let Some(res) = it.next() { + let (embedder_name, info) = res?; + let embedder_name = embedder_name.to_owned(); + // SAFETY: we copied the `embedder_name` so are not using the reference while using put + unsafe { + it.put_current( + &embedder_name, + &EmbedderInfo { + embedder_id: info.embedder_id, + embedding_status: EmbeddingStatus::new(), + }, + )?; + } + } + Ok(()) + } + + pub fn iter_embedder_info<'a>( + &self, + rtxn: &'a RoTxn<'_>, + ) -> heed::Result>> { + self.embedder_info.iter(rtxn) + } + + pub fn iter_embedder_id<'a>( + &self, + rtxn: &'a RoTxn<'_>, + ) -> heed::Result>> { + self.embedder_info.remap_data_type::().iter(rtxn) + } + + pub fn remove_embedder( + &self, + wtxn: &mut RwTxn<'_>, + name: &str, + ) -> heed::Result> { + let info = self.embedder_info.get(wtxn, name)?; + self.embedder_info.delete(wtxn, name)?; + Ok(info) + } + + pub fn put_embedder_info( + &self, + wtxn: &mut RwTxn<'_>, + name: &str, + info: &EmbedderInfo, + ) -> heed::Result<()> { + self.embedder_info.put(wtxn, name, info) + } +} diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 065beb5fb..ec4ee2ccd 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -18,6 +18,7 @@ use crate::prompt::{Prompt, PromptData}; use crate::ThreadPoolNoAbort; pub mod composite; +pub mod db; pub mod error; pub mod hf; pub mod json_template; From 0114796d2aaba9b638e188541dd1edba5ddd06e6 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:56:44 +0200 Subject: [PATCH 090/150] Index uses the vector::db stuff --- crates/milli/src/index.rs | 70 +++++++++++++-------------------------- 1 file changed, 23 insertions(+), 47 deletions(-) diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index e9e63a853..b2ec992ba 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -30,7 +30,8 @@ use crate::order_by_map::OrderByMap; use crate::prompt::PromptData; use crate::proximity::ProximityPrecision; use crate::update::new::StdResult; -use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig}; +use crate::vector::db::IndexEmbeddingConfigs; +use crate::vector::{ArroyStats, ArroyWrapper, Embedding}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, @@ -177,7 +178,7 @@ pub struct Index { pub field_id_docid_facet_strings: Database, /// Maps an embedder name to its id in the arroy store. - pub embedder_category_id: Database, + pub(crate) embedder_category_id: Database, /// Vector store based on arroy™. pub vector_arroy: arroy::Database, @@ -1745,34 +1746,6 @@ impl Index { self.main.remap_key_type::().delete(txn, main_key::LOCALIZED_ATTRIBUTES_RULES) } - /// Put the embedding configs: - /// 1. The name of the embedder - /// 2. The configuration option for this embedder - /// 3. The list of documents with a user provided embedding - pub(crate) fn put_embedding_configs( - &self, - wtxn: &mut RwTxn<'_>, - configs: Vec, - ) -> heed::Result<()> { - self.main.remap_types::>>().put( - wtxn, - main_key::EMBEDDING_CONFIGS, - &configs, - ) - } - - pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { - self.main.remap_key_type::().delete(wtxn, main_key::EMBEDDING_CONFIGS) - } - - pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result> { - Ok(self - .main - .remap_types::>>() - .get(rtxn, main_key::EMBEDDING_CONFIGS)? - .unwrap_or_default()) - } - pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> { self.main.remap_types::().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff) } @@ -1785,19 +1758,29 @@ impl Index { self.main.remap_key_type::().delete(wtxn, main_key::SEARCH_CUTOFF) } + pub fn embedding_configs(&self) -> IndexEmbeddingConfigs { + IndexEmbeddingConfigs::new(self.main, self.embedder_category_id) + } + pub fn embeddings( &self, rtxn: &RoTxn<'_>, docid: DocumentId, - ) -> Result>> { + ) -> Result, bool)>> { let mut res = BTreeMap::new(); - let embedding_configs = self.embedding_configs(rtxn)?; - for config in embedding_configs { - let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); - let reader = - ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); + let embedders = self.embedding_configs(); + for config in embedders.embedding_configs(rtxn)? { + let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap(); + let reader = ArroyWrapper::new( + self.vector_arroy, + embedder_info.embedder_id, + config.config.quantized(), + ); let embeddings = reader.item_vectors(rtxn, docid)?; - res.insert(config.name.to_owned(), embeddings); + res.insert( + config.name.to_owned(), + (embeddings, embedder_info.embedding_status.must_regenerate(docid)), + ); } Ok(res) } @@ -1809,9 +1792,9 @@ impl Index { pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result { let mut stats = ArroyStats::default(); - let embedding_configs = self.embedding_configs(rtxn)?; - for config in embedding_configs { - let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); + let embedding_configs = self.embedding_configs(); + for config in embedding_configs.embedding_configs(rtxn)? { + let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap(); let reader = ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); reader.aggregate_stats(rtxn, &mut stats)?; @@ -1936,13 +1919,6 @@ impl Index { } } -#[derive(Debug, Deserialize, Serialize)] -pub struct IndexEmbeddingConfig { - pub name: String, - pub config: EmbeddingConfig, - pub user_provided: RoaringBitmap, -} - #[derive(Debug, Default, Deserialize, Serialize)] pub struct ChatConfig { pub description: String, From c16c60b5998e88a835d20505799b8f0c779d1922 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:48:53 +0200 Subject: [PATCH 091/150] Add `vector::extractor` module --- crates/milli/src/vector/extractor.rs | 214 +++++++++++++++++++++++++++ crates/milli/src/vector/mod.rs | 1 + 2 files changed, 215 insertions(+) create mode 100644 crates/milli/src/vector/extractor.rs diff --git a/crates/milli/src/vector/extractor.rs b/crates/milli/src/vector/extractor.rs new file mode 100644 index 000000000..cbfc62ee1 --- /dev/null +++ b/crates/milli/src/vector/extractor.rs @@ -0,0 +1,214 @@ +use std::cell::RefCell; +use std::collections::BTreeMap; +use std::fmt::Debug; + +use bumpalo::Bump; +use serde_json::Value; + +use super::json_template::{self, JsonTemplate}; +use crate::prompt::error::RenderPromptError; +use crate::prompt::Prompt; +use crate::update::new::document::Document; +use crate::vector::RuntimeFragment; +use crate::GlobalFieldsIdsMap; + +pub trait Extractor<'doc> { + type DocumentMetadata; + type Input: PartialEq; + type Error; + + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + meta: &Self::DocumentMetadata, + ) -> Result, Self::Error>; + + fn extractor_id(&self) -> u8; + + fn diff_documents<'a, OD: Document<'a> + Debug, ND: Document<'a> + Debug>( + &self, + old: OD, + new: ND, + meta: &Self::DocumentMetadata, + ) -> Result, Self::Error> + where + 'doc: 'a, + { + let old_input = self.extract(old, meta); + let new_input = self.extract(new, meta); + to_diff(old_input, new_input) + } + + fn diff_settings<'a, D: Document<'a> + Debug>( + &self, + doc: D, + meta: &Self::DocumentMetadata, + old: Option<&Self>, + ) -> Result, Self::Error> { + let old_input = if let Some(old) = old { old.extract(&doc, meta) } else { Ok(None) }; + let new_input = self.extract(&doc, meta); + + to_diff(old_input, new_input) + } + + fn ignore_errors(self) -> IgnoreErrorExtractor + where + Self: Sized, + { + IgnoreErrorExtractor(self) + } +} + +fn to_diff( + old_input: Result, E>, + new_input: Result, E>, +) -> Result, E> { + let old_input = old_input.ok().unwrap_or(None); + let new_input = new_input?; + Ok(match (old_input, new_input) { + (Some(old), Some(new)) if old == new => ExtractorDiff::Unchanged, + (None, None) => ExtractorDiff::Unchanged, + (None, Some(input)) => ExtractorDiff::Added(input), + (Some(_), None) => ExtractorDiff::Removed, + (Some(_), Some(input)) => ExtractorDiff::Updated(input), + }) +} + +pub enum ExtractorDiff { + Removed, + Added(Input), + Updated(Input), + Unchanged, +} + +impl ExtractorDiff { + pub fn into_input(self) -> Option { + match self { + ExtractorDiff::Removed => None, + ExtractorDiff::Added(input) => Some(input), + ExtractorDiff::Updated(input) => Some(input), + ExtractorDiff::Unchanged => None, + } + } + + pub fn needs_change(&self) -> bool { + match self { + ExtractorDiff::Removed => true, + ExtractorDiff::Added(_) => true, + ExtractorDiff::Updated(_) => true, + ExtractorDiff::Unchanged => false, + } + } + + pub fn into_list_of_changes( + named_diffs: impl IntoIterator, + ) -> BTreeMap> { + named_diffs + .into_iter() + .filter(|(_, diff)| diff.needs_change()) + .map(|(name, diff)| (name, diff.into_input())) + .collect() + } +} + +pub struct DocumentTemplateExtractor<'a, 'b, 'c> { + doc_alloc: &'a Bump, + field_id_map: &'a RefCell>, + template: &'c Prompt, +} + +impl<'a, 'b, 'c> DocumentTemplateExtractor<'a, 'b, 'c> { + pub fn new( + template: &'c Prompt, + doc_alloc: &'a Bump, + field_id_map: &'a RefCell>, + ) -> Self { + Self { template, doc_alloc, field_id_map } + } +} + +impl<'doc> Extractor<'doc> for DocumentTemplateExtractor<'doc, '_, '_> { + type DocumentMetadata = &'doc str; + type Input = &'doc str; + type Error = RenderPromptError; + + fn extractor_id(&self) -> u8 { + 0 + } + + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + external_docid: &Self::DocumentMetadata, + ) -> Result, Self::Error> { + Ok(Some(self.template.render_document( + external_docid, + doc, + self.field_id_map, + self.doc_alloc, + )?)) + } +} + +pub struct RequestFragmentExtractor<'a> { + fragment: &'a JsonTemplate, + extractor_id: u8, + doc_alloc: &'a Bump, +} + +impl<'a> RequestFragmentExtractor<'a> { + pub fn new(fragment: &'a RuntimeFragment, doc_alloc: &'a Bump) -> Self { + Self { fragment: &fragment.template, extractor_id: fragment.id, doc_alloc } + } +} + +impl<'doc> Extractor<'doc> for RequestFragmentExtractor<'doc> { + type DocumentMetadata = (); + type Input = Value; + type Error = json_template::Error; + + fn extractor_id(&self) -> u8 { + self.extractor_id + } + + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + _meta: &Self::DocumentMetadata, + ) -> Result, Self::Error> { + Ok(Some(self.fragment.render_document(doc, self.doc_alloc)?)) + } +} + +pub struct IgnoreErrorExtractor(E); + +impl<'doc, E> Extractor<'doc> for IgnoreErrorExtractor +where + E: Extractor<'doc>, +{ + type DocumentMetadata = E::DocumentMetadata; + type Input = E::Input; + + type Error = Infallible; + + fn extractor_id(&self) -> u8 { + self.0.extractor_id() + } + + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + meta: &Self::DocumentMetadata, + ) -> Result, Self::Error> { + Ok(self.0.extract(doc, meta).ok().flatten()) + } +} + +#[derive(Debug)] +pub enum Infallible {} + +impl From for crate::Error { + fn from(_: Infallible) -> Self { + unreachable!("Infallible values cannot be built") + } +} diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index ec4ee2ccd..246f824e1 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -20,6 +20,7 @@ use crate::ThreadPoolNoAbort; pub mod composite; pub mod db; pub mod error; +pub mod extractor; pub mod hf; pub mod json_template; pub mod manual; From b45059e8f202a714bf78a957cf6d1304b66325f6 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:49:10 +0200 Subject: [PATCH 092/150] Add `vector::session` module --- crates/milli/src/vector/mod.rs | 1 + crates/milli/src/vector/session.rs | 152 +++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 crates/milli/src/vector/session.rs diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 246f824e1..395c5d704 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -26,6 +26,7 @@ pub mod json_template; pub mod manual; pub mod openai; pub mod parsed_vectors; +pub mod session; pub mod settings; pub mod ollama; diff --git a/crates/milli/src/vector/session.rs b/crates/milli/src/vector/session.rs new file mode 100644 index 000000000..b6f229779 --- /dev/null +++ b/crates/milli/src/vector/session.rs @@ -0,0 +1,152 @@ +use bumpalo::collections::Vec as BVec; +use bumpalo::Bump; +use serde_json::Value; + +use super::{EmbedError, Embedder, Embedding}; +use crate::{DocumentId, Result, ThreadPoolNoAbort}; +type ExtractorId = u8; + +#[derive(Clone, Copy)] +pub struct Metadata<'doc> { + pub docid: DocumentId, + pub external_docid: &'doc str, + pub extractor_id: ExtractorId, +} + +pub struct EmbeddingResponse<'doc> { + pub metadata: Metadata<'doc>, + pub embedding: Option, +} + +pub trait OnEmbed<'doc> { + type ErrorMetadata; + + fn process_embedding_response(&mut self, response: EmbeddingResponse<'doc>); + fn process_embedding_error( + &mut self, + error: EmbedError, + embedder_name: &'doc str, + unused_vectors_distribution: &Self::ErrorMetadata, + metadata: &[Metadata<'doc>], + ) -> crate::Error; + + fn process_embeddings(&mut self, metadata: Metadata<'doc>, embeddings: Vec); +} + +pub struct EmbedSession<'doc, C, I> { + // requests + inputs: BVec<'doc, I>, + metadata: BVec<'doc, Metadata<'doc>>, + + threads: &'doc ThreadPoolNoAbort, + embedder: &'doc Embedder, + + embedder_name: &'doc str, + + on_embed: C, +} + +pub trait Input: Sized { + fn embed_ref( + inputs: &[Self], + embedder: &Embedder, + threads: &ThreadPoolNoAbort, + ) -> std::result::Result, EmbedError>; +} + +impl Input for &'_ str { + fn embed_ref( + inputs: &[Self], + embedder: &Embedder, + threads: &ThreadPoolNoAbort, + ) -> std::result::Result, EmbedError> { + embedder.embed_index_ref(inputs, threads) + } +} + +impl Input for Value { + fn embed_ref( + inputs: &[Value], + embedder: &Embedder, + threads: &ThreadPoolNoAbort, + ) -> std::result::Result, EmbedError> { + embedder.embed_index_ref_fragments(inputs, threads) + } +} + +impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> { + #[allow(clippy::too_many_arguments)] + pub fn new( + embedder: &'doc Embedder, + embedder_name: &'doc str, + threads: &'doc ThreadPoolNoAbort, + doc_alloc: &'doc Bump, + on_embed: C, + ) -> Self { + let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); + let texts = BVec::with_capacity_in(capacity, doc_alloc); + let ids = BVec::with_capacity_in(capacity, doc_alloc); + Self { inputs: texts, metadata: ids, embedder, threads, embedder_name, on_embed } + } + + pub fn request_embedding( + &mut self, + metadata: Metadata<'doc>, + rendered: I, + unused_vectors_distribution: &C::ErrorMetadata, + ) -> Result<()> { + if self.inputs.len() < self.inputs.capacity() { + self.inputs.push(rendered); + self.metadata.push(metadata); + return Ok(()); + } + + self.embed_chunks(unused_vectors_distribution) + } + + pub fn drain(mut self, unused_vectors_distribution: &C::ErrorMetadata) -> Result { + self.embed_chunks(unused_vectors_distribution)?; + Ok(self.on_embed) + } + + #[allow(clippy::too_many_arguments)] + fn embed_chunks(&mut self, unused_vectors_distribution: &C::ErrorMetadata) -> Result<()> { + if self.inputs.is_empty() { + return Ok(()); + } + let res = match I::embed_ref(self.inputs.as_slice(), self.embedder, self.threads) { + Ok(embeddings) => { + for (metadata, embedding) in self.metadata.iter().copied().zip(embeddings) { + self.on_embed.process_embedding_response(EmbeddingResponse { + metadata, + embedding: Some(embedding), + }); + } + Ok(()) + } + Err(error) => { + return Err(self.on_embed.process_embedding_error( + error, + self.embedder_name, + unused_vectors_distribution, + &self.metadata, + )) + } + }; + self.inputs.clear(); + self.metadata.clear(); + res + } + + pub(crate) fn embedder_name(&self) -> &'doc str { + self.embedder_name + } + + pub(crate) fn doc_alloc(&self) -> &'doc Bump { + self.inputs.bump() + } + + pub(crate) fn on_embed_mut(&mut self) -> &mut C { + &mut self.on_embed + } +} From 0b5bc41b792aec391ed293d9b903dfe007e06578 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:50:42 +0200 Subject: [PATCH 093/150] Add new vector errors --- crates/milli/src/vector/error.rs | 73 ++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index 685022de8..00d4221e5 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -101,6 +101,32 @@ pub enum EmbedErrorKind { MissingEmbedding, #[error(transparent)] PanicInThreadPool(#[from] PanicCatched), + #[error("`media` requested but the configuration doesn't have source `rest`")] + RestMediaNotARest, + #[error("`media` requested, and the configuration has source `rest`, but the configuration doesn't have `searchFragments`.")] + RestMediaNotAFragment, + + #[error("Query matches multiple search fragments.\n - Note: First matched fragment `{name}`.\n - Note: Second matched fragment `{second_name}`.\n - Note: {}", + { + serde_json::json!({ + "q": q, + "media": media + }) + })] + RestSearchMatchesMultipleFragments { + name: String, + second_name: String, + q: Option, + media: Option, + }, + #[error("Query matches no search fragment.\n - Note: {}", + { + serde_json::json!({ + "q": q, + "media": media + }) + })] + RestSearchMatchesNoFragment { q: Option, media: Option }, } fn option_info(info: Option<&str>, prefix: &str) -> String { @@ -210,6 +236,44 @@ impl EmbedError { pub(crate) fn rest_extraction_error(error: String) -> EmbedError { Self { kind: EmbedErrorKind::RestExtractionError(error), fault: FaultSource::Runtime } } + + pub(crate) fn rest_media_not_a_rest() -> EmbedError { + Self { kind: EmbedErrorKind::RestMediaNotARest, fault: FaultSource::User } + } + + pub(crate) fn rest_media_not_a_fragment() -> EmbedError { + Self { kind: EmbedErrorKind::RestMediaNotAFragment, fault: FaultSource::User } + } + + pub(crate) fn rest_search_matches_multiple_fragments( + name: &str, + second_name: &str, + q: Option<&str>, + media: Option<&serde_json::Value>, + ) -> EmbedError { + Self { + kind: EmbedErrorKind::RestSearchMatchesMultipleFragments { + name: name.to_string(), + second_name: second_name.to_string(), + q: q.map(String::from), + media: media.cloned(), + }, + fault: FaultSource::User, + } + } + + pub(crate) fn rest_search_matches_no_fragment( + q: Option<&str>, + media: Option<&serde_json::Value>, + ) -> EmbedError { + Self { + kind: EmbedErrorKind::RestSearchMatchesNoFragment { + q: q.map(String::from), + media: media.cloned(), + }, + fault: FaultSource::User, + } + } } #[derive(Debug, thiserror::Error)] @@ -382,6 +446,13 @@ impl NewEmbedderError { fault: FaultSource::User, } } + + pub(crate) fn rest_cannot_infer_dimensions_for_fragment() -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::RestCannotInferDimensionsForFragment, + fault: FaultSource::User, + } + } } #[derive(Debug, Clone, Copy)] @@ -499,6 +570,8 @@ pub enum NewEmbedderErrorKind { CompositeEmbeddingCountMismatch { search_count: usize, index_count: usize }, #[error("error while generating test embeddings.\n - the embeddings produced at search time and indexing time are not similar enough.\n - angular distance {distance:.2}\n - Meilisearch requires a maximum distance of {MAX_COMPOSITE_DISTANCE}.\n - Note: check that both embedders produce similar embeddings.{hint}")] CompositeEmbeddingValueMismatch { distance: f32, hint: CompositeEmbedderContainsHuggingFace }, + #[error("cannot infer `dimensions` for an embedder using `indexingFragments`.\n - Note: Specify `dimensions` explicitly or don't use `indexingFragments`.")] + RestCannotInferDimensionsForFragment, } pub struct PossibleEmbeddingMistakes { From 836ae19becebd5e3dbb81fc322a724159d3fa8d7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:52:05 +0200 Subject: [PATCH 094/150] ArroyWrapper changes --- crates/milli/src/vector/mod.rs | 241 ++++++++++++++++++++++++--------- 1 file changed, 180 insertions(+), 61 deletions(-) diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 395c5d704..3e7dc270d 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -15,6 +15,8 @@ use utoipa::ToSchema; use self::error::{EmbedError, NewEmbedderError}; use crate::progress::{EmbedderStats, Progress}; use crate::prompt::{Prompt, PromptData}; +use crate::vector::composite::SubEmbedderOptions; +use crate::vector::json_template::JsonTemplate; use crate::ThreadPoolNoAbort; pub mod composite; @@ -63,7 +65,7 @@ impl ArroyWrapper { rtxn: &'a RoTxn<'a>, db: arroy::Database, ) -> impl Iterator, arroy::Error>> + 'a { - arroy_db_range_for_embedder(self.embedder_index).map_while(move |index| { + arroy_store_range_for_embedder(self.embedder_index).filter_map(move |index| { match arroy::Reader::open(rtxn, index, db) { Ok(reader) => match reader.is_empty(rtxn) { Ok(false) => Some(Ok(reader)), @@ -76,12 +78,57 @@ impl ArroyWrapper { }) } - pub fn dimensions(&self, rtxn: &RoTxn) -> Result { - let first_id = arroy_db_range_for_embedder(self.embedder_index).next().unwrap(); + /// The item ids that are present in the store specified by its id. + /// + /// The ids are accessed via a lambda to avoid lifetime shenanigans. + pub fn items_in_store( + &self, + rtxn: &RoTxn, + store_id: u8, + with_items: F, + ) -> Result + where + F: FnOnce(&RoaringBitmap) -> O, + { if self.quantized { - Ok(arroy::Reader::open(rtxn, first_id, self.quantized_db())?.dimensions()) + self._items_in_store(rtxn, self.quantized_db(), store_id, with_items) } else { - Ok(arroy::Reader::open(rtxn, first_id, self.angular_db())?.dimensions()) + self._items_in_store(rtxn, self.angular_db(), store_id, with_items) + } + } + + fn _items_in_store( + &self, + rtxn: &RoTxn, + db: arroy::Database, + store_id: u8, + with_items: F, + ) -> Result + where + F: FnOnce(&RoaringBitmap) -> O, + { + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let reader = arroy::Reader::open(rtxn, index, db); + match reader { + Ok(reader) => Ok(with_items(reader.item_ids())), + Err(arroy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())), + Err(err) => Err(err), + } + } + + pub fn dimensions(&self, rtxn: &RoTxn) -> Result, arroy::Error> { + if self.quantized { + Ok(self + .readers(rtxn, self.quantized_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions())) + } else { + Ok(self + .readers(rtxn, self.angular_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions())) } } @@ -96,13 +143,13 @@ impl ArroyWrapper { arroy_memory: Option, cancel: &(impl Fn() -> bool + Sync + Send), ) -> Result<(), arroy::Error> { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.need_build(wtxn)? { writer.builder(rng).build(wtxn)? } else if writer.is_empty(wtxn)? { - break; + continue; } } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); @@ -127,7 +174,7 @@ impl ArroyWrapper { .cancel(cancel) .build(wtxn)?; } else if writer.is_empty(wtxn)? { - break; + continue; } } } @@ -146,7 +193,7 @@ impl ArroyWrapper { ) -> Result<(), arroy::Error> { let dimension = embeddings.dimension(); for (index, vector) in - arroy_db_range_for_embedder(self.embedder_index).zip(embeddings.iter()) + arroy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter()) { if self.quantized { arroy::Writer::new(self.quantized_db(), index, dimension) @@ -182,7 +229,7 @@ impl ArroyWrapper { ) -> Result<(), arroy::Error> { let dimension = vector.len(); - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { let writer = arroy::Writer::new(db, index, dimension); if !writer.contains_item(wtxn, item_id)? { writer.add_item(wtxn, item_id, vector)?; @@ -192,6 +239,38 @@ impl ArroyWrapper { Ok(()) } + /// Add a vector associated with a document in store specified by its id. + /// + /// Any existing vector associated with the document in the store will be replaced by the new vector. + pub fn add_item_in_store( + &self, + wtxn: &mut RwTxn, + item_id: arroy::ItemId, + store_id: u8, + vector: &[f32], + ) -> Result<(), arroy::Error> { + if self.quantized { + self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector) + } else { + self._add_item_in_store(wtxn, self.angular_db(), item_id, store_id, vector) + } + } + + fn _add_item_in_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + store_id: u8, + vector: &[f32], + ) -> Result<(), arroy::Error> { + let dimension = vector.len(); + + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimension); + writer.add_item(wtxn, item_id, vector) + } + /// Delete all embeddings from a specific `item_id` pub fn del_items( &self, @@ -199,24 +278,84 @@ impl ArroyWrapper { dimension: usize, item_id: arroy::ItemId, ) -> Result<(), arroy::Error> { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); - if !writer.del_item(wtxn, item_id)? { - break; - } + writer.del_item(wtxn, item_id)?; } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); - if !writer.del_item(wtxn, item_id)? { - break; - } + writer.del_item(wtxn, item_id)?; } } Ok(()) } - /// Delete one item. + /// Removes the item specified by its id from the store specified by its id. + /// + /// Returns whether the item was removed. + /// + /// # Warning + /// + /// - This function will silently fail to remove the item if used against an arroy database that was never built. + pub fn del_item_in_store( + &self, + wtxn: &mut RwTxn, + item_id: arroy::ItemId, + store_id: u8, + dimensions: usize, + ) -> Result { + if self.quantized { + self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions) + } else { + self._del_item_in_store(wtxn, self.angular_db(), item_id, store_id, dimensions) + } + } + + fn _del_item_in_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + store_id: u8, + dimensions: usize, + ) -> Result { + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimensions); + writer.del_item(wtxn, item_id) + } + + /// Removes all items from the store specified by its id. + /// + /// # Warning + /// + /// - This function will silently fail to remove the items if used against an arroy database that was never built. + pub fn clear_store( + &self, + wtxn: &mut RwTxn, + store_id: u8, + dimensions: usize, + ) -> Result<(), arroy::Error> { + if self.quantized { + self._clear_store(wtxn, self.quantized_db(), store_id, dimensions) + } else { + self._clear_store(wtxn, self.angular_db(), store_id, dimensions) + } + } + + fn _clear_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + store_id: u8, + dimensions: usize, + ) -> Result<(), arroy::Error> { + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimensions); + writer.clear(wtxn) + } + + /// Delete one item from its value. pub fn del_item( &self, wtxn: &mut RwTxn, @@ -238,54 +377,31 @@ impl ArroyWrapper { vector: &[f32], ) -> Result { let dimension = vector.len(); - let mut deleted_index = None; - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { let writer = arroy::Writer::new(db, index, dimension); let Some(candidate) = writer.item_vector(wtxn, item_id)? else { - // uses invariant: vectors are packed in the first writers. - break; + continue; }; if candidate == vector { - writer.del_item(wtxn, item_id)?; - deleted_index = Some(index); + return writer.del_item(wtxn, item_id); } } - - // 🥲 enforce invariant: vectors are packed in the first writers. - if let Some(deleted_index) = deleted_index { - let mut last_index_with_a_vector = None; - for index in - arroy_db_range_for_embedder(self.embedder_index).skip(deleted_index as usize) - { - let writer = arroy::Writer::new(db, index, dimension); - let Some(candidate) = writer.item_vector(wtxn, item_id)? else { - break; - }; - last_index_with_a_vector = Some((index, candidate)); - } - if let Some((last_index, vector)) = last_index_with_a_vector { - let writer = arroy::Writer::new(db, last_index, dimension); - writer.del_item(wtxn, item_id)?; - let writer = arroy::Writer::new(db, deleted_index, dimension); - writer.add_item(wtxn, item_id, &vector)?; - } - } - Ok(deleted_index.is_some()) + Ok(false) } pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.is_empty(wtxn)? { - break; + continue; } writer.clear(wtxn)?; } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); if writer.is_empty(wtxn)? { - break; + continue; } writer.clear(wtxn)?; } @@ -299,17 +415,17 @@ impl ArroyWrapper { dimension: usize, item: arroy::ItemId, ) -> Result { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { let contains = if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.is_empty(rtxn)? { - break; + continue; } writer.contains_item(rtxn, item)? } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); if writer.is_empty(rtxn)? { - break; + continue; } writer.contains_item(rtxn, item)? }; @@ -348,13 +464,14 @@ impl ArroyWrapper { let reader = reader?; let mut searcher = reader.nns(limit); if let Some(filter) = filter { + if reader.item_ids().is_disjoint(filter) { + continue; + } searcher.candidates(filter); } if let Some(mut ret) = searcher.by_item(rtxn, item)? { results.append(&mut ret); - } else { - break; } } results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); @@ -389,6 +506,9 @@ impl ArroyWrapper { let reader = reader?; let mut searcher = reader.nns(limit); if let Some(filter) = filter { + if reader.item_ids().is_disjoint(filter) { + continue; + } searcher.candidates(filter); } @@ -407,16 +527,12 @@ impl ArroyWrapper { for reader in self.readers(rtxn, self.quantized_db()) { if let Some(vec) = reader?.item_vector(rtxn, item_id)? { vectors.push(vec); - } else { - break; } } } else { for reader in self.readers(rtxn, self.angular_db()) { if let Some(vec) = reader?.item_vector(rtxn, item_id)? { vectors.push(vec); - } else { - break; } } } @@ -989,8 +1105,11 @@ pub const fn is_cuda_enabled() -> bool { cfg!(feature = "cuda") } -pub fn arroy_db_range_for_embedder(embedder_id: u8) -> impl Iterator { - let embedder_id = (embedder_id as u16) << 8; - - (0..=u8::MAX).map(move |k| embedder_id | (k as u16)) +fn arroy_store_range_for_embedder(embedder_id: u8) -> impl Iterator { + (0..=u8::MAX).map(move |store_id| arroy_store_for_embedder(embedder_id, store_id)) +} + +fn arroy_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 { + let embedder_id = (embedder_id as u16) << 8; + embedder_id | (store_id as u16) } From 422a786ffdaef07639191f9ec4fedc868be6c7ee Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:52:36 +0200 Subject: [PATCH 095/150] RuntimeEmbedder and RuntimeFragments --- crates/milli/src/vector/mod.rs | 37 ++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 3e7dc270d..37ade8f81 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -584,6 +584,7 @@ pub struct ArroyStats { pub documents: RoaringBitmap, } /// One or multiple embeddings stored consecutively in a flat vector. +#[derive(Debug, PartialEq)] pub struct Embeddings { data: Vec, dimension: usize, @@ -734,15 +735,26 @@ impl EmbeddingConfig { } } -/// Map of embedder configurations. -/// -/// Each configuration is mapped to a name. +/// Map of runtime embedder data. #[derive(Clone, Default)] -pub struct EmbeddingConfigs(HashMap, Arc, bool)>); +pub struct RuntimeEmbedders(HashMap>); -impl EmbeddingConfigs { +pub struct RuntimeEmbedder { + pub embedder: Arc, + pub document_template: Prompt, + pub fragments: Vec, + pub is_quantized: bool, +} + +pub struct RuntimeFragment { + pub name: String, + pub id: u8, + pub template: JsonTemplate, +} + +impl RuntimeEmbedders { /// Create the map from its internal component.s - pub fn new(data: HashMap, Arc, bool)>) -> Self { + pub fn new(data: HashMap>) -> Self { Self(data) } @@ -751,24 +763,23 @@ impl EmbeddingConfigs { } /// Get an embedder configuration and template from its name. - pub fn get(&self, name: &str) -> Option<(Arc, Arc, bool)> { + pub fn get(&self, name: &str) -> Option> { self.0.get(name).cloned() } - pub fn inner_as_ref(&self) -> &HashMap, Arc, bool)> { + pub fn inner_as_ref(&self) -> &HashMap> { &self.0 } - pub fn into_inner(self) -> HashMap, Arc, bool)> { + pub fn into_inner(self) -> HashMap> { self.0 } } -impl IntoIterator for EmbeddingConfigs { - type Item = (String, (Arc, Arc, bool)); +impl IntoIterator for RuntimeEmbedders { + type Item = (String, Arc); - type IntoIter = - std::collections::hash_map::IntoIter, Arc, bool)>; + type IntoIter = std::collections::hash_map::IntoIter>; fn into_iter(self) -> Self::IntoIter { self.0.into_iter() From 5716ab70f38c521f768f98f64aa32399f0fedb54 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:07:32 +0200 Subject: [PATCH 096/150] EmbeddingConfigs -> RuntimeEmbedders --- crates/benchmarks/benches/indexing.rs | 64 +++++++++---------- crates/benchmarks/benches/utils.rs | 4 +- crates/fuzzers/src/bin/fuzz-indexing.rs | 4 +- crates/index-scheduler/src/lib.rs | 58 ++++++++++++----- .../src/scheduler/process_dump_creation.rs | 11 +--- .../src/scheduler/process_index_operation.rs | 9 ++- crates/index-scheduler/src/scheduler/test.rs | 2 +- crates/meilisearch/src/lib.rs | 2 +- .../src/routes/indexes/documents.rs | 13 +--- crates/meilisearch/src/search/mod.rs | 19 ++---- crates/meilitool/src/main.rs | 10 +-- .../milli/src/search/new/tests/integration.rs | 4 +- crates/milli/src/search/new/vector_sort.rs | 4 +- crates/milli/src/search/similar.rs | 11 ++-- crates/milli/src/test_index.rs | 4 +- .../milli/tests/search/facet_distribution.rs | 4 +- crates/milli/tests/search/mod.rs | 4 +- crates/milli/tests/search/query_criteria.rs | 4 +- crates/milli/tests/search/typo_tolerance.rs | 4 +- 19 files changed, 118 insertions(+), 117 deletions(-) diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 16e7a2f81..4083b69dd 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -11,7 +11,7 @@ use milli::heed::{EnvOpenOptions, RwTxn}; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{FilterableAttributesRule, Index}; use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; @@ -166,7 +166,7 @@ fn indexing_songs_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -233,7 +233,7 @@ fn reindexing_songs_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -278,7 +278,7 @@ fn reindexing_songs_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -347,7 +347,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -424,7 +424,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -469,7 +469,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -510,7 +510,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -578,7 +578,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -645,7 +645,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -712,7 +712,7 @@ fn indexing_wiki(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -778,7 +778,7 @@ fn reindexing_wiki(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -823,7 +823,7 @@ fn reindexing_wiki(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -891,7 +891,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -968,7 +968,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1014,7 +1014,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1056,7 +1056,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1123,7 +1123,7 @@ fn indexing_movies_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1189,7 +1189,7 @@ fn reindexing_movies_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1234,7 +1234,7 @@ fn reindexing_movies_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1302,7 +1302,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1351,7 +1351,7 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec Index { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), diff --git a/crates/fuzzers/src/bin/fuzz-indexing.rs b/crates/fuzzers/src/bin/fuzz-indexing.rs index 0632b7846..ec1f96fd5 100644 --- a/crates/fuzzers/src/bin/fuzz-indexing.rs +++ b/crates/fuzzers/src/bin/fuzz-indexing.rs @@ -13,7 +13,7 @@ use milli::heed::EnvOpenOptions; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::IndexerConfig; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::Index; use serde_json::Value; use tempfile::TempDir; @@ -89,7 +89,7 @@ fn main() { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut operations = Vec::new(); diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 505ce23f8..f551652c1 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -57,12 +57,15 @@ use meilisearch_types::features::{ use meilisearch_types::heed::byteorder::BE; use meilisearch_types::heed::types::{DecodeIgnore, SerdeJson, Str, I128}; use meilisearch_types::heed::{self, Database, Env, RoTxn, WithoutTls}; -use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexerConfig; -use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; +use meilisearch_types::milli::vector::json_template::JsonTemplate; +use meilisearch_types::milli::vector::{ + Embedder, EmbedderOptions, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment, +}; use meilisearch_types::milli::{self, Index}; use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{KindWithContent, Task}; +use milli::vector::db::IndexEmbeddingConfig; use processing::ProcessingTasks; pub use queue::Query; use queue::Queue; @@ -851,29 +854,42 @@ impl IndexScheduler { &self, index_uid: String, embedding_configs: Vec, - ) -> Result { + ) -> Result { let res: Result<_> = embedding_configs .into_iter() .map( |IndexEmbeddingConfig { name, config: milli::vector::EmbeddingConfig { embedder_options, prompt, quantized }, - .. - }| { - let prompt = Arc::new( - prompt - .try_into() - .map_err(meilisearch_types::milli::Error::from) - .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, - ); + fragments, + }| + -> Result<(String, Arc)> { + let document_template = prompt + .try_into() + .map_err(meilisearch_types::milli::Error::from) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + + let fragments = fragments + .into_inner() + .into_iter() + .map(|fragment| { + let value = embedder_options.fragment(&fragment.name).unwrap(); + let template = JsonTemplate::new(value.clone()).unwrap(); + RuntimeFragment { name: fragment.name, id: fragment.id, template } + }) + .collect(); // optimistically return existing embedder { let embedders = self.embedders.read().unwrap(); if let Some(embedder) = embedders.get(&embedder_options) { - return Ok(( - name, - (embedder.clone(), prompt, quantized.unwrap_or_default()), - )); + let runtime = Arc::new(RuntimeEmbedder { + embedder: embedder.clone(), + document_template, + fragments, + is_quantized: quantized.unwrap_or_default(), + }); + + return Ok((name, runtime)); } } @@ -889,11 +905,19 @@ impl IndexScheduler { let mut embedders = self.embedders.write().unwrap(); embedders.insert(embedder_options, embedder.clone()); } - Ok((name, (embedder, prompt, quantized.unwrap_or_default()))) + + let runtime = Arc::new(RuntimeEmbedder { + embedder: embedder.clone(), + document_template, + fragments, + is_quantized: quantized.unwrap_or_default(), + }); + + Ok((name, runtime)) }, ) .collect(); - res.map(EmbeddingConfigs::new) + res.map(RuntimeEmbedders::new) } pub fn chat_settings(&self, uid: &str) -> Result> { diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index a6d785b2f..ec1be0e93 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -165,9 +165,6 @@ impl IndexScheduler { let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index - .embedding_configs(&rtxn) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; let nb_documents = index .number_of_documents(&rtxn) @@ -221,16 +218,12 @@ impl IndexScheduler { return Err(Error::from_milli(user_err, Some(uid.to_string()))); }; - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(id)); + for (embedder_name, (embeddings, regenerate)) in embeddings { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate: !user_provided, + regenerate, }; vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); } diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index 04aaf9a84..62d0e6545 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -89,8 +89,9 @@ impl IndexScheduler { let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(); let embedders = index + .embedding_configs() .embedding_configs(index_wtxn) - .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + .map_err(|e| Error::from_milli(e.into(), Some(index_uid.clone())))?; let embedders = self.embedders(index_uid.clone(), embedders)?; for operation in operations { match operation { @@ -274,8 +275,9 @@ impl IndexScheduler { }) .unwrap()?; let embedders = index + .embedding_configs() .embedding_configs(index_wtxn) - .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; let embedders = self.embedders(index_uid.clone(), embedders)?; progress.update_progress(DocumentEditionProgress::Indexing); @@ -423,8 +425,9 @@ impl IndexScheduler { indexer.delete_documents_by_docids(to_delete); let document_changes = indexer.into_changes(&indexer_alloc, primary_key); let embedders = index + .embedding_configs() .embedding_configs(index_wtxn) - .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; let embedders = self.embedders(index_uid.clone(), embedders)?; progress.update_progress(DocumentDeletionProgress::Indexing); diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index ee26165c7..2c492525f 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -3,11 +3,11 @@ use std::collections::BTreeMap; use big_s::S; use meili_snap::{json_string, snapshot}; use meilisearch_auth::AuthFilter; -use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexDocumentsMethod::*; use meilisearch_types::milli::{self}; use meilisearch_types::settings::SettingEmbeddingSettings; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; +use milli::vector::db::IndexEmbeddingConfig; use roaring::RoaringBitmap; use crate::insta_snapshot::snapshot_index_scheduler; diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 871bd688e..e1acef2ce 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -563,7 +563,7 @@ fn import_dump( let reader = BufReader::new(file); let reader = DocumentsBatchReader::from_reader(reader)?; - let embedder_configs = index.embedding_configs(&wtxn)?; + let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?; let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?; let builder = milli::update::IndexDocuments::new( diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 50eec46fe..a93d736f7 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1452,7 +1452,6 @@ fn some_documents<'a, 't: 'a>( ) -> Result> + 'a, ResponseError> { let fields_ids_map = index.fields_ids_map(rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index.embedding_configs(rtxn)?; Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> { @@ -1468,15 +1467,9 @@ fn some_documents<'a, 't: 'a>( Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, vector) in index.embeddings(rtxn, key)? { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == name) - .is_some_and(|conf| conf.user_provided.contains(key)); - let embeddings = ExplicitVectors { - embeddings: Some(vector.into()), - regenerate: !user_provided, - }; + for (name, (vector, regenerate)) in index.embeddings(rtxn, key)? { + let embeddings = + ExplicitVectors { embeddings: Some(vector.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 5e543c53f..61ef3f813 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -399,10 +399,10 @@ impl SearchKind { route: Route, ) -> Result<(String, Arc, bool), ResponseError> { let rtxn = index.read_txn()?; - let embedder_configs = index.embedding_configs(&rtxn)?; + let embedder_configs = index.embedding_configs().embedding_configs(&rtxn)?; let embedders = index_scheduler.embedders(index_uid, embedder_configs)?; - let (embedder, _, quantized) = embedders + let (embedder, quantized) = embedders .get(embedder_name) .ok_or(match route { Route::Search | Route::MultiSearch => { @@ -412,6 +412,7 @@ impl SearchKind { milli::UserError::InvalidSimilarEmbedder(embedder_name.to_owned()) } }) + .map(|runtime| (runtime.embedder.clone(), runtime.is_quantized)) .map_err(milli::Error::from)?; if let Some(vector_len) = vector_len { @@ -1328,7 +1329,6 @@ struct HitMaker<'a> { vectors_fid: Option, retrieve_vectors: RetrieveVectors, to_retrieve_ids: BTreeSet, - embedding_configs: Vec, formatter_builder: MatcherBuilder<'a>, formatted_options: BTreeMap, show_ranking_score: bool, @@ -1443,8 +1443,6 @@ impl<'a> HitMaker<'a> { &displayed_ids, ); - let embedding_configs = index.embedding_configs(rtxn)?; - Ok(Self { index, rtxn, @@ -1453,7 +1451,6 @@ impl<'a> HitMaker<'a> { vectors_fid, retrieve_vectors, to_retrieve_ids, - embedding_configs, formatter_builder, formatted_options, show_ranking_score: format.show_ranking_score, @@ -1499,14 +1496,8 @@ impl<'a> HitMaker<'a> { Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, vector) in self.index.embeddings(self.rtxn, id)? { - let user_provided = self - .embedding_configs - .iter() - .find(|conf| conf.name == name) - .is_some_and(|conf| conf.user_provided.contains(id)); - let embeddings = - ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; + for (name, (vector, regenerate)) in self.index.embeddings(self.rtxn, id)? { + let embeddings = ExplicitVectors { embeddings: Some(vector.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?, diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index dd1213782..b967e620c 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -545,7 +545,6 @@ fn export_documents( let rtxn = index.read_txn()?; let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index.embedding_configs(&rtxn)?; if let Some(offset) = offset { eprintln!("Skipping {offset} documents"); @@ -592,17 +591,12 @@ fn export_documents( .into()); }; - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(id)); - + for (embedder_name, (embeddings, regenerate)) in embeddings { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate: !user_provided, + regenerate, }; vectors .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 9e2afca97..38f39e18b 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -8,7 +8,7 @@ use maplit::{btreemap, hashset}; use crate::progress::Progress; use crate::update::new::indexer; use crate::update::{IndexerConfig, Settings}; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::{db_snap, Criterion, FilterableAttributesRule, Index}; pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson"); use crate::constants::RESERVED_GEO_FIELD_NAME; @@ -55,7 +55,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); diff --git a/crates/milli/src/search/new/vector_sort.rs b/crates/milli/src/search/new/vector_sort.rs index 834f97384..2c201e899 100644 --- a/crates/milli/src/search/new/vector_sort.rs +++ b/crates/milli/src/search/new/vector_sort.rs @@ -32,8 +32,8 @@ impl VectorSort { ) -> Result { let embedder_index = ctx .index - .embedder_category_id - .get(ctx.txn, embedder_name)? + .embedding_configs() + .embedder_id(ctx.txn, embedder_name)? .ok_or_else(|| crate::UserError::InvalidSearchEmbedder(embedder_name.to_owned()))?; Ok(Self { diff --git a/crates/milli/src/search/similar.rs b/crates/milli/src/search/similar.rs index 759940f9c..903b5fcf9 100644 --- a/crates/milli/src/search/similar.rs +++ b/crates/milli/src/search/similar.rs @@ -64,10 +64,13 @@ impl<'a> Similar<'a> { let universe = universe; - let embedder_index = - self.index.embedder_category_id.get(self.rtxn, &self.embedder_name)?.ok_or_else( - || crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()), - )?; + let embedder_index = self + .index + .embedding_configs() + .embedder_id(self.rtxn, &self.embedder_name)? + .ok_or_else(|| { + crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()) + })?; let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized); let results = reader.nns_by_item( diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index f2e34c615..cfd8c8492 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -18,7 +18,7 @@ use crate::update::{ self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, Settings, }; use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::{db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult}; pub(crate) struct TempIndex { @@ -223,7 +223,7 @@ fn aborting_indexation() { let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let payload = documents!([ { "id": 1, "name": "kevin" }, diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index d04db425e..cc1b85369 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -5,7 +5,7 @@ use milli::documents::mmap_from_objects; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{FacetDistribution, FilterableAttributesRule, Index, Object, OrderBy}; use serde_json::{from_value, json}; @@ -35,7 +35,7 @@ fn test_facet_distribution_with_no_facet_values() { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let doc1: Object = from_value( diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 3ee78561d..fa03f1cc1 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -10,7 +10,7 @@ use maplit::{btreemap, hashset}; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{ AscDesc, Criterion, DocumentId, FilterableAttributesRule, Index, Member, TermsMatchingStrategy, }; @@ -74,7 +74,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index cb0c23e42..3f8134085 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -8,7 +8,7 @@ use maplit::hashset; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy}; use rand::Rng; use Criterion::*; @@ -288,7 +288,7 @@ fn criteria_ascdesc() { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index 49c9c7b5d..95ff85165 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -6,7 +6,7 @@ use milli::documents::mmap_from_objects; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{Criterion, Index, Object, Search, TermsMatchingStrategy}; use serde_json::from_value; use tempfile::tempdir; @@ -123,7 +123,7 @@ fn test_typo_disabled_on_word() { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.replace_documents(&documents).unwrap(); From e7b9b8f00230429831fe5467f3a1d5161465e6e2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:53:06 +0200 Subject: [PATCH 097/150] Change embedder API --- crates/milli/src/vector/mod.rs | 75 ++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 37ade8f81..87ecd2414 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -797,6 +797,27 @@ pub enum EmbedderOptions { Composite(composite::EmbedderOptions), } +impl EmbedderOptions { + pub fn fragment(&self, name: &str) -> Option<&serde_json::Value> { + match &self { + EmbedderOptions::HuggingFace(_) + | EmbedderOptions::OpenAi(_) + | EmbedderOptions::Ollama(_) + | EmbedderOptions::UserProvided(_) => None, + EmbedderOptions::Rest(embedder_options) => { + embedder_options.indexing_fragments.get(name) + } + EmbedderOptions::Composite(embedder_options) => { + if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index { + embedder_options.indexing_fragments.get(name) + } else { + None + } + } + } + } +} + impl Default for EmbedderOptions { fn default() -> Self { Self::HuggingFace(Default::default()) @@ -837,6 +858,17 @@ impl Embedder { #[tracing::instrument(level = "debug", skip_all, target = "search")] pub fn embed_search( + &self, + query: SearchQuery<'_>, + deadline: Option, + ) -> std::result::Result { + match query { + SearchQuery::Text(text) => self.embed_search_text(text, deadline), + SearchQuery::Media { q, media } => self.embed_search_media(q, media, deadline), + } + } + + pub fn embed_search_text( &self, text: &str, deadline: Option, @@ -858,10 +890,7 @@ impl Embedder { .pop() .ok_or_else(EmbedError::missing_embedding), Embedder::UserProvided(embedder) => embedder.embed_one(text), - Embedder::Rest(embedder) => embedder - .embed_ref(&[text], deadline, None)? - .pop() - .ok_or_else(EmbedError::missing_embedding), + Embedder::Rest(embedder) => embedder.embed_one(SearchQuery::Text(text), deadline, None), Embedder::Composite(embedder) => embedder.search.embed_one(text, deadline, None), }?; @@ -872,6 +901,18 @@ impl Embedder { Ok(embedding) } + pub fn embed_search_media( + &self, + q: Option<&str>, + media: Option<&serde_json::Value>, + deadline: Option, + ) -> std::result::Result { + let Embedder::Rest(embedder) = self else { + return Err(EmbedError::rest_media_not_a_rest()); + }; + embedder.embed_one(SearchQuery::Media { q, media }, deadline, None) + } + /// Embed multiple chunks of texts. /// /// Each chunk is composed of one or multiple texts. @@ -916,6 +957,26 @@ impl Embedder { } } + pub fn embed_index_ref_fragments( + &self, + fragments: &[serde_json::Value], + threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, + ) -> std::result::Result, EmbedError> { + if let Embedder::Rest(embedder) = self { + embedder.embed_index_ref(fragments, threads, embedder_stats) + } else { + let Embedder::Composite(embedder) = self else { + unimplemented!("embedding fragments is only available for rest embedders") + }; + let crate::vector::composite::SubEmbedder::Rest(embedder) = &embedder.index else { + unimplemented!("embedding fragments is only available for rest embedders") + }; + + embedder.embed_index_ref(fragments, threads, embedder_stats) + } + } + /// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`] pub fn chunk_count_hint(&self) -> usize { match self { @@ -987,6 +1048,12 @@ impl Embedder { } } +#[derive(Clone, Copy)] +pub enum SearchQuery<'a> { + Text(&'a str), + Media { q: Option<&'a str>, media: Option<&'a serde_json::Value> }, +} + /// Describes the mean and sigma of distribution of embedding similarity in the embedding space. /// /// The intended use is to make the similarity score more comparable to the regular ranking score. From 4235a82dcfab23f5bab89cee819c49f91dd68712 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:54:06 +0200 Subject: [PATCH 098/150] REST embedder supports fragments --- crates/milli/src/vector/rest.rs | 231 +++++++++++++++++++++++++++----- 1 file changed, 197 insertions(+), 34 deletions(-) diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index fbe3c1129..9477959ad 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -6,11 +6,13 @@ use rand::Rng; use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; use rayon::slice::ParallelSlice as _; use serde::{Deserialize, Serialize}; +use serde_json::Value; use super::error::EmbedErrorKind; -use super::json_template::ValueTemplate; +use super::json_template::{InjectableValue, JsonTemplate}; use super::{ - DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, REQUEST_PARALLELISM, + DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, SearchQuery, + REQUEST_PARALLELISM, }; use crate::error::FaultSource; use crate::progress::EmbedderStats; @@ -88,19 +90,54 @@ struct EmbedderData { bearer: Option, headers: BTreeMap, url: String, - request: Request, + request: RequestData, response: Response, configuration_source: ConfigurationSource, } +#[derive(Debug)] +pub enum RequestData { + Single(Request), + FromFragments(RequestFromFragments), +} + +impl RequestData { + pub fn new( + request: Value, + indexing_fragments: BTreeMap, + search_fragments: BTreeMap, + ) -> Result { + Ok(if indexing_fragments.is_empty() && search_fragments.is_empty() { + RequestData::Single(Request::new(request)?) + } else { + RequestData::FromFragments(RequestFromFragments::new(request, search_fragments)?) + }) + } + + fn input_type(&self) -> InputType { + match self { + RequestData::Single(request) => request.input_type(), + RequestData::FromFragments(request_from_fragments) => { + request_from_fragments.input_type() + } + } + } + + fn has_fragments(&self) -> bool { + matches!(self, RequestData::FromFragments(_)) + } +} + #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] pub struct EmbedderOptions { pub api_key: Option, pub distribution: Option, pub dimensions: Option, pub url: String, - pub request: serde_json::Value, - pub response: serde_json::Value, + pub request: Value, + pub search_fragments: BTreeMap, + pub indexing_fragments: BTreeMap, + pub response: Value, pub headers: BTreeMap, } @@ -138,7 +175,12 @@ impl Embedder { .timeout(std::time::Duration::from_secs(30)) .build(); - let request = Request::new(options.request)?; + let request = RequestData::new( + options.request, + options.indexing_fragments, + options.search_fragments, + )?; + let response = Response::new(options.response, &request)?; let data = EmbedderData { @@ -188,7 +230,7 @@ impl Embedder { embedder_stats: Option<&EmbedderStats>, ) -> Result, EmbedError> where - S: AsRef + Serialize, + S: Serialize, { embed(&self.data, texts, texts.len(), Some(self.dimensions), deadline, embedder_stats) } @@ -231,9 +273,9 @@ impl Embedder { } } - pub(crate) fn embed_index_ref( + pub(crate) fn embed_index_ref( &self, - texts: &[&str], + texts: &[S], threads: &ThreadPoolNoAbort, embedder_stats: &EmbedderStats, ) -> Result, EmbedError> { @@ -287,9 +329,44 @@ impl Embedder { pub(super) fn cache(&self) -> &EmbeddingCache { &self.cache } + + pub(crate) fn embed_one( + &self, + query: SearchQuery, + deadline: Option, + embedder_stats: Option<&EmbedderStats>, + ) -> Result { + let mut embeddings = match (&self.data.request, query) { + (RequestData::Single(_), SearchQuery::Text(text)) => { + embed(&self.data, &[text], 1, Some(self.dimensions), deadline, embedder_stats) + } + (RequestData::Single(_), SearchQuery::Media { q: _, media: _ }) => { + return Err(EmbedError::rest_media_not_a_fragment()) + } + (RequestData::FromFragments(request_from_fragments), SearchQuery::Text(q)) => { + let fragment = request_from_fragments.render_search_fragment(Some(q), None)?; + + embed(&self.data, &[fragment], 1, Some(self.dimensions), deadline, embedder_stats) + } + ( + RequestData::FromFragments(request_from_fragments), + SearchQuery::Media { q, media }, + ) => { + let fragment = request_from_fragments.render_search_fragment(q, media)?; + + embed(&self.data, &[fragment], 1, Some(self.dimensions), deadline, embedder_stats) + } + }?; + + // unwrap: checked by `expected_count` + Ok(embeddings.pop().unwrap()) + } } fn infer_dimensions(data: &EmbedderData) -> Result { + if data.request.has_fragments() { + return Err(NewEmbedderError::rest_cannot_infer_dimensions_for_fragment()); + } let v = embed(data, ["test"].as_slice(), 1, None, None, None) .map_err(NewEmbedderError::could_not_determine_dimension)?; // unwrap: guaranteed that v.len() == 1, otherwise the previous line terminated in error @@ -307,6 +384,13 @@ fn embed( where S: Serialize, { + if inputs.is_empty() { + if expected_count != 0 { + return Err(EmbedError::rest_response_embedding_count(expected_count, 0)); + } + return Ok(Vec::new()); + } + let request = data.client.post(&data.url); let request = if let Some(bearer) = &data.bearer { request.set("Authorization", bearer) @@ -318,7 +402,12 @@ where request = request.set(header.as_str(), value.as_str()); } - let body = data.request.inject_texts(inputs); + let body = match &data.request { + RequestData::Single(request) => request.inject_texts(inputs), + RequestData::FromFragments(request_from_fragments) => { + request_from_fragments.request_from_fragments(inputs).expect("inputs was empty") + } + }; for attempt in 0..10 { if let Some(embedder_stats) = &embedder_stats { @@ -426,7 +515,7 @@ fn response_to_embedding( expected_count: usize, expected_dimensions: Option, ) -> Result, Retry> { - let response: serde_json::Value = response + let response: Value = response .into_json() .map_err(EmbedError::rest_response_deserialization) .map_err(Retry::retry_later)?; @@ -455,17 +544,19 @@ fn response_to_embedding( } pub(super) const REQUEST_PLACEHOLDER: &str = "{{text}}"; +pub(super) const REQUEST_FRAGMENT_PLACEHOLDER: &str = "{{fragment}}"; pub(super) const RESPONSE_PLACEHOLDER: &str = "{{embedding}}"; pub(super) const REPEAT_PLACEHOLDER: &str = "{{..}}"; #[derive(Debug)] pub struct Request { - template: ValueTemplate, + template: InjectableValue, } impl Request { - pub fn new(template: serde_json::Value) -> Result { - let template = match ValueTemplate::new(template, REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER) { + pub fn new(template: Value) -> Result { + let template = match InjectableValue::new(template, REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER) + { Ok(template) => template, Err(error) => { let message = @@ -485,42 +576,114 @@ impl Request { } } - pub fn inject_texts( - &self, - texts: impl IntoIterator, - ) -> serde_json::Value { + pub fn inject_texts(&self, texts: impl IntoIterator) -> Value { self.template.inject(texts.into_iter().map(|s| serde_json::json!(s))).unwrap() } } +#[derive(Debug)] +pub struct RequestFromFragments { + search_fragments: BTreeMap, + request: InjectableValue, +} + +impl RequestFromFragments { + pub fn new( + request: Value, + search_fragments: impl IntoIterator, + ) -> Result { + let request = + match InjectableValue::new(request, REQUEST_FRAGMENT_PLACEHOLDER, REPEAT_PLACEHOLDER) { + Ok(template) => template, + Err(error) => { + let message = + error.error_message("request", REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER); + return Err(NewEmbedderError::rest_could_not_parse_template(message)); + } + }; + + let search_fragments: Result<_, NewEmbedderError> = search_fragments + .into_iter() + .map(|(name, value)| { + Ok(( + name, + JsonTemplate::new(value).map_err(|error| { + NewEmbedderError::rest_could_not_parse_template( + error.parsing("searchFragments"), + ) + })?, + )) + }) + .collect(); + + Ok(Self { request, search_fragments: search_fragments? }) + } + + fn input_type(&self) -> InputType { + if self.request.has_array_value() { + InputType::TextArray + } else { + InputType::Text + } + } + + pub fn render_search_fragment( + &self, + q: Option<&str>, + media: Option<&Value>, + ) -> Result { + let mut it = self.search_fragments.iter().filter_map(|(name, template)| { + let render = template.render_search(q, media).ok()?; + Some((name, render)) + }); + let Some((name, fragment)) = it.next() else { + return Err(EmbedError::rest_search_matches_no_fragment(q, media)); + }; + if let Some((second_name, _)) = it.next() { + return Err(EmbedError::rest_search_matches_multiple_fragments( + name, + second_name, + q, + media, + )); + } + + Ok(fragment) + } + + pub fn request_from_fragments<'a, S: Serialize + 'a>( + &self, + fragments: impl IntoIterator, + ) -> Option { + self.request.inject(fragments.into_iter().map(|fragment| serde_json::json!(fragment))).ok() + } +} + #[derive(Debug)] pub struct Response { - template: ValueTemplate, + template: InjectableValue, } impl Response { - pub fn new(template: serde_json::Value, request: &Request) -> Result { - let template = match ValueTemplate::new(template, RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER) - { - Ok(template) => template, - Err(error) => { - let message = - error.error_message("response", RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER); - return Err(NewEmbedderError::rest_could_not_parse_template(message)); - } - }; + pub fn new(template: Value, request: &RequestData) -> Result { + let template = + match InjectableValue::new(template, RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER) { + Ok(template) => template, + Err(error) => { + let message = + error.error_message("response", RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER); + return Err(NewEmbedderError::rest_could_not_parse_template(message)); + } + }; - match (template.has_array_value(), request.template.has_array_value()) { + match (template.has_array_value(), request.input_type() == InputType::TextArray) { (true, true) | (false, false) => Ok(Self {template}), (true, false) => Err(NewEmbedderError::rest_could_not_parse_template("in `response`: `response` has multiple embeddings, but `request` has only one text to embed".to_string())), (false, true) => Err(NewEmbedderError::rest_could_not_parse_template("in `response`: `response` has a single embedding, but `request` has multiple texts to embed".to_string())), } } - pub fn extract_embeddings( - &self, - response: serde_json::Value, - ) -> Result, EmbedError> { + pub fn extract_embeddings(&self, response: Value) -> Result, EmbedError> { let extracted_values: Vec = match self.template.extract(response) { Ok(extracted_values) => extracted_values, Err(error) => { From c45ede44a80a75a03d20d624ef12c1d693381b5f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:01:55 +0200 Subject: [PATCH 099/150] Add new parameters to openai and rest embedders --- crates/milli/src/vector/ollama.rs | 2 ++ crates/milli/src/vector/openai.rs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/ollama.rs index d4329a2de..feec92cc0 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/ollama.rs @@ -71,6 +71,8 @@ impl EmbedderOptions { request, response, headers: Default::default(), + indexing_fragments: Default::default(), + search_fragments: Default::default(), }) } } diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/openai.rs index 0159d5c76..bf6c92978 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/openai.rs @@ -201,6 +201,8 @@ impl Embedder { ] }), headers: Default::default(), + indexing_fragments: Default::default(), + search_fragments: Default::default(), }, cache_cap, super::rest::ConfigurationSource::OpenAi, From d48baece51081434b879c35a72d6052a227599a9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:56:15 +0200 Subject: [PATCH 100/150] New error when too many fragments in settings --- crates/milli/src/error.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 2136ec97e..f8886da8e 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -288,6 +288,8 @@ and can not be more than 511 bytes.", .document_id.to_string() InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError), #[error("Too many embedders in the configuration. Found {0}, but limited to 256.")] TooManyEmbedders(usize), + #[error("Too many fragments in the configuration. Found {0}, but limited to 256.")] + TooManyFragments(usize), #[error("Cannot find embedder with name `{0}`.")] InvalidSearchEmbedder(String), #[error("Cannot find embedder with name `{0}`.")] From f3d5c74c02ef8e82b17fa68c7ab833d0f33e20ca Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:55:28 +0200 Subject: [PATCH 101/150] Vector settings to add `indexingFragments` and `searchFragments` --- crates/milli/src/vector/settings.rs | 373 +++++++++++++++++++++++++++- 1 file changed, 361 insertions(+), 12 deletions(-) diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 712c1faa5..93de37290 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -2,6 +2,8 @@ use std::collections::BTreeMap; use std::num::NonZeroUsize; use deserr::Deserr; +use either::Either; +use itertools::Itertools; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; @@ -229,6 +231,35 @@ pub struct EmbeddingSettings { /// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated pub url: Setting, + /// Template fragments that will be reassembled and sent to the remote embedder at indexing time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🏗️ When a fragment is deleted by passing `null` to its name, the corresponding embeddings are removed from documents. + /// - 🏗️ When a fragment is modified, the corresponding embeddings are regenerated if their rendered version changes. + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub indexing_fragments: Setting>>, + + /// Template fragments that will be reassembled and sent to the remote embedder at search time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🌱 Changing the value of this parameter never regenerates embeddings + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub search_fragments: Setting>>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] #[schema(value_type = Option)] @@ -483,6 +514,36 @@ pub struct SubEmbeddingSettings { /// - 🌱 When modified for source `openAi`, embeddings are never regenerated /// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated pub url: Setting, + + /// Template fragments that will be reassembled and sent to the remote embedder at indexing time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🏗️ When a fragment is deleted by passing `null` to its name, the corresponding embeddings are removed from documents. + /// - 🏗️ When a fragment is modified, the corresponding embeddings are regenerated if their rendered version changes. + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub indexing_fragments: Setting>>, + + /// Template fragments that will be reassembled and sent to the remote embedder at search time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🌱 Changing the value of this parameter never regenerates embeddings + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub search_fragments: Setting>>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] #[schema(value_type = Option)] @@ -555,16 +616,24 @@ pub struct SubEmbeddingSettings { } /// Indicates what action should take place during a reindexing operation for an embedder -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum ReindexAction { /// An indexing operation should take place for this embedder, keeping existing vectors /// and checking whether the document template changed or not RegeneratePrompts, + RegenerateFragments(Vec<(String, RegenerateFragment)>), /// An indexing operation should take place for all documents for this embedder, removing existing vectors /// (except userProvided ones) FullReindex, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum RegenerateFragment { + Update, + Remove, + Add, +} + pub enum SettingsDiff { Remove, Reindex { action: ReindexAction, updated_settings: EmbeddingSettings, quantize: bool }, @@ -577,6 +646,12 @@ pub struct EmbedderAction { pub is_being_quantized: bool, pub write_back: Option, pub reindex: Option, + pub remove_fragments: Option, +} + +#[derive(Debug)] +pub struct RemoveFragments { + pub fragment_ids: Vec, } impl EmbedderAction { @@ -592,6 +667,10 @@ impl EmbedderAction { self.reindex.as_ref() } + pub fn remove_fragments(&self) -> Option<&RemoveFragments> { + self.remove_fragments.as_ref() + } + pub fn with_is_being_quantized(mut self, quantize: bool) -> Self { self.is_being_quantized = quantize; self @@ -603,11 +682,23 @@ impl EmbedderAction { is_being_quantized: false, write_back: Some(write_back), reindex: None, + remove_fragments: None, } } pub fn with_reindex(reindex: ReindexAction, was_quantized: bool) -> Self { - Self { was_quantized, is_being_quantized: false, write_back: None, reindex: Some(reindex) } + Self { + was_quantized, + is_being_quantized: false, + write_back: None, + reindex: Some(reindex), + remove_fragments: None, + } + } + + pub fn with_remove_fragments(mut self, remove_fragments: RemoveFragments) -> Self { + self.remove_fragments = Some(remove_fragments); + self } } @@ -634,6 +725,8 @@ impl SettingsDiff { mut dimensions, mut document_template, mut url, + mut indexing_fragments, + mut search_fragments, mut request, mut response, mut search_embedder, @@ -653,6 +746,8 @@ impl SettingsDiff { dimensions: new_dimensions, document_template: new_document_template, url: new_url, + indexing_fragments: new_indexing_fragments, + search_fragments: new_search_fragments, request: new_request, response: new_response, search_embedder: new_search_embedder, @@ -684,6 +779,8 @@ impl SettingsDiff { &mut document_template, &mut document_template_max_bytes, &mut url, + &mut indexing_fragments, + &mut search_fragments, &mut request, &mut response, &mut headers, @@ -696,6 +793,8 @@ impl SettingsDiff { new_document_template, new_document_template_max_bytes, new_url, + new_indexing_fragments, + new_search_fragments, new_request, new_response, new_headers, @@ -722,6 +821,8 @@ impl SettingsDiff { dimensions, document_template, url, + indexing_fragments, + search_fragments, request, response, search_embedder, @@ -769,6 +870,8 @@ impl SettingsDiff { mut document_template, mut document_template_max_bytes, mut url, + mut indexing_fragments, + mut search_fragments, mut request, mut response, mut headers, @@ -794,6 +897,8 @@ impl SettingsDiff { document_template: new_document_template, document_template_max_bytes: new_document_template_max_bytes, url: new_url, + indexing_fragments: new_indexing_fragments, + search_fragments: new_search_fragments, request: new_request, response: new_response, headers: new_headers, @@ -814,6 +919,8 @@ impl SettingsDiff { &mut document_template, &mut document_template_max_bytes, &mut url, + &mut indexing_fragments, + &mut search_fragments, &mut request, &mut response, &mut headers, @@ -826,6 +933,8 @@ impl SettingsDiff { new_document_template, new_document_template_max_bytes, new_url, + new_indexing_fragments, + new_search_fragments, new_request, new_response, new_headers, @@ -846,6 +955,8 @@ impl SettingsDiff { dimensions, document_template, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -875,6 +986,8 @@ impl SettingsDiff { document_template: &mut Setting, document_template_max_bytes: &mut Setting, url: &mut Setting, + indexing_fragments: &mut Setting>>, + search_fragments: &mut Setting>>, request: &mut Setting, response: &mut Setting, headers: &mut Setting>, @@ -887,6 +1000,8 @@ impl SettingsDiff { new_document_template: Setting, new_document_template_max_bytes: Setting, new_url: Setting, + new_indexing_fragments: Setting>>, + new_search_fragments: Setting>>, new_request: Setting, new_response: Setting, new_headers: Setting>, @@ -902,6 +1017,8 @@ impl SettingsDiff { pooling, dimensions, url, + indexing_fragments, + search_fragments, request, response, document_template, @@ -941,6 +1058,104 @@ impl SettingsDiff { } } } + + *search_fragments = match (std::mem::take(search_fragments), new_search_fragments) { + (Setting::Set(search_fragments), Setting::Set(new_search_fragments)) => { + Setting::Set( + search_fragments + .into_iter() + .merge_join_by(new_search_fragments, |(left, _), (right, _)| { + left.cmp(right) + }) + .map(|eob| { + match eob { + // merge fragments + itertools::EitherOrBoth::Both((name, _), (_, right)) => { + (name, right) + } + // unchanged fragment + itertools::EitherOrBoth::Left(left) => left, + // new fragment + itertools::EitherOrBoth::Right(right) => right, + } + }) + .collect(), + ) + } + (_, Setting::Reset) => Setting::Reset, + (left, Setting::NotSet) => left, + (Setting::NotSet | Setting::Reset, Setting::Set(new_search_fragments)) => { + Setting::Set(new_search_fragments) + } + }; + + let mut regenerate_fragments = Vec::new(); + *indexing_fragments = match (std::mem::take(indexing_fragments), new_indexing_fragments) { + (Setting::Set(fragments), Setting::Set(new_fragments)) => { + Setting::Set( + fragments + .into_iter() + .merge_join_by(new_fragments, |(left, _), (right, _)| left.cmp(right)) + .map(|eob| { + match eob { + // merge fragments + itertools::EitherOrBoth::Both( + (name, left), + (other_name, right), + ) => { + if left == right { + (name, left) + } else { + match right { + Some(right) => { + regenerate_fragments + .push((other_name, RegenerateFragment::Update)); + (name, Some(right)) + } + None => { + regenerate_fragments + .push((other_name, RegenerateFragment::Remove)); + (name, None) + } + } + } + } + // unchanged fragment + itertools::EitherOrBoth::Left(left) => left, + // new fragment + itertools::EitherOrBoth::Right((name, right)) => { + if right.is_some() { + regenerate_fragments + .push((name.clone(), RegenerateFragment::Add)); + } + (name, right) + } + } + }) + .collect(), + ) + } + // remove all fragments => move to document template + (_, Setting::Reset) => { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + Setting::Reset + } + // add all fragments + (Setting::NotSet | Setting::Reset, Setting::Set(new_fragments)) => { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + + Setting::Set(new_fragments) + } + // no change + (left, Setting::NotSet) => left, + }; + if !regenerate_fragments.is_empty() { + ReindexAction::push_action( + reindex_action, + ReindexAction::RegenerateFragments(regenerate_fragments), + ); + } + if request.apply(new_request) { ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); } @@ -972,10 +1187,16 @@ impl SettingsDiff { impl ReindexAction { fn push_action(this: &mut Option, other: Self) { - *this = match (*this, other) { - (_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex), - (Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex), - (_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts), + use ReindexAction::*; + *this = match (this.take(), other) { + (_, FullReindex) => Some(FullReindex), + (Some(FullReindex), _) => Some(FullReindex), + (_, RegenerateFragments(fragments)) => Some(RegenerateFragments(fragments)), + (Some(RegenerateFragments(fragments)), RegeneratePrompts) => { + Some(RegenerateFragments(fragments)) + } + (Some(RegeneratePrompts), RegeneratePrompts) => Some(RegeneratePrompts), + (None, RegeneratePrompts) => Some(RegeneratePrompts), } } } @@ -988,6 +1209,8 @@ fn apply_default_for_source( pooling: &mut Setting, dimensions: &mut Setting, url: &mut Setting, + indexing_fragments: &mut Setting>>, + search_fragments: &mut Setting>>, request: &mut Setting, response: &mut Setting, document_template: &mut Setting, @@ -1003,6 +1226,8 @@ fn apply_default_for_source( *pooling = Setting::Reset; *dimensions = Setting::NotSet; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; @@ -1015,6 +1240,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; @@ -1027,6 +1254,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::NotSet; *url = Setting::Reset; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; @@ -1039,6 +1268,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::Reset; + *indexing_fragments = Setting::Reset; + *search_fragments = Setting::Reset; *request = Setting::Reset; *response = Setting::Reset; *headers = Setting::Reset; @@ -1051,6 +1282,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *document_template = Setting::NotSet; @@ -1065,6 +1298,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::NotSet; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *document_template = Setting::NotSet; @@ -1131,6 +1366,8 @@ pub enum MetaEmbeddingSetting { DocumentTemplate, DocumentTemplateMaxBytes, Url, + IndexingFragments, + SearchFragments, Request, Response, Headers, @@ -1153,6 +1390,8 @@ impl MetaEmbeddingSetting { DocumentTemplate => "documentTemplate", DocumentTemplateMaxBytes => "documentTemplateMaxBytes", Url => "url", + IndexingFragments => "indexingFragments", + SearchFragments => "searchFragments", Request => "request", Response => "response", Headers => "headers", @@ -1176,6 +1415,8 @@ impl EmbeddingSettings { dimensions: &Setting, api_key: &Setting, url: &Setting, + indexing_fragments: &Setting>>, + search_fragments: &Setting>>, request: &Setting, response: &Setting, document_template: &Setting, @@ -1210,6 +1451,20 @@ impl EmbeddingSettings { )?; Self::check_setting(embedder_name, source, MetaEmbeddingSetting::ApiKey, context, api_key)?; Self::check_setting(embedder_name, source, MetaEmbeddingSetting::Url, context, url)?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::IndexingFragments, + context, + indexing_fragments, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::SearchFragments, + context, + search_fragments, + )?; Self::check_setting( embedder_name, source, @@ -1348,8 +1603,8 @@ impl EmbeddingSettings { ) => FieldStatus::Allowed, ( OpenAi, - Revision | Pooling | Request | Response | Headers | SearchEmbedder - | IndexingEmbedder, + Revision | Pooling | IndexingFragments | SearchFragments | Request | Response + | Headers | SearchEmbedder | IndexingEmbedder, _, ) => FieldStatus::Disallowed, ( @@ -1359,8 +1614,8 @@ impl EmbeddingSettings { ) => FieldStatus::Allowed, ( HuggingFace, - ApiKey | Dimensions | Url | Request | Response | Headers | SearchEmbedder - | IndexingEmbedder, + ApiKey | Dimensions | Url | IndexingFragments | SearchFragments | Request + | Response | Headers | SearchEmbedder | IndexingEmbedder, _, ) => FieldStatus::Disallowed, (Ollama, Model, _) => FieldStatus::Mandatory, @@ -1371,8 +1626,8 @@ impl EmbeddingSettings { ) => FieldStatus::Allowed, ( Ollama, - Revision | Pooling | Request | Response | Headers | SearchEmbedder - | IndexingEmbedder, + Revision | Pooling | IndexingFragments | SearchFragments | Request | Response + | Headers | SearchEmbedder | IndexingEmbedder, _, ) => FieldStatus::Disallowed, (UserProvided, Dimensions, _) => FieldStatus::Mandatory, @@ -1386,6 +1641,8 @@ impl EmbeddingSettings { | DocumentTemplate | DocumentTemplateMaxBytes | Url + | IndexingFragments + | SearchFragments | Request | Response | Headers @@ -1404,6 +1661,10 @@ impl EmbeddingSettings { | Headers, _, ) => FieldStatus::Allowed, + (Rest, IndexingFragments, NotNested | Indexing) => FieldStatus::Allowed, + (Rest, IndexingFragments, Search) => FieldStatus::Disallowed, + (Rest, SearchFragments, NotNested | Search) => FieldStatus::Allowed, + (Rest, SearchFragments, Indexing) => FieldStatus::Disallowed, (Rest, Model | Revision | Pooling | SearchEmbedder | IndexingEmbedder, _) => { FieldStatus::Disallowed } @@ -1419,6 +1680,8 @@ impl EmbeddingSettings { | DocumentTemplate | DocumentTemplateMaxBytes | Url + | IndexingFragments + | SearchFragments | Request | Response | Headers, @@ -1512,6 +1775,11 @@ impl std::fmt::Display for EmbedderSource { } } +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] +pub struct Fragment { + pub value: serde_json::Value, +} + impl EmbeddingSettings { fn from_hugging_face( super::hf::EmbedderOptions { @@ -1534,6 +1802,8 @@ impl EmbeddingSettings { document_template, document_template_max_bytes, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1566,6 +1836,8 @@ impl EmbeddingSettings { document_template, document_template_max_bytes, url: Setting::some_or_not_set(url), + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1598,6 +1870,8 @@ impl EmbeddingSettings { document_template, document_template_max_bytes, url: Setting::some_or_not_set(url), + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1622,6 +1896,8 @@ impl EmbeddingSettings { document_template: Setting::NotSet, document_template_max_bytes: Setting::NotSet, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1638,6 +1914,8 @@ impl EmbeddingSettings { dimensions, url, request, + indexing_fragments, + search_fragments, response, distribution, headers, @@ -1656,6 +1934,26 @@ impl EmbeddingSettings { document_template, document_template_max_bytes, url: Setting::Set(url), + indexing_fragments: if indexing_fragments.is_empty() { + Setting::NotSet + } else { + Setting::Set( + indexing_fragments + .into_iter() + .map(|(name, fragment)| (name, Some(Fragment { value: fragment }))) + .collect(), + ) + }, + search_fragments: if search_fragments.is_empty() { + Setting::NotSet + } else { + Setting::Set( + search_fragments + .into_iter() + .map(|(name, fragment)| (name, Some(Fragment { value: fragment }))) + .collect(), + ) + }, request: Setting::Set(request), response: Setting::Set(response), distribution: Setting::some_or_not_set(distribution), @@ -1714,6 +2012,8 @@ impl From for EmbeddingSettings { document_template: Setting::NotSet, document_template_max_bytes: Setting::NotSet, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1786,6 +2086,8 @@ impl From for SubEmbeddingSettings { document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -1804,6 +2106,8 @@ impl From for SubEmbeddingSettings { document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -1828,6 +2132,8 @@ impl From for EmbeddingConfig { document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, distribution, @@ -1879,6 +2185,8 @@ impl From for EmbeddingConfig { EmbedderSource::Rest => SubEmbedderOptions::rest( url.set().unwrap(), api_key, + indexing_fragments, + search_fragments, request.set().unwrap(), response.set().unwrap(), headers, @@ -1922,6 +2230,8 @@ impl SubEmbedderOptions { document_template: _, document_template_max_bytes: _, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -1944,6 +2254,8 @@ impl SubEmbedderOptions { EmbedderSource::Rest => Self::rest( url.set().unwrap(), api_key, + indexing_fragments, + search_fragments, request.set().unwrap(), response.set().unwrap(), headers, @@ -2010,9 +2322,13 @@ impl SubEmbedderOptions { distribution: distribution.set(), }) } + + #[allow(clippy::too_many_arguments)] fn rest( url: String, api_key: Setting, + indexing_fragments: Setting>>, + search_fragments: Setting>>, request: serde_json::Value, response: serde_json::Value, headers: Setting>, @@ -2027,6 +2343,22 @@ impl SubEmbedderOptions { response, distribution: distribution.set(), headers: headers.set().unwrap_or_default(), + search_fragments: search_fragments + .set() + .unwrap_or_default() + .into_iter() + .filter_map(|(name, fragment)| { + Some((name, fragment.map(|fragment| fragment.value)?)) + }) + .collect(), + indexing_fragments: indexing_fragments + .set() + .unwrap_or_default() + .into_iter() + .filter_map(|(name, fragment)| { + Some((name, fragment.map(|fragment| fragment.value)?)) + }) + .collect(), }) } fn ollama( @@ -2066,3 +2398,20 @@ impl From for EmbedderOptions { } } } + +pub(crate) fn fragments_from_settings( + setting: &Setting, +) -> impl Iterator + '_ { + let Some(setting) = setting.as_ref().set() else { return Either::Left(None.into_iter()) }; + if let Some(setting) = setting.indexing_fragments.as_ref().set() { + Either::Right(setting.keys().cloned()) + } else { + let Some(setting) = setting.indexing_embedder.as_ref().set() else { + return Either::Left(None.into_iter()); + }; + let Some(setting) = setting.indexing_fragments.as_ref().set() else { + return Either::Left(None.into_iter()); + }; + Either::Right(setting.keys().cloned()) + } +} From 41620d53254a46a58763592f310ce94ca1f567d4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:58:16 +0200 Subject: [PATCH 102/150] Support `indexingFragments` and `searchFragments` in settings --- crates/meilisearch-types/src/settings.rs | 4 +- crates/milli/src/update/settings.rs | 314 +++++++++++++++-------- 2 files changed, 213 insertions(+), 105 deletions(-) diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index 7d64440ce..d7b163448 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -9,10 +9,11 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; use milli::disabled_typos_terms::DisabledTyposTerms; -use milli::index::{IndexEmbeddingConfig, PrefixSearch}; +use milli::index::PrefixSearch; use milli::proximity::ProximityPrecision; pub use milli::update::ChatSettings; use milli::update::Setting; +use milli::vector::db::IndexEmbeddingConfig; use milli::{Criterion, CriterionError, FilterableAttributesRule, Index, DEFAULT_VALUES_PER_FACET}; use serde::{Deserialize, Serialize, Serializer}; use utoipa::ToSchema; @@ -911,6 +912,7 @@ pub fn settings( }; let embedders: BTreeMap<_, _> = index + .embedding_configs() .embedding_configs(rtxn)? .into_iter() .map(|IndexEmbeddingConfig { name, config, .. }| { diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index c6ede7a1d..3dae4f57c 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -7,7 +7,6 @@ use std::sync::Arc; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; use itertools::{merge_join_by, EitherOrBoth, Itertools}; -use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; @@ -23,22 +22,25 @@ use crate::error::UserError::{self, InvalidChatSettingsDocumentTemplateMaxBytes} use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::filterable_attributes_rules::match_faceted_field; use crate::index::{ - ChatConfig, IndexEmbeddingConfig, PrefixSearch, SearchParameters, - DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, + ChatConfig, PrefixSearch, SearchParameters, DEFAULT_MIN_WORD_LEN_ONE_TYPO, + DEFAULT_MIN_WORD_LEN_TWO_TYPOS, }; use crate::order_by_map::OrderByMap; -use crate::progress::EmbedderStats; -use crate::progress::Progress; +use crate::progress::{EmbedderStats, Progress}; use crate::prompt::{default_max_bytes, default_template_text, PromptData}; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::new::indexer::reindex; use crate::update::{IndexDocuments, UpdateIndexingStep}; +use crate::vector::db::{FragmentConfigs, IndexEmbeddingConfig}; +use crate::vector::json_template::JsonTemplate; use crate::vector::settings::{ EmbedderAction, EmbedderSource, EmbeddingSettings, NestingContext, ReindexAction, SubEmbeddingSettings, WriteBackToDocuments, }; -use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; +use crate::vector::{ + Embedder, EmbeddingConfig, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment, +}; use crate::{ ChannelCongestion, FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result, }; @@ -1044,22 +1046,27 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { match std::mem::take(&mut self.embedder_settings) { Setting::Set(configs) => self.update_embedding_configs_set(configs), Setting::Reset => { + let embedders = self.index.embedding_configs(); // all vectors should be written back to documents - let old_configs = self.index.embedding_configs(self.wtxn)?; + let old_configs = embedders.embedding_configs(self.wtxn)?; let remove_all: Result> = old_configs .into_iter() - .map(|IndexEmbeddingConfig { name, config, user_provided }| -> Result<_> { - let embedder_id = - self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( - crate::InternalError::DatabaseMissingEntry { - db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, - key: None, - }, - )?; + .map(|IndexEmbeddingConfig { name, config, fragments: _ }| -> Result<_> { + let embedder_info = embedders.embedder_info(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; Ok(( name, EmbedderAction::with_write_back( - WriteBackToDocuments { embedder_id, user_provided }, + WriteBackToDocuments { + embedder_id: embedder_info.embedder_id, + user_provided: embedder_info + .embedding_status + .into_user_provided(), + }, config.quantized(), ), )) @@ -1069,7 +1076,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let remove_all = remove_all?; self.index.embedder_category_id.clear(self.wtxn)?; - self.index.delete_embedding_configs(self.wtxn)?; + embedders.delete_embedding_configs(self.wtxn)?; Ok(remove_all) } Setting::NotSet => Ok(Default::default()), @@ -1081,12 +1088,12 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { configs: BTreeMap>, ) -> Result> { use crate::vector::settings::SettingsDiff; - - let old_configs = self.index.embedding_configs(self.wtxn)?; - let old_configs: BTreeMap = old_configs + let embedders = self.index.embedding_configs(); + let old_configs = embedders.embedding_configs(self.wtxn)?; + let old_configs: BTreeMap = old_configs .into_iter() - .map(|IndexEmbeddingConfig { name, config, user_provided }| { - (name, (config.into(), user_provided)) + .map(|IndexEmbeddingConfig { name, config, fragments }| { + (name, (config.into(), fragments)) }) .collect(); let mut updated_configs = BTreeMap::new(); @@ -1097,55 +1104,88 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { { match joined { // updated config - EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => { + EitherOrBoth::Both((name, (old, mut fragments)), (_, new)) => { let was_quantized = old.binary_quantized.set().unwrap_or_default(); let settings_diff = SettingsDiff::from_settings(&name, old, new)?; match settings_diff { SettingsDiff::Remove => { + let info = embedders.remove_embedder(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; tracing::debug!( embedder = name, - user_provided = user_provided.len(), + user_provided = info.embedding_status.user_provided_docids().len(), "removing embedder" ); - let embedder_id = - self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( - crate::InternalError::DatabaseMissingEntry { - db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, - key: None, - }, - )?; - // free id immediately - self.index.embedder_category_id.delete(self.wtxn, &name)?; embedder_actions.insert( name, EmbedderAction::with_write_back( - WriteBackToDocuments { embedder_id, user_provided }, + WriteBackToDocuments { + embedder_id: info.embedder_id, + user_provided: info.embedding_status.into_user_provided(), + }, was_quantized, ), ); } SettingsDiff::Reindex { action, updated_settings, quantize } => { - tracing::debug!( - embedder = name, - user_provided = user_provided.len(), - ?action, - "reindex embedder" - ); - embedder_actions.insert( - name.clone(), + let mut remove_fragments = None; + let updated_settings = Setting::Set(updated_settings); + if let ReindexAction::RegenerateFragments(regenerate_fragments) = + &action + { + let it = regenerate_fragments + .iter() + .filter(|(_, action)| { + matches!( + action, + crate::vector::settings::RegenerateFragment::Remove + ) + }) + .map(|(name, _)| name.as_str()); + + remove_fragments = fragments.remove_fragments(it); + + let it = regenerate_fragments + .iter() + .filter(|(_, action)| { + matches!( + action, + crate::vector::settings::RegenerateFragment::Add + ) + }) + .map(|(name, _)| name.clone()); + fragments.add_new_fragments(it)?; + } else { + // needs full reindex of fragments + fragments = FragmentConfigs::new(); + fragments.add_new_fragments( + crate::vector::settings::fragments_from_settings( + &updated_settings, + ), + )?; + } + tracing::debug!(embedder = name, ?action, "reindex embedder"); + + let embedder_action = EmbedderAction::with_reindex(action, was_quantized) - .with_is_being_quantized(quantize), - ); - let new = - validate_embedding_settings(Setting::Set(updated_settings), &name)?; - updated_configs.insert(name, (new, user_provided)); + .with_is_being_quantized(quantize); + + let embedder_action = if let Some(remove_fragments) = remove_fragments { + embedder_action.with_remove_fragments(remove_fragments) + } else { + embedder_action + }; + + embedder_actions.insert(name.clone(), embedder_action); + let new = validate_embedding_settings(updated_settings, &name)?; + updated_configs.insert(name, (new, fragments)); } SettingsDiff::UpdateWithoutReindex { updated_settings, quantize } => { - tracing::debug!( - embedder = name, - user_provided = user_provided.len(), - "update without reindex embedder" - ); + tracing::debug!(embedder = name, "update without reindex embedder"); let new = validate_embedding_settings(Setting::Set(updated_settings), &name)?; if quantize { @@ -1154,14 +1194,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { EmbedderAction::default().with_is_being_quantized(true), ); } - updated_configs.insert(name, (new, user_provided)); + updated_configs.insert(name, (new, fragments)); } } } // unchanged config - EitherOrBoth::Left((name, (setting, user_provided))) => { + EitherOrBoth::Left((name, (setting, fragments))) => { tracing::debug!(embedder = name, "unchanged embedder"); - updated_configs.insert(name, (Setting::Set(setting), user_provided)); + updated_configs.insert(name, (Setting::Set(setting), fragments)); } // new config EitherOrBoth::Right((name, mut setting)) => { @@ -1176,47 +1216,42 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { name.clone(), EmbedderAction::with_reindex(ReindexAction::FullReindex, false), ); - updated_configs.insert(name, (setting, RoaringBitmap::new())); + let mut fragments = FragmentConfigs::new(); + fragments.add_new_fragments( + crate::vector::settings::fragments_from_settings(&setting), + )?; + updated_configs.insert(name, (setting, fragments)); } } } - let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; - for res in self.index.embedder_category_id.iter(self.wtxn)? { - let (_name, id) = res?; - free_indices[id as usize] = false; - } - let mut free_indices = free_indices.iter_mut().enumerate(); - let mut find_free_index = - move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); - for (name, action) in embedder_actions.iter() { - // ignore actions that are not possible for a new embedder - if matches!(action.reindex(), Some(ReindexAction::FullReindex)) - && self.index.embedder_category_id.get(self.wtxn, name)?.is_none() - { - let id = - find_free_index().ok_or(UserError::TooManyEmbedders(updated_configs.len()))?; - tracing::debug!(embedder = name, id, "assigning free id to new embedder"); - self.index.embedder_category_id.put(self.wtxn, name, &id)?; - } - } + embedders.add_new_embedders( + self.wtxn, + embedder_actions + .iter() + // ignore actions that are not possible for a new embedder, most critically deleted embedders + .filter(|(_, action)| matches!(action.reindex(), Some(ReindexAction::FullReindex))) + .map(|(name, _)| name.as_str()), + updated_configs.len(), + )?; + let updated_configs: Vec = updated_configs .into_iter() - .filter_map(|(name, (config, user_provided))| match config { + .filter_map(|(name, (config, fragments))| match config { Setting::Set(config) => { - Some(IndexEmbeddingConfig { name, config: config.into(), user_provided }) + Some(IndexEmbeddingConfig { name, config: config.into(), fragments }) } Setting::Reset => None, Setting::NotSet => Some(IndexEmbeddingConfig { name, config: EmbeddingSettings::default().into(), - user_provided, + fragments: Default::default(), }), }) .collect(); if updated_configs.is_empty() { - self.index.delete_embedding_configs(self.wtxn)?; + embedders.delete_embedding_configs(self.wtxn)?; } else { - self.index.put_embedding_configs(self.wtxn, updated_configs)?; + embedders.put_embedding_configs(self.wtxn, updated_configs)?; } Ok(embedder_actions) } @@ -1611,13 +1646,13 @@ impl InnerIndexSettingsDiff { // if the user-defined searchables changed, then we need to reindex prompts. if cache_user_defined_searchables { - for (embedder_name, (config, _, _quantized)) in - new_settings.embedding_configs.inner_as_ref() - { - let was_quantized = - old_settings.embedding_configs.get(embedder_name).is_some_and(|conf| conf.2); + for (embedder_name, runtime) in new_settings.embedding_configs.inner_as_ref() { + let was_quantized = old_settings + .embedding_configs + .get(embedder_name) + .is_some_and(|conf| conf.is_quantized); // skip embedders that don't use document templates - if !config.uses_document_template() { + if !runtime.embedder.uses_document_template() { continue; } @@ -1630,13 +1665,31 @@ impl InnerIndexSettingsDiff { was_quantized, )); } - std::collections::btree_map::Entry::Occupied(entry) => { + std::collections::btree_map::Entry::Occupied(mut entry) => { + // future-proofing, make sure to destructure here so that any new field is taken into account in this case + // case in point: adding `remove_fragments` was detected. let EmbedderAction { was_quantized: _, is_being_quantized: _, - write_back: _, // We are deleting this embedder, so no point in regeneration - reindex: _, // We are already fully reindexing - } = entry.get(); + write_back, // We are deleting this embedder, so no point in regeneration + reindex, + remove_fragments: _, + } = entry.get_mut(); + + // fixup reindex to make sure we regenerate all fragments + *reindex = match reindex.take() { + Some(ReindexAction::RegenerateFragments(_)) => { + Some(ReindexAction::RegeneratePrompts) + } + Some(reindex) => Some(reindex), // We are at least regenerating prompts + None => { + if write_back.is_none() { + Some(ReindexAction::RegeneratePrompts) // quantization case + } else { + None + } + } + }; } }; } @@ -1790,7 +1843,7 @@ pub(crate) struct InnerIndexSettings { pub exact_attributes: HashSet, pub disabled_typos_terms: DisabledTyposTerms, pub proximity_precision: ProximityPrecision, - pub embedding_configs: EmbeddingConfigs, + pub embedding_configs: RuntimeEmbedders, pub embedder_category_id: HashMap, pub geo_fields_ids: Option<(FieldId, FieldId)>, pub prefix_search: PrefixSearch, @@ -1801,7 +1854,7 @@ impl InnerIndexSettings { pub fn from_index( index: &Index, rtxn: &heed::RoTxn<'_>, - embedding_configs: Option, + embedding_configs: Option, ) -> Result { let stop_words = index.stop_words(rtxn)?; let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); @@ -1812,7 +1865,7 @@ impl InnerIndexSettings { let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let embedding_configs = match embedding_configs { Some(embedding_configs) => embedding_configs, - None => embedders(index.embedding_configs(rtxn)?)?, + None => embedders(index.embedding_configs().embedding_configs(rtxn)?)?, }; let embedder_category_id = index .embedder_category_id @@ -1900,28 +1953,49 @@ impl InnerIndexSettings { } } -fn embedders(embedding_configs: Vec) -> Result { +fn embedders(embedding_configs: Vec) -> Result { let res: Result<_> = embedding_configs .into_iter() .map( |IndexEmbeddingConfig { name, config: EmbeddingConfig { embedder_options, prompt, quantized }, - .. + fragments, }| { - let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); + let document_template = prompt.try_into().map_err(crate::Error::from)?; - let embedder = Arc::new( + let embedder = // cache_cap: no cache needed for indexing purposes - Embedder::new(embedder_options.clone(), 0) + Arc::new(Embedder::new(embedder_options.clone(), 0) .map_err(crate::vector::Error::from) - .map_err(crate::Error::from)?, - ); - Ok((name, (embedder, prompt, quantized.unwrap_or_default()))) + .map_err(crate::Error::from)?); + + let fragments = fragments + .into_inner() + .into_iter() + .map(|fragment| { + let template = JsonTemplate::new( + embedder_options.fragment(&fragment.name).unwrap().clone(), + ) + .unwrap(); + + RuntimeFragment { name: fragment.name, id: fragment.id, template } + }) + .collect(); + + Ok(( + name, + Arc::new(RuntimeEmbedder { + embedder, + document_template, + fragments, + is_quantized: quantized.unwrap_or_default(), + }), + )) }, ) .collect(); - res.map(EmbeddingConfigs::new) + res.map(RuntimeEmbedders::new) } fn validate_prompt( @@ -1970,6 +2044,8 @@ pub fn validate_embedding_settings( document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, search_embedder, @@ -1997,8 +2073,28 @@ pub fn validate_embedding_settings( } if let Some(request) = request.as_ref().set() { - let request = crate::vector::rest::Request::new(request.to_owned()) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + let request = crate::vector::rest::RequestData::new( + request.to_owned(), + indexing_fragments + .as_ref() + .set() + .iter() + .flat_map(|map| map.iter()) + .filter_map(|(name, fragment)| { + Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) + }) + .collect(), + search_fragments + .as_ref() + .set() + .iter() + .flat_map(|map| map.iter()) + .filter_map(|(name, fragment)| { + Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) + }) + .collect(), + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; if let Some(response) = response.as_ref().set() { crate::vector::rest::Response::new(response.to_owned(), &request) .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; @@ -2017,6 +2113,8 @@ pub fn validate_embedding_settings( document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, search_embedder, @@ -2036,6 +2134,8 @@ pub fn validate_embedding_settings( &dimensions, &api_key, &url, + &indexing_fragments, + &search_fragments, &request, &response, &document_template, @@ -2114,6 +2214,8 @@ pub fn validate_embedding_settings( &embedder.dimensions, &embedder.api_key, &embedder.url, + &embedder.indexing_fragments, + &embedder.search_fragments, &embedder.request, &embedder.response, &embedder.document_template, @@ -2169,6 +2271,8 @@ pub fn validate_embedding_settings( &embedder.dimensions, &embedder.api_key, &embedder.url, + &embedder.indexing_fragments, + &embedder.search_fragments, &embedder.request, &embedder.response, &embedder.document_template, @@ -2201,6 +2305,8 @@ pub fn validate_embedding_settings( document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, search_embedder, From 22d363c05ad44f68a24de047c832de67aae7d966 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:59:35 +0200 Subject: [PATCH 103/150] Clear DB on clear documents --- crates/milli/src/update/clear_documents.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/crates/milli/src/update/clear_documents.rs b/crates/milli/src/update/clear_documents.rs index b0ae070de..01631e9a3 100644 --- a/crates/milli/src/update/clear_documents.rs +++ b/crates/milli/src/update/clear_documents.rs @@ -64,11 +64,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { self.index.delete_geo_faceted_documents_ids(self.wtxn)?; // Remove all user-provided bits from the configs - let mut configs = self.index.embedding_configs(self.wtxn)?; - for config in configs.iter_mut() { - config.user_provided.clear(); - } - self.index.put_embedding_configs(self.wtxn, configs)?; + self.index.embedding_configs().clear_embedder_info_docids(self.wtxn)?; // Clear the other databases. external_documents_ids.clear(self.wtxn)?; From f8232976eda21fa869dd1679e0c86c1126011c6c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:00:22 +0200 Subject: [PATCH 104/150] Implement in new document indexer --- crates/milli/src/update/new/channel.rs | 156 +++- crates/milli/src/update/new/document.rs | 105 +++ .../milli/src/update/new/document_change.rs | 8 +- .../milli/src/update/new/extract/documents.rs | 9 +- .../src/update/new/extract/vectors/mod.rs | 842 +++++++++++------- .../milli/src/update/new/indexer/extract.rs | 25 +- crates/milli/src/update/new/indexer/mod.rs | 23 +- crates/milli/src/update/new/indexer/write.rs | 52 +- .../milli/src/update/new/vector_document.rs | 29 +- crates/milli/src/vector/session.rs | 28 +- 10 files changed, 886 insertions(+), 391 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 4fff31a35..aec192ace 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -138,6 +138,7 @@ pub enum ReceiverAction { WakeUp, LargeEntry(LargeEntry), LargeVectors(LargeVectors), + LargeVector(LargeVector), } /// An entry that cannot fit in the BBQueue buffers has been @@ -174,6 +175,24 @@ impl LargeVectors { } } +#[derive(Debug)] +pub struct LargeVector { + /// The document id associated to the large embedding. + pub docid: DocumentId, + /// The embedder id in which to insert the large embedding. + pub embedder_id: u8, + /// The extractor id in which to insert the large embedding. + pub extractor_id: u8, + /// The large embedding that must be written. + pub embedding: Mmap, +} + +impl LargeVector { + pub fn read_embedding(&self, dimensions: usize) -> &[f32] { + self.embedding.chunks_exact(dimensions).map(bytemuck::cast_slice).next().unwrap() + } +} + impl<'a> WriterBbqueueReceiver<'a> { /// Tries to receive an action to do until the timeout occurs /// and if it does, consider it as a spurious wake up. @@ -238,6 +257,7 @@ pub enum EntryHeader { DbOperation(DbOperation), ArroyDeleteVector(ArroyDeleteVector), ArroySetVectors(ArroySetVectors), + ArroySetVector(ArroySetVector), } impl EntryHeader { @@ -250,6 +270,7 @@ impl EntryHeader { EntryHeader::DbOperation(_) => 0, EntryHeader::ArroyDeleteVector(_) => 1, EntryHeader::ArroySetVectors(_) => 2, + EntryHeader::ArroySetVector(_) => 3, } } @@ -274,11 +295,17 @@ impl EntryHeader { Self::variant_size() + mem::size_of::() + embedding_size * count } + fn total_set_vector_size(dimensions: usize) -> usize { + let embedding_size = dimensions * mem::size_of::(); + Self::variant_size() + mem::size_of::() + embedding_size + } + fn header_size(&self) -> usize { let payload_size = match self { EntryHeader::DbOperation(op) => mem::size_of_val(op), EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), + EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv), }; Self::variant_size() + payload_size } @@ -301,6 +328,11 @@ impl EntryHeader { let header = checked::pod_read_unaligned(header_bytes); EntryHeader::ArroySetVectors(header) } + 3 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroySetVector(header) + } id => panic!("invalid variant id: {id}"), } } @@ -311,6 +343,7 @@ impl EntryHeader { EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), + EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv), }; *first = self.variant_id(); remaining.copy_from_slice(payload_bytes); @@ -379,6 +412,37 @@ impl ArroySetVectors { } } +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// The embeddings are in the remaining space and represents +/// non-aligned [f32] each with dimensions f32s. +pub struct ArroySetVector { + pub docid: DocumentId, + pub embedder_id: u8, + pub extractor_id: u8, + _padding: [u8; 2], +} + +impl ArroySetVector { + fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { + let skip = EntryHeader::variant_size() + mem::size_of::(); + &frame[skip..] + } + + /// Read the embedding and write it into an aligned `f32` Vec. + pub fn read_all_embeddings_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + vec: &'v mut Vec, + ) -> &'v [f32] { + let embeddings_bytes = Self::embeddings_bytes(frame); + let embeddings_count = embeddings_bytes.len() / mem::size_of::(); + vec.resize(embeddings_count, 0.0); + bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes); + &vec[..] + } +} + #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] #[repr(u16)] pub enum Database { @@ -398,6 +462,7 @@ pub enum Database { FacetIdStringDocids, FieldIdDocidFacetStrings, FieldIdDocidFacetF64s, + VectorEmbedderCategoryId, } impl Database { @@ -419,6 +484,7 @@ impl Database { Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(), Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(), + Database::VectorEmbedderCategoryId => index.embedder_category_id.remap_types(), } } @@ -440,6 +506,7 @@ impl Database { Database::FacetIdStringDocids => db_name::FACET_ID_STRING_DOCIDS, Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS, Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S, + Database::VectorEmbedderCategoryId => db_name::VECTOR_EMBEDDER_CATEGORY_ID, } } } @@ -568,6 +635,82 @@ impl<'b> ExtractorBbqueueSender<'b> { Ok(()) } + fn set_vector_for_extractor( + &self, + docid: u32, + embedder_id: u8, + extractor_id: u8, + embedding: Option, + ) -> crate::Result<()> { + let max_grant = self.max_grant; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + // If there are no vectors we specify the dimensions + // to zero to allocate no extra space at all + let dimensions = embedding.as_ref().map_or(0, |emb| emb.len()); + + let arroy_set_vector = + ArroySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] }; + let payload_header = EntryHeader::ArroySetVector(arroy_set_vector); + let total_length = EntryHeader::total_set_vector_size(dimensions); + if total_length > max_grant { + let mut value_file = tempfile::tempfile().map(BufWriter::new)?; + let embedding = embedding.expect("set_vector without a vector does not fit in RAM"); + + let mut embedding_bytes = bytemuck::cast_slice(&embedding); + io::copy(&mut embedding_bytes, &mut value_file)?; + + let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; + let embedding = unsafe { Mmap::map(&value_file)? }; + + let large_vectors = LargeVector { docid, embedder_id, extractor_id, embedding }; + self.sender.send(ReceiverAction::LargeVector(large_vectors)).unwrap(); + + return Ok(()); + } + + // Spin loop to have a frame the size we requested. + reserve_and_write_grant( + &mut producer, + total_length, + &self.sender, + &self.sent_messages_attempts, + &self.blocking_sent_messages_attempts, + |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + + if dimensions != 0 { + let output_iter = + remaining.chunks_exact_mut(dimensions * mem::size_of::()); + + for (embedding, output) in embedding.iter().zip(output_iter) { + output.copy_from_slice(bytemuck::cast_slice(embedding)); + } + } + + Ok(()) + }, + )?; + + Ok(()) + } + + fn embedding_status( + &self, + name: &str, + infos: crate::vector::db::EmbedderInfo, + ) -> crate::Result<()> { + let bytes = infos.to_bytes().map_err(|_| { + InternalError::Serialization(crate::SerializationError::Encoding { + db_name: Some(Database::VectorEmbedderCategoryId.database_name()), + }) + })?; + self.write_key_value(Database::VectorEmbedderCategoryId, name.as_bytes(), &bytes) + } + fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { InternalError::StorePut { @@ -942,9 +1085,18 @@ impl EmbeddingSender<'_, '_> { &self, docid: DocumentId, embedder_id: u8, - embedding: Embedding, + extractor_id: u8, + embedding: Option, ) -> crate::Result<()> { - self.0.set_vectors(docid, embedder_id, &[embedding]) + self.0.set_vector_for_extractor(docid, embedder_id, extractor_id, embedding) + } + + pub(crate) fn embedding_status( + &self, + name: &str, + infos: crate::vector::db::EmbedderInfo, + ) -> crate::Result<()> { + self.0.embedding_status(name, infos) } } diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index b07cc0298..d520bb952 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -12,6 +12,7 @@ use super::vector_document::VectorDocument; use super::{KvReaderFieldId, KvWriterFieldId}; use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::documents::FieldIdMapper; +use crate::update::del_add::KvReaderDelAdd; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::vector_document::VectorDocumentFromDb; use crate::vector::settings::EmbedderAction; @@ -469,6 +470,110 @@ impl<'doc> Versions<'doc> { } } +#[derive(Debug)] +pub struct KvDelAddDocument<'a, Mapper: FieldIdMapper> { + document: &'a obkv::KvReaderU16, + side: crate::update::del_add::DelAdd, + fields_ids_map: &'a Mapper, +} + +impl<'a, Mapper: FieldIdMapper> KvDelAddDocument<'a, Mapper> { + pub fn new( + document: &'a obkv::KvReaderU16, + side: crate::update::del_add::DelAdd, + fields_ids_map: &'a Mapper, + ) -> Self { + Self { document, side, fields_ids_map } + } + + fn get(&self, k: &str) -> Result> { + let Some(id) = self.fields_ids_map.id(k) else { return Ok(None) }; + let Some(value) = self.document.get(id) else { return Ok(None) }; + let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else { return Ok(None) }; + + let value = serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?; + + Ok(Some(value)) + } +} + +impl<'a, Mapper: FieldIdMapper> Document<'a> for KvDelAddDocument<'a, Mapper> { + fn iter_top_level_fields(&self) -> impl Iterator> { + let mut it = self.document.iter(); + + std::iter::from_fn(move || loop { + let (fid, value) = it.next()?; + let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else { + continue; + }; + let name = match self.fields_ids_map.name(fid).ok_or( + InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId { + field_id: fid, + process: "getting current document", + }), + ) { + Ok(name) => name, + Err(error) => return Some(Err(error.into())), + }; + + if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME { + continue; + } + + let res = (|| { + let value = + serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?; + + Ok((name, value)) + })(); + + return Some(res); + }) + } + + fn top_level_fields_count(&self) -> usize { + let mut it = self.document.iter(); + + std::iter::from_fn(move || loop { + let (fid, value) = it.next()?; + let Some(_) = KvReaderDelAdd::from_slice(value).get(self.side) else { + continue; + }; + let name = match self.fields_ids_map.name(fid).ok_or( + InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId { + field_id: fid, + process: "getting current document", + }), + ) { + Ok(name) => name, + Err(_) => return Some(()), + }; + + if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME { + continue; + } + + return Some(()); + }) + .count() + } + + fn top_level_field(&self, k: &str) -> Result> { + if k == RESERVED_VECTORS_FIELD_NAME || k == RESERVED_GEO_FIELD_NAME { + return Ok(None); + } + self.get(k) + } + + fn vectors_field(&self) -> Result> { + self.get(RESERVED_VECTORS_FIELD_NAME) + } + + fn geo_field(&self) -> Result> { + self.get(RESERVED_GEO_FIELD_NAME) + } +} + pub struct DocumentIdentifiers<'doc> { docid: DocumentId, external_document_id: &'doc str, diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 2b9161319..1a40615e7 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -11,7 +11,7 @@ use super::vector_document::{ use crate::attribute_patterns::PatternMatch; use crate::documents::FieldIdMapper; use crate::update::new::document::DocumentIdentifiers; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::{DocumentId, Index, InternalError, Result}; pub enum DocumentChange<'doc> { @@ -70,7 +70,7 @@ impl<'doc> Insertion<'doc> { pub fn inserted_vectors( &self, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result>> { VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders) } @@ -241,7 +241,7 @@ impl<'doc> Update<'doc> { pub fn only_changed_vectors( &self, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result>> { VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders) } @@ -252,7 +252,7 @@ impl<'doc> Update<'doc> { index: &'doc Index, mapper: &'doc Mapper, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result>> { if self.from_scratch { MergedVectorDocument::without_db( diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index 5c1a1927a..31d2ada0f 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -7,8 +7,7 @@ use hashbrown::HashMap; use super::DelAddRoaringBitmap; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::update::new::channel::{DocumentsSender, ExtractorBbqueueSender}; -use crate::update::new::document::{write_to_obkv, Document}; -use crate::update::new::document::{DocumentContext, DocumentIdentifiers}; +use crate::update::new::document::{write_to_obkv, Document, DocumentContext, DocumentIdentifiers}; use crate::update::new::indexer::document_changes::{Extractor, IndexingContext}; use crate::update::new::indexer::settings_changes::{ settings_change_extract, DocumentsIndentifiers, SettingsChangeExtractor, @@ -19,16 +18,16 @@ use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; use crate::update::settings::SettingsDelta; use crate::vector::settings::EmbedderAction; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::Result; pub struct DocumentsExtractor<'a, 'b> { document_sender: DocumentsSender<'a, 'b>, - embedders: &'a EmbeddingConfigs, + embedders: &'a RuntimeEmbedders, } impl<'a, 'b> DocumentsExtractor<'a, 'b> { - pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self { + pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a RuntimeEmbedders) -> Self { Self { document_sender, embedders } } } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 4d308018a..3b8f5fa58 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -1,30 +1,35 @@ use std::cell::RefCell; use std::collections::BTreeMap; +use std::fmt::Debug; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; use hashbrown::{DefaultHashBuilder, HashMap}; -use super::cache::DelAddRoaringBitmap; use crate::error::FaultSource; use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::new::channel::EmbeddingSender; -use crate::update::new::document::{DocumentContext, DocumentIdentifiers}; +use crate::update::new::document::{Document, DocumentContext, DocumentIdentifiers}; use crate::update::new::indexer::document_changes::Extractor; use crate::update::new::indexer::settings_changes::SettingsChangeExtractor; use crate::update::new::thread_local::MostlySend; use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; +use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta}; use crate::vector::error::{ EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistributionBump, }; +use crate::vector::extractor::{ + DocumentTemplateExtractor, Extractor as VectorExtractor, RequestFragmentExtractor, +}; +use crate::vector::session::{EmbedSession, Input, Metadata, OnEmbed}; use crate::vector::settings::{EmbedderAction, ReindexAction}; -use crate::vector::{Embedder, Embedding, EmbeddingConfigs}; +use crate::vector::{Embedding, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment}; use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; pub struct EmbeddingExtractor<'a, 'b> { - embedders: &'a EmbeddingConfigs, + embedders: &'a RuntimeEmbedders, sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, embedder_stats: &'a EmbedderStats, @@ -33,7 +38,7 @@ pub struct EmbeddingExtractor<'a, 'b> { impl<'a, 'b> EmbeddingExtractor<'a, 'b> { pub fn new( - embedders: &'a EmbeddingConfigs, + embedders: &'a RuntimeEmbedders, sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, embedder_stats: &'a EmbedderStats, @@ -45,7 +50,7 @@ impl<'a, 'b> EmbeddingExtractor<'a, 'b> { } pub struct EmbeddingExtractorData<'extractor>( - pub HashMap, + pub HashMap, ); unsafe impl MostlySend for EmbeddingExtractorData<'_> {} @@ -67,19 +72,18 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { UnusedVectorsDistributionBump::new_in(&context.doc_alloc); let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); - for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { - let embedder_id = - context.index.embedder_category_id.get(&context.rtxn, embedder_name)?.ok_or_else( - || InternalError::DatabaseMissingEntry { - db_name: "embedder_category_id", - key: None, - }, - )?; + let embedder_db = context.index.embedding_configs(); + for (embedder_name, runtime) in embedders { + let embedder_info = embedder_db + .embedder_info(&context.rtxn, embedder_name)? + .ok_or_else(|| InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + })?; all_chunks.push(Chunks::new( - embedder, - embedder_id, + runtime, + embedder_info, embedder_name, - prompt, context.data, &self.possible_embedding_mistakes, self.embedder_stats, @@ -94,19 +98,14 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { match change { DocumentChange::Deletion(deletion) => { // vector deletion is handled by document sender, - // we still need to accomodate deletion from user_provided + // we still need to accomodate deletion from embedding_status for chunks in &mut all_chunks { - // regenerate: true means we delete from user_provided - chunks.set_regenerate(deletion.docid(), true); + let (is_user_provided, must_regenerate) = + chunks.is_user_provided_must_regenerate(deletion.docid()); + chunks.clear_status(deletion.docid(), is_user_provided, must_regenerate); } } DocumentChange::Update(update) => { - let old_vectors = update.current_vectors( - &context.rtxn, - context.index, - context.db_fields_ids_map, - &context.doc_alloc, - )?; let new_vectors = update.only_changed_vectors(&context.doc_alloc, self.embedders)?; @@ -115,19 +114,16 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { } for chunks in &mut all_chunks { - let embedder_name = chunks.embedder_name(); - let prompt = chunks.prompt(); + let (old_is_user_provided, old_must_regenerate) = + chunks.is_user_provided_must_regenerate(update.docid()); - let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap(); + let embedder_name = chunks.embedder_name(); // case where we have a `_vectors` field in the updated document if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { new_vectors.vectors_for_key(embedder_name).transpose() }) { let new_vectors = new_vectors?; - if old_vectors.regenerate != new_vectors.regenerate { - chunks.set_regenerate(update.docid(), new_vectors.regenerate); - } // do we have set embeddings? if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( @@ -139,97 +135,62 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { document_id: update.external_document_id().to_string(), error: error.to_string(), })?, + old_is_user_provided, + old_must_regenerate, + new_vectors.regenerate, )?; // regenerate if the new `_vectors` fields is set to. } else if new_vectors.regenerate { - let new_rendered = prompt.render_document( - update.external_document_id(), - update.merged( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - let must_regenerate = if !old_vectors.regenerate { - // we just enabled `regenerate` - true - } else { - let old_rendered = prompt.render_document( - update.external_document_id(), - update.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - ); - - if let Ok(old_rendered) = old_rendered { - // must regenerate if the rendered changed - new_rendered != old_rendered - } else { - // cannot check previous rendered, better regenerate - true - } - }; - - if must_regenerate { - chunks.set_autogenerated( - update.docid(), - update.external_document_id(), - new_rendered, - &unused_vectors_distribution, - )?; - } - } - // no `_vectors` field, so only regenerate if the document is already set to in the DB. - } else if old_vectors.regenerate { - let new_rendered = prompt.render_document( - update.external_document_id(), - update.merged( + let new_document = update.merged( &context.rtxn, context.index, context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - - let must_regenerate = { - let old_rendered = prompt.render_document( - update.external_document_id(), - update.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - ); - if let Ok(old_rendered) = old_rendered { - // regenerate if the rendered version changed - new_rendered != old_rendered - } else { - // if we cannot render the previous version of the documents, let's regenerate - true - } - }; - - if must_regenerate { - chunks.set_autogenerated( + )?; + let old_document = update.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?; + chunks.update_autogenerated( update.docid(), update.external_document_id(), - new_rendered, + old_document, + new_document, + context.new_fields_ids_map, &unused_vectors_distribution, + old_is_user_provided, + old_must_regenerate, + true, )?; } + // no `_vectors` field, so only regenerate if the document is already set to in the DB. + } else if old_must_regenerate { + let new_document = update.merged( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?; + let old_document = update.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?; + chunks.update_autogenerated( + update.docid(), + update.external_document_id(), + old_document, + new_document, + context.new_fields_ids_map, + &unused_vectors_distribution, + old_is_user_provided, + old_must_regenerate, + true, + )?; } } } DocumentChange::Insertion(insertion) => { + let (default_is_user_provided, default_must_regenerate) = (false, true); let new_vectors = insertion.inserted_vectors(&context.doc_alloc, self.embedders)?; if let Some(new_vectors) = &new_vectors { @@ -238,13 +199,11 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { for chunks in &mut all_chunks { let embedder_name = chunks.embedder_name(); - let prompt = chunks.prompt(); // if no inserted vectors, then regenerate: true + no embeddings => autogenerate if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { new_vectors.vectors_for_key(embedder_name).transpose() }) { let new_vectors = new_vectors?; - chunks.set_regenerate(insertion.docid(), new_vectors.regenerate); if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( insertion.external_document_id(), @@ -257,33 +216,36 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { .to_string(), error: error.to_string(), })?, + default_is_user_provided, + default_must_regenerate, + new_vectors.regenerate, )?; } else if new_vectors.regenerate { - let rendered = prompt.render_document( + chunks.insert_autogenerated( + insertion.docid(), insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, - &context.doc_alloc, - )?; - chunks.set_autogenerated( - insertion.docid(), - insertion.external_document_id(), - rendered, &unused_vectors_distribution, + true, )?; + } else { + chunks.set_status( + insertion.docid(), + default_is_user_provided, + default_must_regenerate, + false, + false, + ); } } else { - let rendered = prompt.render_document( + chunks.insert_autogenerated( + insertion.docid(), insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, - &context.doc_alloc, - )?; - chunks.set_autogenerated( - insertion.docid(), - insertion.external_document_id(), - rendered, &unused_vectors_distribution, + true, )?; } } @@ -501,156 +463,74 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding } } -// **Warning**: the destructor of this struct is not normally run, make sure that all its fields: -// 1. don't have side effects tied to they destructors -// 2. if allocated, are allocated inside of the bumpalo -// -// Currently this is the case as: -// 1. BVec are inside of the bumaplo -// 2. All other fields are either trivial (u8) or references. -struct Chunks<'a, 'b, 'extractor> { - texts: BVec<'a, &'a str>, - ids: BVec<'a, DocumentId>, - - embedder: &'a Embedder, +pub struct OnEmbeddingDocumentUpdates<'doc, 'b> { embedder_id: u8, - embedder_name: &'a str, - dimensions: usize, - prompt: &'a Prompt, - possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, - embedder_stats: &'a EmbedderStats, - user_provided: &'a RefCell>, - threads: &'a ThreadPoolNoAbort, - sender: EmbeddingSender<'a, 'b>, - has_manual_generation: Option<&'a str>, + sender: EmbeddingSender<'doc, 'b>, + possible_embedding_mistakes: &'doc PossibleEmbeddingMistakes, } -impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { - #[allow(clippy::too_many_arguments)] - pub fn new( - embedder: &'a Embedder, - embedder_id: u8, - embedder_name: &'a str, - prompt: &'a Prompt, - user_provided: &'a RefCell>, - possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, - embedder_stats: &'a EmbedderStats, - threads: &'a ThreadPoolNoAbort, - sender: EmbeddingSender<'a, 'b>, - doc_alloc: &'a Bump, - ) -> Self { - let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); - let texts = BVec::with_capacity_in(capacity, doc_alloc); - let ids = BVec::with_capacity_in(capacity, doc_alloc); - let dimensions = embedder.dimensions(); - Self { - texts, - ids, - embedder, - prompt, - possible_embedding_mistakes, - embedder_stats, - threads, - sender, - embedder_id, - embedder_name, - user_provided, - has_manual_generation: None, - dimensions, - } +impl OnEmbeddingDocumentUpdates<'_, '_> { + fn clear_vectors(&self, docid: DocumentId) { + self.sender.set_vectors(docid, self.embedder_id, vec![]).unwrap(); } +} - pub fn set_autogenerated( +impl<'doc> OnEmbed<'doc> for OnEmbeddingDocumentUpdates<'doc, '_> { + type ErrorMetadata = UnusedVectorsDistributionBump<'doc>; + fn process_embedding_response( &mut self, - docid: DocumentId, - external_docid: &'a str, - rendered: &'a str, - unused_vectors_distribution: &UnusedVectorsDistributionBump, - ) -> Result<()> { - let is_manual = matches!(&self.embedder, &Embedder::UserProvided(_)); - if is_manual { - self.has_manual_generation.get_or_insert(external_docid); - } - - if self.texts.len() < self.texts.capacity() { - self.texts.push(rendered); - self.ids.push(docid); - return Ok(()); - } - - Self::embed_chunks( - &mut self.texts, - &mut self.ids, - self.embedder, - self.embedder_id, - self.embedder_name, - self.possible_embedding_mistakes, - self.embedder_stats, - unused_vectors_distribution, - self.threads, - self.sender, - self.has_manual_generation.take(), - ) + response: crate::vector::session::EmbeddingResponse<'doc>, + ) { + self.sender + .set_vector( + response.metadata.docid, + self.embedder_id, + response.metadata.extractor_id, + response.embedding, + ) + .unwrap(); } - pub fn drain( - mut self, - unused_vectors_distribution: &UnusedVectorsDistributionBump, - ) -> Result<()> { - let res = Self::embed_chunks( - &mut self.texts, - &mut self.ids, - self.embedder, - self.embedder_id, - self.embedder_name, - self.possible_embedding_mistakes, - self.embedder_stats, - unused_vectors_distribution, - self.threads, - self.sender, - self.has_manual_generation, - ); - // optimization: don't run bvec dtors as they only contain bumpalo allocated stuff - std::mem::forget(self); - res + fn process_embeddings(&mut self, metadata: Metadata<'doc>, embeddings: Vec) { + self.sender.set_vectors(metadata.docid, self.embedder_id, embeddings).unwrap(); } - #[allow(clippy::too_many_arguments)] - pub fn embed_chunks( - texts: &mut BVec<'a, &'a str>, - ids: &mut BVec<'a, DocumentId>, - embedder: &Embedder, - embedder_id: u8, - embedder_name: &str, - possible_embedding_mistakes: &PossibleEmbeddingMistakes, - embedder_stats: &EmbedderStats, + fn process_embedding_error( + &mut self, + error: crate::vector::hf::EmbedError, + embedder_name: &'doc str, unused_vectors_distribution: &UnusedVectorsDistributionBump, - threads: &ThreadPoolNoAbort, - sender: EmbeddingSender<'a, 'b>, - has_manual_generation: Option<&'a str>, - ) -> Result<()> { - if let Some(external_docid) = has_manual_generation { - let mut msg = format!( - r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{}", - external_docid, - if ids.len() > 1 { - format!(" and at least {} other document(s)", ids.len() - 1) - } else { - "".to_string() - } - ); - - msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); + metadata: &[Metadata<'doc>], + ) -> crate::Error { + if let FaultSource::Bug = error.fault { + crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(error.into())) + } else { + let mut msg = if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + format!( + r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{} +- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.", + if let Some(first) = metadata.first() { first.external_docid } else { "???" }, + if metadata.len() > 1 { + format!(" and at least {} other document(s)", metadata.len() - 1) + } else { + "".to_string() + } + ) + } else { + format!(r"While embedding documents for embedder `{embedder_name}`: {error}") + }; let mut hint_count = 0; - for (vector_misspelling, count) in possible_embedding_mistakes.vector_mistakes().take(2) + for (vector_misspelling, count) in + self.possible_embedding_mistakes.vector_mistakes().take(2) { msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); hint_count += 1; } - for (embedder_misspelling, count) in possible_embedding_mistakes + for (embedder_misspelling, count) in self + .possible_embedding_mistakes .embedder_mistakes_bump(embedder_name, unused_vectors_distribution) .take(2) { @@ -659,107 +539,413 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { } if hint_count == 0 { - msg += &format!( - "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" - ); - } - - return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))); - } - - let res = match embedder.embed_index_ref(texts.as_slice(), threads, embedder_stats) { - Ok(embeddings) => { - for (docid, embedding) in ids.into_iter().zip(embeddings) { - sender.set_vector(*docid, embedder_id, embedding).unwrap(); - } - Ok(()) - } - Err(error) => { - if let FaultSource::Bug = error.fault { - Err(crate::Error::InternalError(crate::InternalError::VectorEmbeddingError( - error.into(), - ))) - } else { - let mut msg = format!( - r"While embedding documents for embedder `{embedder_name}`: {error}" + if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + msg += &format!( + "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" ); - - if let EmbedErrorKind::ManualEmbed(_) = &error.kind { - msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); - } - - let mut hint_count = 0; - - for (vector_misspelling, count) in - possible_embedding_mistakes.vector_mistakes().take(2) - { - msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); - hint_count += 1; - } - - for (embedder_misspelling, count) in possible_embedding_mistakes - .embedder_mistakes_bump(embedder_name, unused_vectors_distribution) - .take(2) - { - msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); - hint_count += 1; - } - - if hint_count == 0 { - if let EmbedErrorKind::ManualEmbed(_) = &error.kind { - msg += &format!( - "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" - ); - } - } - - Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))) } } + + crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)) + } + } +} + +struct Chunks<'a, 'b, 'extractor> { + dimensions: usize, + status_delta: &'a RefCell>, + status: EmbeddingStatus, + kind: ChunkType<'a, 'b>, +} + +enum ChunkType<'a, 'b> { + DocumentTemplate { + document_template: &'a Prompt, + session: EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, &'a str>, + }, + Fragments { + fragments: &'a [RuntimeFragment], + session: EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, serde_json::Value>, + }, +} + +impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { + #[allow(clippy::too_many_arguments)] + pub fn new( + runtime: &'a RuntimeEmbedder, + embedder_info: EmbedderInfo, + embedder_name: &'a str, + status_delta: &'a RefCell>, + possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, + embedder_stats: &'a EmbedderStats, + threads: &'a ThreadPoolNoAbort, + sender: EmbeddingSender<'a, 'b>, + doc_alloc: &'a Bump, + ) -> Self { + let embedder = &runtime.embedder; + let dimensions = embedder.dimensions(); + + let fragments = runtime.fragments.as_slice(); + let kind = if fragments.is_empty() { + ChunkType::DocumentTemplate { + document_template: &runtime.document_template, + session: EmbedSession::new( + &runtime.embedder, + embedder_name, + threads, + doc_alloc, + embedder_stats, + OnEmbeddingDocumentUpdates { + embedder_id: embedder_info.embedder_id, + sender, + possible_embedding_mistakes, + }, + ), + } + } else { + ChunkType::Fragments { + fragments, + session: EmbedSession::new( + &runtime.embedder, + embedder_name, + threads, + doc_alloc, + embedder_stats, + OnEmbeddingDocumentUpdates { + embedder_id: embedder_info.embedder_id, + sender, + possible_embedding_mistakes, + }, + ), + } }; - texts.clear(); - ids.clear(); - res + + Self { dimensions, status: embedder_info.embedding_status, status_delta, kind } } - pub fn prompt(&self) -> &'a Prompt { - self.prompt + pub fn is_user_provided_must_regenerate(&self, docid: DocumentId) -> (bool, bool) { + self.status.is_user_provided_must_regenerate(docid) + } + + #[allow(clippy::too_many_arguments)] + pub fn update_autogenerated<'doc, OD: Document<'doc> + Debug, ND: Document<'doc> + Debug>( + &mut self, + docid: DocumentId, + external_docid: &'a str, + old_document: OD, + new_document: ND, + new_fields_ids_map: &'a RefCell, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_must_regenerate: bool, + ) -> Result<()> + where + 'a: 'doc, + { + let extracted = match &mut self.kind { + ChunkType::DocumentTemplate { document_template, session } => { + let doc_alloc = session.doc_alloc(); + let ex = DocumentTemplateExtractor::new( + document_template, + doc_alloc, + new_fields_ids_map, + ); + + if old_is_user_provided { + session.on_embed_mut().clear_vectors(docid); + } + + update_autogenerated( + docid, + external_docid, + [ex], + old_document, + new_document, + &external_docid, + old_must_regenerate, + session, + unused_vectors_distribution, + )? + } + ChunkType::Fragments { fragments, session } => { + let doc_alloc = session.doc_alloc(); + let extractors = fragments.iter().map(|fragment| { + RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors() + }); + + if old_is_user_provided { + session.on_embed_mut().clear_vectors(docid); + } + + update_autogenerated( + docid, + external_docid, + extractors, + old_document, + new_document, + &(), + old_must_regenerate, + session, + unused_vectors_distribution, + )? + } + }; + + self.set_status( + docid, + old_is_user_provided, + old_must_regenerate, + old_is_user_provided && !extracted, + new_must_regenerate, + ); + + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + pub fn insert_autogenerated + Debug>( + &mut self, + docid: DocumentId, + external_docid: &'a str, + new_document: D, + new_fields_ids_map: &'a RefCell, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, + new_must_regenerate: bool, + ) -> Result<()> { + let (default_is_user_provided, default_must_regenerate) = (false, true); + self.set_status( + docid, + default_is_user_provided, + default_must_regenerate, + false, + new_must_regenerate, + ); + + match &mut self.kind { + ChunkType::DocumentTemplate { document_template, session } => { + let doc_alloc = session.doc_alloc(); + let ex = DocumentTemplateExtractor::new( + document_template, + doc_alloc, + new_fields_ids_map, + ); + + insert_autogenerated( + docid, + external_docid, + [ex], + new_document, + &external_docid, + session, + unused_vectors_distribution, + )?; + } + ChunkType::Fragments { fragments, session } => { + let doc_alloc = session.doc_alloc(); + let extractors = fragments.iter().map(|fragment| { + RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors() + }); + + insert_autogenerated( + docid, + external_docid, + extractors, + new_document, + &(), + session, + unused_vectors_distribution, + )?; + } + } + Ok(()) + } + + pub fn drain(self, unused_vectors_distribution: &UnusedVectorsDistributionBump) -> Result<()> { + match self.kind { + ChunkType::DocumentTemplate { document_template: _, session } => { + session.drain(unused_vectors_distribution)?; + } + ChunkType::Fragments { fragments: _, session } => { + session.drain(unused_vectors_distribution)?; + } + } + Ok(()) } pub fn embedder_name(&self) -> &'a str { - self.embedder_name - } - - fn set_regenerate(&self, docid: DocumentId, regenerate: bool) { - let mut user_provided = self.user_provided.borrow_mut(); - let user_provided = user_provided.0.entry_ref(self.embedder_name).or_default(); - if regenerate { - // regenerate == !user_provided - user_provided.insert_del_u32(docid); - } else { - user_provided.insert_add_u32(docid); + match &self.kind { + ChunkType::DocumentTemplate { document_template: _, session } => { + session.embedder_name() + } + ChunkType::Fragments { fragments: _, session } => session.embedder_name(), } } - fn set_vectors( + fn set_status( &self, + docid: DocumentId, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_is_user_provided: bool, + new_must_regenerate: bool, + ) { + if EmbeddingStatusDelta::needs_change( + old_is_user_provided, + old_must_regenerate, + new_is_user_provided, + new_must_regenerate, + ) { + let mut status_delta = self.status_delta.borrow_mut(); + let status_delta = status_delta.0.entry_ref(self.embedder_name()).or_default(); + status_delta.push_delta( + docid, + old_is_user_provided, + old_must_regenerate, + new_is_user_provided, + new_must_regenerate, + ); + } + } + + pub fn clear_status(&self, docid: DocumentId, is_user_provided: bool, must_regenerate: bool) { + // these value ensure both roaring are at 0. + if EmbeddingStatusDelta::needs_clear(is_user_provided, must_regenerate) { + let mut status_delta = self.status_delta.borrow_mut(); + let status_delta = status_delta.0.entry_ref(self.embedder_name()).or_default(); + status_delta.clear_docid(docid, is_user_provided, must_regenerate); + } + } + + pub fn set_vectors( + &mut self, external_docid: &'a str, docid: DocumentId, embeddings: Vec, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_must_regenerate: bool, ) -> Result<()> { + self.set_status( + docid, + old_is_user_provided, + old_must_regenerate, + true, + new_must_regenerate, + ); for (embedding_index, embedding) in embeddings.iter().enumerate() { if embedding.len() != self.dimensions { return Err(UserError::InvalidIndexingVectorDimensions { expected: self.dimensions, found: embedding.len(), - embedder_name: self.embedder_name.to_string(), + embedder_name: self.embedder_name().to_string(), document_id: external_docid.to_string(), embedding_index, } .into()); } } - self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap(); + match &mut self.kind { + ChunkType::DocumentTemplate { document_template: _, session } => { + session.on_embed_mut().process_embeddings( + Metadata { docid, external_docid, extractor_id: 0 }, + embeddings, + ); + } + ChunkType::Fragments { fragments: _, session } => { + session.on_embed_mut().process_embeddings( + Metadata { docid, external_docid, extractor_id: 0 }, + embeddings, + ); + } + } + Ok(()) } } + +#[allow(clippy::too_many_arguments)] +fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>( + docid: DocumentId, + external_docid: &'a str, + extractors: impl IntoIterator, + old_document: OD, + new_document: ND, + meta: &E::DocumentMetadata, + old_must_regenerate: bool, + session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, +) -> Result +where + OD: Document<'doc> + Debug, + ND: Document<'doc> + Debug, + E: VectorExtractor<'a>, + E::Input: Input, + crate::Error: From, +{ + let mut extracted = false; + for extractor in extractors { + let new_rendered = extractor.extract(&new_document, meta)?; + let must_regenerate = if !old_must_regenerate { + // we just enabled `regenerate` + true + } else { + let old_rendered = extractor.extract(&old_document, meta); + + if let Ok(old_rendered) = old_rendered { + // must regenerate if the rendered changed + new_rendered != old_rendered + } else { + // cannot check previous rendered, better regenerate + true + } + }; + + if must_regenerate { + extracted = true; + let metadata = + Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; + + if let Some(new_rendered) = new_rendered { + session.request_embedding(metadata, new_rendered, unused_vectors_distribution)? + } else { + // remove any existing embedding + OnEmbed::process_embedding_response( + session.on_embed_mut(), + crate::vector::session::EmbeddingResponse { metadata, embedding: None }, + ); + } + } + } + + Ok(extracted) +} + +fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>( + docid: DocumentId, + external_docid: &'a str, + extractors: impl IntoIterator, + new_document: D, + meta: &E::DocumentMetadata, + session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, +) -> Result<()> +where + E: VectorExtractor<'a>, + E::Input: Input, + crate::Error: From, +{ + for extractor in extractors { + let new_rendered = extractor.extract(&new_document, meta)?; + + if let Some(new_rendered) = new_rendered { + session.request_embedding( + Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }, + new_rendered, + unused_vectors_distribution, + )?; + } + } + + Ok(()) +} diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index bb275d8aa..a3e7842c2 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -13,21 +13,17 @@ use super::super::thread_local::{FullySend, ThreadLocal}; use super::super::FacetFieldIdsDelta; use super::document_changes::{extract, DocumentChanges, IndexingContext}; use super::settings_changes::settings_change_extract; -use crate::documents::FieldIdMapper; -use crate::documents::PrimaryKey; -use crate::index::IndexEmbeddingConfig; -use crate::progress::EmbedderStats; -use crate::progress::MergingWordCache; +use crate::documents::{FieldIdMapper, PrimaryKey}; +use crate::progress::{EmbedderStats, MergingWordCache}; use crate::proximity::ProximityPrecision; use crate::update::new::extract::EmbeddingExtractor; use crate::update::new::indexer::settings_changes::DocumentsIndentifiers; use crate::update::new::merger::merge_and_send_rtree; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::SettingsDelta; -use crate::vector::EmbeddingConfigs; -use crate::Index; -use crate::InternalError; -use crate::{Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; +use crate::vector::db::IndexEmbeddingConfig; +use crate::vector::RuntimeEmbedders; +use crate::{Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; #[allow(clippy::too_many_arguments)] pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( @@ -35,7 +31,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( indexing_context: IndexingContext, indexer_span: Span, extractor_sender: ExtractorBbqueueSender, - embedders: &EmbeddingConfigs, + embedders: &RuntimeEmbedders, extractor_allocs: &'extractor mut ThreadLocal>, finished_extraction: &AtomicBool, field_distribution: &mut BTreeMap, @@ -275,14 +271,19 @@ where let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors"); let _entered = span.enter(); + let embedder_configs = index.embedding_configs(); for config in &mut index_embeddings { + let mut infos = embedder_configs.embedder_info(&rtxn, &config.name)?.unwrap(); + 'data: for data in datastore.iter_mut() { let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { + let Some(delta) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided, modified_docids); + delta.apply_to(&mut infos.embedding_status); } + + extractor_sender.embeddings().embedding_status(&config.name, infos).unwrap(); } } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 0efef48fd..507d1a650 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -24,7 +24,7 @@ use crate::progress::{EmbedderStats, Progress}; use crate::update::settings::SettingsDelta; use crate::update::GrenadParameters; use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; -use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs}; +use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; pub(crate) mod de; @@ -54,7 +54,7 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>( new_fields_ids_map: FieldsIdsMap, new_primary_key: Option>, document_changes: &DC, - embedders: EmbeddingConfigs, + embedders: RuntimeEmbedders, must_stop_processing: &'indexer MSP, progress: &'indexer Progress, embedder_stats: &'indexer EmbedderStats, @@ -93,7 +93,7 @@ where grenad_parameters: &grenad_parameters, }; - let index_embeddings = index.embedding_configs(wtxn)?; + let index_embeddings = index.embedding_configs().embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?; let mut modified_docids = roaring::RoaringBitmap::new(); @@ -133,20 +133,21 @@ where let arroy_writers: Result> = embedders .inner_as_ref() .iter() - .map(|(embedder_name, (embedder, _, was_quantized))| { - let embedder_index = index.embedder_category_id.get(wtxn, embedder_name)?.ok_or( - InternalError::DatabaseMissingEntry { + .map(|(embedder_name, runtime)| { + let embedder_index = index + .embedding_configs() + .embedder_id(wtxn, embedder_name)? + .ok_or(InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None, - }, - )?; + })?; - let dimensions = embedder.dimensions(); - let writer = ArroyWrapper::new(vector_arroy, embedder_index, *was_quantized); + let dimensions = runtime.embedder.dimensions(); + let writer = ArroyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized); Ok(( embedder_index, - (embedder_name.as_str(), embedder.as_ref(), writer, dimensions), + (embedder_name.as_str(), &*runtime.embedder, writer, dimensions), )) }) .collect(); diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index fa48ff589..b8e3685f8 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -11,11 +11,11 @@ use super::super::channel::*; use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; -use crate::index::IndexEmbeddingConfig; use crate::progress::Progress; use crate::update::settings::InnerIndexSettings; +use crate::vector::db::IndexEmbeddingConfig; use crate::vector::settings::EmbedderAction; -use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings}; +use crate::vector::{ArroyWrapper, Embedder, Embeddings, RuntimeEmbedders}; use crate::{Error, Index, InternalError, Result, UserError}; pub fn write_to_db( @@ -64,6 +64,14 @@ pub fn write_to_db( writer.del_items(wtxn, *dimensions, docid)?; writer.add_items(wtxn, docid, &embeddings)?; } + ReceiverAction::LargeVector( + large_vector @ LargeVector { docid, embedder_id, extractor_id, .. }, + ) => { + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + let embedding = large_vector.read_embedding(*dimensions); + writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?; + } } // Every time the is a message in the channel we search @@ -137,7 +145,7 @@ where )?; } - index.put_embedding_configs(wtxn, index_embeddings)?; + index.embedding_configs().put_embedding_configs(wtxn, index_embeddings)?; Ok(()) } @@ -147,7 +155,7 @@ pub(super) fn update_index( wtxn: &mut RwTxn<'_>, new_fields_ids_map: FieldIdMapWithMetadata, new_primary_key: Option>, - embedders: EmbeddingConfigs, + embedders: RuntimeEmbedders, field_distribution: std::collections::BTreeMap, document_ids: roaring::RoaringBitmap, ) -> Result<()> { @@ -226,14 +234,36 @@ pub fn write_from_bbqueue( arroy_writers.get(&embedder_id).expect("requested a missing embedder"); let mut embeddings = Embeddings::new(*dimensions); let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); - if embeddings.append(all_embeddings.to_vec()).is_err() { - return Err(Error::UserError(UserError::InvalidVectorDimensions { - expected: *dimensions, - found: all_embeddings.len(), - })); - } writer.del_items(wtxn, *dimensions, docid)?; - writer.add_items(wtxn, docid, &embeddings)?; + if !all_embeddings.is_empty() { + if embeddings.append(all_embeddings.to_vec()).is_err() { + return Err(Error::UserError(UserError::InvalidVectorDimensions { + expected: *dimensions, + found: all_embeddings.len(), + })); + } + writer.add_items(wtxn, docid, &embeddings)?; + } + } + EntryHeader::ArroySetVector( + asv @ ArroySetVector { docid, embedder_id, extractor_id, .. }, + ) => { + let frame = frame_with_header.frame(); + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding); + + if embedding.is_empty() { + writer.del_item_in_store(wtxn, docid, extractor_id, *dimensions)?; + } else { + if embedding.len() != *dimensions { + return Err(Error::UserError(UserError::InvalidVectorDimensions { + expected: *dimensions, + found: embedding.len(), + })); + } + writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?; + } } } } diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index a52dab6a1..b59984248 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -12,9 +12,9 @@ use super::document::{Document, DocumentFromDb, DocumentFromVersions, Versions}; use super::indexer::de::DeserrRawValue; use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::documents::FieldIdMapper; -use crate::index::IndexEmbeddingConfig; +use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig}; use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors}; -use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs}; +use crate::vector::{ArroyWrapper, Embedding, RuntimeEmbedders}; use crate::{DocumentId, Index, InternalError, Result, UserError}; #[derive(Serialize)] @@ -109,7 +109,7 @@ impl<'t> VectorDocumentFromDb<'t> { None => None, }; - let embedding_config = index.embedding_configs(rtxn)?; + let embedding_config = index.embedding_configs().embedding_configs(rtxn)?; Ok(Some(Self { docid, embedding_config, index, vectors_field, rtxn, doc_alloc })) } @@ -118,6 +118,7 @@ impl<'t> VectorDocumentFromDb<'t> { &self, embedder_id: u8, config: &IndexEmbeddingConfig, + status: &EmbeddingStatus, ) -> Result> { let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized()); @@ -126,7 +127,7 @@ impl<'t> VectorDocumentFromDb<'t> { Ok(VectorEntry { has_configured_embedder: true, embeddings: Some(Embeddings::FromDb(vectors)), - regenerate: !config.user_provided.contains(self.docid), + regenerate: status.must_regenerate(self.docid), implicit: false, }) } @@ -137,9 +138,9 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { self.embedding_config .iter() .map(|config| { - let embedder_id = - self.index.embedder_category_id.get(self.rtxn, &config.name)?.unwrap(); - let entry = self.entry_from_db(embedder_id, config)?; + let info = + self.index.embedding_configs().embedder_info(self.rtxn, &config.name)?.unwrap(); + let entry = self.entry_from_db(info.embedder_id, config, &info.embedding_status)?; let config_name = self.doc_alloc.alloc_str(config.name.as_str()); Ok((&*config_name, entry)) }) @@ -156,11 +157,11 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { } fn vectors_for_key(&self, key: &str) -> Result>> { - Ok(match self.index.embedder_category_id.get(self.rtxn, key)? { - Some(embedder_id) => { + Ok(match self.index.embedding_configs().embedder_info(self.rtxn, key)? { + Some(info) => { let config = self.embedding_config.iter().find(|config| config.name == key).unwrap(); - Some(self.entry_from_db(embedder_id, config)?) + Some(self.entry_from_db(info.embedder_id, config, &info.embedding_status)?) } None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) { Some(embedding_from_doc) => { @@ -222,7 +223,7 @@ fn entry_from_raw_value( pub struct VectorDocumentFromVersions<'doc> { external_document_id: &'doc str, vectors: RawMap<'doc, FxBuildHasher>, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, } impl<'doc> VectorDocumentFromVersions<'doc> { @@ -230,7 +231,7 @@ impl<'doc> VectorDocumentFromVersions<'doc> { external_document_id: &'doc str, versions: &Versions<'doc>, bump: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result> { let document = DocumentFromVersions::new(versions); if let Some(vectors_field) = document.vectors_field()? { @@ -283,7 +284,7 @@ impl<'doc> MergedVectorDocument<'doc> { db_fields_ids_map: &'doc Mapper, versions: &Versions<'doc>, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result> { let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?; let new_doc = @@ -295,7 +296,7 @@ impl<'doc> MergedVectorDocument<'doc> { external_document_id: &'doc str, versions: &Versions<'doc>, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result> { let Some(new_doc) = VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)? diff --git a/crates/milli/src/vector/session.rs b/crates/milli/src/vector/session.rs index b6f229779..dd005e993 100644 --- a/crates/milli/src/vector/session.rs +++ b/crates/milli/src/vector/session.rs @@ -3,6 +3,7 @@ use bumpalo::Bump; use serde_json::Value; use super::{EmbedError, Embedder, Embedding}; +use crate::progress::EmbedderStats; use crate::{DocumentId, Result, ThreadPoolNoAbort}; type ExtractorId = u8; @@ -43,6 +44,8 @@ pub struct EmbedSession<'doc, C, I> { embedder_name: &'doc str, + embedder_stats: &'doc EmbedderStats, + on_embed: C, } @@ -51,6 +54,7 @@ pub trait Input: Sized { inputs: &[Self], embedder: &Embedder, threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, ) -> std::result::Result, EmbedError>; } @@ -59,8 +63,9 @@ impl Input for &'_ str { inputs: &[Self], embedder: &Embedder, threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, ) -> std::result::Result, EmbedError> { - embedder.embed_index_ref(inputs, threads) + embedder.embed_index_ref(inputs, threads, embedder_stats) } } @@ -69,8 +74,9 @@ impl Input for Value { inputs: &[Value], embedder: &Embedder, threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, ) -> std::result::Result, EmbedError> { - embedder.embed_index_ref_fragments(inputs, threads) + embedder.embed_index_ref_fragments(inputs, threads, embedder_stats) } } @@ -81,12 +87,21 @@ impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> { embedder_name: &'doc str, threads: &'doc ThreadPoolNoAbort, doc_alloc: &'doc Bump, + embedder_stats: &'doc EmbedderStats, on_embed: C, ) -> Self { let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); let texts = BVec::with_capacity_in(capacity, doc_alloc); let ids = BVec::with_capacity_in(capacity, doc_alloc); - Self { inputs: texts, metadata: ids, embedder, threads, embedder_name, on_embed } + Self { + inputs: texts, + metadata: ids, + embedder, + threads, + embedder_name, + embedder_stats, + on_embed, + } } pub fn request_embedding( @@ -114,7 +129,12 @@ impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> { if self.inputs.is_empty() { return Ok(()); } - let res = match I::embed_ref(self.inputs.as_slice(), self.embedder, self.threads) { + let res = match I::embed_ref( + self.inputs.as_slice(), + self.embedder, + self.threads, + self.embedder_stats, + ) { Ok(embeddings) => { for (metadata, embedding) in self.metadata.iter().copied().zip(embeddings) { self.on_embed.process_embedding_response(EmbeddingResponse { From cab5e35ff7b133b0743a852a76fdeac92c4b3f3f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:01:05 +0200 Subject: [PATCH 105/150] Implement in old settings indexer and old dump import indexer --- .../extract/extract_vector_points.rs | 771 ++++++++++++++---- .../src/update/index_documents/extract/mod.rs | 53 +- .../milli/src/update/index_documents/mod.rs | 100 ++- .../src/update/index_documents/transform.rs | 41 +- .../src/update/index_documents/typed_chunk.rs | 93 ++- crates/milli/src/vector/parsed_vectors.rs | 12 +- 6 files changed, 824 insertions(+), 246 deletions(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index e1981a615..0a179cfa5 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -1,4 +1,5 @@ use std::cmp::Ordering; +use std::collections::{BTreeMap, VecDeque}; use std::convert::{TryFrom, TryInto}; use std::fs::File; use std::io::{self, BufReader, BufWriter}; @@ -6,25 +7,29 @@ use std::mem::size_of; use std::str::from_utf8; use std::sync::Arc; +use bumpalo::Bump; use bytemuck::cast_slice; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use grenad::Writer; +use obkv::KvReaderU16; use ordered_float::OrderedFloat; -use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::error::FaultSource; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; -use crate::index::IndexEmbeddingConfig; use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; +use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta}; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; +use crate::vector::extractor::{Extractor, ExtractorDiff, RequestFragmentExtractor}; use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState}; +use crate::vector::session::{EmbedSession, Metadata, OnEmbed}; use crate::vector::settings::ReindexAction; -use crate::vector::{Embedder, Embedding}; +use crate::vector::{Embedder, Embedding, RuntimeEmbedder, RuntimeFragment}; use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. @@ -37,12 +42,13 @@ pub struct ExtractedVectorPoints { pub remove_vectors: grenad::Reader>, // docid -> prompt pub prompts: grenad::Reader>, + // docid, extractor_id -> Option + pub inputs: grenad::Reader>, // embedder pub embedder_name: String, - pub embedder: Arc, - pub add_to_user_provided: RoaringBitmap, - pub remove_from_user_provided: RoaringBitmap, + pub runtime: Arc, + pub embedding_status_delta: EmbeddingStatusDelta, } enum VectorStateDelta { @@ -56,46 +62,74 @@ enum VectorStateDelta { // Remove any previous vector // Note: changing the value of the prompt **does require** recording this delta NowGenerated(String), + + // Add and remove the vectors computed from the fragments. + UpdateGeneratedFromFragments(Vec<(String, ExtractorDiff)>), + + /// Wasn't generated from fragments, but now is. + /// Delete any previous vectors and add the new vectors + NowGeneratedFromFragments(Vec<(String, Value)>), } impl VectorStateDelta { - fn into_values(self) -> (bool, String, Vec>) { + fn into_values(self) -> (bool, String, BTreeMap>, Vec>) { match self { VectorStateDelta::NoChange => Default::default(), - VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), - // We always delete the previous vectors - VectorStateDelta::NowManual(add) => (true, Default::default(), add), - VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), + VectorStateDelta::NowRemoved => { + (true, Default::default(), Default::default(), Default::default()) + } + VectorStateDelta::NowManual(add) => (true, Default::default(), Default::default(), add), + VectorStateDelta::NowGenerated(prompt) => { + (true, prompt, Default::default(), Default::default()) + } + VectorStateDelta::UpdateGeneratedFromFragments(fragments) => ( + false, + Default::default(), + ExtractorDiff::into_list_of_changes(fragments), + Default::default(), + ), + VectorStateDelta::NowGeneratedFromFragments(items) => ( + true, + Default::default(), + ExtractorDiff::into_list_of_changes( + items.into_iter().map(|(name, value)| (name, ExtractorDiff::Added(value))), + ), + Default::default(), + ), } } } -struct EmbedderVectorExtractor { +struct EmbedderVectorExtractor<'a> { embedder_name: String, - embedder: Arc, - prompt: Arc, + embedder_info: &'a EmbedderInfo, + runtime: Arc, // (docid) -> (prompt) prompts_writer: Writer>, + // (docid, extractor_id) -> (Option) + inputs_writer: Writer>, // (docid) -> () remove_vectors_writer: Writer>, // (docid, _index) -> KvWriterDelAdd -> Vector manual_vectors_writer: Writer>, - // The docids of the documents that contains a user defined embedding - add_to_user_provided: RoaringBitmap, + embedding_status_delta: EmbeddingStatusDelta, action: ExtractionAction, } -struct DocumentOperation { - // The docids of the documents that contains an auto-generated embedding - remove_from_user_provided: RoaringBitmap, -} - enum ExtractionAction { SettingsFullReindex, - SettingsRegeneratePrompts { old_prompt: Arc }, - DocumentOperation(DocumentOperation), + SettingsRegeneratePrompts { + old_runtime: Arc, + }, + /// List of fragments to update/add + SettingsRegenerateFragments { + // name and indices, respectively in old and new runtime, of the fragments to examine. + must_regenerate_fragments: BTreeMap, usize)>, + old_runtime: Arc, + }, + DocumentOperation, } struct ManualEmbedderErrors { @@ -183,8 +217,8 @@ impl ManualEmbedderErrors { pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, - embedders_configs: &[IndexEmbeddingConfig], settings_diff: &InnerIndexSettingsDiff, + embedder_info: &[(String, EmbedderInfo)], possible_embedding_mistakes: &PossibleEmbeddingMistakes, ) -> Result<(Vec, UnusedVectorsDistribution)> { let mut unused_vectors_distribution = UnusedVectorsDistribution::new(); @@ -204,13 +238,13 @@ pub fn extract_vector_points( let mut configs = settings_diff.new.embedding_configs.clone().into_inner(); let old_configs = &settings_diff.old.embedding_configs; - if reindex_vectors { for (name, action) in settings_diff.embedding_config_updates.iter() { if let Some(action) = action.reindex() { - let Some((embedder_name, (embedder, prompt, _quantized))) = - configs.remove_entry(name) - else { + let (_, embedder_info) = + embedder_info.iter().find(|(embedder_name, _)| embedder_name == name).unwrap(); + + let Some((embedder_name, runtime)) = configs.remove_entry(name) else { tracing::error!(embedder = name, "Requested embedder config not found"); continue; }; @@ -229,6 +263,12 @@ pub fn extract_vector_points( tempfile::tempfile()?, ); + let inputs_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + // (docid) -> () let remove_vectors_writer = create_writer( indexer.chunk_compression_type, @@ -238,24 +278,66 @@ pub fn extract_vector_points( let action = match action { ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex, - ReindexAction::RegeneratePrompts => { - let Some((_, old_prompt, _quantized)) = old_configs.get(name) else { + ReindexAction::RegenerateFragments(regenerate_fragments) => { + let Some(old_runtime) = old_configs.get(name) else { tracing::error!(embedder = name, "Old embedder config not found"); continue; }; - ExtractionAction::SettingsRegeneratePrompts { old_prompt } + let fragments = regenerate_fragments + .iter() + .filter_map(|(name, fragment)| match fragment { + crate::vector::settings::RegenerateFragment::Update => { + let old_value = old_runtime + .fragments + .binary_search_by_key(&name, |fragment| &fragment.name) + .ok(); + let Ok(new_value) = runtime + .fragments + .binary_search_by_key(&name, |fragment| &fragment.name) + else { + return None; + }; + Some((name.clone(), (old_value, new_value))) + } + // was already handled in transform + crate::vector::settings::RegenerateFragment::Remove => None, + crate::vector::settings::RegenerateFragment::Add => { + let Ok(new_value) = runtime + .fragments + .binary_search_by_key(&name, |fragment| &fragment.name) + else { + return None; + }; + Some((name.clone(), (None, new_value))) + } + }) + .collect(); + ExtractionAction::SettingsRegenerateFragments { + old_runtime, + must_regenerate_fragments: fragments, + } + } + + ReindexAction::RegeneratePrompts => { + let Some(old_runtime) = old_configs.get(name) else { + tracing::error!(embedder = name, "Old embedder config not found"); + continue; + }; + + ExtractionAction::SettingsRegeneratePrompts { old_runtime } } }; extractors.push(EmbedderVectorExtractor { embedder_name, - embedder, - prompt, + runtime, + embedder_info, prompts_writer, + inputs_writer, remove_vectors_writer, manual_vectors_writer, - add_to_user_provided: RoaringBitmap::new(), + embedding_status_delta: Default::default(), action, }); } else { @@ -264,8 +346,12 @@ pub fn extract_vector_points( } } else { // document operation + for (embedder_name, runtime) in configs.into_iter() { + let (_, embedder_info) = embedder_info + .iter() + .find(|(name, _)| embedder_name.as_str() == name.as_str()) + .unwrap(); - for (embedder_name, (embedder, prompt, _quantized)) in configs.into_iter() { // (docid, _index) -> KvWriterDelAdd -> Vector let manual_vectors_writer = create_writer( indexer.chunk_compression_type, @@ -280,6 +366,12 @@ pub fn extract_vector_points( tempfile::tempfile()?, ); + let inputs_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + // (docid) -> () let remove_vectors_writer = create_writer( indexer.chunk_compression_type, @@ -289,22 +381,23 @@ pub fn extract_vector_points( extractors.push(EmbedderVectorExtractor { embedder_name, - embedder, - prompt, + runtime, + embedder_info, prompts_writer, + inputs_writer, remove_vectors_writer, manual_vectors_writer, - add_to_user_provided: RoaringBitmap::new(), - action: ExtractionAction::DocumentOperation(DocumentOperation { - remove_from_user_provided: RoaringBitmap::new(), - }), + embedding_status_delta: Default::default(), + action: ExtractionAction::DocumentOperation, }); } } let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; + let mut doc_alloc = Bump::new(); while let Some((key, value)) = cursor.move_on_next()? { + doc_alloc.reset(); // this must always be serialized as (docid, external_docid); const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::(); let (docid_bytes, external_id_bytes) = @@ -320,9 +413,12 @@ pub fn extract_vector_points( // lazily get it when needed let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; + let regenerate_for_embedders = embedder_info + .iter() + .filter(|&(_, infos)| infos.embedding_status.must_regenerate(docid)) + .map(|(name, _)| name.clone()); let mut parsed_vectors = ParsedVectorsDiff::new( - docid, - embedders_configs, + regenerate_for_embedders, obkv, old_vectors_fid, new_vectors_fid, @@ -331,44 +427,40 @@ pub fn extract_vector_points( for EmbedderVectorExtractor { embedder_name, - embedder, - prompt, + runtime, + embedder_info, prompts_writer, + inputs_writer, remove_vectors_writer, manual_vectors_writer, - add_to_user_provided, + embedding_status_delta, action, } in extractors.iter_mut() { - let embedder_is_manual = matches!(**embedder, Embedder::UserProvided(_)); + let embedder_is_manual = matches!(*runtime.embedder, Embedder::UserProvided(_)); let (old, new) = parsed_vectors.remove(embedder_name); + let new_must_regenerate = new.must_regenerate(); let delta = match action { ExtractionAction::SettingsFullReindex => match old { // A full reindex can be triggered either by: // 1. a new embedder // 2. an existing embedder changed so that it must regenerate all generated embeddings. // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB - VectorState::Inline(vectors) => { - if !vectors.must_regenerate() { - add_to_user_provided.insert(docid); - } - - match vectors.into_array_of_vectors() { - Some(add_vectors) => { - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError( - crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ), - )); - } - VectorStateDelta::NowManual(add_vectors) + VectorState::Inline(vectors) => match vectors.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError( + crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ), + )); } - None => VectorStateDelta::NoChange, + VectorStateDelta::NowManual(add_vectors) } - } + None => VectorStateDelta::NoChange, + }, // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors VectorState::Manual => VectorStateDelta::NoChange, // generated vectors must be regenerated @@ -381,11 +473,79 @@ pub fn extract_vector_points( ); continue; } - regenerate_prompt(obkv, prompt, new_fields_ids_map)? + let has_fragments = !runtime.fragments.is_empty(); + + if has_fragments { + regenerate_all_fragments( + &runtime.fragments, + &doc_alloc, + new_fields_ids_map, + obkv, + ) + } else { + regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? + } } }, + ExtractionAction::SettingsRegenerateFragments { + must_regenerate_fragments, + old_runtime, + } => { + if old.must_regenerate() { + let has_fragments = !runtime.fragments.is_empty(); + let old_has_fragments = !old_runtime.fragments.is_empty(); + + let is_adding_fragments = has_fragments && !old_has_fragments; + + if is_adding_fragments { + regenerate_all_fragments( + &runtime.fragments, + &doc_alloc, + new_fields_ids_map, + obkv, + ) + } else if !has_fragments { + // removing fragments + regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? + } else { + let mut fragment_diff = Vec::new(); + let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); + + let obkv_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Addition, + new_fields_ids_map, + ); + for (name, (old_index, new_index)) in must_regenerate_fragments { + let Some(new) = runtime.fragments.get(*new_index) else { continue }; + + let new = + RequestFragmentExtractor::new(new, &doc_alloc).ignore_errors(); + + let diff = { + let old = old_index.as_ref().and_then(|old| { + let old = old_runtime.fragments.get(*old)?; + Some( + RequestFragmentExtractor::new(old, &doc_alloc) + .ignore_errors(), + ) + }); + let old = old.as_ref(); + Extractor::diff_settings(&new, &obkv_document, &(), old) + } + .expect("ignoring errors so this cannot fail"); + fragment_diff.push((name.clone(), diff)); + } + VectorStateDelta::UpdateGeneratedFromFragments(fragment_diff) + } + } else { + // we can simply ignore user provided vectors as they are not regenerated and are + // already in the DB since this is an existing embedder + VectorStateDelta::NoChange + } + } // prompt regeneration is only triggered for existing embedders - ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { + ExtractionAction::SettingsRegeneratePrompts { old_runtime } => { if old.must_regenerate() { if embedder_is_manual { ManualEmbedderErrors::push_error( @@ -395,24 +555,32 @@ pub fn extract_vector_points( ); continue; } - regenerate_if_prompt_changed( - obkv, - (old_prompt, prompt), - (old_fields_ids_map, new_fields_ids_map), - )? + let has_fragments = !runtime.fragments.is_empty(); + + if has_fragments { + regenerate_all_fragments( + &runtime.fragments, + &doc_alloc, + new_fields_ids_map, + obkv, + ) + } else { + regenerate_if_prompt_changed( + obkv, + (&old_runtime.document_template, &runtime.document_template), + (old_fields_ids_map, new_fields_ids_map), + )? + } } else { // we can simply ignore user provided vectors as they are not regenerated and are // already in the DB since this is an existing embedder VectorStateDelta::NoChange } } - ExtractionAction::DocumentOperation(DocumentOperation { - remove_from_user_provided, - }) => extract_vector_document_diff( - docid, + ExtractionAction::DocumentOperation => extract_vector_document_diff( obkv, - prompt, - (add_to_user_provided, remove_from_user_provided), + runtime, + &doc_alloc, (old, new), (old_fields_ids_map, new_fields_ids_map), document_id, @@ -421,13 +589,25 @@ pub fn extract_vector_points( &mut manual_errors, )?, }; + + // update the embedding status + push_embedding_status_delta( + embedding_status_delta, + docid, + &delta, + new_must_regenerate, + &embedder_info.embedding_status, + ); + // and we finally push the unique vectors into the writer push_vectors_diff( remove_vectors_writer, prompts_writer, + inputs_writer, manual_vectors_writer, &mut key_buffer, delta, + &runtime.fragments, )?; } @@ -444,45 +624,65 @@ pub fn extract_vector_points( for EmbedderVectorExtractor { embedder_name, - embedder, - prompt: _, + runtime, + embedder_info: _, prompts_writer, + inputs_writer, remove_vectors_writer, - action, + action: _, manual_vectors_writer, - add_to_user_provided, + embedding_status_delta, } in extractors { - let remove_from_user_provided = - if let ExtractionAction::DocumentOperation(DocumentOperation { - remove_from_user_provided, - }) = action - { - remove_from_user_provided - } else { - Default::default() - }; - results.push(ExtractedVectorPoints { manual_vectors: writer_into_reader(manual_vectors_writer)?, remove_vectors: writer_into_reader(remove_vectors_writer)?, prompts: writer_into_reader(prompts_writer)?, - embedder, + inputs: writer_into_reader(inputs_writer)?, + runtime, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, }) } Ok((results, unused_vectors_distribution)) } +fn push_embedding_status_delta( + embedding_status_delta: &mut EmbeddingStatusDelta, + docid: DocumentId, + delta: &VectorStateDelta, + new_must_regenerate: bool, + embedding_status: &EmbeddingStatus, +) { + let (old_is_user_provided, old_must_regenerate) = + embedding_status.is_user_provided_must_regenerate(docid); + let new_is_user_provided = match delta { + VectorStateDelta::NoChange => old_is_user_provided, + VectorStateDelta::NowRemoved => { + embedding_status_delta.clear_docid(docid, old_is_user_provided, old_must_regenerate); + return; + } + VectorStateDelta::NowManual(_) => true, + VectorStateDelta::NowGenerated(_) + | VectorStateDelta::UpdateGeneratedFromFragments(_) + | VectorStateDelta::NowGeneratedFromFragments(_) => false, + }; + + embedding_status_delta.push_delta( + docid, + old_is_user_provided, + old_must_regenerate, + new_is_user_provided, + new_must_regenerate, + ); +} + #[allow(clippy::too_many_arguments)] // feel free to find efficient way to factor arguments fn extract_vector_document_diff( - docid: DocumentId, obkv: &obkv::KvReader, - prompt: &Prompt, - (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), + runtime: &RuntimeEmbedder, + doc_alloc: &Bump, (old, new): (VectorState, VectorState), (old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata), document_id: impl Fn() -> Value, @@ -490,16 +690,6 @@ fn extract_vector_document_diff( embedder_is_manual: bool, manual_errors: &mut Option, ) -> Result { - match (old.must_regenerate(), new.must_regenerate()) { - (true, true) | (false, false) => {} - (true, false) => { - add_to_user_provided.insert(docid); - } - (false, true) => { - remove_from_user_provided.insert(docid); - } - } - let delta = match (old, new) { // regardless of the previous state, if a document now contains inline _vectors, they must // be extracted manually @@ -530,19 +720,52 @@ fn extract_vector_document_diff( ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id); return Ok(VectorStateDelta::NoChange); } - // Don't give up if the old prompt was failing - let old_prompt = Some(&prompt).map(|p| { - p.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) - .unwrap_or_default() - }); - let new_prompt = - prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; - if old_prompt.as_ref() != Some(&new_prompt) { - let old_prompt = old_prompt.unwrap_or_default(); - tracing::trace!( - "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" - ); - VectorStateDelta::NowGenerated(new_prompt) + let has_fragments = !runtime.fragments.is_empty(); + if has_fragments { + let prompt = &runtime.document_template; + // Don't give up if the old prompt was failing + let old_prompt = Some(&prompt).map(|p| { + p.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) + .unwrap_or_default() + }); + let new_prompt = + prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; + if old_prompt.as_ref() != Some(&new_prompt) { + let old_prompt = old_prompt.unwrap_or_default(); + tracing::trace!( + "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" + ); + VectorStateDelta::NowGenerated(new_prompt) + } else { + let mut fragment_diff = Vec::new(); + let old_fields_ids_map = old_fields_ids_map.as_fields_ids_map(); + let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); + + let old_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Deletion, + old_fields_ids_map, + ); + + let new_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Addition, + new_fields_ids_map, + ); + + for new in &runtime.fragments { + let name = &new.name; + let fragment = + RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); + + let diff = fragment + .diff_documents(&old_document, &new_document, &()) + .expect("ignoring errors so this cannot fail"); + + fragment_diff.push((name.clone(), diff)); + } + VectorStateDelta::UpdateGeneratedFromFragments(fragment_diff) + } } else { tracing::trace!("⏭️ Prompt unmodified, skipping"); VectorStateDelta::NoChange @@ -567,15 +790,25 @@ fn extract_vector_document_diff( ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id); return Ok(VectorStateDelta::NoChange); } - // becomes autogenerated - VectorStateDelta::NowGenerated(prompt.render_kvdeladd( - obkv, - DelAdd::Addition, - new_fields_ids_map, - )?) + + let has_fragments = !runtime.fragments.is_empty(); + + if has_fragments { + regenerate_all_fragments( + &runtime.fragments, + doc_alloc, + new_fields_ids_map, + obkv, + ) + } else { + // becomes autogenerated + VectorStateDelta::NowGenerated(runtime.document_template.render_kvdeladd( + obkv, + DelAdd::Addition, + new_fields_ids_map, + )?) + } } else { - // make sure the document is always removed from user provided on removal - remove_from_user_provided.insert(docid); VectorStateDelta::NowRemoved } } @@ -593,8 +826,6 @@ fn extract_vector_document_diff( // then they are user-provided and nothing possibly changed VectorStateDelta::NoChange } else { - // make sure the document is always removed from user provided on removal - remove_from_user_provided.insert(docid); VectorStateDelta::NowRemoved } } @@ -629,16 +860,45 @@ fn regenerate_prompt( Ok(VectorStateDelta::NowGenerated(prompt)) } +fn regenerate_all_fragments<'a>( + fragments: impl IntoIterator, + doc_alloc: &Bump, + new_fields_ids_map: &FieldIdMapWithMetadata, + obkv: &KvReaderU16, +) -> VectorStateDelta { + let mut fragment_diff = Vec::new(); + let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); + + let obkv_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Addition, + new_fields_ids_map, + ); + for new in fragments { + let name = &new.name; + let new = RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); + + let diff = + { new.extract(&obkv_document, &()) }.expect("ignoring errors so this cannot fail"); + if let Some(value) = diff { + fragment_diff.push((name.clone(), value)); + } + } + VectorStateDelta::NowGeneratedFromFragments(fragment_diff) +} + /// We cannot compute the diff between both Del and Add vectors. /// We'll push every vector and compute the difference later in TypedChunk. fn push_vectors_diff( remove_vectors_writer: &mut Writer>, prompts_writer: &mut Writer>, + inputs_writer: &mut Writer>, manual_vectors_writer: &mut Writer>, key_buffer: &mut Vec, delta: VectorStateDelta, + fragments: &[RuntimeFragment], ) -> Result<()> { - let (must_remove, prompt, mut add_vectors) = delta.into_values(); + let (must_remove, prompt, mut fragment_delta, mut add_vectors) = delta.into_values(); if must_remove { key_buffer.truncate(TRUNCATE_SIZE); remove_vectors_writer.insert(&key_buffer, [])?; @@ -648,23 +908,49 @@ fn push_vectors_diff( prompts_writer.insert(&key_buffer, prompt.as_bytes())?; } - // We sort and dedup the vectors - add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); - add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + if !fragment_delta.is_empty() { + let mut scratch = Vec::new(); + let mut fragment_delta: Vec<_> = fragments + .iter() + .filter_map(|fragment| { + let delta = fragment_delta.remove(&fragment.name)?; + Some((fragment.id, delta)) + }) + .collect(); - // insert vectors into the writer - for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { - // Generate the key by extending the unique index to it. - key_buffer.truncate(TRUNCATE_SIZE); - let index = u16::try_from(i).unwrap(); - key_buffer.extend_from_slice(&index.to_be_bytes()); + fragment_delta.sort_unstable_by_key(|(id, _)| *id); + for (id, value) in fragment_delta { + key_buffer.truncate(TRUNCATE_SIZE); + key_buffer.push(id); + if let Some(value) = value { + scratch.clear(); + serde_json::to_writer(&mut scratch, &value).unwrap(); + inputs_writer.insert(&key_buffer, &scratch)?; + } else { + inputs_writer.insert(&key_buffer, [])?; + } + } + } - // We insert only the Add part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Addition, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; + if !add_vectors.is_empty() { + // We sort and dedup the vectors + add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); + add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + + // insert vectors into the writer + for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { + // Generate the key by extending the unique index to it. + key_buffer.truncate(TRUNCATE_SIZE); + let index = u16::try_from(i).unwrap(); + key_buffer.extend_from_slice(&index.to_be_bytes()); + + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + manual_vectors_writer.insert(&key_buffer, bytes)?; + } } Ok(()) @@ -677,17 +963,18 @@ fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering { #[allow(clippy::too_many_arguments)] #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] -pub fn extract_embeddings( +pub fn extract_embeddings_from_prompts( // docid, prompt prompt_reader: grenad::Reader, indexer: GrenadParameters, - embedder: Arc, + runtime: Arc, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, embedder_stats: &EmbedderStats, unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, ) -> Result>> { + let embedder = &runtime.embedder; let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk @@ -723,7 +1010,7 @@ pub fn extract_embeddings( if chunks.len() == chunks.capacity() { let chunked_embeds = embed_chunks( - &embedder, + embedder, std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)), embedder_name, possible_embedding_mistakes, @@ -746,7 +1033,7 @@ pub fn extract_embeddings( // send last chunk if !chunks.is_empty() { let chunked_embeds = embed_chunks( - &embedder, + embedder, std::mem::take(&mut chunks), embedder_name, possible_embedding_mistakes, @@ -765,7 +1052,7 @@ pub fn extract_embeddings( if !current_chunk.is_empty() { let embeds = embed_chunks( - &embedder, + embedder, vec![std::mem::take(&mut current_chunk)], embedder_name, possible_embedding_mistakes, @@ -838,3 +1125,183 @@ fn embed_chunks( } } } + +#[allow(clippy::too_many_arguments)] +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] +pub fn extract_embeddings_from_fragments( + // (docid, extractor_id) -> (Option) + inputs_reader: grenad::Reader, + indexer: GrenadParameters, + runtime: Arc, + embedder_name: &str, + possible_embedding_mistakes: &PossibleEmbeddingMistakes, + embedder_stats: &EmbedderStats, + unused_vectors_distribution: &UnusedVectorsDistribution, + request_threads: &ThreadPoolNoAbort, +) -> Result>> { + let doc_alloc = Bump::new(); + + // (docid, extractor_id) -> (Option) + let vector_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + if inputs_reader.is_empty() { + return writer_into_reader(vector_writer); + } + + let on_embed = WriteGrenadOnEmbed { + waiting_responses: Default::default(), + vector_writer, + scratch: Default::default(), + possible_embedding_mistakes, + }; + + let mut session = EmbedSession::new( + &runtime.embedder, + embedder_name, + request_threads, + &doc_alloc, + embedder_stats, + on_embed, + ); + + let mut cursor = inputs_reader.into_cursor()?; + + while let Some((mut key, value)) = cursor.move_on_next()? { + let docid = key.read_u32::().unwrap(); + let extractor_id = key.read_u8().unwrap(); + + if value.is_empty() { + // no value => removed fragment + session.on_embed_mut().push_response(docid, extractor_id); + } else { + // unwrap: the grenad value was saved as a serde_json::Value + let value: Value = serde_json::from_slice(value).unwrap(); + session.request_embedding( + Metadata { docid, external_docid: "", extractor_id }, + value, + unused_vectors_distribution, + )?; + } + } + + // send last chunk + let on_embed = session.drain(unused_vectors_distribution)?; + on_embed.finish() +} + +struct WriteGrenadOnEmbed<'a> { + // list of (document_id, extractor_id) for which vectors should be removed. + // these are written whenever a response arrives that has a larger (docid, extractor_id). + waiting_responses: VecDeque<(DocumentId, u8)>, + + // grenad of (docid, extractor_id) -> (Option) + vector_writer: Writer>, + + possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, + + // scratch buffer used to write keys + scratch: Vec, +} + +impl WriteGrenadOnEmbed<'_> { + pub fn push_response(&mut self, docid: DocumentId, extractor_id: u8) { + self.waiting_responses.push_back((docid, extractor_id)); + } + + pub fn finish(mut self) -> Result>> { + for (docid, extractor_id) in self.waiting_responses { + self.scratch.clear(); + self.scratch.write_u32::(docid).unwrap(); + self.scratch.write_u8(extractor_id).unwrap(); + self.vector_writer.insert(&self.scratch, []).unwrap(); + } + writer_into_reader(self.vector_writer) + } +} + +impl<'doc> OnEmbed<'doc> for WriteGrenadOnEmbed<'_> { + type ErrorMetadata = UnusedVectorsDistribution; + fn process_embedding_response( + &mut self, + response: crate::vector::session::EmbeddingResponse<'doc>, + ) { + let (docid, extractor_id) = (response.metadata.docid, response.metadata.extractor_id); + while let Some(waiting_response) = self.waiting_responses.pop_front() { + if (docid, extractor_id) > waiting_response { + self.scratch.clear(); + self.scratch.write_u32::(docid).unwrap(); + self.scratch.write_u8(extractor_id).unwrap(); + self.vector_writer.insert(&self.scratch, []).unwrap(); + } else { + self.waiting_responses.push_front(waiting_response); + break; + } + } + + if let Some(embedding) = response.embedding { + self.scratch.clear(); + self.scratch.write_u32::(docid).unwrap(); + self.scratch.write_u8(extractor_id).unwrap(); + self.vector_writer.insert(&self.scratch, cast_slice(embedding.as_slice())).unwrap(); + } + } + + fn process_embedding_error( + &mut self, + error: crate::vector::error::EmbedError, + embedder_name: &'doc str, + unused_vectors_distribution: &crate::vector::error::UnusedVectorsDistribution, + _metadata: &[crate::vector::session::Metadata<'doc>], + ) -> crate::Error { + if let FaultSource::Bug = error.fault { + crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(error.into())) + } else { + let mut msg = + format!(r"While embedding documents for embedder `{embedder_name}`: {error}"); + + if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); + } + + let mut hint_count = 0; + + for (vector_misspelling, count) in + self.possible_embedding_mistakes.vector_mistakes().take(2) + { + msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); + hint_count += 1; + } + + for (embedder_misspelling, count) in self + .possible_embedding_mistakes + .embedder_mistakes(embedder_name, unused_vectors_distribution) + .take(2) + { + msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); + hint_count += 1; + } + + if hint_count == 0 { + if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + msg += &format!( + "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" + ); + } + } + + crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)) + } + } + + fn process_embeddings( + &mut self, + _metadata: crate::vector::session::Metadata<'doc>, + _embeddings: Vec, + ) { + unimplemented!("unused") + } +} diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index d640bc075..cbf4ceba2 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -23,16 +23,17 @@ use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, Extra use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_vector_points::{ - extract_embeddings, extract_vector_points, ExtractedVectorPoints, + extract_embeddings_from_prompts, extract_vector_points, ExtractedVectorPoints, }; use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; -use crate::index::IndexEmbeddingConfig; use crate::progress::EmbedderStats; +use crate::update::index_documents::extract::extract_vector_points::extract_embeddings_from_fragments; use crate::update::settings::InnerIndexSettingsDiff; +use crate::vector::db::EmbedderInfo; use crate::vector::error::PossibleEmbeddingMistakes; use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; @@ -46,9 +47,9 @@ pub(crate) fn data_from_obkv_documents( indexer: GrenadParameters, lmdb_writer_sx: Sender>, primary_key_id: FieldId, - embedders_configs: Arc>, settings_diff: Arc, max_positions_per_attributes: Option, + embedder_info: Arc>, possible_embedding_mistakes: Arc, embedder_stats: &Arc, ) -> Result<()> { @@ -61,8 +62,8 @@ pub(crate) fn data_from_obkv_documents( original_documents_chunk, indexer, lmdb_writer_sx.clone(), - embedders_configs.clone(), settings_diff.clone(), + embedder_info.clone(), possible_embedding_mistakes.clone(), embedder_stats.clone(), ) @@ -231,8 +232,8 @@ fn send_original_documents_data( original_documents_chunk: Result>>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, - embedders_configs: Arc>, settings_diff: Arc, + embedder_info: Arc>, possible_embedding_mistakes: Arc, embedder_stats: Arc, ) -> Result<()> { @@ -245,7 +246,6 @@ fn send_original_documents_data( if index_vectors { let settings_diff = settings_diff.clone(); - let embedders_configs = embedders_configs.clone(); let original_documents_chunk = original_documents_chunk.clone(); let lmdb_writer_sx = lmdb_writer_sx.clone(); @@ -253,8 +253,8 @@ fn send_original_documents_data( match extract_vector_points( original_documents_chunk.clone(), indexer, - &embedders_configs, &settings_diff, + embedder_info.as_slice(), &possible_embedding_mistakes, ) { Ok((extracted_vectors, unused_vectors_distribution)) => { @@ -262,16 +262,16 @@ fn send_original_documents_data( manual_vectors, remove_vectors, prompts, + inputs, embedder_name, - embedder, - add_to_user_provided, - remove_from_user_provided, + runtime, + embedding_status_delta, } in extracted_vectors { - let embeddings = match extract_embeddings( + let embeddings_from_prompts = match extract_embeddings_from_prompts( prompts, indexer, - embedder.clone(), + runtime.clone(), &embedder_name, &possible_embedding_mistakes, &embedder_stats, @@ -284,18 +284,37 @@ fn send_original_documents_data( None } }; + + let embeddings_from_fragments = match extract_embeddings_from_fragments( + inputs, + indexer, + runtime.clone(), + &embedder_name, + &possible_embedding_mistakes, + &embedder_stats, + &unused_vectors_distribution, + request_threads(), + ) { + Ok(results) => Some(results), + Err(error) => { + let _ = lmdb_writer_sx.send(Err(error)); + None + } + }; + if !(remove_vectors.is_empty() && manual_vectors.is_empty() - && embeddings.as_ref().is_none_or(|e| e.is_empty())) + && embeddings_from_prompts.as_ref().is_none_or(|e| e.is_empty()) + && embeddings_from_fragments.as_ref().is_none_or(|e| e.is_empty())) { let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints { remove_vectors, - embeddings, - expected_dimension: embedder.dimensions(), + embeddings_from_prompts, + embeddings_from_fragments, + expected_dimension: runtime.embedder.dimensions(), manual_vectors, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, })); } } diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 5ec6910f7..055b8bbad 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -38,7 +38,8 @@ pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; -use crate::vector::{ArroyWrapper, EmbeddingConfigs}; +use crate::vector::db::EmbedderInfo; +use crate::vector::{ArroyWrapper, RuntimeEmbedders}; use crate::{CboRoaringBitmapCodec, Index, Result, UserError}; static MERGED_DATABASE_COUNT: usize = 7; @@ -81,7 +82,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { should_abort: FA, added_documents: u64, deleted_documents: u64, - embedders: EmbeddingConfigs, + embedders: RuntimeEmbedders, embedder_stats: &'t Arc, } @@ -172,7 +173,7 @@ where Ok((self, Ok(indexed_documents))) } - pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self { + pub fn with_embedders(mut self, embedders: RuntimeEmbedders) -> Self { self.embedders = embedders; self } @@ -226,7 +227,13 @@ where settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let settings_diff = Arc::new(settings_diff); - let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?); + let embedder_infos: heed::Result> = self + .index + .embedding_configs() + .iter_embedder_info(self.wtxn)? + .map(|res| res.map(|(name, info)| (name.to_owned(), info))) + .collect(); + let embedder_infos = Arc::new(embedder_infos?); let possible_embedding_mistakes = crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution); @@ -328,9 +335,9 @@ where pool_params, lmdb_writer_sx.clone(), primary_key_id, - embedders_configs.clone(), settings_diff_cloned, max_positions_per_attributes, + embedder_infos, Arc::new(possible_embedding_mistakes), &embedder_stats ) @@ -430,21 +437,21 @@ where TypedChunk::VectorPoints { expected_dimension, remove_vectors, - embeddings, + embeddings_from_prompts, + embeddings_from_fragments, manual_vectors, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, } => { dimension.insert(embedder_name.clone(), expected_dimension); TypedChunk::VectorPoints { remove_vectors, - embeddings, + embeddings_from_prompts, + embeddings_from_fragments, expected_dimension, manual_vectors, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, } } otherwise => otherwise, @@ -480,7 +487,7 @@ where // we should insert it in `dimension` for (name, action) in settings_diff.embedding_config_updates.iter() { if action.is_being_quantized && !dimension.contains_key(name.as_str()) { - let index = self.index.embedder_category_id.get(self.wtxn, name)?.ok_or( + let index = self.index.embedding_configs().embedder_id(self.wtxn, name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None, @@ -488,7 +495,9 @@ where )?; let reader = ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized); - let dim = reader.dimensions(self.wtxn)?; + let Some(dim) = reader.dimensions(self.wtxn)? else { + continue; + }; dimension.insert(name.to_string(), dim); } } @@ -498,12 +507,19 @@ where let vector_arroy = self.index.vector_arroy; let cancel = &self.should_abort; - let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, - )?; + let embedder_index = + self.index.embedding_configs().embedder_id(wtxn, &embedder_name)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + }, + )?; let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name); - let was_quantized = - settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2); + let was_quantized = settings_diff + .old + .embedding_configs + .get(&embedder_name) + .is_some_and(|conf| conf.is_quantized); let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized); pool.install(|| { @@ -773,11 +789,11 @@ mod tests { use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::documents::mmap_from_objects; use crate::index::tests::TempIndex; - use crate::index::IndexEmbeddingConfig; use crate::progress::Progress; use crate::search::TermsMatchingStrategy; use crate::update::new::indexer; use crate::update::Setting; + use crate::vector::db::IndexEmbeddingConfig; use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError}; #[test] @@ -2028,7 +2044,7 @@ mod tests { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -2116,7 +2132,7 @@ mod tests { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -2277,7 +2293,7 @@ mod tests { ]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.replace_documents(&documents).unwrap(); indexer.delete_documents(&["2"]); @@ -2343,7 +2359,7 @@ mod tests { indexer.delete_documents(&["1", "2"]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let (document_changes, _operation_stats, primary_key) = indexer .into_changes( &indexer_alloc, @@ -2394,7 +2410,7 @@ mod tests { { "id": 3, "name": "jean", "age": 25 }, ]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.update_documents(&documents).unwrap(); @@ -2446,7 +2462,7 @@ mod tests { { "id": 3, "legs": 4 }, ]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.update_documents(&documents).unwrap(); indexer.delete_documents(&["1", "2"]); @@ -2496,7 +2512,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1", "2"]); @@ -2552,7 +2568,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1", "2", "1", "2"]); @@ -2611,7 +2627,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let documents = documents!([ @@ -2661,7 +2677,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1"]); @@ -2775,6 +2791,8 @@ mod tests { document_template: Setting::NotSet, document_template_max_bytes: Setting::NotSet, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, distribution: Setting::NotSet, @@ -2801,17 +2819,27 @@ mod tests { .unwrap(); let rtxn = index.read_txn().unwrap(); - let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } = + let embedders = index.embedding_configs(); + let mut embedding_configs = embedders.embedding_configs(&rtxn).unwrap(); + let IndexEmbeddingConfig { name: embedder_name, config: embedder, fragments } = embedding_configs.pop().unwrap(); + let info = embedders.embedder_info(&rtxn, &embedder_name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0, 1, 2]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[0, 1, 2]>"); insta::assert_snapshot!(embedder_name, @"manual"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); + let embedder = std::sync::Arc::new( crate::vector::Embedder::new(embedder.embedder_options, 0).unwrap(), ); let res = index .search(&rtxn) - .semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec())) + .semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec()), None) .execute() .unwrap(); assert_eq!(res.documents_ids.len(), 3); @@ -2860,7 +2888,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); // OP @@ -2921,7 +2949,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1"]); @@ -2980,7 +3008,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let documents = documents!([ diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index e17625ad4..e07483aff 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -31,7 +31,7 @@ use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableIds, UpdateIndexingStep}; use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; -use crate::vector::settings::WriteBackToDocuments; +use crate::vector::settings::{RemoveFragments, WriteBackToDocuments}; use crate::vector::ArroyWrapper; use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result}; @@ -933,10 +933,47 @@ impl<'a, 'i> Transform<'a, 'i> { // delete all vectors from the embedders that need removal for (_, (reader, _)) in readers { - let dimensions = reader.dimensions(wtxn)?; + let Some(dimensions) = reader.dimensions(wtxn)? else { + continue; + }; reader.clear(wtxn, dimensions)?; } + // remove all vectors for the specified fragments + for (embedder_name, RemoveFragments { fragment_ids }, was_quantized) in + settings_diff.embedding_config_updates.iter().filter_map(|(name, action)| { + action.remove_fragments().map(|fragments| (name, fragments, action.was_quantized)) + }) + { + let Some(infos) = self.index.embedding_configs().embedder_info(wtxn, embedder_name)? + else { + continue; + }; + let arroy = + ArroyWrapper::new(self.index.vector_arroy, infos.embedder_id, was_quantized); + let Some(dimensions) = arroy.dimensions(wtxn)? else { + continue; + }; + for fragment_id in fragment_ids { + // we must keep the user provided embeddings that ended up in this store + + if infos.embedding_status.user_provided_docids().is_empty() { + // no user provided: clear store + arroy.clear_store(wtxn, *fragment_id, dimensions)?; + continue; + } + + // some user provided, remove only the ids that are not user provided + let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| { + items - infos.embedding_status.user_provided_docids() + })?; + + for to_delete in to_delete { + arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; + } + } + } + let grenad_params = GrenadParameters { chunk_compression_type: self.indexer_settings.chunk_compression_type, chunk_compression_level: self.indexer_settings.chunk_compression_level, diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index 6d575a98b..370579a6c 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -4,6 +4,7 @@ use std::fs::File; use std::io::{self, BufReader}; use bytemuck::allocation::pod_collect_to_vec; +use byteorder::{BigEndian, ReadBytesExt as _}; use grenad::{MergeFunction, Merger, MergerBuilder}; use heed::types::Bytes; use heed::{BytesDecode, RwTxn}; @@ -18,7 +19,6 @@ use super::helpers::{ use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; -use crate::index::IndexEmbeddingConfig; use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; @@ -26,6 +26,7 @@ use crate::update::index_documents::helpers::{ as_cloneable_grenad, try_split_array_at, KeepLatestObkv, }; use crate::update::settings::InnerIndexSettingsDiff; +use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig}; use crate::vector::ArroyWrapper; use crate::{ lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, @@ -86,12 +87,14 @@ pub(crate) enum TypedChunk { GeoPoints(grenad::Reader>), VectorPoints { remove_vectors: grenad::Reader>, - embeddings: Option>>, + // docid -> vector + embeddings_from_prompts: Option>>, + // docid, extractor_id -> Option, + embeddings_from_fragments: Option>>, expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, - add_to_user_provided: RoaringBitmap, - remove_from_user_provided: RoaringBitmap, + embedding_status_delta: EmbeddingStatusDelta, }, } @@ -155,6 +158,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut iter = merger.into_stream_merger_iter()?; let embedders: BTreeSet<_> = index + .embedding_configs() .embedding_configs(wtxn)? .into_iter() .map(|IndexEmbeddingConfig { name, .. }| name) @@ -614,57 +618,66 @@ pub(crate) fn write_typed_chunk_into_index( let span = tracing::trace_span!(target: "indexing::write_db", "vector_points"); let _entered = span.enter(); + let embedders = index.embedding_configs(); + let mut remove_vectors_builder = MergerBuilder::new(KeepFirst); let mut manual_vectors_builder = MergerBuilder::new(KeepFirst); - let mut embeddings_builder = MergerBuilder::new(KeepFirst); - let mut add_to_user_provided = RoaringBitmap::new(); - let mut remove_from_user_provided = RoaringBitmap::new(); + let mut embeddings_from_prompts_builder = MergerBuilder::new(KeepFirst); + let mut embeddings_from_fragments_builder = MergerBuilder::new(KeepFirst); let mut params = None; + let mut infos = None; for typed_chunk in typed_chunks { let TypedChunk::VectorPoints { remove_vectors, manual_vectors, - embeddings, + embeddings_from_prompts, + embeddings_from_fragments, expected_dimension, embedder_name, - add_to_user_provided: aud, - remove_from_user_provided: rud, + embedding_status_delta, } = typed_chunk else { unreachable!(); }; + if infos.is_none() { + infos = Some(embedders.embedder_info(wtxn, &embedder_name)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + }, + )?); + } + params = Some((expected_dimension, embedder_name)); remove_vectors_builder.push(remove_vectors.into_cursor()?); manual_vectors_builder.push(manual_vectors.into_cursor()?); - if let Some(embeddings) = embeddings { - embeddings_builder.push(embeddings.into_cursor()?); + if let Some(embeddings) = embeddings_from_prompts { + embeddings_from_prompts_builder.push(embeddings.into_cursor()?); + } + if let Some(embeddings) = embeddings_from_fragments { + embeddings_from_fragments_builder.push(embeddings.into_cursor()?); + } + + if let Some(infos) = &mut infos { + embedding_status_delta.apply_to(&mut infos.embedding_status); } - add_to_user_provided |= aud; - remove_from_user_provided |= rud; } // typed chunks has always at least 1 chunk. let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; + let Some(infos) = infos else { unreachable!() }; - let mut embedding_configs = index.embedding_configs(wtxn)?; - let index_embedder_config = embedding_configs - .iter_mut() - .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) - .unwrap(); - index_embedder_config.user_provided -= remove_from_user_provided; - index_embedder_config.user_provided |= add_to_user_provided; + embedders.put_embedder_info(wtxn, &embedder_name, &infos)?; - index.put_embedding_configs(wtxn, embedding_configs)?; - - let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, - )?; - let binary_quantized = - settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2); + let binary_quantized = settings_diff + .old + .embedding_configs + .get(&embedder_name) + .is_some_and(|conf| conf.is_quantized); // FIXME: allow customizing distance - let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized); + let writer = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, binary_quantized); // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); @@ -674,8 +687,8 @@ pub(crate) fn write_typed_chunk_into_index( writer.del_items(wtxn, expected_dimension, docid)?; } - // add generated embeddings - let merger = embeddings_builder.build(); + // add generated embeddings -- from prompts + let merger = embeddings_from_prompts_builder.build(); let mut iter = merger.into_stream_merger_iter()?; while let Some((key, value)) = iter.next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); @@ -702,6 +715,24 @@ pub(crate) fn write_typed_chunk_into_index( writer.add_items(wtxn, docid, &embeddings)?; } + // add generated embeddings -- from fragments + let merger = embeddings_from_fragments_builder.build(); + let mut iter = merger.into_stream_merger_iter()?; + while let Some((mut key, value)) = iter.next()? { + let docid = key.read_u32::().unwrap(); + let extractor_id = key.read_u8().unwrap(); + if value.is_empty() { + writer.del_item_in_store(wtxn, docid, extractor_id, expected_dimension)?; + } else { + let data = pod_collect_to_vec(value); + // it is a code error to have embeddings and not expected_dimension + if data.len() != expected_dimension { + panic!("wrong dimensions") + } + writer.add_item_in_store(wtxn, docid, extractor_id, &data)?; + } + } + // perform the manual diff let merger = manual_vectors_builder.build(); let mut iter = merger.into_stream_merger_iter()?; diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index 5fcb2912b..36e80677a 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -6,9 +6,8 @@ use serde_json::value::RawValue; use serde_json::{from_slice, Value}; use super::Embedding; -use crate::index::IndexEmbeddingConfig; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; -use crate::{DocumentId, FieldId, InternalError, UserError}; +use crate::{FieldId, InternalError, UserError}; #[derive(serde::Serialize, Debug)] #[serde(untagged)] @@ -374,8 +373,7 @@ pub struct ParsedVectorsDiff { impl ParsedVectorsDiff { pub fn new( - docid: DocumentId, - embedders_configs: &[IndexEmbeddingConfig], + regenerate_for_embedders: impl Iterator, documents_diff: &KvReader, old_vectors_fid: Option, new_vectors_fid: Option, @@ -396,10 +394,8 @@ impl ParsedVectorsDiff { } } .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); - for embedding_config in embedders_configs { - if embedding_config.user_provided.contains(docid) { - old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual); - } + for name in regenerate_for_embedders { + old.entry(name).or_insert(VectorState::Generated); } let new = 'new: { From 46bceb91f19cea95fc902ca8ff9482d53ea41359 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:08:48 +0200 Subject: [PATCH 106/150] New search errors --- crates/meilisearch-types/src/error.rs | 3 +++ crates/meilisearch/src/error.rs | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 30f6868f6..c57e2d042 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -301,6 +301,7 @@ InvalidFacetSearchQuery , InvalidRequest , BAD_REQU InvalidFacetSearchName , InvalidRequest , BAD_REQUEST ; FacetSearchDisabled , InvalidRequest , BAD_REQUEST ; InvalidSearchVector , InvalidRequest , BAD_REQUEST ; +InvalidSearchMedia , InvalidRequest , BAD_REQUEST ; InvalidSearchShowMatchesPosition , InvalidRequest , BAD_REQUEST ; InvalidSearchShowRankingScore , InvalidRequest , BAD_REQUEST ; InvalidSimilarShowRankingScore , InvalidRequest , BAD_REQUEST ; @@ -308,6 +309,7 @@ InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQU InvalidSimilarShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ; InvalidSearchSort , InvalidRequest , BAD_REQUEST ; InvalidSearchDistinct , InvalidRequest , BAD_REQUEST ; +InvalidSearchMediaAndVector , InvalidRequest , BAD_REQUEST ; InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ; InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ; @@ -464,6 +466,7 @@ impl ErrorCode for milli::Error { | UserError::MissingSourceForNested { .. } | UserError::InvalidSettingsEmbedder { .. } => Code::InvalidSettingsEmbedders, UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders, + UserError::TooManyFragments(_) => Code::InvalidSettingsEmbedders, UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders, UserError::NoPrimaryKeyCandidateFound => Code::IndexPrimaryKeyNoCandidateFound, UserError::MultiplePrimaryKeyCandidatesFound { .. } => { diff --git a/crates/meilisearch/src/error.rs b/crates/meilisearch/src/error.rs index b13eb8d7c..91c6c23fa 100644 --- a/crates/meilisearch/src/error.rs +++ b/crates/meilisearch/src/error.rs @@ -76,8 +76,10 @@ pub enum MeilisearchHttpError { DocumentFormat(#[from] DocumentFormatError), #[error(transparent)] Join(#[from] JoinError), - #[error("Invalid request: missing `hybrid` parameter when `vector` is present.")] + #[error("Invalid request: missing `hybrid` parameter when `vector` or `media` are present.")] MissingSearchHybrid, + #[error("Invalid request: both `media` and `vector` parameters are present.")] + MediaAndVector, } impl MeilisearchHttpError { @@ -111,6 +113,7 @@ impl ErrorCode for MeilisearchHttpError { MeilisearchHttpError::DocumentFormat(e) => e.error_code(), MeilisearchHttpError::Join(_) => Code::Internal, MeilisearchHttpError::MissingSearchHybrid => Code::MissingSearchHybrid, + MeilisearchHttpError::MediaAndVector => Code::InvalidSearchMediaAndVector, MeilisearchHttpError::FederationOptionsInNonFederatedRequest(_) => { Code::InvalidMultiSearchFederationOptions } From d14184f4da8114d532b5f8a7b13c955e204c5ebf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:10:46 +0200 Subject: [PATCH 107/150] Add `media` to search --- .../src/routes/indexes/facet_search.rs | 6 +++ .../meilisearch/src/routes/indexes/search.rs | 2 + crates/meilisearch/src/search/mod.rs | 36 ++++++++++++-- crates/milli/src/search/hybrid.rs | 27 +++++++---- crates/milli/src/search/mod.rs | 48 +++++++++++-------- 5 files changed, 86 insertions(+), 33 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/facet_search.rs b/crates/meilisearch/src/routes/indexes/facet_search.rs index 41f306746..18ad54ccf 100644 --- a/crates/meilisearch/src/routes/indexes/facet_search.rs +++ b/crates/meilisearch/src/routes/indexes/facet_search.rs @@ -56,6 +56,8 @@ pub struct FacetSearchQuery { pub q: Option, #[deserr(default, error = DeserrJsonError)] pub vector: Option>, + #[deserr(default, error = DeserrJsonError)] + pub media: Option, #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default, error = DeserrJsonError)] @@ -94,6 +96,7 @@ impl FacetSearchAggregator { facet_name, vector, q, + media, filter, matching_strategy, attributes_to_search_on, @@ -108,6 +111,7 @@ impl FacetSearchAggregator { facet_names: Some(facet_name.clone()).into_iter().collect(), additional_search_parameters_provided: q.is_some() || vector.is_some() + || media.is_some() || filter.is_some() || *matching_strategy != MatchingStrategy::default() || attributes_to_search_on.is_some() @@ -291,6 +295,7 @@ impl From for SearchQuery { facet_name: _, q, vector, + media, filter, matching_strategy, attributes_to_search_on, @@ -312,6 +317,7 @@ impl From for SearchQuery { SearchQuery { q, + media, offset: DEFAULT_SEARCH_OFFSET(), limit: DEFAULT_SEARCH_LIMIT(), page, diff --git a/crates/meilisearch/src/routes/indexes/search.rs b/crates/meilisearch/src/routes/indexes/search.rs index 333ae1944..035ba71d8 100644 --- a/crates/meilisearch/src/routes/indexes/search.rs +++ b/crates/meilisearch/src/routes/indexes/search.rs @@ -205,6 +205,8 @@ impl TryFrom for SearchQuery { Ok(Self { q: other.q, + // `media` not supported for `GET` + media: None, vector: other.vector.map(CS::into_inner), offset: other.offset.0, limit: other.limit.0, diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 61ef3f813..6d8639504 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -64,6 +64,8 @@ pub struct SearchQuery { pub q: Option, #[deserr(default, error = DeserrJsonError)] pub vector: Option>, + #[deserr(default, error = DeserrJsonError)] + pub media: Option, #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError)] @@ -147,6 +149,7 @@ impl From for SearchQuery { ranking_score_threshold: ranking_score_threshold.map(RankingScoreThreshold::from), q: None, vector: None, + media: None, offset: DEFAULT_SEARCH_OFFSET(), page: None, hits_per_page: None, @@ -220,6 +223,7 @@ impl fmt::Debug for SearchQuery { let Self { q, vector, + media, hybrid, offset, limit, @@ -274,6 +278,9 @@ impl fmt::Debug for SearchQuery { ); } } + if let Some(media) = media { + debug.field("media", media); + } if let Some(hybrid) = hybrid { debug.field("hybrid", &hybrid); } @@ -482,8 +489,10 @@ pub struct SearchQueryWithIndex { pub index_uid: IndexUid, #[deserr(default, error = DeserrJsonError)] pub q: Option, - #[deserr(default, error = DeserrJsonError)] + #[deserr(default, error = DeserrJsonError)] pub vector: Option>, + #[deserr(default, error = DeserrJsonError)] + pub media: Option, #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default, error = DeserrJsonError)] @@ -564,6 +573,7 @@ impl SearchQueryWithIndex { let SearchQuery { q, vector, + media, hybrid, offset, limit, @@ -594,6 +604,7 @@ impl SearchQueryWithIndex { index_uid, q, vector, + media, hybrid, offset: if offset == DEFAULT_SEARCH_OFFSET() { None } else { Some(offset) }, limit: if limit == DEFAULT_SEARCH_LIMIT() { None } else { Some(limit) }, @@ -628,6 +639,7 @@ impl SearchQueryWithIndex { federation_options, q, vector, + media, offset, limit, page, @@ -658,6 +670,7 @@ impl SearchQueryWithIndex { SearchQuery { q, vector, + media, offset: offset.unwrap_or(DEFAULT_SEARCH_OFFSET()), limit: limit.unwrap_or(DEFAULT_SEARCH_LIMIT()), page, @@ -984,14 +997,27 @@ pub fn prepare_search<'t>( let deadline = std::time::Instant::now() + std::time::Duration::from_secs(10); + let q = query.q.as_deref(); + let media = query.media.as_ref(); + + let search_query = match (q, media) { + (Some(text), None) => milli::vector::SearchQuery::Text(text), + (q, media) => milli::vector::SearchQuery::Media { q, media }, + }; + embedder - .embed_search(query.q.as_ref().unwrap(), Some(deadline)) + .embed_search(search_query, Some(deadline)) .map_err(milli::vector::Error::from) .map_err(milli::Error::from)? } }; - - search.semantic(embedder_name.clone(), embedder.clone(), *quantized, Some(vector)); + search.semantic( + embedder_name.clone(), + embedder.clone(), + *quantized, + Some(vector), + query.media.clone(), + ); } SearchKind::Hybrid { embedder_name, embedder, quantized, semantic_ratio: _ } => { if let Some(q) = &query.q { @@ -1003,6 +1029,7 @@ pub fn prepare_search<'t>( embedder.clone(), *quantized, query.vector.clone(), + query.media.clone(), ); } } @@ -1127,6 +1154,7 @@ pub fn perform_search( locales, // already used in prepare_search vector: _, + media: _, hybrid: _, offset: _, ranking_score_threshold: _, diff --git a/crates/milli/src/search/hybrid.rs b/crates/milli/src/search/hybrid.rs index b63f6288f..c906e1eb7 100644 --- a/crates/milli/src/search/hybrid.rs +++ b/crates/milli/src/search/hybrid.rs @@ -7,6 +7,7 @@ use roaring::RoaringBitmap; use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy}; use crate::search::new::{distinct_fid, distinct_single_docid}; use crate::search::SemanticSearch; +use crate::vector::SearchQuery; use crate::{Index, MatchingWords, Result, Search, SearchResult}; struct ScoreWithRatioResult { @@ -225,12 +226,9 @@ impl Search<'_> { return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); } - // no vector search against placeholder search - let Some(query) = search.query.take() else { - return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); - }; // no embedder, no semantic search - let Some(SemanticSearch { vector, embedder_name, embedder, quantized }) = semantic else { + let Some(SemanticSearch { vector, embedder_name, embedder, quantized, media }) = semantic + else { return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); }; @@ -241,9 +239,17 @@ impl Search<'_> { let span = tracing::trace_span!(target: "search::hybrid", "embed_one"); let _entered = span.enter(); + let q = search.query.as_deref(); + let media = media.as_ref(); + + let query = match (q, media) { + (Some(text), None) => SearchQuery::Text(text), + (q, media) => SearchQuery::Media { q, media }, + }; + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(3); - match embedder.embed_search(&query, Some(deadline)) { + match embedder.embed_search(query, Some(deadline)) { Ok(embedding) => embedding, Err(error) => { tracing::error!(error=%error, "Embedding failed"); @@ -257,8 +263,13 @@ impl Search<'_> { } }; - search.semantic = - Some(SemanticSearch { vector: Some(vector_query), embedder_name, embedder, quantized }); + search.semantic = Some(SemanticSearch { + vector: Some(vector_query), + embedder_name, + embedder, + quantized, + media, + }); // TODO: would be better to have two distinct functions at this point let vector_results = search.execute()?; diff --git a/crates/milli/src/search/mod.rs b/crates/milli/src/search/mod.rs index 62183afc3..97d542524 100644 --- a/crates/milli/src/search/mod.rs +++ b/crates/milli/src/search/mod.rs @@ -12,7 +12,7 @@ use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats}; use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::index::MatchingStrategy; use crate::score_details::{ScoreDetails, ScoringStrategy}; -use crate::vector::Embedder; +use crate::vector::{Embedder, Embedding}; use crate::{ execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index, Result, SearchContext, TimeBudget, UserError, @@ -32,6 +32,7 @@ pub mod similar; #[derive(Debug, Clone)] pub struct SemanticSearch { vector: Option>, + media: Option, embedder_name: String, embedder: Arc, quantized: bool, @@ -93,9 +94,10 @@ impl<'a> Search<'a> { embedder_name: String, embedder: Arc, quantized: bool, - vector: Option>, + vector: Option, + media: Option, ) -> &mut Search<'a> { - self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector }); + self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector, media }); self } @@ -231,24 +233,28 @@ impl<'a> Search<'a> { degraded, used_negative_operator, } = match self.semantic.as_ref() { - Some(SemanticSearch { vector: Some(vector), embedder_name, embedder, quantized }) => { - execute_vector_search( - &mut ctx, - vector, - self.scoring_strategy, - universe, - &self.sort_criteria, - &self.distinct, - self.geo_param, - self.offset, - self.limit, - embedder_name, - embedder, - *quantized, - self.time_budget.clone(), - self.ranking_score_threshold, - )? - } + Some(SemanticSearch { + vector: Some(vector), + embedder_name, + embedder, + quantized, + media: _, + }) => execute_vector_search( + &mut ctx, + vector, + self.scoring_strategy, + universe, + &self.sort_criteria, + &self.distinct, + self.geo_param, + self.offset, + self.limit, + embedder_name, + embedder, + *quantized, + self.time_budget.clone(), + self.ranking_score_threshold, + )?, _ => execute_search( &mut ctx, self.query.as_deref(), From 2b3327ea74357cf6823bd8a89f447c2773c221d1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:11:00 +0200 Subject: [PATCH 108/150] Use `media` to determine search kind --- .../meilisearch/src/routes/indexes/search.rs | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/search.rs b/crates/meilisearch/src/routes/indexes/search.rs index 035ba71d8..697ae9241 100644 --- a/crates/meilisearch/src/routes/indexes/search.rs +++ b/crates/meilisearch/src/routes/indexes/search.rs @@ -483,28 +483,30 @@ pub fn search_kind( index_uid: String, index: &milli::Index, ) -> Result { + let is_placeholder_query = + if let Some(q) = query.q.as_deref() { q.trim().is_empty() } else { true }; + let non_placeholder_query = !is_placeholder_query; + let is_media = query.media.is_some(); // handle with care, the order of cases matters, the semantics is subtle - match (query.q.as_deref(), &query.hybrid, query.vector.as_deref()) { - // empty query, no vector => placeholder search - (Some(q), _, None) if q.trim().is_empty() => Ok(SearchKind::KeywordOnly), - // no query, no vector => placeholder search - (None, _, None) => Ok(SearchKind::KeywordOnly), - // hybrid.semantic_ratio == 1.0 => vector - (_, Some(HybridQuery { semantic_ratio, embedder }), v) if **semantic_ratio == 1.0 => { - SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) - } - // hybrid.semantic_ratio == 0.0 => keyword - (_, Some(HybridQuery { semantic_ratio, embedder: _ }), _) if **semantic_ratio == 0.0 => { + match (is_media, non_placeholder_query, &query.hybrid, query.vector.as_deref()) { + // media + vector => error + (true, _, _, Some(_)) => Err(MeilisearchHttpError::MediaAndVector.into()), + // media + !hybrid => error + (true, _, None, _) => Err(MeilisearchHttpError::MissingSearchHybrid.into()), + // vector + !hybrid => error + (_, _, None, Some(_)) => Err(MeilisearchHttpError::MissingSearchHybrid.into()), + // hybrid S0 => keyword + (_, _, Some(HybridQuery { semantic_ratio, embedder: _ }), _) if **semantic_ratio == 0.0 => { Ok(SearchKind::KeywordOnly) } - // no query, hybrid, vector => semantic - (None, Some(HybridQuery { semantic_ratio: _, embedder }), Some(v)) => { - SearchKind::semantic(index_scheduler, index_uid, index, embedder, Some(v.len())) + // !q + !vector => placeholder search + (false, false, _, None) => Ok(SearchKind::KeywordOnly), + // hybrid S100 => semantic + (_, _, Some(HybridQuery { semantic_ratio, embedder }), v) if **semantic_ratio == 1.0 => { + SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) } - // query, no hybrid, no vector => keyword - (Some(_), None, None) => Ok(SearchKind::KeywordOnly), - // query, hybrid, maybe vector => hybrid - (Some(_), Some(HybridQuery { semantic_ratio, embedder }), v) => SearchKind::hybrid( + // q + hybrid => hybrid + (_, true, Some(HybridQuery { semantic_ratio, embedder }), v) => SearchKind::hybrid( index_scheduler, index_uid, index, @@ -512,7 +514,11 @@ pub fn search_kind( **semantic_ratio, v.map(|v| v.len()), ), - - (_, None, Some(_)) => Err(MeilisearchHttpError::MissingSearchHybrid.into()), + // !q + hybrid => semantic + (_, false, Some(HybridQuery { semantic_ratio: _, embedder }), v) => { + SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) + } + // q => keyword + (false, true, None, None) => Ok(SearchKind::KeywordOnly), } } From c593fbe648ec7aedf62285cd7aa8459e9ac068d8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:11:29 +0200 Subject: [PATCH 109/150] Analytics --- .../src/routes/indexes/search_analytics.rs | 12 ++++++++++++ .../meilisearch/src/routes/multi_search_analytics.rs | 1 + 2 files changed, 13 insertions(+) diff --git a/crates/meilisearch/src/routes/indexes/search_analytics.rs b/crates/meilisearch/src/routes/indexes/search_analytics.rs index b16e2636e..07f79eba7 100644 --- a/crates/meilisearch/src/routes/indexes/search_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/search_analytics.rs @@ -61,6 +61,8 @@ pub struct SearchAggregator { semantic_ratio: bool, hybrid: bool, retrieve_vectors: bool, + // Number of requests containing `media` + total_media: usize, // every time a search is done, we increment the counter linked to the used settings matching_strategy: HashMap, @@ -101,6 +103,7 @@ impl SearchAggregator { let SearchQuery { q, vector, + media, offset, limit, page, @@ -175,6 +178,11 @@ impl SearchAggregator { if let Some(ref vector) = vector { ret.max_vector_size = vector.len(); } + + if media.is_some() { + ret.total_media = 1; + } + ret.retrieve_vectors |= retrieve_vectors; if query.is_finite_pagination() { @@ -277,6 +285,7 @@ impl Aggregate for SearchAggregator { show_ranking_score_details, semantic_ratio, hybrid, + total_media, total_degraded, total_used_negative_operator, ranking_score_threshold, @@ -327,6 +336,7 @@ impl Aggregate for SearchAggregator { self.retrieve_vectors |= retrieve_vectors; self.semantic_ratio |= semantic_ratio; self.hybrid |= hybrid; + self.total_media += total_media; // pagination self.max_limit = self.max_limit.max(max_limit); @@ -403,6 +413,7 @@ impl Aggregate for SearchAggregator { show_ranking_score_details, semantic_ratio, hybrid, + total_media, total_degraded, total_used_negative_operator, ranking_score_threshold, @@ -450,6 +461,7 @@ impl Aggregate for SearchAggregator { "hybrid": { "enabled": hybrid, "semantic_ratio": semantic_ratio, + "total_media": total_media, }, "pagination": { "max_limit": max_limit, diff --git a/crates/meilisearch/src/routes/multi_search_analytics.rs b/crates/meilisearch/src/routes/multi_search_analytics.rs index 3fa23f630..c24875797 100644 --- a/crates/meilisearch/src/routes/multi_search_analytics.rs +++ b/crates/meilisearch/src/routes/multi_search_analytics.rs @@ -42,6 +42,7 @@ impl MultiSearchAggregator { federation_options, q: _, vector: _, + media: _, offset: _, limit: _, page: _, From 11e7c0d75f53e8b2b798194daf38fa12d94e6a5a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:09:48 +0200 Subject: [PATCH 110/150] Fix tests --- crates/index-scheduler/src/scheduler/test.rs | 15 +- .../src/scheduler/test_embedders.rs | 215 ++++++++++++------ crates/meilisearch/tests/search/hybrid.rs | 2 +- 3 files changed, 155 insertions(+), 77 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index 2c492525f..e9f21dfe4 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -690,11 +690,20 @@ fn test_settings_update() { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); - let configs = index.embedding_configs(&rtxn).unwrap(); - let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap(); + let embedders = index.embedding_configs(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); + let IndexEmbeddingConfig { name, config, fragments } = configs.first().unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"default"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(config.embedder_options); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); } #[test] diff --git a/crates/index-scheduler/src/scheduler/test_embedders.rs b/crates/index-scheduler/src/scheduler/test_embedders.rs index 305894d0a..a9b920bd2 100644 --- a/crates/index-scheduler/src/scheduler/test_embedders.rs +++ b/crates/index-scheduler/src/scheduler/test_embedders.rs @@ -3,13 +3,14 @@ use std::collections::BTreeMap; use big_s::S; use insta::assert_json_snapshot; use meili_snap::{json_string, snapshot}; -use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::settings::EmbeddingSettings; +use meilisearch_types::milli::vector::SearchQuery; use meilisearch_types::milli::{self, obkv_to_json}; use meilisearch_types::settings::{SettingEmbeddingSettings, Settings, Unchecked}; use meilisearch_types::tasks::KindWithContent; use milli::update::IndexDocumentsMethod::*; +use milli::vector::db::IndexEmbeddingConfig; use crate::insta_snapshot::snapshot_index_scheduler; use crate::test_utils::read_json; @@ -85,28 +86,51 @@ fn import_vectors() { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); - let configs = index.embedding_configs(&rtxn).unwrap(); + let embedders = index.embedding_configs(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } = + let IndexEmbeddingConfig { name, config: fakerest_config, fragments } = configs.get(0).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } = + let IndexEmbeddingConfig { name, config: simple_hf_config, fragments } = configs.get(1).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"1"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); let configs = index_scheduler.embedders("doggos".to_string(), configs).unwrap(); - let (hf_embedder, _, _) = configs.get(&simple_hf_name).unwrap(); - let beagle_embed = hf_embedder.embed_search("Intel the beagle best doggo", None).unwrap(); - let lab_embed = hf_embedder.embed_search("Max the lab best doggo", None).unwrap(); - let patou_embed = hf_embedder.embed_search("kefir the patou best doggo", None).unwrap(); + let hf_runtime = configs.get(&simple_hf_name).unwrap(); + let hf_embedder = &hf_runtime.embedder; + let beagle_embed = hf_embedder + .embed_search(SearchQuery::Text("Intel the beagle best doggo"), None) + .unwrap(); + let lab_embed = + hf_embedder.embed_search(SearchQuery::Text("Max the lab best doggo"), None).unwrap(); + let patou_embed = hf_embedder + .embed_search(SearchQuery::Text("kefir the patou best doggo"), None) + .unwrap(); (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) }; @@ -166,22 +190,38 @@ fn import_vectors() { let rtxn = index.read_txn().unwrap(); // Ensure the document have been inserted into the relevant bitamp - let configs = index.embedding_configs(&rtxn).unwrap(); + let embedders = index.embedding_configs(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = - configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(0).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[0]>"); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); - let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(1).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"1"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); let embeddings = index.embeddings(&rtxn, 0).unwrap(); - assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); - assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == lab_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -239,25 +279,41 @@ fn import_vectors() { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + let embedders = index.embedding_configs(); // Ensure the document have been inserted into the relevant bitamp - let configs = index.embedding_configs(&rtxn).unwrap(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = - configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(0).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[0]>"); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); - let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(1).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"1"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); let embeddings = index.embeddings(&rtxn, 0).unwrap(); // automatically changed to patou because set to regenerate - assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == patou_embed, @"true"); // remained beagle - assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -400,7 +456,7 @@ fn import_vectors_first_and_embedder_later() { // the all the vectors linked to the new specified embedder have been removed // Only the unknown embedders stays in the document DB snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1,2,3]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4,5]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); - let conf = index.embedding_configs(&rtxn).unwrap(); + let conf = index.embedding_configs().embedding_configs(&rtxn).unwrap(); // even though we specified the vector for the ID 3, it shouldn't be marked // as user provided since we explicitely marked it as NOT user provided. snapshot!(format!("{conf:#?}"), @r###" @@ -426,19 +482,28 @@ fn import_vectors_first_and_embedder_later() { }, quantized: None, }, - user_provided: RoaringBitmap<[1, 2]>, + fragments: FragmentConfigs( + [], + ), }, ] "###); + let info = + index.embedding_configs().embedder_info(&rtxn, "my_doggo_embedder").unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[1, 2, 3]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[1, 2]>"); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; + let (embedding, _) = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty(), "{embedding:?}"); // the document with the id 3 should keep its original embedding let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embeddings = &embeddings["my_doggo_embedder"]; + let (embeddings, _) = &embeddings["my_doggo_embedder"]; snapshot!(embeddings.len(), @"1"); assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); @@ -493,7 +558,7 @@ fn import_vectors_first_and_embedder_later() { "###); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; + let (embedding, _) = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); @@ -501,7 +566,7 @@ fn import_vectors_first_and_embedder_later() { // the document with the id 4 should generate an embedding let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; + let (embedding, _) = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); } @@ -603,33 +668,35 @@ fn delete_document_containing_vector() { .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); - let conf = index.embedding_configs(&rtxn).unwrap(); + let conf = index.embedding_configs().embedding_configs(&rtxn).unwrap(); snapshot!(format!("{conf:#?}"), @r###" - [ - IndexEmbeddingConfig { - name: "manual", - config: EmbeddingConfig { - embedder_options: UserProvided( - EmbedderOptions { - dimensions: 3, - distribution: None, - }, - ), - prompt: PromptData { - template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", - max_bytes: Some( - 400, - ), + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, }, - quantized: None, + ), + prompt: PromptData { + template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", + max_bytes: Some( + 400, + ), }, - user_provided: RoaringBitmap<[0]>, + quantized: None, }, - ] - "###); + fragments: FragmentConfigs( + [], + ), + }, + ] + "###); let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["manual"]; + let (embedding, _) = &embeddings["manual"]; assert!(!embedding.is_empty(), "{embedding:?}"); index_scheduler @@ -647,30 +714,32 @@ fn delete_document_containing_vector() { .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); - let conf = index.embedding_configs(&rtxn).unwrap(); + let conf = index.embedding_configs().embedding_configs(&rtxn).unwrap(); snapshot!(format!("{conf:#?}"), @r###" - [ - IndexEmbeddingConfig { - name: "manual", - config: EmbeddingConfig { - embedder_options: UserProvided( - EmbedderOptions { - dimensions: 3, - distribution: None, - }, - ), - prompt: PromptData { - template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", - max_bytes: Some( - 400, - ), + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, }, - quantized: None, + ), + prompt: PromptData { + template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", + max_bytes: Some( + 400, + ), }, - user_provided: RoaringBitmap<[]>, + quantized: None, }, - ] - "###); + fragments: FragmentConfigs( + [], + ), + }, + ] + "###); } #[test] diff --git a/crates/meilisearch/tests/search/hybrid.rs b/crates/meilisearch/tests/search/hybrid.rs index be2a724b0..d95e6fb64 100644 --- a/crates/meilisearch/tests/search/hybrid.rs +++ b/crates/meilisearch/tests/search/hybrid.rs @@ -499,7 +499,7 @@ async fn query_combination() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Invalid request: missing `hybrid` parameter when `vector` is present.", + "message": "Invalid request: missing `hybrid` parameter when `vector` or `media` are present.", "code": "missing_search_hybrid", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#missing_search_hybrid" From e54fc592485b19dbfb8f647b542cc5738a8057bf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:47:15 +0200 Subject: [PATCH 111/150] Fix snaps --- ...r__scheduler__test__settings_update-7.snap | 17 ++++++++++++++++ ...er__test_embedders__import_vectors-14.snap | 12 +++++++++++ ...er__test_embedders__import_vectors-27.snap | 15 ++++++++++++++ ...er__test_embedders__import_vectors-40.snap | 15 ++++++++++++++ ...ler__test_embedders__import_vectors-8.snap | 15 +++++++++----- .../after_registering_settings_task.snap | 2 +- .../settings_update_processed.snap | 2 +- .../Intel to kefir succeeds.snap | 2 +- .../import_vectors/Intel to kefir.snap | 2 +- .../import_vectors/adding Intel succeeds.snap | 2 +- .../import_vectors/after adding Intel.snap | 2 +- ...ter_registering_settings_task_vectors.snap | 2 +- .../settings_update_processed_vectors.snap | 2 +- crates/meilisearch/tests/dumps/mod.rs | 9 ++++++--- crates/meilisearch/tests/features/mod.rs | 20 ++++++++++++------- 15 files changed, 96 insertions(+), 23 deletions(-) create mode 100644 crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-7.snap create mode 100644 crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-14.snap create mode 100644 crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-27.snap create mode 100644 crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-40.snap diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-7.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-7.snap new file mode 100644 index 000000000..82134b838 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-7.snap @@ -0,0 +1,17 @@ +--- +source: crates/index-scheduler/src/scheduler/test.rs +expression: config.embedder_options +--- +{ + "Rest": { + "api_key": "My super secret", + "distribution": null, + "dimensions": 4, + "url": "http://localhost:7777", + "request": "{{text}}", + "search_fragments": {}, + "indexing_fragments": {}, + "response": "{{embedding}}", + "headers": {} + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-14.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-14.snap new file mode 100644 index 000000000..19b5cab92 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-14.snap @@ -0,0 +1,12 @@ +--- +source: crates/index-scheduler/src/scheduler/test_embedders.rs +expression: simple_hf_config.embedder_options +--- +{ + "HuggingFace": { + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "distribution": null, + "pooling": "useModel" + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-27.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-27.snap new file mode 100644 index 000000000..0fc8bd531 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-27.snap @@ -0,0 +1,15 @@ +--- +source: crates/index-scheduler/src/scheduler/test_embedders.rs +expression: doc +--- +{ + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + "noise": [ + 0.1, + 0.2, + 0.3 + ] + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-40.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-40.snap new file mode 100644 index 000000000..0942e4d82 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-40.snap @@ -0,0 +1,15 @@ +--- +source: crates/index-scheduler/src/scheduler/test_embedders.rs +expression: doc +--- +{ + "doggo": "kefir", + "breed": "patou", + "_vectors": { + "noise": [ + 0.1, + 0.2, + 0.3 + ] + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap index 19b5cab92..29f35d9c1 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap @@ -1,12 +1,17 @@ --- source: crates/index-scheduler/src/scheduler/test_embedders.rs -expression: simple_hf_config.embedder_options +expression: fakerest_config.embedder_options --- { - "HuggingFace": { - "model": "sentence-transformers/all-MiniLM-L6-v2", - "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "Rest": { + "api_key": "My super secret", "distribution": null, - "pooling": "useModel" + "dimensions": 384, + "url": "http://localhost:7777", + "request": "{{text}}", + "search_fragments": {}, + "indexing_fragments": {}, + "response": "{{embedding}}", + "headers": {} } } diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap index c66a6b5b3..a52f18079 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap index b7faefa8a..b99e15852 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap index c8955e2b6..12e03a28b 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap index 23e43860f..2ea2ebb17 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap index 732527fa8..a2a263b6f 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap index 5e01ffcdf..29fc6abf4 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap index 1172d1118..ae943bf48 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap index 3653eeb9a..9ada7580a 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index 3d3bc01db..9b111186d 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -2188,7 +2188,8 @@ async fn import_dump_v6_containing_experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -2314,7 +2315,8 @@ async fn import_dump_v6_containing_batches_and_enqueued_tasks() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -2420,7 +2422,8 @@ async fn generate_and_import_dump_containing_vectors() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); diff --git a/crates/meilisearch/tests/features/mod.rs b/crates/meilisearch/tests/features/mod.rs index d0d457d3e..ec5838d35 100644 --- a/crates/meilisearch/tests/features/mod.rs +++ b/crates/meilisearch/tests/features/mod.rs @@ -25,7 +25,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -41,7 +42,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -57,7 +59,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -74,7 +77,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -91,7 +95,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); } @@ -115,7 +120,8 @@ async fn experimental_feature_metrics() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -162,7 +168,7 @@ async fn errors() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`, `getTaskDocumentsRoute`, `compositeEmbedders`, `chatCompletions`", + "message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`, `getTaskDocumentsRoute`, `compositeEmbedders`, `chatCompletions`, `multimodal`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" From c1a132fa068e252d2554cd5acab489e9eea804b2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 13:54:04 +0200 Subject: [PATCH 112/150] `multimodal` experimental feature --- crates/index-scheduler/src/features.rs | 13 +++++++++++++ crates/meilisearch-types/src/features.rs | 1 + .../meilisearch/src/analytics/segment_analytics.rs | 3 +++ crates/meilisearch/src/routes/features.rs | 11 +++++++++++ crates/meilisearch/src/routes/indexes/settings.rs | 8 ++++++++ crates/meilisearch/src/search/mod.rs | 3 +++ 6 files changed, 39 insertions(+) diff --git a/crates/index-scheduler/src/features.rs b/crates/index-scheduler/src/features.rs index 78ffc0766..b52a659a6 100644 --- a/crates/index-scheduler/src/features.rs +++ b/crates/index-scheduler/src/features.rs @@ -144,6 +144,19 @@ impl RoFeatures { .into()) } } + + pub fn check_multimodal(&self, disabled_action: &'static str) -> Result<()> { + if self.runtime.multimodal { + Ok(()) + } else { + Err(FeatureNotEnabledError { + disabled_action, + feature: "multimodal", + issue_link: "https://github.com/orgs/meilisearch/discussions/846", + } + .into()) + } + } } impl FeatureData { diff --git a/crates/meilisearch-types/src/features.rs b/crates/meilisearch-types/src/features.rs index 9ec2d321f..3c78035e8 100644 --- a/crates/meilisearch-types/src/features.rs +++ b/crates/meilisearch-types/src/features.rs @@ -21,6 +21,7 @@ pub struct RuntimeTogglableFeatures { pub get_task_documents_route: bool, pub composite_embedders: bool, pub chat_completions: bool, + pub multimodal: bool, } #[derive(Default, Debug, Clone, Copy)] diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index 668a7fded..0abc5c817 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -197,6 +197,7 @@ struct Infos { experimental_max_number_of_batched_tasks: usize, experimental_limit_batched_tasks_total_size: u64, experimental_network: bool, + experimental_multimodal: bool, experimental_chat_completions: bool, experimental_get_task_documents_route: bool, experimental_composite_embedders: bool, @@ -303,6 +304,7 @@ impl Infos { get_task_documents_route, composite_embedders, chat_completions, + multimodal, } = features; // We're going to override every sensible information. @@ -322,6 +324,7 @@ impl Infos { experimental_reduce_indexing_memory_usage, experimental_network: network, experimental_chat_completions: chat_completions, + experimental_multimodal: multimodal, experimental_get_task_documents_route: get_task_documents_route, experimental_composite_embedders: composite_embedders, experimental_embedding_cache_entries, diff --git a/crates/meilisearch/src/routes/features.rs b/crates/meilisearch/src/routes/features.rs index 179b9cf68..1a1f89b2d 100644 --- a/crates/meilisearch/src/routes/features.rs +++ b/crates/meilisearch/src/routes/features.rs @@ -54,6 +54,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { get_task_documents_route: Some(false), composite_embedders: Some(false), chat_completions: Some(false), + multimodal: Some(false), })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { @@ -100,6 +101,8 @@ pub struct RuntimeTogglableFeatures { pub composite_embedders: Option, #[deserr(default)] pub chat_completions: Option, + #[deserr(default)] + pub multimodal: Option, } impl From for RuntimeTogglableFeatures { @@ -113,6 +116,7 @@ impl From for RuntimeTogg get_task_documents_route, composite_embedders, chat_completions, + multimodal, } = value; Self { @@ -124,6 +128,7 @@ impl From for RuntimeTogg get_task_documents_route: Some(get_task_documents_route), composite_embedders: Some(composite_embedders), chat_completions: Some(chat_completions), + multimodal: Some(multimodal), } } } @@ -138,6 +143,7 @@ pub struct PatchExperimentalFeatureAnalytics { get_task_documents_route: bool, composite_embedders: bool, chat_completions: bool, + multimodal: bool, } impl Aggregate for PatchExperimentalFeatureAnalytics { @@ -155,6 +161,7 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { get_task_documents_route: new.get_task_documents_route, composite_embedders: new.composite_embedders, chat_completions: new.chat_completions, + multimodal: new.multimodal, }) } @@ -181,6 +188,7 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { get_task_documents_route: Some(false), composite_embedders: Some(false), chat_completions: Some(false), + multimodal: Some(false), })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { @@ -223,6 +231,7 @@ async fn patch_features( .composite_embedders .unwrap_or(old_features.composite_embedders), chat_completions: new_features.0.chat_completions.unwrap_or(old_features.chat_completions), + multimodal: new_features.0.multimodal.unwrap_or(old_features.multimodal), }; // explicitly destructure for analytics rather than using the `Serialize` implementation, because @@ -237,6 +246,7 @@ async fn patch_features( get_task_documents_route, composite_embedders, chat_completions, + multimodal, } = new_features; analytics.publish( @@ -249,6 +259,7 @@ async fn patch_features( get_task_documents_route, composite_embedders, chat_completions, + multimodal, }, &req, ); diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index a4b7a5219..308977a6e 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -755,6 +755,14 @@ fn validate_settings( if matches!(embedder.indexing_embedder, Setting::Set(_)) { features.check_composite_embedders("setting `indexingEmbedder`")?; } + + if matches!(embedder.indexing_fragments, Setting::Set(_)) { + features.check_multimodal("setting `indexingFragments`")?; + } + + if matches!(embedder.search_fragments, Setting::Set(_)) { + features.check_multimodal("setting `searchFragments`")?; + } } } diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 6d8639504..1c987a70c 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -972,6 +972,9 @@ pub fn prepare_search<'t>( time_budget: TimeBudget, features: RoFeatures, ) -> Result<(milli::Search<'t>, bool, usize, usize), ResponseError> { + if query.media.is_some() { + features.check_multimodal("passing `media` in a search query")?; + } let mut search = index.search(rtxn); search.time_budget(time_budget); if let Some(ranking_score_threshold) = query.ranking_score_threshold { From e30c24b5bfa6aa8e1782cfe9043c50b80f403222 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 1 Jul 2025 23:52:44 +0200 Subject: [PATCH 113/150] Prompt: relax lifetime constraints --- crates/milli/src/prompt/document.rs | 11 ++++++----- crates/milli/src/prompt/fields.rs | 24 ++++++++++++------------ crates/milli/src/prompt/mod.rs | 4 ++-- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/crates/milli/src/prompt/document.rs b/crates/milli/src/prompt/document.rs index b00c4cb42..1125c8fba 100644 --- a/crates/milli/src/prompt/document.rs +++ b/crates/milli/src/prompt/document.rs @@ -144,18 +144,19 @@ impl ValueView for Document<'_> { use crate::update::new::document::Document as DocumentTrait; #[derive(Debug)] -pub struct ParseableDocument<'doc, D> { +pub struct ParseableDocument<'a, 'doc, D: DocumentTrait<'a> + Debug> { document: D, doc_alloc: &'doc Bump, + _marker: std::marker::PhantomData<&'a ()>, } -impl<'doc, D> ParseableDocument<'doc, D> { +impl<'a, 'doc, D: DocumentTrait<'a> + Debug> ParseableDocument<'a, 'doc, D> { pub fn new(document: D, doc_alloc: &'doc Bump) -> Self { - Self { document, doc_alloc } + Self { document, doc_alloc, _marker: std::marker::PhantomData } } } -impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc, D> { +impl<'a, D: DocumentTrait<'a> + Debug> ObjectView for ParseableDocument<'a, '_, D> { fn as_value(&self) -> &dyn ValueView { self } @@ -195,7 +196,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc } } -impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> { +impl<'a, D: DocumentTrait<'a> + Debug> ValueView for ParseableDocument<'a, '_, D> { fn as_debug(&self) -> &dyn Debug { self } diff --git a/crates/milli/src/prompt/fields.rs b/crates/milli/src/prompt/fields.rs index 8d006f0b7..5a842268c 100644 --- a/crates/milli/src/prompt/fields.rs +++ b/crates/milli/src/prompt/fields.rs @@ -121,10 +121,10 @@ impl ObjectView for FieldValue<'_, D> { pub struct OwnedFields<'a, D: ObjectView>(Vec>); #[derive(Debug)] -pub struct BorrowedFields<'a, 'map, D: ObjectView> { +pub struct BorrowedFields<'a, 'doc, 'map, D: ObjectView> { document: &'a D, field_id_map: &'a RefCell>, - doc_alloc: &'a Bump, + doc_alloc: &'doc Bump, } impl<'a, D: ObjectView> OwnedFields<'a, D> { @@ -138,11 +138,11 @@ impl<'a, D: ObjectView> OwnedFields<'a, D> { } } -impl<'a, 'map, D: ObjectView> BorrowedFields<'a, 'map, D> { +impl<'a, 'doc, 'map, D: ObjectView> BorrowedFields<'a, 'doc, 'map, D> { pub fn new( document: &'a D, field_id_map: &'a RefCell>, - doc_alloc: &'a Bump, + doc_alloc: &'doc Bump, ) -> Self { Self { document, field_id_map, doc_alloc } } @@ -170,7 +170,7 @@ impl ArrayView for OwnedFields<'_, D> { } } -impl ArrayView for BorrowedFields<'_, '_, D> { +impl ArrayView for BorrowedFields<'_, '_, '_, D> { fn as_value(&self) -> &dyn ValueView { self } @@ -212,7 +212,7 @@ impl ArrayView for BorrowedFields<'_, '_, D> { } } -impl ValueView for BorrowedFields<'_, '_, D> { +impl ValueView for BorrowedFields<'_, '_, '_, D> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } @@ -288,11 +288,11 @@ impl ValueView for OwnedFields<'_, D> { } } -struct ArraySource<'a, 'map, D: ObjectView> { - s: &'a BorrowedFields<'a, 'map, D>, +struct ArraySource<'a, 'doc, 'map, D: ObjectView> { + s: &'a BorrowedFields<'a, 'doc, 'map, D>, } -impl fmt::Display for ArraySource<'_, '_, D> { +impl fmt::Display for ArraySource<'_, '_, '_, D> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "[")?; for item in self.s.values() { @@ -303,11 +303,11 @@ impl fmt::Display for ArraySource<'_, '_, D> { } } -struct ArrayRender<'a, 'map, D: ObjectView> { - s: &'a BorrowedFields<'a, 'map, D>, +struct ArrayRender<'a, 'doc, 'map, D: ObjectView> { + s: &'a BorrowedFields<'a, 'doc, 'map, D>, } -impl fmt::Display for ArrayRender<'_, '_, D> { +impl fmt::Display for ArrayRender<'_, '_, '_, D> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { for item in self.s.values() { write!(f, "{}", item.render())?; diff --git a/crates/milli/src/prompt/mod.rs b/crates/milli/src/prompt/mod.rs index f1b4ddf89..03b20a090 100644 --- a/crates/milli/src/prompt/mod.rs +++ b/crates/milli/src/prompt/mod.rs @@ -107,8 +107,8 @@ impl Prompt { } pub fn render_document< - 'a, // lifetime of the borrow of the document - 'doc: 'a, // lifetime of the allocator, will live for an entire chunk of documents + 'a, // lifetime of the borrow of the document + 'doc, // lifetime of the allocator, will live for an entire chunk of documents >( &self, external_docid: &str, From 9ce5598fef9d966621710192934ebb6cd45bdbd2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 1 Jul 2025 23:55:07 +0200 Subject: [PATCH 114/150] parsed vectors: embeddings is None when it is null when read from DB --- crates/milli/src/vector/parsed_vectors.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index 36e80677a..8ff5a2201 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -150,7 +150,8 @@ impl<'doc> serde::de::Visitor<'doc> for RawVectorsVisitor { regenerate = Some(value); } Ok(Some("embeddings")) => { - let value: &RawValue = match map.next_value() { + let value: &RawValue = match map.next_value::<&RawValue>() { + Ok(value) if value.get() == "null" => continue, Ok(value) => value, Err(error) => { return Ok(Err(RawVectorsError::DeserializeEmbeddings { From b086c51a232dd76406525c7caa128daa9bc5b10d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 1 Jul 2025 23:57:14 +0200 Subject: [PATCH 115/150] new settings indexer --- .../src/update/new/extract/vectors/mod.rs | 294 ++++++++++++------ .../milli/src/update/new/indexer/extract.rs | 25 +- crates/milli/src/update/new/indexer/mod.rs | 67 +++- 3 files changed, 262 insertions(+), 124 deletions(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 3b8f5fa58..c08fadb14 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -1,5 +1,4 @@ use std::cell::RefCell; -use std::collections::BTreeMap; use std::fmt::Debug; use bumpalo::collections::Vec as BVec; @@ -16,15 +15,17 @@ use crate::update::new::indexer::settings_changes::SettingsChangeExtractor; use crate::update::new::thread_local::MostlySend; use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; +use crate::update::settings::SettingsDelta; use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta}; use crate::vector::error::{ EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistributionBump, }; use crate::vector::extractor::{ - DocumentTemplateExtractor, Extractor as VectorExtractor, RequestFragmentExtractor, + DocumentTemplateExtractor, Extractor as VectorExtractor, ExtractorDiff, + RequestFragmentExtractor, }; use crate::vector::session::{EmbedSession, Input, Metadata, OnEmbed}; -use crate::vector::settings::{EmbedderAction, ReindexAction}; +use crate::vector::settings::ReindexAction; use crate::vector::{Embedding, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment}; use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; @@ -260,44 +261,31 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { } } -pub struct SettingsChangeEmbeddingExtractor<'a, 'b> { - embedders: &'a EmbeddingConfigs, - old_embedders: &'a EmbeddingConfigs, - embedder_actions: &'a BTreeMap, - embedder_category_id: &'a std::collections::HashMap, +pub struct SettingsChangeEmbeddingExtractor<'a, 'b, SD> { + settings_delta: &'a SD, embedder_stats: &'a EmbedderStats, sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, } -impl<'a, 'b> SettingsChangeEmbeddingExtractor<'a, 'b> { +impl<'a, 'b, SD: SettingsDelta> SettingsChangeEmbeddingExtractor<'a, 'b, SD> { #[allow(clippy::too_many_arguments)] pub fn new( - embedders: &'a EmbeddingConfigs, - old_embedders: &'a EmbeddingConfigs, - embedder_actions: &'a BTreeMap, - embedder_category_id: &'a std::collections::HashMap, + settings_delta: &'a SD, embedder_stats: &'a EmbedderStats, sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, threads: &'a ThreadPoolNoAbort, ) -> Self { let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution); - Self { - embedders, - old_embedders, - embedder_actions, - embedder_category_id, - embedder_stats, - sender, - threads, - possible_embedding_mistakes, - } + Self { settings_delta, embedder_stats, sender, threads, possible_embedding_mistakes } } } -impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbeddingExtractor<'_, '_> { +impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor> + for SettingsChangeEmbeddingExtractor<'_, '_, SD> +{ type Data = RefCell>; fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result { @@ -309,44 +297,49 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding documents: impl Iterator>>, context: &'doc DocumentContext, ) -> crate::Result<()> { - let embedders = self.embedders.inner_as_ref(); - let old_embedders = self.old_embedders.inner_as_ref(); + let embedders = self.settings_delta.new_embedders(); + let old_embedders = self.settings_delta.old_embedders(); let unused_vectors_distribution = UnusedVectorsDistributionBump::new_in(&context.doc_alloc); let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); - for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { - // if the embedder is not in the embedder_actions, we don't need to reindex. - if let Some((embedder_id, reindex_action)) = - self.embedder_actions - .get(embedder_name) - // keep only the reindex actions - .and_then(EmbedderAction::reindex) - // map the reindex action to the embedder_id - .map(|reindex| { - let embedder_id = self.embedder_category_id.get(embedder_name).expect( - "An embedder_category_id must exist for all reindexed embedders", - ); - (*embedder_id, reindex) - }) - { - all_chunks.push(( - Chunks::new( - embedder, - embedder_id, - embedder_name, - prompt, - context.data, - &self.possible_embedding_mistakes, - self.embedder_stats, - self.threads, - self.sender, - &context.doc_alloc, - ), - reindex_action, - )) - } + let embedder_configs = context.index.embedding_configs(); + for (embedder_name, action) in self.settings_delta.embedder_actions().iter() { + let Some(reindex_action) = action.reindex() else { + continue; + }; + let runtime = embedders + .get(embedder_name) + .expect("A runtime must exist for all reindexed embedder"); + let embedder_info = embedder_configs + .embedder_info(&context.rtxn, embedder_name)? + .unwrap_or_else(|| { + // new embedder + EmbedderInfo { + embedder_id: *self + .settings_delta + .new_embedder_category_id() + .get(embedder_name) + .expect( + "An embedder_category_id must exist for all reindexed embedders", + ), + embedding_status: EmbeddingStatus::new(), + } + }); + all_chunks.push(( + Chunks::new( + runtime, + embedder_info, + embedder_name.as_str(), + context.data, + &self.possible_embedding_mistakes, + self.embedder_stats, + self.threads, + self.sender, + &context.doc_alloc, + ), + reindex_action, + )); } - for document in documents { let document = document?; @@ -360,6 +353,16 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding for (chunks, reindex_action) in &mut all_chunks { let embedder_name = chunks.embedder_name(); let current_vectors = current_vectors.vectors_for_key(embedder_name)?; + let (old_is_user_provided, _) = + chunks.is_user_provided_must_regenerate(document.docid()); + let old_has_fragments = old_embedders + .get(embedder_name) + .map(|embedder| embedder.fragments().is_empty()) + .unwrap_or_default(); + + let new_has_fragments = chunks.has_fragments(); + + let fragments_changed = old_has_fragments ^ new_has_fragments; // if the vectors for this document have been already provided, we don't need to reindex. let (is_new_embedder, must_regenerate) = @@ -368,60 +371,33 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding }); match reindex_action { - ReindexAction::RegeneratePrompts => { + ReindexAction::RegeneratePrompts | ReindexAction::RegenerateFragments(_) => { if !must_regenerate { continue; } // we need to regenerate the prompts for the document - - // Get the old prompt and render the document with it - let Some((_, old_prompt, _)) = old_embedders.get(embedder_name) else { - unreachable!("ReindexAction::RegeneratePrompts implies that the embedder {embedder_name} is in the old_embedders") - }; - let old_rendered = old_prompt.render_document( + chunks.settings_change_autogenerated( + document.docid(), document.external_document_id(), document.current( &context.rtxn, context.index, context.db_fields_ids_map, )?, + self.settings_delta, context.new_fields_ids_map, - &context.doc_alloc, + &unused_vectors_distribution, + old_is_user_provided, + fragments_changed, )?; - - // Get the new prompt and render the document with it - let new_prompt = chunks.prompt(); - let new_rendered = new_prompt.render_document( - document.external_document_id(), - document.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - - // Compare the rendered documents - // if they are different, regenerate the vectors - if new_rendered != old_rendered { - chunks.set_autogenerated( - document.docid(), - document.external_document_id(), - new_rendered, - &unused_vectors_distribution, - )?; - } } ReindexAction::FullReindex => { - let prompt = chunks.prompt(); // if no inserted vectors, then regenerate: true + no embeddings => autogenerate if let Some(embeddings) = current_vectors .and_then(|vectors| vectors.embeddings) // insert the embeddings only for new embedders .filter(|_| is_new_embedder) { - chunks.set_regenerate(document.docid(), must_regenerate); chunks.set_vectors( document.external_document_id(), document.docid(), @@ -431,24 +407,27 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding error: error.to_string(), }, )?, + old_is_user_provided, + true, + must_regenerate, )?; } else if must_regenerate { - let rendered = prompt.render_document( + chunks.settings_change_autogenerated( + document.docid(), document.external_document_id(), document.current( &context.rtxn, context.index, context.db_fields_ids_map, )?, + self.settings_delta, context.new_fields_ids_map, - &context.doc_alloc, - )?; - chunks.set_autogenerated( - document.docid(), - document.external_document_id(), - rendered, &unused_vectors_distribution, + old_is_user_provided, + true, )?; + } else if is_new_embedder { + chunks.set_status(document.docid(), false, true, false, false); } } } @@ -585,7 +564,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { let embedder = &runtime.embedder; let dimensions = embedder.dimensions(); - let fragments = runtime.fragments.as_slice(); + let fragments = runtime.fragments(); let kind = if fragments.is_empty() { ChunkType::DocumentTemplate { document_template: &runtime.document_template, @@ -627,6 +606,117 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { self.status.is_user_provided_must_regenerate(docid) } + #[allow(clippy::too_many_arguments)] + pub fn settings_change_autogenerated<'doc, D: Document<'doc> + Debug, SD: SettingsDelta>( + &mut self, + docid: DocumentId, + external_docid: &'a str, + document: D, + settings_delta: &SD, + fields_ids_map: &'a RefCell, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, + old_is_user_provided: bool, + full_reindex: bool, + ) -> Result<()> + where + 'a: 'doc, + { + match &mut self.kind { + ChunkType::Fragments { fragments: _, session } => { + let doc_alloc = session.doc_alloc(); + + if old_is_user_provided | full_reindex { + session.on_embed_mut().clear_vectors(docid); + } + + let mut extracted = false; + let extracted = &mut extracted; + + settings_delta.try_for_each_fragment_diff( + session.embedder_name(), + |fragment_diff| { + let extractor = RequestFragmentExtractor::new(fragment_diff.new, doc_alloc) + .ignore_errors(); + let old = if full_reindex { + None + } else { + fragment_diff.old.map(|old| { + RequestFragmentExtractor::new(old, doc_alloc).ignore_errors() + }) + }; + let metadata = Metadata { + docid, + external_docid, + extractor_id: extractor.extractor_id(), + }; + + match extractor.diff_settings(&document, &(), old.as_ref())? { + ExtractorDiff::Removed => { + OnEmbed::process_embedding_response( + session.on_embed_mut(), + crate::vector::session::EmbeddingResponse { + metadata, + embedding: None, + }, + ); + } + ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { + *extracted = true; + session.request_embedding( + metadata, + input, + unused_vectors_distribution, + )?; + } + ExtractorDiff::Unchanged => { /* nothing to do */ } + } + + Result::Ok(()) + }, + )?; + self.set_status( + docid, + old_is_user_provided, + true, + old_is_user_provided & !*extracted, + true, + ); + } + ChunkType::DocumentTemplate { document_template, session } => { + let doc_alloc = session.doc_alloc(); + + let old_embedder = settings_delta.old_embedders().get(session.embedder_name()); + let old_document_template = if full_reindex { + None + } else { + old_embedder.as_ref().map(|old_embedder| &old_embedder.document_template) + }; + let extractor = + DocumentTemplateExtractor::new(document_template, doc_alloc, fields_ids_map); + let old_extractor = old_document_template.map(|old_document_template| { + DocumentTemplateExtractor::new(old_document_template, doc_alloc, fields_ids_map) + }); + let metadata = + Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; + + match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? { + ExtractorDiff::Removed => { + OnEmbed::process_embedding_response( + session.on_embed_mut(), + crate::vector::session::EmbeddingResponse { metadata, embedding: None }, + ); + } + ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { + session.request_embedding(metadata, input, unused_vectors_distribution)?; + } + ExtractorDiff::Unchanged => { /* do nothing */ } + } + self.set_status(docid, old_is_user_provided, true, false, true); + } + } + Ok(()) + } + #[allow(clippy::too_many_arguments)] pub fn update_autogenerated<'doc, OD: Document<'doc> + Debug, ND: Document<'doc> + Debug>( &mut self, @@ -862,6 +952,10 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { Ok(()) } + + fn has_fragments(&self) -> bool { + matches!(self.kind, ChunkType::Fragments { .. }) + } } #[allow(clippy::too_many_arguments)] diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index a3e7842c2..abfb4d6da 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -21,7 +21,7 @@ use crate::update::new::indexer::settings_changes::DocumentsIndentifiers; use crate::update::new::merger::merge_and_send_rtree; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::SettingsDelta; -use crate::vector::db::IndexEmbeddingConfig; +use crate::vector::db::{EmbedderInfo, IndexEmbeddingConfig}; use crate::vector::RuntimeEmbedders; use crate::{Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; @@ -333,12 +333,11 @@ pub(super) fn extract_all_settings_changes( finished_extraction: &AtomicBool, field_distribution: &mut BTreeMap, mut index_embeddings: Vec, - modified_docids: &mut RoaringBitmap, embedder_stats: &EmbedderStats, ) -> Result> where MSP: Fn() -> bool + Sync, - SD: SettingsDelta, + SD: SettingsDelta + Sync, { // Create the list of document ids to extract let rtxn = indexing_context.index.read_txn()?; @@ -369,10 +368,7 @@ where // extract the remaining embeddings let extractor = SettingsChangeEmbeddingExtractor::new( - settings_delta.new_embedders(), - settings_delta.old_embedders(), - settings_delta.embedder_actions(), - settings_delta.new_embedder_category_id(), + settings_delta, embedder_stats, embedding_sender, field_distribution, @@ -396,14 +392,25 @@ where let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors"); let _entered = span.enter(); + let embedder_configs = indexing_context.index.embedding_configs(); for config in &mut index_embeddings { + // retrieve infos for existing embedder or create a fresh one + let mut infos = + embedder_configs.embedder_info(&rtxn, &config.name)?.unwrap_or_else(|| { + let embedder_id = + *settings_delta.new_embedder_category_id().get(&config.name).unwrap(); + EmbedderInfo { embedder_id, embedding_status: Default::default() } + }); + 'data: for data in datastore.iter_mut() { let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { + let Some(delta) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided, modified_docids); + delta.apply_to(&mut infos.embedding_status); } + + extractor_sender.embeddings().embedding_status(&config.name, infos).unwrap(); } } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 507d1a650..a6ba3a919 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -23,7 +23,7 @@ use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::progress::{EmbedderStats, Progress}; use crate::update::settings::SettingsDelta; use crate::update::GrenadParameters; -use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; +use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments}; use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; @@ -221,7 +221,7 @@ where MSP: Fn() -> bool + Sync, SD: SettingsDelta + Sync, { - delete_old_embedders(wtxn, index, settings_delta)?; + delete_old_embedders_and_fragments(wtxn, index, settings_delta)?; let mut bbbuffers = Vec::new(); let finished_extraction = AtomicBool::new(false); @@ -254,16 +254,14 @@ where grenad_parameters: &grenad_parameters, }; - let index_embeddings = index.embedding_configs(wtxn)?; + let index_embeddings = index.embedding_configs().embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; - let mut modified_docids = roaring::RoaringBitmap::new(); let congestion = thread::scope(|s| -> Result { let indexer_span = tracing::Span::current(); let finished_extraction = &finished_extraction; // prevent moving the field_distribution and document_ids in the inner closure... let field_distribution = &mut field_distribution; - let modified_docids = &mut modified_docids; let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.install(move || { @@ -276,7 +274,6 @@ where finished_extraction, field_distribution, index_embeddings, - modified_docids, &embedder_stats, ) }) @@ -342,7 +339,7 @@ where fn arroy_writers_from_embedder_actions<'indexer>( index: &Index, embedder_actions: &'indexer BTreeMap, - embedders: &'indexer EmbeddingConfigs, + embedders: &'indexer RuntimeEmbedders, index_embedder_category_ids: &'indexer std::collections::HashMap, ) -> Result> { let vector_arroy = index.vector_arroy; @@ -350,7 +347,7 @@ fn arroy_writers_from_embedder_actions<'indexer>( embedders .inner_as_ref() .iter() - .filter_map(|(embedder_name, (embedder, _, _))| match embedder_actions.get(embedder_name) { + .filter_map(|(embedder_name, runtime)| match embedder_actions.get(embedder_name) { None => None, Some(action) if action.write_back().is_some() => None, Some(action) => { @@ -365,25 +362,65 @@ fn arroy_writers_from_embedder_actions<'indexer>( }; let writer = ArroyWrapper::new(vector_arroy, embedder_category_id, action.was_quantized); - let dimensions = embedder.dimensions(); + let dimensions = runtime.embedder.dimensions(); Some(Ok(( embedder_category_id, - (embedder_name.as_str(), embedder.as_ref(), writer, dimensions), + (embedder_name.as_str(), runtime.embedder.as_ref(), writer, dimensions), ))) } }) .collect() } -fn delete_old_embedders(wtxn: &mut RwTxn<'_>, index: &Index, settings_delta: &SD) -> Result<()> +fn delete_old_embedders_and_fragments( + wtxn: &mut RwTxn<'_>, + index: &Index, + settings_delta: &SD, +) -> Result<()> where SD: SettingsDelta, { for action in settings_delta.embedder_actions().values() { - if let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() { - let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized); - let dimensions = reader.dimensions(wtxn)?; - reader.clear(wtxn, dimensions)?; + let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else { + continue; + }; + let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized); + let Some(dimensions) = reader.dimensions(wtxn)? else { + continue; + }; + reader.clear(wtxn, dimensions)?; + } + + // remove all vectors for the specified fragments + for (embedder_name, RemoveFragments { fragment_ids }, was_quantized) in + settings_delta.embedder_actions().iter().filter_map(|(name, action)| { + action.remove_fragments().map(|fragments| (name, fragments, action.was_quantized)) + }) + { + let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else { + continue; + }; + let arroy = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, was_quantized); + let Some(dimensions) = arroy.dimensions(wtxn)? else { + continue; + }; + for fragment_id in fragment_ids { + // we must keep the user provided embeddings that ended up in this store + + if infos.embedding_status.user_provided_docids().is_empty() { + // no user provided: clear store + arroy.clear_store(wtxn, *fragment_id, dimensions)?; + continue; + } + + // some user provided, remove only the ids that are not user provided + let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| { + items - infos.embedding_status.user_provided_docids() + })?; + + for to_delete in to_delete { + arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; + } } } From e6329e77e1c0c470ed7d8db9bee9f6abc18bb01d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 00:00:39 +0200 Subject: [PATCH 116/150] settings fragment_diffs --- crates/milli/src/update/settings.rs | 93 +++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 3dae4f57c..03d44d785 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1578,6 +1578,7 @@ pub struct InnerIndexSettingsDiff { /// The set of only the additional searchable fields. /// If any other searchable field has been modified, is set to None. pub(crate) only_additional_fields: Option>, + fragment_diffs: BTreeMap, usize)>>, // Cache the check to see if all the stop_words, allowed_separators, dictionary, // exact_attributes, proximity_precision are different. @@ -1695,10 +1696,59 @@ impl InnerIndexSettingsDiff { } } + // build the fragment diffs + let mut fragment_diffs = BTreeMap::new(); + for (embedder_name, embedder_action) in &embedding_config_updates { + let Some(new_embedder) = new_settings.runtime_embedders.get(embedder_name) else { + continue; + }; + let regenerate_fragments = + if let Some(ReindexAction::RegenerateFragments(regenerate_fragments)) = + embedder_action.reindex() + { + either::Either::Left( + regenerate_fragments + .iter() + .filter(|(_, action)| { + !matches!( + action, + crate::vector::settings::RegenerateFragment::Remove + ) + }) + .map(|(name, _)| name), + ) + } else { + either::Either::Right( + new_embedder.fragments().iter().map(|fragment| &fragment.name), + ) + }; + + let old_embedder = old_settings.runtime_embedders.get(embedder_name); + + let mut fragments = Vec::new(); + for fragment_name in regenerate_fragments { + let Ok(new) = new_embedder + .fragments() + .binary_search_by_key(&fragment_name, |fragment| &fragment.name) + else { + continue; + }; + let old = old_embedder.as_ref().and_then(|old_embedder| { + old_embedder + .fragments() + .binary_search_by_key(&fragment_name, |fragment| &fragment.name) + .ok() + }); + fragments.push((old, new)); + } + fragment_diffs.insert(embedder_name.clone(), fragments); + } + InnerIndexSettingsDiff { old: old_settings, new: new_settings, primary_key_id, + fragment_diffs, embedding_config_updates, settings_update_only, only_additional_fields, @@ -2341,9 +2391,21 @@ pub trait SettingsDelta { fn old_embedders(&self) -> &EmbeddingConfigs; fn new_embedder_category_id(&self) -> &HashMap; fn embedder_actions(&self) -> &BTreeMap; + fn try_for_each_fragment_diff( + &self, + embedder_name: &str, + for_each: F, + ) -> std::result::Result<(), E> + where + F: FnMut(FragmentDiff) -> std::result::Result<(), E>; fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata; } +pub struct FragmentDiff<'a> { + pub old: Option<&'a RuntimeFragment>, + pub new: &'a RuntimeFragment, +} + impl SettingsDelta for InnerIndexSettingsDiff { fn new_embedders(&self) -> &EmbeddingConfigs { &self.new.embedding_configs @@ -2364,6 +2426,37 @@ impl SettingsDelta for InnerIndexSettingsDiff { fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata { &self.new.fields_ids_map } + + fn try_for_each_fragment_diff( + &self, + embedder_name: &str, + mut for_each: F, + ) -> std::result::Result<(), E> + where + F: FnMut(FragmentDiff) -> std::result::Result<(), E>, + { + let Some(fragment_diff) = self.fragment_diffs.get(embedder_name) else { return Ok(()) }; + for (old, new) in fragment_diff { + let Some(new_runtime) = self.new.runtime_embedders.get(embedder_name) else { + continue; + }; + + let new = new_runtime.fragments().get(*new).unwrap(); + + match old { + Some(old) => { + if let Some(old_runtime) = self.old.runtime_embedders.get(embedder_name) { + let old = &old_runtime.fragments().get(*old).unwrap(); + for_each(FragmentDiff { old: Some(old), new })?; + } else { + for_each(FragmentDiff { old: None, new })?; + } + } + None => for_each(FragmentDiff { old: None, new })?, + }; + } + Ok(()) + } } #[cfg(test)] From 2b2e6c0b3a278827e49de4131bfeeec48d39e7bd Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 00:01:59 +0200 Subject: [PATCH 117/150] Settings changes --- crates/index-scheduler/src/lib.rs | 16 +++--- crates/milli/src/test_index.rs | 4 +- .../extract/extract_vector_points.rs | 50 ++++++++++--------- .../src/update/index_documents/extract/mod.rs | 2 +- .../milli/src/update/index_documents/mod.rs | 2 +- .../src/update/index_documents/typed_chunk.rs | 2 +- crates/milli/src/update/settings.rs | 34 ++++++------- crates/milli/src/vector/mod.rs | 31 ++++++++++-- 8 files changed, 85 insertions(+), 56 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index f551652c1..b2f27d66b 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -882,12 +882,12 @@ impl IndexScheduler { { let embedders = self.embedders.read().unwrap(); if let Some(embedder) = embedders.get(&embedder_options) { - let runtime = Arc::new(RuntimeEmbedder { - embedder: embedder.clone(), + let runtime = Arc::new(RuntimeEmbedder::new( + embedder.clone(), document_template, fragments, - is_quantized: quantized.unwrap_or_default(), - }); + quantized.unwrap_or_default(), + )); return Ok((name, runtime)); } @@ -906,12 +906,12 @@ impl IndexScheduler { embedders.insert(embedder_options, embedder.clone()); } - let runtime = Arc::new(RuntimeEmbedder { - embedder: embedder.clone(), + let runtime = Arc::new(RuntimeEmbedder::new( + embedder.clone(), document_template, fragments, - is_quantized: quantized.unwrap_or_default(), - }); + quantized.unwrap_or_default(), + )); Ok((name, runtime)) }, diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index cfd8c8492..6bb6b1345 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -66,7 +66,7 @@ impl TempIndex { let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?; let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.embedding_configs; + let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.runtime_embedders; let mut indexer = indexer::DocumentOperation::new(); match self.index_documents_config.update_method { IndexDocumentsMethod::ReplaceDocuments => { @@ -151,7 +151,7 @@ impl TempIndex { let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?; let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.embedding_configs; + let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.runtime_embedders; let mut indexer = indexer::DocumentOperation::new(); let external_document_ids: Vec<_> = diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 0a179cfa5..d40e82b92 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -236,8 +236,8 @@ pub fn extract_vector_points( let mut extractors = Vec::new(); - let mut configs = settings_diff.new.embedding_configs.clone().into_inner(); - let old_configs = &settings_diff.old.embedding_configs; + let mut configs = settings_diff.new.runtime_embedders.clone().into_inner(); + let old_configs = &settings_diff.old.runtime_embedders; if reindex_vectors { for (name, action) in settings_diff.embedding_config_updates.iter() { if let Some(action) = action.reindex() { @@ -284,16 +284,16 @@ pub fn extract_vector_points( continue; }; - let fragments = regenerate_fragments + let fragment_diffs = regenerate_fragments .iter() .filter_map(|(name, fragment)| match fragment { crate::vector::settings::RegenerateFragment::Update => { let old_value = old_runtime - .fragments + .fragments() .binary_search_by_key(&name, |fragment| &fragment.name) .ok(); let Ok(new_value) = runtime - .fragments + .fragments() .binary_search_by_key(&name, |fragment| &fragment.name) else { return None; @@ -304,7 +304,7 @@ pub fn extract_vector_points( crate::vector::settings::RegenerateFragment::Remove => None, crate::vector::settings::RegenerateFragment::Add => { let Ok(new_value) = runtime - .fragments + .fragments() .binary_search_by_key(&name, |fragment| &fragment.name) else { return None; @@ -314,8 +314,8 @@ pub fn extract_vector_points( }) .collect(); ExtractionAction::SettingsRegenerateFragments { - old_runtime, - must_regenerate_fragments: fragments, + old_runtime: old_runtime.clone(), + must_regenerate_fragments: fragment_diffs, } } @@ -325,7 +325,9 @@ pub fn extract_vector_points( continue; }; - ExtractionAction::SettingsRegeneratePrompts { old_runtime } + ExtractionAction::SettingsRegeneratePrompts { + old_runtime: old_runtime.clone(), + } } }; @@ -473,11 +475,11 @@ pub fn extract_vector_points( ); continue; } - let has_fragments = !runtime.fragments.is_empty(); + let has_fragments = !runtime.fragments().is_empty(); if has_fragments { regenerate_all_fragments( - &runtime.fragments, + runtime.fragments(), &doc_alloc, new_fields_ids_map, obkv, @@ -492,14 +494,14 @@ pub fn extract_vector_points( old_runtime, } => { if old.must_regenerate() { - let has_fragments = !runtime.fragments.is_empty(); - let old_has_fragments = !old_runtime.fragments.is_empty(); + let has_fragments = !runtime.fragments().is_empty(); + let old_has_fragments = !old_runtime.fragments().is_empty(); let is_adding_fragments = has_fragments && !old_has_fragments; if is_adding_fragments { regenerate_all_fragments( - &runtime.fragments, + runtime.fragments(), &doc_alloc, new_fields_ids_map, obkv, @@ -517,14 +519,16 @@ pub fn extract_vector_points( new_fields_ids_map, ); for (name, (old_index, new_index)) in must_regenerate_fragments { - let Some(new) = runtime.fragments.get(*new_index) else { continue }; + let Some(new) = runtime.fragments().get(*new_index) else { + continue; + }; let new = RequestFragmentExtractor::new(new, &doc_alloc).ignore_errors(); let diff = { let old = old_index.as_ref().and_then(|old| { - let old = old_runtime.fragments.get(*old)?; + let old = old_runtime.fragments().get(*old)?; Some( RequestFragmentExtractor::new(old, &doc_alloc) .ignore_errors(), @@ -555,11 +559,11 @@ pub fn extract_vector_points( ); continue; } - let has_fragments = !runtime.fragments.is_empty(); + let has_fragments = !runtime.fragments().is_empty(); if has_fragments { regenerate_all_fragments( - &runtime.fragments, + runtime.fragments(), &doc_alloc, new_fields_ids_map, obkv, @@ -607,7 +611,7 @@ pub fn extract_vector_points( manual_vectors_writer, &mut key_buffer, delta, - &runtime.fragments, + runtime.fragments(), )?; } @@ -720,7 +724,7 @@ fn extract_vector_document_diff( ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id); return Ok(VectorStateDelta::NoChange); } - let has_fragments = !runtime.fragments.is_empty(); + let has_fragments = !runtime.fragments().is_empty(); if has_fragments { let prompt = &runtime.document_template; // Don't give up if the old prompt was failing @@ -753,7 +757,7 @@ fn extract_vector_document_diff( new_fields_ids_map, ); - for new in &runtime.fragments { + for new in runtime.fragments() { let name = &new.name; let fragment = RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); @@ -791,11 +795,11 @@ fn extract_vector_document_diff( return Ok(VectorStateDelta::NoChange); } - let has_fragments = !runtime.fragments.is_empty(); + let has_fragments = !runtime.fragments().is_empty(); if has_fragments { regenerate_all_fragments( - &runtime.fragments, + runtime.fragments(), doc_alloc, new_fields_ids_map, obkv, diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index cbf4ceba2..b41fd59e1 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -242,7 +242,7 @@ fn send_original_documents_data( let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only()) // no point in indexing vectors without embedders - && (!settings_diff.new.embedding_configs.inner_as_ref().is_empty()); + && (!settings_diff.new.runtime_embedders.inner_as_ref().is_empty()); if index_vectors { let settings_diff = settings_diff.clone(); diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 055b8bbad..658ff1923 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -517,7 +517,7 @@ where let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name); let was_quantized = settings_diff .old - .embedding_configs + .runtime_embedders .get(&embedder_name) .is_some_and(|conf| conf.is_quantized); let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized); diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index 370579a6c..c93e3e0f7 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -673,7 +673,7 @@ pub(crate) fn write_typed_chunk_into_index( let binary_quantized = settings_diff .old - .embedding_configs + .runtime_embedders .get(&embedder_name) .is_some_and(|conf| conf.is_quantized); // FIXME: allow customizing distance diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 03d44d785..c9ab427ea 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1647,9 +1647,9 @@ impl InnerIndexSettingsDiff { // if the user-defined searchables changed, then we need to reindex prompts. if cache_user_defined_searchables { - for (embedder_name, runtime) in new_settings.embedding_configs.inner_as_ref() { + for (embedder_name, runtime) in new_settings.runtime_embedders.inner_as_ref() { let was_quantized = old_settings - .embedding_configs + .runtime_embedders .get(embedder_name) .is_some_and(|conf| conf.is_quantized); // skip embedders that don't use document templates @@ -1893,7 +1893,7 @@ pub(crate) struct InnerIndexSettings { pub exact_attributes: HashSet, pub disabled_typos_terms: DisabledTyposTerms, pub proximity_precision: ProximityPrecision, - pub embedding_configs: RuntimeEmbedders, + pub runtime_embedders: RuntimeEmbedders, pub embedder_category_id: HashMap, pub geo_fields_ids: Option<(FieldId, FieldId)>, pub prefix_search: PrefixSearch, @@ -1904,7 +1904,7 @@ impl InnerIndexSettings { pub fn from_index( index: &Index, rtxn: &heed::RoTxn<'_>, - embedding_configs: Option, + runtime_embedders: Option, ) -> Result { let stop_words = index.stop_words(rtxn)?; let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); @@ -1913,13 +1913,13 @@ impl InnerIndexSettings { let mut fields_ids_map = index.fields_ids_map(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); - let embedding_configs = match embedding_configs { + let runtime_embedders = match runtime_embedders { Some(embedding_configs) => embedding_configs, None => embedders(index.embedding_configs().embedding_configs(rtxn)?)?, }; let embedder_category_id = index - .embedder_category_id - .iter(rtxn)? + .embedding_configs() + .iter_embedder_id(rtxn)? .map(|r| r.map(|(k, v)| (k.to_string(), v))) .collect::>()?; let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default(); @@ -1960,7 +1960,7 @@ impl InnerIndexSettings { sortable_fields, exact_attributes, proximity_precision, - embedding_configs, + runtime_embedders, embedder_category_id, geo_fields_ids, prefix_search, @@ -2035,12 +2035,12 @@ fn embedders(embedding_configs: Vec) -> Result &EmbeddingConfigs; - fn old_embedders(&self) -> &EmbeddingConfigs; + fn new_embedders(&self) -> &RuntimeEmbedders; + fn old_embedders(&self) -> &RuntimeEmbedders; fn new_embedder_category_id(&self) -> &HashMap; fn embedder_actions(&self) -> &BTreeMap; fn try_for_each_fragment_diff( @@ -2407,12 +2407,12 @@ pub struct FragmentDiff<'a> { } impl SettingsDelta for InnerIndexSettingsDiff { - fn new_embedders(&self) -> &EmbeddingConfigs { - &self.new.embedding_configs + fn new_embedders(&self) -> &RuntimeEmbedders { + &self.new.runtime_embedders } - fn old_embedders(&self) -> &EmbeddingConfigs { - &self.old.embedding_configs + fn old_embedders(&self) -> &RuntimeEmbedders { + &self.old.runtime_embedders } fn new_embedder_category_id(&self) -> &HashMap { diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 87ecd2414..f64223e41 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -742,10 +742,27 @@ pub struct RuntimeEmbedders(HashMap>); pub struct RuntimeEmbedder { pub embedder: Arc, pub document_template: Prompt, - pub fragments: Vec, + fragments: Vec, pub is_quantized: bool, } +impl RuntimeEmbedder { + pub fn new( + embedder: Arc, + document_template: Prompt, + mut fragments: Vec, + is_quantized: bool, + ) -> Self { + fragments.sort_unstable_by(|left, right| left.name.cmp(&right.name)); + Self { embedder, document_template, fragments, is_quantized } + } + + /// The runtime fragments sorted by name. + pub fn fragments(&self) -> &[RuntimeFragment] { + self.fragments.as_slice() + } +} + pub struct RuntimeFragment { pub name: String, pub id: u8, @@ -763,8 +780,8 @@ impl RuntimeEmbedders { } /// Get an embedder configuration and template from its name. - pub fn get(&self, name: &str) -> Option> { - self.0.get(name).cloned() + pub fn get(&self, name: &str) -> Option<&Arc> { + self.0.get(name) } pub fn inner_as_ref(&self) -> &HashMap> { @@ -774,6 +791,14 @@ impl RuntimeEmbedders { pub fn into_inner(self) -> HashMap> { self.0 } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } } impl IntoIterator for RuntimeEmbedders { From 119d618a7630963be1ce4dcac9a32da8d32b5ffc Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 00:02:14 +0200 Subject: [PATCH 118/150] Do not "upgrade" regnerate fragments to regenerate prompt --- crates/milli/src/update/settings.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index c9ab427ea..242e083f1 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1679,9 +1679,6 @@ impl InnerIndexSettingsDiff { // fixup reindex to make sure we regenerate all fragments *reindex = match reindex.take() { - Some(ReindexAction::RegenerateFragments(_)) => { - Some(ReindexAction::RegeneratePrompts) - } Some(reindex) => Some(reindex), // We are at least regenerating prompts None => { if write_back.is_none() { From eda309d562701b9d91e3002ac7f6585dc46c2b7d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 00:02:48 +0200 Subject: [PATCH 119/150] make sure fragments are ordered --- crates/milli/src/vector/settings.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 93de37290..4bb4ed92c 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -1150,6 +1150,7 @@ impl SettingsDiff { (left, Setting::NotSet) => left, }; if !regenerate_fragments.is_empty() { + regenerate_fragments.sort_unstable_by(|(left, _), (right, _)| left.cmp(right)); ReindexAction::push_action( reindex_action, ReindexAction::RegenerateFragments(regenerate_fragments), From be640062114d12cfa6c073941f45530dcab988a7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 09:12:18 +0200 Subject: [PATCH 120/150] Fix process export --- .../index-scheduler/src/scheduler/process_export.rs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 30721065e..2062e1c28 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -150,9 +150,6 @@ impl IndexScheduler { let fields_ids_map = index.fields_ids_map(&index_rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index - .embedding_configs(&index_rtxn) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; // We don't need to keep this one alive as we will // spawn many threads to process the documents @@ -232,17 +229,12 @@ impl IndexScheduler { )); }; - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(docid)); - + for (embedder_name, (embeddings, regenerate)) in embeddings { let embeddings = ExplicitVectors { embeddings: Some( VectorOrArrayOfVectors::from_array_of_vectors(embeddings), ), - regenerate: !user_provided, + regenerate, }; vectors.insert( embedder_name, From d72e5f5f697a8a0c0dc176284f02e4bb9cb5c767 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 11:29:50 +0200 Subject: [PATCH 121/150] Hide `documentTemplate` and `documentTemplateMaxBytes` when indexing_fragment is defined --- crates/milli/src/vector/settings.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 4bb4ed92c..9ea8d7703 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -1932,8 +1932,18 @@ impl EmbeddingSettings { pooling: Setting::NotSet, api_key: Setting::some_or_not_set(api_key), dimensions: Setting::some_or_not_set(dimensions), - document_template, - document_template_max_bytes, + document_template: if indexing_fragments.is_empty() && search_fragments.is_empty() { + document_template + } else { + Setting::NotSet + }, + document_template_max_bytes: if indexing_fragments.is_empty() + && search_fragments.is_empty() + { + document_template_max_bytes + } else { + Setting::NotSet + }, url: Setting::Set(url), indexing_fragments: if indexing_fragments.is_empty() { Setting::NotSet From 3f5b5df139070e42da0912c9910295985ec17e49 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 11:35:01 +0200 Subject: [PATCH 122/150] Check consistency of fragments --- crates/meilisearch-types/src/settings.rs | 7 ++- crates/milli/src/update/settings.rs | 65 +++++++++++++++++------- crates/milli/src/vector/settings.rs | 6 +++ 3 files changed, 58 insertions(+), 20 deletions(-) diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index d7b163448..9e107a5c3 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -501,8 +501,11 @@ impl Settings { let Setting::Set(mut configs) = self.embedders else { return Ok(self) }; for (name, config) in configs.iter_mut() { let config_to_check = std::mem::take(config); - let checked_config = - milli::update::validate_embedding_settings(config_to_check.inner, name)?; + let checked_config = milli::update::validate_embedding_settings( + config_to_check.inner, + name, + milli::vector::settings::EmbeddingValidationContext::SettingsPartialUpdate, + )?; *config = SettingEmbeddingSettings { inner: checked_config }; } self.embedders = Setting::Set(configs); diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 242e083f1..c2152022b 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -35,8 +35,8 @@ use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::vector::db::{FragmentConfigs, IndexEmbeddingConfig}; use crate::vector::json_template::JsonTemplate; use crate::vector::settings::{ - EmbedderAction, EmbedderSource, EmbeddingSettings, NestingContext, ReindexAction, - SubEmbeddingSettings, WriteBackToDocuments, + EmbedderAction, EmbedderSource, EmbeddingSettings, EmbeddingValidationContext, NestingContext, + ReindexAction, SubEmbeddingSettings, WriteBackToDocuments, }; use crate::vector::{ Embedder, EmbeddingConfig, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment, @@ -1181,13 +1181,20 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { }; embedder_actions.insert(name.clone(), embedder_action); - let new = validate_embedding_settings(updated_settings, &name)?; + let new = validate_embedding_settings( + updated_settings, + &name, + EmbeddingValidationContext::FullSettings, + )?; updated_configs.insert(name, (new, fragments)); } SettingsDiff::UpdateWithoutReindex { updated_settings, quantize } => { tracing::debug!(embedder = name, "update without reindex embedder"); - let new = - validate_embedding_settings(Setting::Set(updated_settings), &name)?; + let new = validate_embedding_settings( + Setting::Set(updated_settings), + &name, + EmbeddingValidationContext::FullSettings, + )?; if quantize { embedder_actions.insert( name.clone(), @@ -1211,7 +1218,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { crate::vector::settings::EmbeddingSettings::apply_default_openai_model( &mut setting, ); - let setting = validate_embedding_settings(setting, &name)?; + let setting = validate_embedding_settings( + setting, + &name, + EmbeddingValidationContext::FullSettings, + )?; embedder_actions.insert( name.clone(), EmbedderAction::with_reindex(ReindexAction::FullReindex, false), @@ -2079,6 +2090,7 @@ fn validate_prompt( pub fn validate_embedding_settings( settings: Setting, name: &str, + context: EmbeddingValidationContext, ) -> Result> { let Setting::Set(settings) = settings else { return Ok(settings) }; let EmbeddingSettings { @@ -2119,10 +2131,10 @@ pub fn validate_embedding_settings( })?; } - if let Some(request) = request.as_ref().set() { - let request = crate::vector::rest::RequestData::new( - request.to_owned(), - indexing_fragments + // if we are working with partial settings, the user could have changed only the `request` and not given again the fragments + if context == EmbeddingValidationContext::FullSettings { + if let Some(request) = request.as_ref().set() { + let indexing_fragments: BTreeMap<_, _> = indexing_fragments .as_ref() .set() .iter() @@ -2130,8 +2142,8 @@ pub fn validate_embedding_settings( .filter_map(|(name, fragment)| { Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) }) - .collect(), - search_fragments + .collect(); + let search_fragments: BTreeMap<_, _> = search_fragments .as_ref() .set() .iter() @@ -2139,12 +2151,29 @@ pub fn validate_embedding_settings( .filter_map(|(name, fragment)| { Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) }) - .collect(), - ) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; - if let Some(response) = response.as_ref().set() { - crate::vector::rest::Response::new(response.to_owned(), &request) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + .collect(); + + let are_fragments_inconsistent = + indexing_fragments.is_empty() ^ search_fragments.is_empty(); + if are_fragments_inconsistent { + return Err(crate::vector::error::NewEmbedderError::rest_inconsistent_fragments( + indexing_fragments.is_empty(), + indexing_fragments, + search_fragments, + )) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()).into()); + } + + let request = crate::vector::rest::RequestData::new( + request.to_owned(), + indexing_fragments, + search_fragments, + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + if let Some(response) = response.as_ref().set() { + crate::vector::rest::Response::new(response.to_owned(), &request) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + } } } diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 9ea8d7703..b769ce277 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -615,6 +615,12 @@ pub struct SubEmbeddingSettings { pub indexing_embedder: Setting, } +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum EmbeddingValidationContext { + FullSettings, + SettingsPartialUpdate, +} + /// Indicates what action should take place during a reindexing operation for an embedder #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum ReindexAction { From ede456c5b0c6021ca9da607b3c7d3cf261a91aac Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 11:35:19 +0200 Subject: [PATCH 123/150] New error: rest inconsistent fragments --- crates/milli/src/vector/error.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index 00d4221e5..b56a5dce9 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; use bumpalo::Bump; use hf_hub::api::sync::ApiError; +use itertools::Itertools as _; use super::parsed_vectors::ParsedVectorsDiff; use super::rest::ConfigurationSource; @@ -453,6 +454,29 @@ impl NewEmbedderError { fault: FaultSource::User, } } + + pub(crate) fn rest_inconsistent_fragments( + indexing_fragments_is_empty: bool, + indexing_fragments: BTreeMap, + search_fragments: BTreeMap, + ) -> NewEmbedderError { + let message = if indexing_fragments_is_empty { + format!("`indexingFragments` is empty, but `searchFragments` declares {} fragments: {}{}\n - Hint: declare at least one fragment in `indexingFragments` or remove fragments from `searchFragments` by setting them to `null`", + search_fragments.len(), + search_fragments.keys().take(3).join(", "), if search_fragments.len() > 3 { ", ..." } else { "" } + ) + } else { + format!("`searchFragments` is empty, but `indexingFragments` declares {} fragments: {}{}\n - Hint: declare at least one fragment in `searchFragments` or remove fragments from `indexingFragments` by setting them to `null`", + indexing_fragments.len(), + indexing_fragments.keys().take(3).join(", "), if indexing_fragments.len() > 3 { ", ..." } else { "" } + ) + }; + + Self { + kind: NewEmbedderErrorKind::RestInconsistentFragments { message }, + fault: FaultSource::User, + } + } } #[derive(Debug, Clone, Copy)] @@ -572,6 +596,8 @@ pub enum NewEmbedderErrorKind { CompositeEmbeddingValueMismatch { distance: f32, hint: CompositeEmbedderContainsHuggingFace }, #[error("cannot infer `dimensions` for an embedder using `indexingFragments`.\n - Note: Specify `dimensions` explicitly or don't use `indexingFragments`.")] RestCannotInferDimensionsForFragment, + #[error("inconsistent fragments: {message}")] + RestInconsistentFragments { message: String }, } pub struct PossibleEmbeddingMistakes { From f6287602e9bbbf69f1296d77db69572cdd1d5990 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 11:35:44 +0200 Subject: [PATCH 124/150] Improve error message when request contains the wrong type of placeholder --- crates/milli/src/vector/rest.rs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index 9477959ad..41e8ca9f9 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -561,6 +561,7 @@ impl Request { Err(error) => { let message = error.error_message("request", REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER); + let message = format!("{message}\n - Note: this template is using a document template, and so expects to contain the placeholder {REQUEST_PLACEHOLDER:?} rather than {REQUEST_FRAGMENT_PLACEHOLDER:?}"); return Err(NewEmbedderError::rest_could_not_parse_template(message)); } }; @@ -592,15 +593,23 @@ impl RequestFromFragments { request: Value, search_fragments: impl IntoIterator, ) -> Result { - let request = - match InjectableValue::new(request, REQUEST_FRAGMENT_PLACEHOLDER, REPEAT_PLACEHOLDER) { - Ok(template) => template, - Err(error) => { - let message = - error.error_message("request", REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER); - return Err(NewEmbedderError::rest_could_not_parse_template(message)); - } - }; + let request = match InjectableValue::new( + request, + REQUEST_FRAGMENT_PLACEHOLDER, + REPEAT_PLACEHOLDER, + ) { + Ok(template) => template, + Err(error) => { + let message = error.error_message( + "request", + REQUEST_FRAGMENT_PLACEHOLDER, + REPEAT_PLACEHOLDER, + ); + let message = format!("{message}\n - Note: this template is using fragments, and so expects to contain the placeholder {REQUEST_FRAGMENT_PLACEHOLDER:?} rathern than {REQUEST_PLACEHOLDER:?}"); + + return Err(NewEmbedderError::rest_could_not_parse_template(message)); + } + }; let search_fragments: Result<_, NewEmbedderError> = search_fragments .into_iter() From 82a796aea7e0402f605d46b9aabfffd984bdf2b0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 11:36:14 +0200 Subject: [PATCH 125/150] vector settings: fix bug where removed fragments were returned as new --- crates/milli/src/vector/settings.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index b769ce277..1b85dd503 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -2420,8 +2420,17 @@ pub(crate) fn fragments_from_settings( setting: &Setting, ) -> impl Iterator + '_ { let Some(setting) = setting.as_ref().set() else { return Either::Left(None.into_iter()) }; + + let filter_map = |(name, fragment): (&String, &Option)| { + if fragment.is_some() { + Some(name.clone()) + } else { + None + } + }; + if let Some(setting) = setting.indexing_fragments.as_ref().set() { - Either::Right(setting.keys().cloned()) + Either::Right(setting.iter().filter_map(filter_map)) } else { let Some(setting) = setting.indexing_embedder.as_ref().set() else { return Either::Left(None.into_iter()); @@ -2429,6 +2438,6 @@ pub(crate) fn fragments_from_settings( let Some(setting) = setting.indexing_fragments.as_ref().set() else { return Either::Left(None.into_iter()); }; - Either::Right(setting.keys().cloned()) + Either::Right(setting.iter().filter_map(filter_map)) } } From 91e77abf4fabfde895a8746fea605c8e87d6653d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 2 Jul 2025 12:15:11 +0200 Subject: [PATCH 126/150] Bump the mini-dashboard to v0.2.20 --- crates/meilisearch/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index fe00d9fee..83eb439d9 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -169,5 +169,5 @@ german = ["meilisearch-types/german"] turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.19/build.zip" -sha1 = "7974430d5277c97f67cf6e95eec6faaac2788834" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.20/build.zip" +sha1 = "82a7ddd7bf14bb5323c3d235d2b62892a98b6a59" From 895db76a517e8b5a4e1d5c2e4457ddd9023453f3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 16:10:05 +0200 Subject: [PATCH 127/150] Fix snaps --- crates/meilisearch/tests/vector/rest.rs | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 87296c36a..e03563bcc 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -1,9 +1,9 @@ use std::collections::BTreeMap; use std::sync::atomic::AtomicUsize; +use std::time::Duration; use meili_snap::{json_string, snapshot}; use reqwest::IntoUrl; -use std::time::Duration; use tokio::sync::mpsc; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, Request, ResponseTemplate}; @@ -408,13 +408,13 @@ async fn bad_request() { .await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" - { - "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found", - "code": "vector_embedding_error", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#vector_embedding_error" - } - "###); + { + "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); // A repeat string appears inside a repeated value let (response, code) = index @@ -437,7 +437,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input.input`: \"{{..}}\" appears nested inside of a value that is itself repeated", + "message": "Error while generating embeddings: user error: in `request.input.input`: \"{{..}}\" appears nested inside of a value that is itself repeated\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -460,7 +460,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input.repeat`: \"{{..}}\" appears outside of an array", + "message": "Error while generating embeddings: user error: in `request.input.repeat`: \"{{..}}\" appears outside of an array\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -483,7 +483,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #0", + "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #0\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -506,7 +506,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #2", + "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #2\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -529,7 +529,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input[0]`: Expected \"{{text}}\" inside of the repeated value", + "message": "Error while generating embeddings: user error: in `request.input[0]`: Expected \"{{text}}\" inside of the repeated value\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -556,7 +556,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{..}}\", but it was already present in `request.input`", + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{..}}\", but it was already present in `request.input`\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -577,7 +577,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input`", + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input`\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -598,7 +598,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.repeated.data[1]`: Found \"{{text}}\", but it was already present in `request.repeated.input`", + "message": "Error while generating embeddings: user error: in `request.repeated.data[1]`: Found \"{{text}}\", but it was already present in `request.repeated.input`\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -619,7 +619,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input[0]` (repeated)", + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input[0]` (repeated)\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -920,7 +920,7 @@ async fn bad_settings() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found", + "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" From aa6855cd4ff796f002a18885a00080ac24af31cf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 16:12:23 +0200 Subject: [PATCH 128/150] Vector settings: don't assume which kind of request is asked when looking at a settings update without fragments --- crates/milli/src/update/settings.rs | 122 +++++++++++++++++++++------- 1 file changed, 91 insertions(+), 31 deletions(-) diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index c2152022b..911f51865 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -2131,28 +2131,41 @@ pub fn validate_embedding_settings( })?; } - // if we are working with partial settings, the user could have changed only the `request` and not given again the fragments - if context == EmbeddingValidationContext::FullSettings { - if let Some(request) = request.as_ref().set() { - let indexing_fragments: BTreeMap<_, _> = indexing_fragments - .as_ref() - .set() - .iter() - .flat_map(|map| map.iter()) - .filter_map(|(name, fragment)| { - Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) - }) - .collect(); - let search_fragments: BTreeMap<_, _> = search_fragments - .as_ref() - .set() - .iter() - .flat_map(|map| map.iter()) - .filter_map(|(name, fragment)| { - Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) - }) - .collect(); + // used below + enum WithFragments { + Yes { + indexing_fragments: BTreeMap, + search_fragments: BTreeMap, + }, + No, + Maybe, + } + let with_fragments = { + let has_reset = matches!(indexing_fragments, Setting::Reset) + || matches!(search_fragments, Setting::Reset); + let indexing_fragments: BTreeMap<_, _> = indexing_fragments + .as_ref() + .set() + .iter() + .flat_map(|map| map.iter()) + .filter_map(|(name, fragment)| { + Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) + }) + .collect(); + let search_fragments: BTreeMap<_, _> = search_fragments + .as_ref() + .set() + .iter() + .flat_map(|map| map.iter()) + .filter_map(|(name, fragment)| { + Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) + }) + .collect(); + + let has_fragments = !indexing_fragments.is_empty() || !search_fragments.is_empty(); + + if context == EmbeddingValidationContext::FullSettings { let are_fragments_inconsistent = indexing_fragments.is_empty() ^ search_fragments.is_empty(); if are_fragments_inconsistent { @@ -2163,17 +2176,64 @@ pub fn validate_embedding_settings( )) .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()).into()); } - - let request = crate::vector::rest::RequestData::new( - request.to_owned(), - indexing_fragments, - search_fragments, - ) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; - if let Some(response) = response.as_ref().set() { - crate::vector::rest::Response::new(response.to_owned(), &request) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + } + if has_fragments { + if context == EmbeddingValidationContext::SettingsPartialUpdate + && matches!(document_template, Setting::Set(_)) + { + return Err( + crate::vector::error::NewEmbedderError::rest_document_template_and_fragments( + indexing_fragments.len(), + search_fragments.len(), + ), + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()).into()); } + WithFragments::Yes { indexing_fragments, search_fragments } + } else if has_reset || context == EmbeddingValidationContext::FullSettings { + WithFragments::No + } else { + // if we are working with partial settings, the user could have changed only the `request` and not given again the fragments + WithFragments::Maybe + } + }; + if let Some(request) = request.as_ref().set() { + let request = match with_fragments { + WithFragments::Yes { indexing_fragments, search_fragments } => { + crate::vector::rest::RequestData::new( + request.to_owned(), + indexing_fragments, + search_fragments, + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())) + } + WithFragments::No => crate::vector::rest::RequestData::new( + request.to_owned(), + Default::default(), + Default::default(), + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())), + WithFragments::Maybe => { + let mut indexing_fragments = BTreeMap::new(); + indexing_fragments.insert("test".to_string(), serde_json::json!("test")); + crate::vector::rest::RequestData::new( + request.to_owned(), + indexing_fragments, + Default::default(), + ) + .or_else(|_| { + crate::vector::rest::RequestData::new( + request.to_owned(), + Default::default(), + Default::default(), + ) + }) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())) + } + }?; + if let Some(response) = response.as_ref().set() { + crate::vector::rest::Response::new(response.to_owned(), &request) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; } } From 7113fcf63a6c344a9211fbbe7a7a8c23ff780689 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 16:17:12 +0200 Subject: [PATCH 129/150] New error --- crates/milli/src/vector/error.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index b56a5dce9..0d737cbfc 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -477,6 +477,19 @@ impl NewEmbedderError { fault: FaultSource::User, } } + + pub(crate) fn rest_document_template_and_fragments( + indexing_fragments_len: usize, + search_fragments_len: usize, + ) -> Self { + Self { + kind: NewEmbedderErrorKind::RestDocumentTemplateAndFragments { + indexing_fragments_len, + search_fragments_len, + }, + fault: FaultSource::User, + } + } } #[derive(Debug, Clone, Copy)] @@ -598,6 +611,8 @@ pub enum NewEmbedderErrorKind { RestCannotInferDimensionsForFragment, #[error("inconsistent fragments: {message}")] RestInconsistentFragments { message: String }, + #[error("cannot pass both fragments and a document template.\n - Note: {indexing_fragments_len} fragments declared in `indexingFragments` and {search_fragments_len} fragments declared in `search_fragments_len`.\n - Hint: remove the declared fragments or remove the `documentTemplate`")] + RestDocumentTemplateAndFragments { indexing_fragments_len: usize, search_fragments_len: usize }, } pub struct PossibleEmbeddingMistakes { From 428463e45c804a606b4576b500100407bbc5d02e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 16:17:22 +0200 Subject: [PATCH 130/150] Check indexing fragments as well as search fragments --- crates/milli/src/vector/rest.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index 41e8ca9f9..7a16f1a1e 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -110,6 +110,13 @@ impl RequestData { Ok(if indexing_fragments.is_empty() && search_fragments.is_empty() { RequestData::Single(Request::new(request)?) } else { + for (name, value) in indexing_fragments { + JsonTemplate::new(value).map_err(|error| { + NewEmbedderError::rest_could_not_parse_template( + error.parsing(&format!(".indexingFragments.{name}")), + ) + })?; + } RequestData::FromFragments(RequestFromFragments::new(request, search_fragments)?) }) } @@ -614,14 +621,12 @@ impl RequestFromFragments { let search_fragments: Result<_, NewEmbedderError> = search_fragments .into_iter() .map(|(name, value)| { - Ok(( - name, - JsonTemplate::new(value).map_err(|error| { - NewEmbedderError::rest_could_not_parse_template( - error.parsing("searchFragments"), - ) - })?, - )) + let json_template = JsonTemplate::new(value).map_err(|error| { + NewEmbedderError::rest_could_not_parse_template( + error.parsing(&format!(".searchFragments.{name}")), + ) + })?; + Ok((name, json_template)) }) .collect(); From 549dc985b8ae6e09306172aa350d5ec11c55cae5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 09:58:41 +0200 Subject: [PATCH 131/150] Old dump import indexer: fix the case where going from Generated to Generated --- .../extract/extract_vector_points.rs | 62 +++++++++---------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index d40e82b92..54fcca75f 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -726,6 +726,35 @@ fn extract_vector_document_diff( } let has_fragments = !runtime.fragments().is_empty(); if has_fragments { + let mut fragment_diff = Vec::new(); + let old_fields_ids_map = old_fields_ids_map.as_fields_ids_map(); + let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); + + let old_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Deletion, + old_fields_ids_map, + ); + + let new_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Addition, + new_fields_ids_map, + ); + + for new in runtime.fragments() { + let name = &new.name; + let fragment = + RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); + + let diff = fragment + .diff_documents(&old_document, &new_document, &()) + .expect("ignoring errors so this cannot fail"); + + fragment_diff.push((name.clone(), diff)); + } + VectorStateDelta::UpdateGeneratedFromFragments(fragment_diff) + } else { let prompt = &runtime.document_template; // Don't give up if the old prompt was failing let old_prompt = Some(&prompt).map(|p| { @@ -741,38 +770,9 @@ fn extract_vector_document_diff( ); VectorStateDelta::NowGenerated(new_prompt) } else { - let mut fragment_diff = Vec::new(); - let old_fields_ids_map = old_fields_ids_map.as_fields_ids_map(); - let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); - - let old_document = crate::update::new::document::KvDelAddDocument::new( - obkv, - DelAdd::Deletion, - old_fields_ids_map, - ); - - let new_document = crate::update::new::document::KvDelAddDocument::new( - obkv, - DelAdd::Addition, - new_fields_ids_map, - ); - - for new in runtime.fragments() { - let name = &new.name; - let fragment = - RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); - - let diff = fragment - .diff_documents(&old_document, &new_document, &()) - .expect("ignoring errors so this cannot fail"); - - fragment_diff.push((name.clone(), diff)); - } - VectorStateDelta::UpdateGeneratedFromFragments(fragment_diff) + tracing::trace!("⏭️ Prompt unmodified, skipping"); + VectorStateDelta::NoChange } - } else { - tracing::trace!("⏭️ Prompt unmodified, skipping"); - VectorStateDelta::NoChange } } else { VectorStateDelta::NowRemoved From a06cb1bfd6a21b283f5aeb7cee7ae0c605580b0c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 10:02:16 +0200 Subject: [PATCH 132/150] Remove `Embed::process_embeddings` and have it be an inherent function of the type that uses it --- .../index_documents/extract/extract_vector_points.rs | 8 -------- crates/milli/src/update/new/extract/vectors/mod.rs | 9 ++++----- crates/milli/src/vector/session.rs | 2 -- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 54fcca75f..677ff93c9 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -1300,12 +1300,4 @@ impl<'doc> OnEmbed<'doc> for WriteGrenadOnEmbed<'_> { crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)) } } - - fn process_embeddings( - &mut self, - _metadata: crate::vector::session::Metadata<'doc>, - _embeddings: Vec, - ) { - unimplemented!("unused") - } } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index c08fadb14..f8e0e7cb5 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -452,6 +452,10 @@ impl OnEmbeddingDocumentUpdates<'_, '_> { fn clear_vectors(&self, docid: DocumentId) { self.sender.set_vectors(docid, self.embedder_id, vec![]).unwrap(); } + + fn process_embeddings(&mut self, metadata: Metadata<'_>, embeddings: Vec) { + self.sender.set_vectors(metadata.docid, self.embedder_id, embeddings).unwrap(); + } } impl<'doc> OnEmbed<'doc> for OnEmbeddingDocumentUpdates<'doc, '_> { @@ -469,11 +473,6 @@ impl<'doc> OnEmbed<'doc> for OnEmbeddingDocumentUpdates<'doc, '_> { ) .unwrap(); } - - fn process_embeddings(&mut self, metadata: Metadata<'doc>, embeddings: Vec) { - self.sender.set_vectors(metadata.docid, self.embedder_id, embeddings).unwrap(); - } - fn process_embedding_error( &mut self, error: crate::vector::hf::EmbedError, diff --git a/crates/milli/src/vector/session.rs b/crates/milli/src/vector/session.rs index dd005e993..5f6d68879 100644 --- a/crates/milli/src/vector/session.rs +++ b/crates/milli/src/vector/session.rs @@ -30,8 +30,6 @@ pub trait OnEmbed<'doc> { unused_vectors_distribution: &Self::ErrorMetadata, metadata: &[Metadata<'doc>], ) -> crate::Error; - - fn process_embeddings(&mut self, metadata: Metadata<'doc>, embeddings: Vec); } pub struct EmbedSession<'doc, C, I> { From bbcabc47bda50573a0289b1ff48b3d18e794d8fb Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 3 Jul 2025 08:06:38 +0000 Subject: [PATCH 133/150] Update version for the next release (v1.16.0) in Cargo.toml --- Cargo.lock | 34 +++++++++++++++++----------------- Cargo.toml | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index be6aa4b21..ceec0a05e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -580,7 +580,7 @@ source = "git+https://github.com/meilisearch/bbqueue#cbb87cc707b5af415ef203bdaf2 [[package]] name = "benchmarks" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "bumpalo", @@ -770,7 +770,7 @@ dependencies = [ [[package]] name = "build-info" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "time", @@ -1774,7 +1774,7 @@ dependencies = [ [[package]] name = "dump" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "big_s", @@ -2006,7 +2006,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "file-store" -version = "1.15.2" +version = "1.16.0" dependencies = [ "tempfile", "thiserror 2.0.12", @@ -2028,7 +2028,7 @@ dependencies = [ [[package]] name = "filter-parser" -version = "1.15.2" +version = "1.16.0" dependencies = [ "insta", "nom", @@ -2049,7 +2049,7 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "1.15.2" +version = "1.16.0" dependencies = [ "criterion", "serde_json", @@ -2194,7 +2194,7 @@ dependencies = [ [[package]] name = "fuzzers" -version = "1.15.2" +version = "1.16.0" dependencies = [ "arbitrary", "bumpalo", @@ -2994,7 +2994,7 @@ dependencies = [ [[package]] name = "index-scheduler" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "backoff", @@ -3230,7 +3230,7 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "1.15.2" +version = "1.16.0" dependencies = [ "criterion", "serde_json", @@ -3724,7 +3724,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "meili-snap" -version = "1.15.2" +version = "1.16.0" dependencies = [ "insta", "md5", @@ -3735,7 +3735,7 @@ dependencies = [ [[package]] name = "meilisearch" -version = "1.15.2" +version = "1.16.0" dependencies = [ "actix-cors", "actix-http", @@ -3830,7 +3830,7 @@ dependencies = [ [[package]] name = "meilisearch-auth" -version = "1.15.2" +version = "1.16.0" dependencies = [ "base64 0.22.1", "enum-iterator", @@ -3849,7 +3849,7 @@ dependencies = [ [[package]] name = "meilisearch-types" -version = "1.15.2" +version = "1.16.0" dependencies = [ "actix-web", "anyhow", @@ -3884,7 +3884,7 @@ dependencies = [ [[package]] name = "meilitool" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "clap", @@ -3918,7 +3918,7 @@ dependencies = [ [[package]] name = "milli" -version = "1.15.2" +version = "1.16.0" dependencies = [ "allocator-api2 0.3.0", "arroy", @@ -4470,7 +4470,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "permissive-json-pointer" -version = "1.15.2" +version = "1.16.0" dependencies = [ "big_s", "serde_json", @@ -7258,7 +7258,7 @@ dependencies = [ [[package]] name = "xtask" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "build-info", diff --git a/Cargo.toml b/Cargo.toml index 835ef497c..3e57563b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ members = [ ] [workspace.package] -version = "1.15.2" +version = "1.16.0" authors = [ "Quentin de Quelen ", "Clément Renault ", From 3740755d9c05b6beee8b5c8537b1ba39112c18a8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 10:11:07 +0200 Subject: [PATCH 134/150] Compare to `RawValue::NULL` constant rather than explicit "null" --- crates/milli/src/vector/parsed_vectors.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index 8ff5a2201..b96922bc4 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -151,7 +151,7 @@ impl<'doc> serde::de::Visitor<'doc> for RawVectorsVisitor { } Ok(Some("embeddings")) => { let value: &RawValue = match map.next_value::<&RawValue>() { - Ok(value) if value.get() == "null" => continue, + Ok(value) if value.get() == RawValue::NULL.get() => continue, Ok(value) => value, Err(error) => { return Ok(Err(RawVectorsError::DeserializeEmbeddings { From 735634e998943adc64a9289272edb3073a3d1e69 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 10:32:57 +0200 Subject: [PATCH 135/150] Send owned metadata and clear inputs in case of error --- .../extract/extract_vector_points.rs | 2 +- crates/milli/src/update/new/extract/vectors/mod.rs | 2 +- crates/milli/src/vector/session.rs | 13 ++++++++++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 677ff93c9..9604c4823 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -1259,7 +1259,7 @@ impl<'doc> OnEmbed<'doc> for WriteGrenadOnEmbed<'_> { error: crate::vector::error::EmbedError, embedder_name: &'doc str, unused_vectors_distribution: &crate::vector::error::UnusedVectorsDistribution, - _metadata: &[crate::vector::session::Metadata<'doc>], + _metadata: bumpalo::collections::Vec<'doc, crate::vector::session::Metadata<'doc>>, ) -> crate::Error { if let FaultSource::Bug = error.fault { crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(error.into())) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index f8e0e7cb5..72a07dea6 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -478,7 +478,7 @@ impl<'doc> OnEmbed<'doc> for OnEmbeddingDocumentUpdates<'doc, '_> { error: crate::vector::hf::EmbedError, embedder_name: &'doc str, unused_vectors_distribution: &UnusedVectorsDistributionBump, - metadata: &[Metadata<'doc>], + metadata: BVec<'doc, Metadata<'doc>>, ) -> crate::Error { if let FaultSource::Bug = error.fault { crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(error.into())) diff --git a/crates/milli/src/vector/session.rs b/crates/milli/src/vector/session.rs index 5f6d68879..b582bd840 100644 --- a/crates/milli/src/vector/session.rs +++ b/crates/milli/src/vector/session.rs @@ -28,7 +28,7 @@ pub trait OnEmbed<'doc> { error: EmbedError, embedder_name: &'doc str, unused_vectors_distribution: &Self::ErrorMetadata, - metadata: &[Metadata<'doc>], + metadata: BVec<'doc, Metadata<'doc>>, ) -> crate::Error; } @@ -143,12 +143,19 @@ impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> { Ok(()) } Err(error) => { + // reset metadata and inputs, and send metadata to the error processing. + let doc_alloc = self.metadata.bump(); + let metadata = std::mem::replace( + &mut self.metadata, + BVec::with_capacity_in(self.inputs.capacity(), doc_alloc), + ); + self.inputs.clear(); return Err(self.on_embed.process_embedding_error( error, self.embedder_name, unused_vectors_distribution, - &self.metadata, - )) + metadata, + )); } }; self.inputs.clear(); From 87f105747f857449e6fd0562c11eb1716db9bcb0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 10:41:20 +0200 Subject: [PATCH 136/150] Add documentation to `Extractor` trait --- crates/milli/src/vector/extractor.rs | 32 +++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/vector/extractor.rs b/crates/milli/src/vector/extractor.rs index cbfc62ee1..2ab541ac1 100644 --- a/crates/milli/src/vector/extractor.rs +++ b/crates/milli/src/vector/extractor.rs @@ -12,19 +12,41 @@ use crate::update::new::document::Document; use crate::vector::RuntimeFragment; use crate::GlobalFieldsIdsMap; +/// Trait for types that extract embedder inputs from a document. +/// +/// An embedder input can then be sent to an embedder by using an [`super::session::EmbedSession`]. pub trait Extractor<'doc> { - type DocumentMetadata; + /// The embedder input that is extracted from documents by this extractor. + /// + /// The inputs have to be comparable for equality so that diffing is possible. type Input: PartialEq; + + /// The error that can happen while extracting from a document. type Error; + /// Metadata associated with a document. + type DocumentMetadata; + + /// Extract the embedder input from a document and its metadata. fn extract<'a, D: Document<'a> + Debug>( &self, doc: D, meta: &Self::DocumentMetadata, ) -> Result, Self::Error>; + /// Unique `id` associated with this extractor. + /// + /// This will serve to decide where to store the vectors in the vector store. + /// The id should be stable for a given extractor. fn extractor_id(&self) -> u8; + /// The result of diffing the embedder inputs extracted from two versions of a document. + /// + /// # Parameters + /// + /// - `old`: old version of the document + /// - `new`: new version of the document + /// - `meta`: metadata associated to the document fn diff_documents<'a, OD: Document<'a> + Debug, ND: Document<'a> + Debug>( &self, old: OD, @@ -39,6 +61,13 @@ pub trait Extractor<'doc> { to_diff(old_input, new_input) } + /// The result of diffing the embedder inputs extracted from a document by two versions of this extractor. + /// + /// # Parameters + /// + /// - `doc`: the document from which to extract the embedder inputs + /// - `meta`: metadata associated to the document + /// - `old`: If `Some`, the old version of this extractor. If `None`, this is equivalent to calling `ExtractorDiff::Added(self.extract(_))`. fn diff_settings<'a, D: Document<'a> + Debug>( &self, doc: D, @@ -51,6 +80,7 @@ pub trait Extractor<'doc> { to_diff(old_input, new_input) } + /// Returns an extractor wrapping `self` and set to ignore all errors arising from extracting with this extractor. fn ignore_errors(self) -> IgnoreErrorExtractor where Self: Sized, From 0ca652de2811d16733018ffc0c9f203d16307eee Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 10:52:30 +0200 Subject: [PATCH 137/150] Extract vector points: remove the { --- .../update/index_documents/extract/extract_vector_points.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 9604c4823..064cfd154 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -882,8 +882,7 @@ fn regenerate_all_fragments<'a>( let name = &new.name; let new = RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); - let diff = - { new.extract(&obkv_document, &()) }.expect("ignoring errors so this cannot fail"); + let diff = new.extract(&obkv_document, &()).expect("ignoring errors so this cannot fail"); if let Some(value) = diff { fragment_diff.push((name.clone(), value)); } From dfe0c8664ee20300d562f6e059bcd33a4bb4c054 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 11:08:31 +0200 Subject: [PATCH 138/150] Add a version of prompt::Context that has no fields --- crates/milli/src/prompt/context.rs | 34 ++++++++++++++------ crates/milli/src/vector/json_template/mod.rs | 3 +- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/crates/milli/src/prompt/context.rs b/crates/milli/src/prompt/context.rs index 84523333a..8958cb693 100644 --- a/crates/milli/src/prompt/context.rs +++ b/crates/milli/src/prompt/context.rs @@ -6,12 +6,18 @@ use liquid::{ObjectView, ValueView}; #[derive(Debug, Clone)] pub struct Context<'a, D: ObjectView, F: ArrayView> { document: &'a D, - fields: &'a F, + fields: Option<&'a F>, } impl<'a, D: ObjectView, F: ArrayView> Context<'a, D, F> { pub fn new(document: &'a D, fields: &'a F) -> Self { - Self { document, fields } + Self { document, fields: Some(fields) } + } +} + +impl<'a, D: ObjectView> Context<'a, D, Vec> { + pub fn without_fields(document: &'a D) -> Self { + Self { document, fields: None } } } @@ -21,17 +27,27 @@ impl ObjectView for Context<'_, D, F> { } fn size(&self) -> i64 { - 2 + if self.fields.is_some() { + 2 + } else { + 1 + } } fn keys<'k>(&'k self) -> Box> + 'k> { - Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s))) + let keys = if self.fields.is_some() { + either::Either::Left(["doc", "fields"]) + } else { + either::Either::Right(["doc"]) + }; + + Box::new(keys.into_iter().map(KStringCow::from_static)) } fn values<'k>(&'k self) -> Box + 'k> { Box::new( std::iter::once(self.document.as_value()) - .chain(std::iter::once(self.fields.as_value())), + .chain(self.fields.iter().map(|fields| fields.as_value())), ) } @@ -40,13 +56,13 @@ impl ObjectView for Context<'_, D, F> { } fn contains_key(&self, index: &str) -> bool { - index == "doc" || index == "fields" + index == "doc" || (index == "fields" && self.fields.is_some()) } fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> { - match index { - "doc" => Some(self.document.as_value()), - "fields" => Some(self.fields.as_value()), + match (index, &self.fields) { + ("doc", _) => Some(self.document.as_value()), + ("fields", Some(fields)) => Some(fields.as_value()), _ => None, } } diff --git a/crates/milli/src/vector/json_template/mod.rs b/crates/milli/src/vector/json_template/mod.rs index 57a3b67b1..d7ce3e8f1 100644 --- a/crates/milli/src/vector/json_template/mod.rs +++ b/crates/milli/src/vector/json_template/mod.rs @@ -115,8 +115,7 @@ impl JsonTemplate { doc_alloc: &'doc Bump, ) -> Result { let document = ParseableDocument::new(document, doc_alloc); - let v: Vec = vec![]; - let context = crate::prompt::Context::new(&document, &v); + let context = crate::prompt::Context::without_fields(&document); self.render(&context) } From 6b94033c978a86a85135d9bb3cee18d214483d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 11:30:24 +0200 Subject: [PATCH 139/150] Correctly export the chat completions settings in dumps --- crates/dump/src/writer.rs | 24 ++++++++++++++++++- crates/index-scheduler/src/processing.rs | 1 + .../src/scheduler/process_dump_creation.rs | 23 ++++++++++++------ 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/crates/dump/src/writer.rs b/crates/dump/src/writer.rs index 63b006b5c..9f828595a 100644 --- a/crates/dump/src/writer.rs +++ b/crates/dump/src/writer.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use flate2::write::GzEncoder; use flate2::Compression; use meilisearch_types::batches::Batch; -use meilisearch_types::features::{Network, RuntimeTogglableFeatures}; +use meilisearch_types::features::{ChatCompletionSettings, Network, RuntimeTogglableFeatures}; use meilisearch_types::keys::Key; use meilisearch_types::settings::{Checked, Settings}; use serde_json::{Map, Value}; @@ -51,6 +51,10 @@ impl DumpWriter { KeyWriter::new(self.dir.path().to_path_buf()) } + pub fn create_chat_completions_settings(&self) -> Result { + ChatCompletionsSettingsWriter::new(self.dir.path().join("chat-completions-settings")) + } + pub fn create_tasks_queue(&self) -> Result { TaskWriter::new(self.dir.path().join("tasks")) } @@ -104,6 +108,24 @@ impl KeyWriter { } } +pub struct ChatCompletionsSettingsWriter { + path: PathBuf, +} + +impl ChatCompletionsSettingsWriter { + pub(crate) fn new(path: PathBuf) -> Result { + std::fs::create_dir(&path)?; + Ok(ChatCompletionsSettingsWriter { path }) + } + + pub fn push_settings(&mut self, name: &str, settings: &ChatCompletionSettings) -> Result<()> { + let mut settings_file = File::create(self.path.join(name).with_extension("json"))?; + serde_json::to_writer(&mut settings_file, &settings)?; + settings_file.flush()?; + Ok(()) + } +} + pub struct TaskWriter { queue: BufWriter, update_files: PathBuf, diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 2aa7cf859..fdd8e42ef 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -103,6 +103,7 @@ make_enum_progress! { pub enum DumpCreationProgress { StartTheDumpCreation, DumpTheApiKeys, + DumpTheChatCompletionSettings, DumpTheTasks, DumpTheBatches, DumpTheIndexes, diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index a6d785b2f..a6907d739 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -43,7 +43,16 @@ impl IndexScheduler { let rtxn = self.env.read_txn()?; - // 2. dump the tasks + // 2. dump the chat completion settings + // TODO should I skip the export if the chat completion has been disabled? + progress.update_progress(DumpCreationProgress::DumpTheChatCompletionSettings); + let mut dump_chat_completion_settings = dump.create_chat_completions_settings()?; + for result in self.chat_settings.iter(&rtxn)? { + let (name, chat_settings) = result?; + dump_chat_completion_settings.push_settings(name, &chat_settings)?; + } + + // 3. dump the tasks progress.update_progress(DumpCreationProgress::DumpTheTasks); let mut dump_tasks = dump.create_tasks_queue()?; @@ -81,7 +90,7 @@ impl IndexScheduler { let mut dump_content_file = dump_tasks.push_task(&t.into())?; - // 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. + // 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. if let Some(content_file) = content_file { if self.scheduler.must_stop_processing.get() { return Err(Error::AbortedTask); @@ -105,7 +114,7 @@ impl IndexScheduler { } dump_tasks.flush()?; - // 3. dump the batches + // 4. dump the batches progress.update_progress(DumpCreationProgress::DumpTheBatches); let mut dump_batches = dump.create_batches_queue()?; @@ -138,7 +147,7 @@ impl IndexScheduler { } dump_batches.flush()?; - // 4. Dump the indexes + // 5. Dump the indexes progress.update_progress(DumpCreationProgress::DumpTheIndexes); let nb_indexes = self.index_mapper.index_mapping.len(&rtxn)? as u32; let mut count = 0; @@ -178,7 +187,7 @@ impl IndexScheduler { let documents = index .all_documents(&rtxn) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - // 4.1. Dump the documents + // 5.1. Dump the documents for ret in documents { if self.scheduler.must_stop_processing.get() { return Err(Error::AbortedTask); @@ -240,7 +249,7 @@ impl IndexScheduler { atomic.fetch_add(1, Ordering::Relaxed); } - // 4.2. Dump the settings + // 5.2. Dump the settings let settings = meilisearch_types::settings::settings( index, &rtxn, @@ -251,7 +260,7 @@ impl IndexScheduler { Ok(()) })?; - // 5. Dump experimental feature settings + // 6. Dump experimental feature settings progress.update_progress(DumpCreationProgress::DumpTheExperimentalFeatures); let features = self.features().runtime_features(); dump.create_experimental_features(features)?; From a051ab3d9ae8ad7bf4262cbf608eb04383a6441d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 12:04:40 +0200 Subject: [PATCH 140/150] Support importing chat completions settings --- crates/dump/src/reader/mod.rs | 9 +++++++++ crates/dump/src/reader/v6/mod.rs | 26 ++++++++++++++++++++++++++ crates/meilisearch/src/lib.rs | 28 +++++++++++++++++----------- 3 files changed, 52 insertions(+), 11 deletions(-) diff --git a/crates/dump/src/reader/mod.rs b/crates/dump/src/reader/mod.rs index 2b4440ab7..23e7eec9e 100644 --- a/crates/dump/src/reader/mod.rs +++ b/crates/dump/src/reader/mod.rs @@ -116,6 +116,15 @@ impl DumpReader { } } + pub fn chat_completions_settings( + &mut self, + ) -> Result> + '_>> { + match self { + DumpReader::Current(current) => current.chat_completions_settings(), + DumpReader::Compat(_compat) => Ok(Box::new(std::iter::empty())), + } + } + pub fn features(&self) -> Result> { match self { DumpReader::Current(current) => Ok(current.features()), diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 0b4ba5bdd..0c920aadb 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -1,3 +1,4 @@ +use std::ffi::OsStr; use std::fs::{self, File}; use std::io::{BufRead, BufReader, ErrorKind}; use std::path::Path; @@ -21,6 +22,7 @@ pub type Unchecked = meilisearch_types::settings::Unchecked; pub type Task = crate::TaskDump; pub type Batch = meilisearch_types::batches::Batch; pub type Key = meilisearch_types::keys::Key; +pub type ChatCompletionSettings = meilisearch_types::features::ChatCompletionSettings; pub type RuntimeTogglableFeatures = meilisearch_types::features::RuntimeTogglableFeatures; pub type Network = meilisearch_types::features::Network; @@ -192,6 +194,30 @@ impl V6Reader { ) } + pub fn chat_completions_settings( + &mut self, + ) -> Result> + '_>> { + let entries = fs::read_dir(self.dump.path().join("chat-completions-settings"))?; + Ok(Box::new( + entries + .map(|entry| -> Result> { + let entry = entry?; + let file_name = entry.file_name(); + let path = Path::new(&file_name); + if entry.file_type()?.is_file() && path.extension() == Some(OsStr::new("json")) + { + let name = path.file_stem().unwrap().to_str().unwrap().to_string(); + let file = File::open(entry.path())?; + let settings = serde_json::from_reader(file)?; + Ok(Some((name, settings))) + } else { + Ok(None) + } + }) + .filter_map(|entry| entry.transpose()), + )) + } + pub fn features(&self) -> Option { self.features } diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 871bd688e..b11a4a76d 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -498,14 +498,20 @@ fn import_dump( keys.push(key); } - // 3. Import the runtime features and network + // 3. Import the `ChatCompletionSettings`s. + for result in dump_reader.chat_completions_settings()? { + let (name, settings) = result?; + index_scheduler.put_chat_settings(&name, &settings)?; + } + + // 4. Import the runtime features and network let features = dump_reader.features()?.unwrap_or_default(); index_scheduler.put_runtime_features(features)?; let network = dump_reader.network()?.cloned().unwrap_or_default(); index_scheduler.put_network(network)?; - // 3.1 Use all cpus to process dump if `max_indexing_threads` not configured + // 4.1 Use all cpus to process dump if `max_indexing_threads` not configured let backup_config; let base_config = index_scheduler.indexer_config(); @@ -522,7 +528,7 @@ fn import_dump( // /!\ The tasks must be imported AFTER importing the indexes or else the scheduler might // try to process tasks while we're trying to import the indexes. - // 4. Import the indexes. + // 5. Import the indexes. for index_reader in dump_reader.indexes()? { let mut index_reader = index_reader?; let metadata = index_reader.metadata(); @@ -535,20 +541,20 @@ fn import_dump( let mut wtxn = index.write_txn()?; let mut builder = milli::update::Settings::new(&mut wtxn, &index, indexer_config); - // 4.1 Import the primary key if there is one. + // 5.1 Import the primary key if there is one. if let Some(ref primary_key) = metadata.primary_key { builder.set_primary_key(primary_key.to_string()); } - // 4.2 Import the settings. + // 5.2 Import the settings. tracing::info!("Importing the settings."); let settings = index_reader.settings()?; apply_settings_to_builder(&settings, &mut builder); let embedder_stats: Arc = Default::default(); builder.execute(&|| false, &progress, embedder_stats.clone())?; - // 4.3 Import the documents. - // 4.3.1 We need to recreate the grenad+obkv format accepted by the index. + // 5.3 Import the documents. + // 5.3.1 We need to recreate the grenad+obkv format accepted by the index. tracing::info!("Importing the documents."); let file = tempfile::tempfile()?; let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file)); @@ -559,7 +565,7 @@ fn import_dump( // This flush the content of the batch builder. let file = builder.into_inner()?.into_inner()?; - // 4.3.2 We feed it to the milli index. + // 5.3.2 We feed it to the milli index. let reader = BufReader::new(file); let reader = DocumentsBatchReader::from_reader(reader)?; @@ -591,15 +597,15 @@ fn import_dump( index_scheduler.refresh_index_stats(&uid)?; } - // 5. Import the queue + // 6. Import the queue let mut index_scheduler_dump = index_scheduler.register_dumped_task()?; - // 5.1. Import the batches + // 6.1. Import the batches for ret in dump_reader.batches()? { let batch = ret?; index_scheduler_dump.register_dumped_batch(batch)?; } - // 5.2. Import the tasks + // 6.2. Import the tasks for ret in dump_reader.tasks()? { let (task, file) = ret?; index_scheduler_dump.register_dumped_task(task, file)?; From 6e6fd077d42802057198c523b9b39f4dd8a024e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 13:33:56 +0200 Subject: [PATCH 141/150] Ignore unexisting chat completions settings folder --- crates/dump/src/reader/v6/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 0c920aadb..449a7e5fe 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -197,7 +197,11 @@ impl V6Reader { pub fn chat_completions_settings( &mut self, ) -> Result> + '_>> { - let entries = fs::read_dir(self.dump.path().join("chat-completions-settings"))?; + let entries = match fs::read_dir(self.dump.path().join("chat-completions-settings")) { + Ok(entries) => entries, + Err(e) if e.kind() == ErrorKind::NotFound => return Ok(Box::new(std::iter::empty())), + Err(e) => return Err(e.into()), + }; Ok(Box::new( entries .map(|entry| -> Result> { From 2b75072b0976f6068511dd00fb5e2252ad08280f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 14:04:27 +0200 Subject: [PATCH 142/150] Expose the number of internal chat searches on the /metrics route --- crates/meilisearch/src/metrics.rs | 6 ++++++ crates/meilisearch/src/routes/chats/chat_completions.rs | 6 ++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/src/metrics.rs b/crates/meilisearch/src/metrics.rs index 29c1aeae8..1c7d0c3a4 100644 --- a/crates/meilisearch/src/metrics.rs +++ b/crates/meilisearch/src/metrics.rs @@ -15,6 +15,12 @@ lazy_static! { "Meilisearch number of degraded search requests" )) .expect("Can't create a metric"); + pub static ref MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS: IntGauge = + register_int_gauge!(opts!( + "meilisearch_chat_internal_search_requests", + "Meilisearch number of search requests performed by the chat route itself" + )) + .expect("Can't create a metric"); pub static ref MEILISEARCH_DB_SIZE_BYTES: IntGauge = register_int_gauge!(opts!("meilisearch_db_size_bytes", "Meilisearch DB Size In Bytes")) .expect("Can't create a metric"); diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index ccbdccbbc..f6030f2bc 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -48,7 +48,9 @@ use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{extract_token_from_request, GuardedData, Policy as _}; -use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; +use crate::metrics::{ + MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, MEILISEARCH_DEGRADED_SEARCH_REQUESTS, +}; use crate::routes::chats::utils::SseEventSender; use crate::routes::indexes::search::search_kind; use crate::search::{add_search_rules, prepare_search, search_from_kind, SearchQuery}; @@ -286,7 +288,7 @@ async fn process_search_request( let output = output?; let mut documents = Vec::new(); if let Ok((ref rtxn, ref search_result)) = output { - // aggregate.succeed(search_result); + MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS.inc(); if search_result.degraded { MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc(); } From 90e6b6416f301c95536b040bb0f9ae302b08a13a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 14:35:02 +0200 Subject: [PATCH 143/150] new extractor bugfixes: - fix old_has_fragments - new_is_user_provided is always false when generating fragments, even if no fragment ever matches --- .../src/update/new/extract/vectors/mod.rs | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 72a07dea6..4ca68027c 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -357,7 +357,7 @@ impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor> chunks.is_user_provided_must_regenerate(document.docid()); let old_has_fragments = old_embedders .get(embedder_name) - .map(|embedder| embedder.fragments().is_empty()) + .map(|embedder| !embedder.fragments().is_empty()) .unwrap_or_default(); let new_has_fragments = chunks.has_fragments(); @@ -628,9 +628,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { session.on_embed_mut().clear_vectors(docid); } - let mut extracted = false; - let extracted = &mut extracted; - settings_delta.try_for_each_fragment_diff( session.embedder_name(), |fragment_diff| { @@ -660,7 +657,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { ); } ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { - *extracted = true; session.request_embedding( metadata, input, @@ -673,13 +669,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { Result::Ok(()) }, )?; - self.set_status( - docid, - old_is_user_provided, - true, - old_is_user_provided & !*extracted, - true, - ); + self.set_status(docid, old_is_user_provided, true, false, true); } ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); @@ -732,7 +722,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { where 'a: 'doc, { - let extracted = match &mut self.kind { + match &mut self.kind { ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); let ex = DocumentTemplateExtractor::new( @@ -785,7 +775,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { docid, old_is_user_provided, old_must_regenerate, - old_is_user_provided && !extracted, + false, new_must_regenerate, ); @@ -968,7 +958,7 @@ fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>( old_must_regenerate: bool, session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, -) -> Result +) -> Result<()> where OD: Document<'doc> + Debug, ND: Document<'doc> + Debug, @@ -976,7 +966,6 @@ where E::Input: Input, crate::Error: From, { - let mut extracted = false; for extractor in extractors { let new_rendered = extractor.extract(&new_document, meta)?; let must_regenerate = if !old_must_regenerate { @@ -995,7 +984,6 @@ where }; if must_regenerate { - extracted = true; let metadata = Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; @@ -1011,7 +999,7 @@ where } } - Ok(extracted) + Ok(()) } fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>( From 9f0d33ec999920dfdec917aff14604df9f30e6b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 15:05:15 +0200 Subject: [PATCH 144/150] Expose the number of tokens on the chat completions routes --- crates/meilisearch/src/metrics.rs | 5 +++ .../src/routes/chats/chat_completions.rs | 35 ++++++++++++++++--- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/crates/meilisearch/src/metrics.rs b/crates/meilisearch/src/metrics.rs index 1c7d0c3a4..9941bacae 100644 --- a/crates/meilisearch/src/metrics.rs +++ b/crates/meilisearch/src/metrics.rs @@ -21,6 +21,11 @@ lazy_static! { "Meilisearch number of search requests performed by the chat route itself" )) .expect("Can't create a metric"); + pub static ref MEILISEARCH_CHAT_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( + opts!("meilisearch_chat_tokens_usage", "Meilisearch Chat Tokens Usage"), + &["chat", "model", "type"] + ) + .expect("Can't create a metric"); pub static ref MEILISEARCH_DB_SIZE_BYTES: IntGauge = register_int_gauge!(opts!("meilisearch_db_size_bytes", "Meilisearch DB Size In Bytes")) .expect("Can't create a metric"); diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index f6030f2bc..a7d878c6e 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -13,9 +13,9 @@ use async_openai::types::{ ChatCompletionRequestDeveloperMessageContent, ChatCompletionRequestMessage, ChatCompletionRequestSystemMessage, ChatCompletionRequestSystemMessageContent, ChatCompletionRequestToolMessage, ChatCompletionRequestToolMessageContent, - ChatCompletionStreamResponseDelta, ChatCompletionToolArgs, ChatCompletionToolType, - CreateChatCompletionRequest, CreateChatCompletionStreamResponse, FinishReason, FunctionCall, - FunctionCallStream, FunctionObjectArgs, + ChatCompletionStreamOptions, ChatCompletionStreamResponseDelta, ChatCompletionToolArgs, + ChatCompletionToolType, CreateChatCompletionRequest, CreateChatCompletionStreamResponse, + FinishReason, FunctionCall, FunctionCallStream, FunctionObjectArgs, }; use async_openai::Client; use bumpalo::Bump; @@ -49,7 +49,8 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{extract_token_from_request, GuardedData, Policy as _}; use crate::metrics::{ - MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, MEILISEARCH_DEGRADED_SEARCH_REQUESTS, + MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, MEILISEARCH_CHAT_TOKENS_USAGE, + MEILISEARCH_DEGRADED_SEARCH_REQUESTS, }; use crate::routes::chats::utils::SseEventSender; use crate::routes::indexes::search::search_kind; @@ -490,6 +491,7 @@ async fn streamed_chat( let (tx, rx) = tokio::sync::mpsc::channel(10); let tx = SseEventSender::new(tx); + let workspace_uid = workspace_uid.to_string(); let _join_handle = Handle::current().spawn(async move { let client = Client::with_config(config.clone()); let mut global_tool_calls = HashMap::::new(); @@ -499,6 +501,7 @@ async fn streamed_chat( let output = run_conversation( &index_scheduler, &auth_ctrl, + &workspace_uid, &search_queue, &auth_token, &client, @@ -536,6 +539,7 @@ async fn run_conversation( Data, >, auth_ctrl: &web::Data, + workspace_uid: &str, search_queue: &web::Data, auth_token: &str, client: &Client, @@ -546,12 +550,33 @@ async fn run_conversation( function_support: FunctionSupport, ) -> Result, ()>, SendError> { let mut finish_reason = None; + chat_completion.stream_options = Some(ChatCompletionStreamOptions { include_usage: true }); // safety: unwrap: can only happens if `stream` was set to `false` let mut response = client.chat().create_stream(chat_completion.clone()).await.unwrap(); while let Some(result) = response.next().await { match result { Ok(resp) => { - let choice = &resp.choices[0]; + let choice = match resp.choices.get(0) { + Some(choice) => choice, + None => { + if let Some(usage) = resp.usage.as_ref() { + for (r#type, value) in &[ + ("prompt", usage.prompt_tokens), + ("completion", usage.completion_tokens), + ("total", usage.total_tokens), + ] { + MEILISEARCH_CHAT_TOKENS_USAGE + .with_label_values(&[ + workspace_uid, + &chat_completion.model, + r#type, + ]) + .inc_by(*value as u64); + } + } + break; + } + }; finish_reason = choice.finish_reason; let ChatCompletionStreamResponseDelta { ref tool_calls, .. } = &choice.delta; From b5e41f0e4612eb4c665994a6a33064a2afac8c02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 15:18:16 +0200 Subject: [PATCH 145/150] Fix the Mistral uncompatibility with the usage of OpenAI --- .../src/routes/chats/chat_completions.rs | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index a7d878c6e..ea3077e99 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -549,33 +549,33 @@ async fn run_conversation( global_tool_calls: &mut HashMap, function_support: FunctionSupport, ) -> Result, ()>, SendError> { + use DbChatCompletionSource::*; + let mut finish_reason = None; - chat_completion.stream_options = Some(ChatCompletionStreamOptions { include_usage: true }); + chat_completion.stream_options = match source { + OpenAi | AzureOpenAi => Some(ChatCompletionStreamOptions { include_usage: true }), + Mistral | VLlm => None, + }; + // safety: unwrap: can only happens if `stream` was set to `false` let mut response = client.chat().create_stream(chat_completion.clone()).await.unwrap(); while let Some(result) = response.next().await { match result { Ok(resp) => { - let choice = match resp.choices.get(0) { - Some(choice) => choice, - None => { - if let Some(usage) = resp.usage.as_ref() { - for (r#type, value) in &[ - ("prompt", usage.prompt_tokens), - ("completion", usage.completion_tokens), - ("total", usage.total_tokens), - ] { - MEILISEARCH_CHAT_TOKENS_USAGE - .with_label_values(&[ - workspace_uid, - &chat_completion.model, - r#type, - ]) - .inc_by(*value as u64); - } - } - break; + if let Some(usage) = resp.usage.as_ref() { + for (r#type, value) in &[ + ("prompt", usage.prompt_tokens), + ("completion", usage.completion_tokens), + ("total", usage.total_tokens), + ] { + MEILISEARCH_CHAT_TOKENS_USAGE + .with_label_values(&[workspace_uid, &chat_completion.model, r#type]) + .inc_by(*value as u64); } + } + let choice = match resp.choices.first() { + Some(choice) => choice, + None => break, }; finish_reason = choice.finish_reason; From 6397ef12a0f42aee7255b046109ed2a63f3f34d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 15:56:56 +0200 Subject: [PATCH 146/150] Use three metrics for the three different tokens --- crates/meilisearch/src/metrics.rs | 20 +++++++++++++++--- .../src/routes/chats/chat_completions.rs | 21 ++++++++++--------- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/crates/meilisearch/src/metrics.rs b/crates/meilisearch/src/metrics.rs index 9941bacae..2207e69ff 100644 --- a/crates/meilisearch/src/metrics.rs +++ b/crates/meilisearch/src/metrics.rs @@ -21,9 +21,23 @@ lazy_static! { "Meilisearch number of search requests performed by the chat route itself" )) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( - opts!("meilisearch_chat_tokens_usage", "Meilisearch Chat Tokens Usage"), - &["chat", "model", "type"] + pub static ref MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( + opts!("meilisearch_chat_prompt_tokens_usage", "Meilisearch Chat Prompt Tokens Usage"), + &["workspace", "model"] + ) + .expect("Can't create a metric"); + pub static ref MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE: IntCounterVec = + register_int_counter_vec!( + opts!( + "meilisearch_chat_completion_tokens_usage", + "Meilisearch Chat Completion Tokens Usage" + ), + &["workspace", "model"] + ) + .expect("Can't create a metric"); + pub static ref MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( + opts!("meilisearch_chat_total_tokens_usage", "Meilisearch Chat Total Tokens Usage"), + &["workspace", "model"] ) .expect("Can't create a metric"); pub static ref MEILISEARCH_DB_SIZE_BYTES: IntGauge = diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index ea3077e99..9d132a96f 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -49,7 +49,8 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{extract_token_from_request, GuardedData, Policy as _}; use crate::metrics::{ - MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, MEILISEARCH_CHAT_TOKENS_USAGE, + MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE, MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, + MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE, MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE, MEILISEARCH_DEGRADED_SEARCH_REQUESTS, }; use crate::routes::chats::utils::SseEventSender; @@ -563,15 +564,15 @@ async fn run_conversation( match result { Ok(resp) => { if let Some(usage) = resp.usage.as_ref() { - for (r#type, value) in &[ - ("prompt", usage.prompt_tokens), - ("completion", usage.completion_tokens), - ("total", usage.total_tokens), - ] { - MEILISEARCH_CHAT_TOKENS_USAGE - .with_label_values(&[workspace_uid, &chat_completion.model, r#type]) - .inc_by(*value as u64); - } + MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE + .with_label_values(&[workspace_uid, &chat_completion.model]) + .inc_by(usage.prompt_tokens as u64); + MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE + .with_label_values(&[workspace_uid, &chat_completion.model]) + .inc_by(usage.completion_tokens as u64); + MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE + .with_label_values(&[workspace_uid, &chat_completion.model]) + .inc_by(usage.total_tokens as u64); } let choice = match resp.choices.first() { Some(choice) => choice, From 32dede35c75d91f63da3e6b6935665ac1d2bd941 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 3 Jul 2025 15:59:14 +0200 Subject: [PATCH 147/150] Update snapshots --- .../upgrade_failure/after_processing_everything.snap | 4 ++-- .../upgrade_failure/register_automatic_upgrade_task.snap | 2 +- .../registered_a_task_while_the_upgrade_task_is_enqueued.snap | 2 +- .../test_failure.rs/upgrade_failure/upgrade_task_failed.snap | 4 ++-- .../upgrade_failure/upgrade_task_failed_again.snap | 4 ++-- .../upgrade_failure/upgrade_task_succeeded.snap | 4 ++-- crates/meilisearch/tests/upgrade/mod.rs | 4 ++-- ...ches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap | 2 +- ...ches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap | 2 +- ...tches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap | 2 +- ...asks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap | 2 +- ...asks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap | 2 +- ...tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap | 2 +- ..._whole_batch_queue_once_everything_has_been_processed.snap | 2 +- ...e_whole_task_queue_once_everything_has_been_processed.snap | 2 +- 15 files changed, 20 insertions(+), 20 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap index ee18cd1db..0b5d4409d 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} 3 {uid: 3, batch_uid: 3, status: failed, error: ResponseError { code: 200, message: "Index `doggo` already exists.", error_code: "index_already_exists", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_already_exists" }, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} @@ -57,7 +57,7 @@ girafo: { number_of_documents: 0, field_distribution: {} } [timestamp] [4,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.15.2"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.16.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } 1 {uid: 1, details: {"primaryKey":"mouse"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"catto":1}}, stop reason: "created batch containing only task with id 1 of type `indexCreation` that cannot be batched with any other task.", } 2 {uid: 2, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, stop reason: "created batch containing only task with id 2 of type `indexCreation` that cannot be batched with any other task.", } 3 {uid: 3, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, stop reason: "created batch containing only task with id 3 of type `indexCreation` that cannot be batched with any other task.", } diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap index 9fa30ee2a..0bfb9c6da 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap index 162798cad..8d374479b 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap index 8f615cb1c..9fc28abbe 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} ---------------------------------------------------------------------- ### Status: @@ -37,7 +37,7 @@ catto [1,] [timestamp] [0,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.15.2"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.16.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } ---------------------------------------------------------------------- ### Batch to tasks mapping: 0 [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap index a5f9be6e1..33ddf7193 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 2 {uid: 2, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} ---------------------------------------------------------------------- @@ -40,7 +40,7 @@ doggo [2,] [timestamp] [0,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.15.2"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.16.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } ---------------------------------------------------------------------- ### Batch to tasks mapping: 0 [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap index eb738d626..05d366d1e 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 2 {uid: 2, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} 3 {uid: 3, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} @@ -43,7 +43,7 @@ doggo [2,3,] [timestamp] [0,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.15.2"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.16.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } ---------------------------------------------------------------------- ### Batch to tasks mapping: 0 [0,] diff --git a/crates/meilisearch/tests/upgrade/mod.rs b/crates/meilisearch/tests/upgrade/mod.rs index 4faa7e0c0..8114ed58b 100644 --- a/crates/meilisearch/tests/upgrade/mod.rs +++ b/crates/meilisearch/tests/upgrade/mod.rs @@ -43,7 +43,7 @@ async fn version_too_old() { std::fs::write(db_path.join("VERSION"), "1.11.9999").unwrap(); let options = Opt { experimental_dumpless_upgrade: true, ..default_settings }; let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err(); - snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.15.2"); + snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.16.0"); } #[actix_rt::test] @@ -58,7 +58,7 @@ async fn version_requires_downgrade() { std::fs::write(db_path.join("VERSION"), format!("{major}.{minor}.{patch}")).unwrap(); let options = Opt { experimental_dumpless_upgrade: true, ..default_settings }; let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err(); - snapshot!(err, @"Database version 1.15.3 is higher than the Meilisearch version 1.15.2. Downgrade is not supported"); + snapshot!(err, @"Database version 1.16.1 is higher than the Meilisearch version 1.16.0. Downgrade is not supported"); } #[actix_rt::test] diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap index 4355b9213..f4edae51b 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap index 4355b9213..f4edae51b 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap index 4355b9213..f4edae51b 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap index ebe246ee5..01d2ea341 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "error": null, "duration": "[duration]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap index ebe246ee5..01d2ea341 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "error": null, "duration": "[duration]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap index ebe246ee5..01d2ea341 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "error": null, "duration": "[duration]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap index c2d7967f0..fb62b35da 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap index 52da67fef..abb4dcdd9 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap @@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "error": null, "duration": "[duration]", From a76a3e8f118a48d1bd57775cf9d509e8374305e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 16:01:31 +0200 Subject: [PATCH 148/150] Change the metric name for the search to use a label --- crates/meilisearch/src/metrics.rs | 12 +++++++----- .../meilisearch/src/routes/chats/chat_completions.rs | 6 +++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/crates/meilisearch/src/metrics.rs b/crates/meilisearch/src/metrics.rs index 2207e69ff..d52e04cc6 100644 --- a/crates/meilisearch/src/metrics.rs +++ b/crates/meilisearch/src/metrics.rs @@ -15,12 +15,14 @@ lazy_static! { "Meilisearch number of degraded search requests" )) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS: IntGauge = - register_int_gauge!(opts!( - "meilisearch_chat_internal_search_requests", + pub static ref MEILISEARCH_CHAT_SEARCH_REQUESTS: IntCounterVec = register_int_counter_vec!( + opts!( + "meilisearch_chat_search_requests", "Meilisearch number of search requests performed by the chat route itself" - )) - .expect("Can't create a metric"); + ), + &["type"] + ) + .expect("Can't create a metric"); pub static ref MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( opts!("meilisearch_chat_prompt_tokens_usage", "Meilisearch Chat Prompt Tokens Usage"), &["workspace", "model"] diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index 9d132a96f..4f7087ae8 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -49,8 +49,8 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{extract_token_from_request, GuardedData, Policy as _}; use crate::metrics::{ - MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE, MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, - MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE, MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE, + MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE, MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE, + MEILISEARCH_CHAT_SEARCH_REQUESTS, MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE, MEILISEARCH_DEGRADED_SEARCH_REQUESTS, }; use crate::routes::chats::utils::SseEventSender; @@ -290,7 +290,7 @@ async fn process_search_request( let output = output?; let mut documents = Vec::new(); if let Ok((ref rtxn, ref search_result)) = output { - MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS.inc(); + MEILISEARCH_CHAT_SEARCH_REQUESTS.with_label_values(&["internal"]).inc(); if search_result.degraded { MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc(); } From 07bfed99e65360d62d17c051f6050c7d422ae455 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 4 Jul 2025 11:03:14 +0200 Subject: [PATCH 149/150] Expose the host in the analytics --- crates/meilisearch/src/routes/export_analytics.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs index b66a5133b..bf1f667e0 100644 --- a/crates/meilisearch/src/routes/export_analytics.rs +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -1,3 +1,5 @@ +use url::Url; + use crate::analytics::Aggregate; use crate::routes::export::Export; @@ -5,6 +7,7 @@ use crate::routes::export::Export; pub struct ExportAnalytics { total_received: usize, has_api_key: bool, + hosts: Vec, sum_index_patterns: usize, sum_patterns_with_filter: usize, sum_patterns_with_override_settings: usize, @@ -13,8 +16,10 @@ pub struct ExportAnalytics { impl ExportAnalytics { pub fn from_export(export: &Export) -> Self { - let Export { url: _, api_key, payload_size, indexes } = export; + let Export { url, api_key, payload_size, indexes } = export; + let url = Url::parse(url).ok(); + let host = url.as_ref().and_then(Url::host_str); let has_api_key = api_key.is_some(); let index_patterns_count = indexes.as_ref().map_or(0, |indexes| indexes.len()); let patterns_with_filter_count = indexes.as_ref().map_or(0, |indexes| { @@ -33,6 +38,7 @@ impl ExportAnalytics { Self { total_received: 1, has_api_key, + hosts: host.map(ToOwned::to_owned).map_or_else(Default::default, |h| vec![h]), sum_index_patterns: index_patterns_count, sum_patterns_with_filter: patterns_with_filter_count, sum_patterns_with_override_settings: patterns_with_override_settings_count, @@ -49,6 +55,7 @@ impl Aggregate for ExportAnalytics { fn aggregate(mut self: Box, other: Box) -> Box { self.total_received += other.total_received; self.has_api_key |= other.has_api_key; + self.hosts.extend(other.hosts); self.sum_index_patterns += other.sum_index_patterns; self.sum_patterns_with_filter += other.sum_patterns_with_filter; self.sum_patterns_with_override_settings += other.sum_patterns_with_override_settings; @@ -84,6 +91,7 @@ impl Aggregate for ExportAnalytics { serde_json::json!({ "total_received": self.total_received, "has_api_key": self.has_api_key, + "hosts": self.hosts, "avg_index_patterns": avg_index_patterns, "avg_patterns_with_filter": avg_patterns_with_filter, "avg_patterns_with_override_settings": avg_patterns_with_override_settings, From 4c7a6e5c1bd25114dab164fb0fcfd67403012ea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 7 Jul 2025 10:59:39 +0200 Subject: [PATCH 150/150] Do not leak private URLs --- .../src/routes/export_analytics.rs | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs index bf1f667e0..a2f0a129d 100644 --- a/crates/meilisearch/src/routes/export_analytics.rs +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -7,7 +7,7 @@ use crate::routes::export::Export; pub struct ExportAnalytics { total_received: usize, has_api_key: bool, - hosts: Vec, + sum_exports_meilisearch_cloud: usize, sum_index_patterns: usize, sum_patterns_with_filter: usize, sum_patterns_with_override_settings: usize, @@ -19,7 +19,11 @@ impl ExportAnalytics { let Export { url, api_key, payload_size, indexes } = export; let url = Url::parse(url).ok(); - let host = url.as_ref().and_then(Url::host_str); + let is_meilisearch_cloud = url.as_ref().and_then(Url::host_str).is_some_and(|host| { + host.ends_with("meilisearch.dev") + || host.ends_with("meilisearch.com") + || host.ends_with("meilisearch.io") + }); let has_api_key = api_key.is_some(); let index_patterns_count = indexes.as_ref().map_or(0, |indexes| indexes.len()); let patterns_with_filter_count = indexes.as_ref().map_or(0, |indexes| { @@ -38,7 +42,7 @@ impl ExportAnalytics { Self { total_received: 1, has_api_key, - hosts: host.map(ToOwned::to_owned).map_or_else(Default::default, |h| vec![h]), + sum_exports_meilisearch_cloud: is_meilisearch_cloud as usize, sum_index_patterns: index_patterns_count, sum_patterns_with_filter: patterns_with_filter_count, sum_patterns_with_override_settings: patterns_with_override_settings_count, @@ -55,7 +59,7 @@ impl Aggregate for ExportAnalytics { fn aggregate(mut self: Box, other: Box) -> Box { self.total_received += other.total_received; self.has_api_key |= other.has_api_key; - self.hosts.extend(other.hosts); + self.sum_exports_meilisearch_cloud += other.sum_exports_meilisearch_cloud; self.sum_index_patterns += other.sum_index_patterns; self.sum_patterns_with_filter += other.sum_patterns_with_filter; self.sum_patterns_with_override_settings += other.sum_patterns_with_override_settings; @@ -70,6 +74,12 @@ impl Aggregate for ExportAnalytics { Some(self.payload_sizes.iter().sum::() / self.payload_sizes.len() as u64) }; + let avg_exports_meilisearch_cloud = if self.total_received == 0 { + None + } else { + Some(self.sum_exports_meilisearch_cloud as f64 / self.total_received as f64) + }; + let avg_index_patterns = if self.total_received == 0 { None } else { @@ -91,7 +101,7 @@ impl Aggregate for ExportAnalytics { serde_json::json!({ "total_received": self.total_received, "has_api_key": self.has_api_key, - "hosts": self.hosts, + "avg_exports_meilisearch_cloud": avg_exports_meilisearch_cloud, "avg_index_patterns": avg_index_patterns, "avg_patterns_with_filter": avg_patterns_with_filter, "avg_patterns_with_override_settings": avg_patterns_with_override_settings,